mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-26 17:49:33 +00:00 
			
		
		
		
	Compare commits
	
		
			164 Commits
		
	
	
		
			feature/sh
			...
			ISC-freeze
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 251b904a28 | ||
|  | 5dfd216a34 | ||
|  | 5a112feac3 | ||
|  | c2e8d0aa88 | ||
|  | bf96a4bdbf | ||
|  | 84685c9bc3 | ||
|  | 013ea4e8d1 | ||
|  | 7fbbb31a50 | ||
|  | 0e127b1fc7 | ||
|  | 68c028b0a6 | ||
| a61e0df54b | |||
| f871fb0c6d | |||
|  | 25d1cadd3b | ||
|  | c24d53bbd1 | ||
| 3c7a4106ed | |||
|  | 6eed167f0c | ||
|  | 4ad0df6fde | ||
|  | 68a5079f33 | ||
|  | 8634e19f1b | ||
|  | 9ada378e38 | ||
| bfbf2f1fa0 | |||
|  | 587bfcc0f4 | ||
|  | 8c658de179 | ||
|  | ba37d51ee9 | ||
|  | 4f4181c54a | ||
|  | 4d4ac2517b | ||
|  | e568c24d1d | ||
|  | b458326744 | ||
|  | 6e7d5e2243 | ||
|  | b35169f1dd | ||
|  | 441ad7498d | ||
|  | 6f6c5c549a | ||
|  | 1584e17b54 | ||
|  | 12982a4455 | ||
|  | 172f412102 | ||
|  | a64497265d | ||
|  | c45f24a1b5 | ||
|  | aaf37ee4d7 | ||
|  | 1dddd17e3c | ||
|  | 661f1d3e8e | ||
|  | edcf9b9293 | ||
|  | fe6860b4dd | ||
|  | d6406b13e1 | ||
|  | e369d7306d | ||
|  | 9f8d63e104 | ||
|  | 9b0240d101 | ||
|  | b27f0e5a53 | ||
|  | 75e4483407 | ||
|  | 0734e9ddd4 | ||
|  | 809b1cdd58 | ||
|  | 1be8089604 | ||
|  | 3e0eff6468 | ||
|  | 7ecc47ac89 | ||
|  | e9f1ac09de | ||
|  | fa0d8feff4 | ||
|  | 05b44aef6b | ||
|  | 03e9832efa | ||
|  | 28a375d35d | ||
|  | 3b06381745 | ||
|  | 91a0a3f820 | ||
|  | 8f44c799a6 | ||
|  | 96272f3841 | ||
|  | 5c936d88a0 | ||
|  | 1c64ee926e | ||
|  | 2cbb72a81c | ||
|  | 31d83ee046 | ||
|  | a9e8758a01 | ||
|  | 3e125c5b61 | ||
|  | eac6ec4b5e | ||
|  | 213f8db6a2 | ||
|  | 6358f35b7e | ||
|  | 43f5a0df50 | ||
|  | c897878776 | ||
|  | 2baf193031 | ||
|  | 362ba0443a | ||
|  | 276a2353df | ||
|  | c5b9147b53 | ||
|  | 64ac815fd9 | ||
|  | a1be533329 | ||
| 141da3ae71 | |||
| 94edf9cf8b | |||
| c11a3ca0a7 | |||
|  | 870b1a85ae | ||
|  | b5510427f9 | ||
|  | 26ed65c8f8 | ||
|  | f7f043d8cf | ||
|  | ddcaa6ad29 | ||
| c8d4d184ee | |||
| 1569a374a9 | |||
| eddf023b8a | |||
|  | f089bf5629 | ||
|  | 276f113f28 | ||
| a13c109111 | |||
|  | ab6afd18ac | ||
|  | 5bde64d48b | ||
|  | 2f5add4d5f | ||
| c5a885dcd6 | |||
|  | c9c073eee4 | ||
|  | f290b2e908 | ||
|  | 5f8225461b | ||
| 20e186a1e0 | |||
|  | 6ef4af989b | ||
|  | ccde8b817f | ||
|  | 68168bf72d | ||
|  | e93d0feaa7 | ||
|  | 8f601d9b39 | ||
|  | 5436308e4a | ||
|  | 07fe7d0cbe | ||
|  | 60b57706c4 | ||
|  | 954e38bebe | ||
|  | b1a38bde7a | ||
|  | 2581875edc | ||
|  | 6c6d43eb4e | ||
|  | e1dcfd3553 | ||
|  | 888838473a | ||
|  | 01568b0e62 | ||
|  | d5ce66f6ab | ||
|  | d86936a3de | ||
|  | 0fb84fa34b | ||
|  | 0880747edb | ||
|  | b801e1fcd6 | ||
| 360cface33 | |||
|  | 80302e95a8 | ||
| caf2f6b274 | |||
| c49be8988b | |||
| 971c2379bd | |||
|  | 94b0d66e4c | ||
|  | 5e8af396fd | ||
| a7d19dbb64 | |||
| 90dbe03e17 | |||
| 8b14096990 | |||
|  | b938202081 | ||
| 485c5db0fe | |||
|  | c399c2b44d | ||
|  | af7de7a294 | ||
|  | 1dc86efd26 | ||
| 30391cb2eb | |||
|  | 2e88408f5c | ||
|  | 0f468e2179 | ||
|  | 4790e99817 | ||
|  | 2dd63aa7a4 | ||
|  | 559a501140 | ||
|  | 945684c470 | ||
|  | e30a80a234 | ||
|  | c96483e3bd | ||
|  | ae31a6a760 | ||
|  | dd8f2a64fe | ||
|  | 7b8b2731e7 | ||
|  | 237a8ec918 | ||
|  | 896f3a8002 | ||
|  | f0fcdf75b5 | ||
|  | 53bffb83d4 | ||
|  | cd44e851f1 | ||
|  | fb24e3a7d2 | ||
|  | 655a69259a | ||
|  | 507c4e9efc | ||
|  | f8a5194c70 | ||
|  | cff3bae155 | ||
| 6e3ce7423e | |||
| 15f15a7cfd | |||
| 0e5f626226 | |||
|  | 97b9c6f03d | ||
|  | 63982819c6 | ||
|  | 24162c9ead | 
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -123,6 +123,7 @@ make-bin-BUCK.sh | |||||||
| ##################### | ##################### | ||||||
| lib/qcd/spin/gamma-gen/*.h | lib/qcd/spin/gamma-gen/*.h | ||||||
| lib/qcd/spin/gamma-gen/*.cc | lib/qcd/spin/gamma-gen/*.cc | ||||||
|  | lib/version.h | ||||||
|  |  | ||||||
| # vs code editor files # | # vs code editor files # | ||||||
| ######################## | ######################## | ||||||
|   | |||||||
							
								
								
									
										17
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								.travis.yml
									
									
									
									
									
								
							| @@ -19,6 +19,8 @@ before_install: | |||||||
|     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi |     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi | ||||||
|      |      | ||||||
| install: | install: | ||||||
|  |     - export CWD=`pwd` | ||||||
|  |     - echo $CWD | ||||||
|     - export CC=$CC$VERSION |     - export CC=$CC$VERSION | ||||||
|     - export CXX=$CXX$VERSION |     - export CXX=$CXX$VERSION | ||||||
|     - echo $PATH |     - echo $PATH | ||||||
| @@ -36,11 +38,22 @@ script: | |||||||
|     - ./bootstrap.sh |     - ./bootstrap.sh | ||||||
|     - mkdir build |     - mkdir build | ||||||
|     - cd build |     - cd build | ||||||
|     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none |     - mkdir lime | ||||||
|  |     - cd lime | ||||||
|  |     - mkdir build | ||||||
|  |     - cd build | ||||||
|  |     - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz | ||||||
|  |     - tar xf lime-1.3.2.tar.gz | ||||||
|  |     - cd lime-1.3.2 | ||||||
|  |     - ./configure --prefix=$CWD/build/lime/install | ||||||
|  |     - make -j4 | ||||||
|  |     - make install | ||||||
|  |     - cd $CWD/build | ||||||
|  |     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install | ||||||
|     - make -j4  |     - make -j4  | ||||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals |     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||||
|     - echo make clean |     - echo make clean | ||||||
|     - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none |     - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install | ||||||
|     - make -j4 |     - make -j4 | ||||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals |     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||||
|     - make check |     - make check | ||||||
|   | |||||||
| @@ -5,6 +5,10 @@ include $(top_srcdir)/doxygen.inc | |||||||
|  |  | ||||||
| bin_SCRIPTS=grid-config | bin_SCRIPTS=grid-config | ||||||
|  |  | ||||||
|  | BUILT_SOURCES = version.h | ||||||
|  |  | ||||||
|  | version.h: | ||||||
|  | 	echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d\\"%n" HEAD`" > $(srcdir)/lib/version.h | ||||||
|  |  | ||||||
| .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL) | .PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used: | |||||||
| | `<code>`    | Description                            | | | `<code>`    | Description                            | | ||||||
| | ----------- | -------------------------------------- | | | ----------- | -------------------------------------- | | ||||||
| | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) | | | `KNL`       | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) | | ||||||
|  | | `SKL`       | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) | | ||||||
| | `BGQ`       | Blue Gene/Q                            | | | `BGQ`       | Blue Gene/Q                            | | ||||||
|  |  | ||||||
| #### Notes: | #### Notes: | ||||||
| - We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced. | - We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced. | ||||||
| - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. | - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform. | ||||||
| - BG/Q performances are currently rather poor. This is being investigated for future versions. | - BG/Q performances are currently rather poor. This is being investigated for future versions. | ||||||
| - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`. | - The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`. | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								VERSION
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								VERSION
									
									
									
									
									
								
							| @@ -1,4 +1,4 @@ | |||||||
| Version : 0.7.0 | Version : 0.8.0 | ||||||
|  |  | ||||||
| - Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended | - Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended | ||||||
| - MPI and MPI3 comms optimisations for KNL and OPA finished | - MPI and MPI3 comms optimisations for KNL and OPA finished | ||||||
|   | |||||||
							
								
								
									
										108
									
								
								benchmarks/Benchmark_IO.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								benchmarks/Benchmark_IO.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,108 @@ | |||||||
|  | #include <Grid/Grid.h> | ||||||
|  | #ifdef HAVE_LIME | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Grid::QCD; | ||||||
|  |  | ||||||
|  | #define MSG cout << GridLogMessage | ||||||
|  | #define SEP \ | ||||||
|  | "=============================================================================" | ||||||
|  | #ifndef BENCH_IO_LMAX | ||||||
|  | #define BENCH_IO_LMAX 40 | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | typedef function<void(const string, LatticeFermion &)> WriterFn; | ||||||
|  | typedef function<void(LatticeFermion &, const string)> ReaderFn; | ||||||
|  |  | ||||||
|  | string filestem(const int l) | ||||||
|  | { | ||||||
|  |   return "iobench_l" + to_string(l); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void limeWrite(const string filestem, LatticeFermion &vec) | ||||||
|  | { | ||||||
|  |   emptyUserRecord record; | ||||||
|  |   ScidacWriter    binWriter(vec._grid->IsBoss()); | ||||||
|  |  | ||||||
|  |   binWriter.open(filestem + ".bin"); | ||||||
|  |   binWriter.writeScidacFieldRecord(vec, record); | ||||||
|  |   binWriter.close(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void limeRead(LatticeFermion &vec, const string filestem) | ||||||
|  | { | ||||||
|  |   emptyUserRecord record; | ||||||
|  |   ScidacReader    binReader; | ||||||
|  |  | ||||||
|  |   binReader.open(filestem + ".bin"); | ||||||
|  |   binReader.readScidacFieldRecord(vec, record); | ||||||
|  |   binReader.close(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void writeBenchmark(const int l, const WriterFn &write) | ||||||
|  | { | ||||||
|  |   auto                      mpi  = GridDefaultMpi(); | ||||||
|  |   auto                      simd = GridDefaultSimd(Nd, vComplex::Nsimd()); | ||||||
|  |   vector<int>               latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||||
|  |   unique_ptr<GridCartesian> gPt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); | ||||||
|  |   GridCartesian             *g = gPt.get(); | ||||||
|  |   GridParallelRNG           rng(g); | ||||||
|  |   LatticeFermion            vec(g); | ||||||
|  |   emptyUserRecord           record; | ||||||
|  |   ScidacWriter              binWriter(g->IsBoss()); | ||||||
|  |  | ||||||
|  |   cout << "-- Local volume " << l << "^4" << endl; | ||||||
|  |   random(rng, vec); | ||||||
|  |   write(filestem(l), vec); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void readBenchmark(const int l, const ReaderFn &read) | ||||||
|  | { | ||||||
|  |   auto                      mpi  = GridDefaultMpi(); | ||||||
|  |   auto                      simd = GridDefaultSimd(Nd, vComplex::Nsimd()); | ||||||
|  |   vector<int>               latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||||
|  |   unique_ptr<GridCartesian> gPt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); | ||||||
|  |   GridCartesian             *g = gPt.get(); | ||||||
|  |   LatticeFermion            vec(g); | ||||||
|  |   emptyUserRecord           record; | ||||||
|  |   ScidacReader              binReader; | ||||||
|  |  | ||||||
|  |   cout << "-- Local volume " << l << "^4" << endl; | ||||||
|  |   read(vec, filestem(l)); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   auto simd = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||||
|  |   auto mpi  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|  |   int64_t threads = GridThread::GetThreads(); | ||||||
|  |   MSG << "Grid is setup to use " << threads << " threads" << endl; | ||||||
|  |   MSG << SEP << endl; | ||||||
|  |   MSG << "Benchmark Lime write" << endl; | ||||||
|  |   MSG << SEP << endl; | ||||||
|  |   for (int l = 4; l <= BENCH_IO_LMAX; l += 2) | ||||||
|  |   { | ||||||
|  |     writeBenchmark(l, limeWrite); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   MSG << "Benchmark Lime read" << endl; | ||||||
|  |   MSG << SEP << endl; | ||||||
|  |   for (int l = 4; l <= BENCH_IO_LMAX; l += 2) | ||||||
|  |   { | ||||||
|  |     readBenchmark(l, limeRead); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  |  | ||||||
|  |   return EXIT_SUCCESS; | ||||||
|  | } | ||||||
|  | #else | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   return EXIT_SUCCESS; | ||||||
|  | } | ||||||
|  | #endif | ||||||
| @@ -158,8 +158,10 @@ public: | |||||||
|  |  | ||||||
| 	  dbytes=0; | 	  dbytes=0; | ||||||
| 	  ncomm=0; | 	  ncomm=0; | ||||||
|  | #ifdef GRID_OMP | ||||||
| 	  parallel_for(int dir=0;dir<8;dir++){ | #pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads) | ||||||
|  | #endif | ||||||
|  | 	  for(int dir=0;dir<8;dir++){ | ||||||
|  |  | ||||||
| 	    double tbytes; | 	    double tbytes; | ||||||
| 	    int mu =dir % 4; | 	    int mu =dir % 4; | ||||||
| @@ -175,9 +177,14 @@ public: | |||||||
| 		int comm_proc = mpi_layout[mu]-1; | 		int comm_proc = mpi_layout[mu]-1; | ||||||
| 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
| 	      } | 	      } | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | 	int tid = omp_get_thread_num();  | ||||||
|  | #else  | ||||||
|  |         int tid = dir; | ||||||
|  | #endif | ||||||
| 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
| 						 (void *)&rbuf[dir][0], recv_from_rank, | 						 (void *)&rbuf[dir][0], recv_from_rank, | ||||||
| 						 bytes,dir); | 						 bytes,tid); | ||||||
| 	   | 	   | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
| #pragma omp atomic | #pragma omp atomic | ||||||
|   | |||||||
| @@ -169,7 +169,11 @@ int main (int argc, char ** argv) | |||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=8;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat,lat,lat,lat}); |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|  |                                     lat*mpi_layout[1], | ||||||
|  |                                     lat*mpi_layout[2], | ||||||
|  |                                     lat*mpi_layout[3]}); | ||||||
|  |  | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       RealD Nrank = Grid._Nprocessors; |       RealD Nrank = Grid._Nprocessors; | ||||||
| @@ -446,7 +450,7 @@ int main (int argc, char ** argv) | |||||||
|   }     |   }     | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #ifdef GRID_OMP | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; |   std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
| @@ -485,7 +489,8 @@ int main (int argc, char ** argv) | |||||||
| 	dbytes=0; | 	dbytes=0; | ||||||
| 	ncomm=0; | 	ncomm=0; | ||||||
|  |  | ||||||
| 	parallel_for(int dir=0;dir<8;dir++){ | #pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads) | ||||||
|  | 	for(int dir=0;dir<8;dir++){ | ||||||
|  |  | ||||||
| 	  double tbytes; | 	  double tbytes; | ||||||
| 	  int mu =dir % 4; | 	  int mu =dir % 4; | ||||||
| @@ -502,9 +507,9 @@ int main (int argc, char ** argv) | |||||||
| 	      int comm_proc = mpi_layout[mu]-1; | 	      int comm_proc = mpi_layout[mu]-1; | ||||||
| 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
| 	    } | 	    } | ||||||
|  |             int tid = omp_get_thread_num(); | ||||||
| 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
| 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir); | 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,tid); | ||||||
|  |  | ||||||
| #pragma omp atomic | #pragma omp atomic | ||||||
| 	    dbytes+=tbytes; | 	    dbytes+=tbytes; | ||||||
| @@ -532,7 +537,7 @@ int main (int argc, char ** argv) | |||||||
|   |   | ||||||
|     } |     } | ||||||
|   }     |   }     | ||||||
|  | #endif | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl; |   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   | |||||||
| @@ -48,7 +48,6 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|  |  | ||||||
|   int threads = GridThread::GetThreads(); |   int threads = GridThread::GetThreads(); | ||||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; |  | ||||||
|  |  | ||||||
|   std::vector<int> latt4 = GridDefaultLatt(); |   std::vector<int> latt4 = GridDefaultLatt(); | ||||||
|   int Ls=16; |   int Ls=16; | ||||||
| @@ -57,6 +56,10 @@ int main (int argc, char ** argv) | |||||||
|       std::stringstream ss(argv[i+1]); ss >> Ls; |       std::stringstream ss(argv[i+1]); ss >> Ls; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |   GridLogLayout(); | ||||||
|  |  | ||||||
|  |   long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); | ||||||
|  |  | ||||||
|  |  | ||||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
| @@ -187,7 +190,7 @@ int main (int argc, char ** argv) | |||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|      |      | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=1344*volume*ncall; |     double flops=single_site_flops*volume*ncall; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; |     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
| @@ -226,7 +229,7 @@ int main (int argc, char ** argv) | |||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|      |      | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=1344*volume*ncall; |     double flops=single_site_flops*volume*ncall; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
| @@ -277,7 +280,7 @@ int main (int argc, char ** argv) | |||||||
|     double t1=usecond(); |     double t1=usecond(); | ||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=1344*volume*ncall; |     double flops=single_site_flops*volume*ncall; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
| @@ -355,7 +358,7 @@ int main (int argc, char ** argv) | |||||||
|       //      sDw.stat.print(); |       //      sDw.stat.print(); | ||||||
|  |  | ||||||
|       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|       double flops=(1344.0*volume*ncall)/2; |       double flops=(single_site_flops*volume*ncall)/2.0; | ||||||
|  |  | ||||||
|       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl; |       std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|       std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; |       std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||||
| @@ -478,7 +481,7 @@ int main (int argc, char ** argv) | |||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|      |      | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=(1344.0*volume*ncall)/2; |     double flops=(single_site_flops*volume*ncall)/2.0; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; |     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|   | |||||||
| @@ -51,6 +51,7 @@ int main (int argc, char ** argv) | |||||||
| { | { | ||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; |   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; | ||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
| @@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report ) | |||||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); |   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |   long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); | ||||||
|  |  | ||||||
|   std::vector<int> seeds4({1,2,3,4}); |   std::vector<int> seeds4({1,2,3,4}); | ||||||
|   std::vector<int> seeds5({5,6,7,8}); |   std::vector<int> seeds5({5,6,7,8}); | ||||||
| @@ -196,7 +198,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report ) | |||||||
|    |    | ||||||
|   if ( ! report ) { |   if ( ! report ) { | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=1344*volume*ncall; |     double flops=single_site_flops*volume*ncall; | ||||||
|     std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t"; |     std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t"; | ||||||
|   } |   } | ||||||
|    |    | ||||||
| @@ -228,7 +230,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report ) | |||||||
|      |      | ||||||
|     if(!report){ |     if(!report){ | ||||||
|       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|       double flops=(1344.0*volume*ncall)/2; |       double flops=(single_site_flops*volume*ncall)/2.0; | ||||||
|       std::cout<< flops/(t1-t0); |       std::cout<< flops/(t1-t0); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -237,6 +239,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report ) | |||||||
| #define CHECK_SDW | #define CHECK_SDW | ||||||
| void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report ) | void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report ) | ||||||
| { | { | ||||||
|  |   long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); | ||||||
|  |  | ||||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
| @@ -321,7 +324,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report ) | |||||||
|     Counter.Report(); |     Counter.Report(); | ||||||
|   } else {  |   } else {  | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=1344*volume*ncall; |     double flops=single_site_flops*volume*ncall; | ||||||
|     std::cout<<"\t"<< flops/(t1-t0); |     std::cout<<"\t"<< flops/(t1-t0); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -358,7 +361,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report ) | |||||||
|     CounterSdw.Report(); |     CounterSdw.Report(); | ||||||
|   } else { |   } else { | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=(1344.0*volume*ncall)/2; |     double flops=(single_site_flops*volume*ncall)/2.0; | ||||||
|     std::cout<<"\t"<< flops/(t1-t0); |     std::cout<<"\t"<< flops/(t1-t0); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -107,7 +107,7 @@ int main (int argc, char ** argv) | |||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|      |      | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=2*1344*volume*ncall; |     double flops=2*1320*volume*ncall; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; |     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
| @@ -134,7 +134,7 @@ int main (int argc, char ** argv) | |||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|      |      | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=2*1344*volume*ncall; |     double flops=2*1320*volume*ncall; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
| @@ -174,7 +174,7 @@ int main (int argc, char ** argv) | |||||||
|     FGrid_d->Barrier(); |     FGrid_d->Barrier(); | ||||||
|      |      | ||||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|     double flops=2*1344*volume*ncall; |     double flops=2*1320*volume*ncall; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; |     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
|   | |||||||
| @@ -55,7 +55,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|   uint64_t lmax=96; |   uint64_t lmax=64; | ||||||
| #define NLOOP (10*lmax*lmax*lmax*lmax/vol) | #define NLOOP (10*lmax*lmax*lmax*lmax/vol) | ||||||
|   for(int lat=8;lat<=lmax;lat+=8){ |   for(int lat=8;lat<=lmax;lat+=8){ | ||||||
|  |  | ||||||
|   | |||||||
| @@ -35,9 +35,11 @@ using namespace Grid::QCD; | |||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
| #define LMAX (64) | #define LMAX (32) | ||||||
|  | #define LMIN (16) | ||||||
|  | #define LINC (4) | ||||||
|  |  | ||||||
|   int64_t Nloop=20; |   int64_t Nloop=2000; | ||||||
|  |  | ||||||
|   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); |   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||||
|   std::vector<int> mpi_layout  = GridDefaultMpi(); |   std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
| @@ -51,7 +53,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=LMIN;lat<=LMAX;lat+=LINC){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
| @@ -83,7 +85,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=LMIN;lat<=LMAX;lat+=LINC){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
| @@ -114,7 +116,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=LMIN;lat<=LMAX;lat+=LINC){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
| @@ -145,7 +147,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=LMIN;lat<=LMAX;lat+=LINC){ | ||||||
|      |      | ||||||
|     std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |     std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|     int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |     int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
| @@ -165,10 +167,87 @@ int main (int argc, char ** argv) | |||||||
|     double time = (stop-start)/Nloop*1000.0; |     double time = (stop-start)/Nloop*1000.0; | ||||||
|      |      | ||||||
|     double bytes=3*vol*Nc*Nc*sizeof(Complex); |     double bytes=3*vol*Nc*Nc*sizeof(Complex); | ||||||
|       double flops=Nc*Nc*(8+8+8)*vol; |     double flops=Nc*Nc*(6+8+8)*vol; | ||||||
|     std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; |     std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | ||||||
|      |      | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |  | ||||||
|  |   for(int lat=LMIN;lat<=LMAX;lat+=LINC){ | ||||||
|  |  | ||||||
|  |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|  |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|  |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |       GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|  |       LatticeColourMatrix z(&Grid); random(pRNG,z); | ||||||
|  |       LatticeColourMatrix x(&Grid); random(pRNG,x); | ||||||
|  |       LatticeColourMatrix y(&Grid); random(pRNG,y); | ||||||
|  |  | ||||||
|  |       for(int mu=0;mu<4;mu++){ | ||||||
|  | 	      double start=usecond(); | ||||||
|  | 	      for(int64_t i=0;i<Nloop;i++){ | ||||||
|  | 	        z = PeriodicBC::CovShiftForward(x,mu,y); | ||||||
|  | 	    } | ||||||
|  | 	    double stop=usecond(); | ||||||
|  | 	    double time = (stop-start)/Nloop*1000.0; | ||||||
|  | 	 | ||||||
|  | 	 | ||||||
|  | 	    double bytes=3*vol*Nc*Nc*sizeof(Complex); | ||||||
|  | 	    double flops=Nc*Nc*(6+8+8)*vol; | ||||||
|  | 	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | ||||||
|  |       } | ||||||
|  |   } | ||||||
|  | #if 1 | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |  | ||||||
|  |   for(int lat=LMIN;lat<=LMAX;lat+=LINC){ | ||||||
|  |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|  |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|  |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |       GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|  |       LatticeColourMatrix z(&Grid); random(pRNG,z); | ||||||
|  |       LatticeColourMatrix x(&Grid); random(pRNG,x); | ||||||
|  |       LatticeColourMatrix y(&Grid); random(pRNG,y); | ||||||
|  |       LatticeColourMatrix tmp(&Grid); | ||||||
|  |  | ||||||
|  |       for(int mu=0;mu<4;mu++){ | ||||||
|  | 	double tshift=0; | ||||||
|  | 	double tmult =0; | ||||||
|  |  | ||||||
|  | 	double start=usecond(); | ||||||
|  | 	for(int64_t i=0;i<Nloop;i++){ | ||||||
|  | 	  tshift-=usecond(); | ||||||
|  | 	  tmp = Cshift(y,mu,-1); | ||||||
|  | 	  tshift+=usecond(); | ||||||
|  | 	  tmult-=usecond(); | ||||||
|  | 	  z   = x*tmp; | ||||||
|  | 	  tmult+=usecond(); | ||||||
|  | 	} | ||||||
|  | 	double stop=usecond(); | ||||||
|  | 	double time = (stop-start)/Nloop; | ||||||
|  | 	tshift = tshift/Nloop; | ||||||
|  | 	tmult  = tmult /Nloop; | ||||||
|  | 	 | ||||||
|  | 	double bytes=3*vol*Nc*Nc*sizeof(Complex); | ||||||
|  | 	double flops=Nc*Nc*(6+8+8)*vol; | ||||||
|  | 	std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl; | ||||||
|  | 	time = time * 1000; // convert to NS for GB/s | ||||||
|  | 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -4,7 +4,7 @@ | |||||||
|  |  | ||||||
|     Source file: ./benchmarks/Benchmark_wilson.cc |     Source file: ./benchmarks/Benchmark_wilson.cc | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |     Copyright (C) 2018 | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
| @@ -32,6 +32,9 @@ using namespace std; | |||||||
| using namespace Grid; | using namespace Grid; | ||||||
| using namespace Grid::QCD; | using namespace Grid::QCD; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #include "Grid/util/Profiling.h" | ||||||
|  |  | ||||||
| template<class d> | template<class d> | ||||||
| struct scal { | struct scal { | ||||||
|   d internal; |   d internal; | ||||||
| @@ -45,6 +48,7 @@ struct scal { | |||||||
|   }; |   }; | ||||||
|  |  | ||||||
| bool overlapComms = false; | bool overlapComms = false; | ||||||
|  | bool perfProfiling = false; | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
| @@ -53,6 +57,12 @@ int main (int argc, char ** argv) | |||||||
|   if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ |   if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ | ||||||
|     overlapComms = true; |     overlapComms = true; | ||||||
|   } |   } | ||||||
|  |   if( GridCmdOptionExists(argv,argv+argc,"--perf") ){ | ||||||
|  |     perfProfiling = true; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::vector<int> latt_size   = GridDefaultLatt(); |   std::vector<int> latt_size   = GridDefaultLatt(); | ||||||
|   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); |   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||||
| @@ -61,10 +71,15 @@ int main (int argc, char ** argv) | |||||||
|   GridRedBlackCartesian     RBGrid(&Grid); |   GridRedBlackCartesian     RBGrid(&Grid); | ||||||
|  |  | ||||||
|   int threads = GridThread::GetThreads(); |   int threads = GridThread::GetThreads(); | ||||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; |  | ||||||
|  |   GridLogLayout(); | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl; |   std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl; |   std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl; |   std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "Grid number of colours : "<< QCD::Nc <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::vector<int> seeds({1,2,3,4}); |   std::vector<int> seeds({1,2,3,4}); | ||||||
|   GridParallelRNG          pRNG(&Grid); |   GridParallelRNG          pRNG(&Grid); | ||||||
| @@ -134,9 +149,25 @@ int main (int argc, char ** argv) | |||||||
|     Dw.Dhop(src,result,0); |     Dw.Dhop(src,result,0); | ||||||
|   } |   } | ||||||
|   double t1=usecond(); |   double t1=usecond(); | ||||||
|   double flops=1344*volume*ncall; |   double flops=single_site_flops*volume*ncall; | ||||||
|  |    | ||||||
|  |   if (perfProfiling){ | ||||||
|  |   std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl; | ||||||
|  |      | ||||||
|  |   System::profile("kernel", [&]() { | ||||||
|  |     for(int i=0;i<ncall;i++){ | ||||||
|  |       Dw.Dhop(src,result,0); | ||||||
|  |     } | ||||||
|  |   }); | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl; | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "Called Dw"<<std::endl; |   std::cout<<GridLogMessage << "Called Dw"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; |   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; |   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|   | |||||||
| @@ -62,6 +62,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; |   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; | ||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage<< "* Number of colours "<< QCD::Nc <<std::endl; | ||||||
|   std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl; |   std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl; | ||||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl; |   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl; | ||||||
|   if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; |   if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
| @@ -69,13 +70,15 @@ int main (int argc, char ** argv) | |||||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  |   std::cout << GridLogMessage << "* OpenMP threads       : "<< GridThread::GetThreads() <<std::endl; | ||||||
|  |   std::cout << GridLogMessage << "* MPI tasks            : "<< GridCmdVectorIntToString(mpi_layout) << std::endl; | ||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "============================================================================="<< std::endl; |   std::cout<<GridLogMessage << "================================================================================================="<< std::endl; | ||||||
|   std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl; |   std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl; | ||||||
|   std::cout<<GridLogMessage << "============================================================================="<< std::endl; |   std::cout<<GridLogMessage << "================================================================================================="<< std::endl; | ||||||
|   std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl; |   std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl; | ||||||
|   std::cout<<GridLogMessage << "============================================================================="<< std::endl; |   std::cout<<GridLogMessage << "================================================================================================="<< std::endl; | ||||||
|  |  | ||||||
|   int Lmax = 32; |   int Lmax = 32; | ||||||
|   int dmin = 0; |   int dmin = 0; | ||||||
| @@ -98,12 +101,19 @@ int main (int argc, char ** argv) | |||||||
| 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); | 	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); | ||||||
| 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu); | 	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu); | ||||||
| 	  LatticeFermion        src(&Grid); random(pRNG,src); | 	  LatticeFermion        src(&Grid); random(pRNG,src); | ||||||
|  | 	  LatticeFermion    src_o(&RBGrid); pickCheckerboard(Odd,src_o,src); | ||||||
| 	  LatticeFermion     result(&Grid); result=zero; | 	  LatticeFermion     result(&Grid); result=zero; | ||||||
|  | 	  LatticeFermion result_e(&RBGrid); result_e=zero; | ||||||
|  |  | ||||||
| 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>()); | 	  double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>()); | ||||||
|  |  | ||||||
| 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params); | 	  WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params); | ||||||
|  |  | ||||||
|  |     // Full operator       | ||||||
|  | 	  bench_wilson(src,result,Dw,volume,DaggerNo); | ||||||
|  | 	  bench_wilson(src,result,Dw,volume,DaggerYes); | ||||||
|  |     std::cout << "\t"; | ||||||
|  |     // EO | ||||||
| 	  bench_wilson(src,result,Dw,volume,DaggerNo); | 	  bench_wilson(src,result,Dw,volume,DaggerNo); | ||||||
| 	  bench_wilson(src,result,Dw,volume,DaggerYes); | 	  bench_wilson(src,result,Dw,volume,DaggerYes); | ||||||
| 	  std::cout << std::endl; | 	  std::cout << std::endl; | ||||||
| @@ -122,9 +132,26 @@ void bench_wilson ( | |||||||
| 		   int const           dag ) | 		   int const           dag ) | ||||||
| { | { | ||||||
|   int ncall    = 1000; |   int ncall    = 1000; | ||||||
|  |   long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); | ||||||
|   double t0    = usecond(); |   double t0    = usecond(); | ||||||
|   for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); } |   for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); } | ||||||
|   double t1    = usecond(); |   double t1    = usecond(); | ||||||
|   double flops = 1344 * volume * ncall; |   double flops = single_site_flops * volume * ncall; | ||||||
|  |   std::cout << flops/(t1-t0) << "\t\t"; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void bench_wilson_eo ( | ||||||
|  | 		   LatticeFermion &    src, | ||||||
|  | 		   LatticeFermion & result, | ||||||
|  | 		   WilsonFermionR &     Dw, | ||||||
|  | 		   double const     volume, | ||||||
|  | 		   int const           dag ) | ||||||
|  | { | ||||||
|  |   int ncall    = 1000; | ||||||
|  |   long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc); | ||||||
|  |   double t0    = usecond(); | ||||||
|  |   for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); } | ||||||
|  |   double t1    = usecond(); | ||||||
|  |   double flops = (single_site_flops * volume * ncall)/2.0; | ||||||
|   std::cout << flops/(t1-t0) << "\t\t"; |   std::cout << flops/(t1-t0) << "\t\t"; | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										15
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in | |||||||
|       AVX512) |       AVX512) | ||||||
|         AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) |         AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) | ||||||
|         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; |         SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; | ||||||
|  |       SKL) | ||||||
|  |         AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon]) | ||||||
|  |         SIMD_FLAGS='-march=skylake-avx512';; | ||||||
|       KNC) |       KNC) | ||||||
|         AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner]) |         AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner]) | ||||||
|         SIMD_FLAGS='';; |         SIMD_FLAGS='';; | ||||||
| @@ -337,7 +340,7 @@ case ${ac_PRECISION} in | |||||||
| esac | esac | ||||||
|  |  | ||||||
| ######################  Shared memory allocation technique under MPI3 | ######################  Shared memory allocation technique under MPI3 | ||||||
| AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs], | AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone], | ||||||
|               [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen]) |               [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen]) | ||||||
|  |  | ||||||
| case ${ac_SHM} in | case ${ac_SHM} in | ||||||
| @@ -346,6 +349,14 @@ case ${ac_SHM} in | |||||||
|      AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] ) |      AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] ) | ||||||
|      ;; |      ;; | ||||||
|  |  | ||||||
|  |      shmget) | ||||||
|  |      AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] ) | ||||||
|  |      ;; | ||||||
|  |  | ||||||
|  |      shmnone) | ||||||
|  |      AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] ) | ||||||
|  |      ;; | ||||||
|  |  | ||||||
|      hugetlbfs) |      hugetlbfs) | ||||||
|      AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] ) |      AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] ) | ||||||
|      ;; |      ;; | ||||||
| @@ -359,7 +370,7 @@ esac | |||||||
| AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path], | AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path], | ||||||
|               [Select SHM mmap base path for hugetlbfs])], |               [Select SHM mmap base path for hugetlbfs])], | ||||||
| 	      [ac_SHMPATH=${enable_shmpath}], | 	      [ac_SHMPATH=${enable_shmpath}], | ||||||
| 	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/]) | 	      [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/]) | ||||||
| AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing]) | AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing]) | ||||||
|  |  | ||||||
| ############### communication type selection | ############### communication type selection | ||||||
|   | |||||||
| @@ -43,12 +43,6 @@ using namespace Hadrons; | |||||||
| Application::Application(void) | Application::Application(void) | ||||||
| { | { | ||||||
|     initLogger(); |     initLogger(); | ||||||
|     LOG(Message) << "Modules available:" << std::endl; |  | ||||||
|     auto list = ModuleFactory::getInstance().getBuilderList(); |  | ||||||
|     for (auto &m: list) |  | ||||||
|     { |  | ||||||
|         LOG(Message) << "  " << m << std::endl; |  | ||||||
|     } |  | ||||||
|     auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim); |     auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim); | ||||||
|     locVol_ = 1; |     locVol_ = 1; | ||||||
|     for (unsigned int d = 0; d < dim.size(); ++d) |     for (unsigned int d = 0; d < dim.size(); ++d) | ||||||
|   | |||||||
| @@ -57,6 +57,7 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| #include <Grid/Hadrons/Modules/MAction/DWF.hpp> | #include <Grid/Hadrons/Modules/MAction/DWF.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MAction/Wilson.hpp> | #include <Grid/Hadrons/Modules/MAction/Wilson.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MAction/WilsonClover.hpp> | #include <Grid/Hadrons/Modules/MAction/WilsonClover.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MScalarSUN/Div.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MScalarSUN/TrMag.hpp> | #include <Grid/Hadrons/Modules/MScalarSUN/TrMag.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MScalarSUN/TwoPoint.hpp> | #include <Grid/Hadrons/Modules/MScalarSUN/TwoPoint.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MScalarSUN/TrPhi.hpp> | #include <Grid/Hadrons/Modules/MScalarSUN/TrPhi.hpp> | ||||||
|   | |||||||
| @@ -57,7 +57,7 @@ std::vector<std::string> TFundtoHirep<Rep>::getOutput(void) | |||||||
| template <typename Rep> | template <typename Rep> | ||||||
| void TFundtoHirep<Rep>::setup(void) | void TFundtoHirep<Rep>::setup(void) | ||||||
| { | { | ||||||
|     env().template registerLattice<typename Rep::LatticeField>(getName()); |     envCreateLat(typename Rep::LatticeField, getName()); | ||||||
| } | } | ||||||
|  |  | ||||||
| // execution /////////////////////////////////////////////////////////////////// | // execution /////////////////////////////////////////////////////////////////// | ||||||
| @@ -70,6 +70,6 @@ void TFundtoHirep<Rep>::execute(void) | |||||||
|     Rep TargetRepresentation(U._grid); |     Rep TargetRepresentation(U._grid); | ||||||
|     TargetRepresentation.update_representation(U); |     TargetRepresentation.update_representation(U); | ||||||
|  |  | ||||||
|    typename Rep::LatticeField &URep = *env().template createLattice<typename Rep::LatticeField>(getName()); |     auto &URep = envGet(typename Rep::LatticeField, getName()); | ||||||
|     URep = TargetRepresentation.U; |     URep = TargetRepresentation.U; | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										166
									
								
								extras/Hadrons/Modules/MScalarSUN/Div.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										166
									
								
								extras/Hadrons/Modules/MScalarSUN/Div.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,166 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  | Source file: extras/Hadrons/Modules/MScalarSUN/Div.hpp | ||||||
|  |  | ||||||
|  | Copyright (C) 2015-2018 | ||||||
|  |  | ||||||
|  | Author: Antonin Portelli <antonin.portelli@me.com> | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #ifndef Hadrons_MScalarSUN_Div_hpp_ | ||||||
|  | #define Hadrons_MScalarSUN_Div_hpp_ | ||||||
|  |  | ||||||
|  | #include <Grid/Hadrons/Global.hpp> | ||||||
|  | #include <Grid/Hadrons/Module.hpp> | ||||||
|  | #include <Grid/Hadrons/ModuleFactory.hpp> | ||||||
|  |  | ||||||
|  | BEGIN_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  |  *                         Div                                 * | ||||||
|  |  ******************************************************************************/ | ||||||
|  | BEGIN_MODULE_NAMESPACE(MScalarSUN) | ||||||
|  |  | ||||||
|  | class DivPar: Serializable | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     GRID_SERIALIZABLE_ENUM(DiffType, undef, forward, 1, backward, 2, central, 3); | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(DivPar, | ||||||
|  |                                     std::vector<std::string>, op, | ||||||
|  |                                     DiffType,                 type, | ||||||
|  |                                     std::string,              output); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template <typename SImpl> | ||||||
|  | class TDiv: public Module<DivPar> | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     typedef typename SImpl::Field        Field; | ||||||
|  |     typedef typename SImpl::ComplexField ComplexField; | ||||||
|  |     class Result: Serializable | ||||||
|  |     { | ||||||
|  |     public: | ||||||
|  |         GRID_SERIALIZABLE_CLASS_MEMBERS(Result, | ||||||
|  |                                         DivPar::DiffType, type, | ||||||
|  |                                         Complex,          value); | ||||||
|  |     }; | ||||||
|  | public: | ||||||
|  |     // constructor | ||||||
|  |     TDiv(const std::string name); | ||||||
|  |     // destructor | ||||||
|  |     virtual ~TDiv(void) = default; | ||||||
|  |     // dependency relation | ||||||
|  |     virtual std::vector<std::string> getInput(void); | ||||||
|  |     virtual std::vector<std::string> getOutput(void); | ||||||
|  |     // setup | ||||||
|  |     virtual void setup(void); | ||||||
|  |     // execution | ||||||
|  |     virtual void execute(void); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | MODULE_REGISTER_NS(DivSU2, TDiv<ScalarNxNAdjImplR<2>>, MScalarSUN); | ||||||
|  | MODULE_REGISTER_NS(DivSU3, TDiv<ScalarNxNAdjImplR<3>>, MScalarSUN); | ||||||
|  | MODULE_REGISTER_NS(DivSU4, TDiv<ScalarNxNAdjImplR<4>>, MScalarSUN); | ||||||
|  | MODULE_REGISTER_NS(DivSU5, TDiv<ScalarNxNAdjImplR<5>>, MScalarSUN); | ||||||
|  | MODULE_REGISTER_NS(DivSU6, TDiv<ScalarNxNAdjImplR<6>>, MScalarSUN); | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  |  *                 TDiv implementation                             * | ||||||
|  |  ******************************************************************************/ | ||||||
|  | // constructor ///////////////////////////////////////////////////////////////// | ||||||
|  | template <typename SImpl> | ||||||
|  | TDiv<SImpl>::TDiv(const std::string name) | ||||||
|  | : Module<DivPar>(name) | ||||||
|  | {} | ||||||
|  |  | ||||||
|  | // dependencies/products /////////////////////////////////////////////////////// | ||||||
|  | template <typename SImpl> | ||||||
|  | std::vector<std::string> TDiv<SImpl>::getInput(void) | ||||||
|  | { | ||||||
|  |     return par().op; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template <typename SImpl> | ||||||
|  | std::vector<std::string> TDiv<SImpl>::getOutput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> out = {getName()}; | ||||||
|  |  | ||||||
|  |     return out; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // setup /////////////////////////////////////////////////////////////////////// | ||||||
|  | template <typename SImpl> | ||||||
|  | void TDiv<SImpl>::setup(void) | ||||||
|  | { | ||||||
|  |     if (par().op.size() != env().getNd()) | ||||||
|  |     { | ||||||
|  |         HADRON_ERROR(Size, "the number of components differs from number of dimensions"); | ||||||
|  |     } | ||||||
|  |     envCreateLat(ComplexField, getName()); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // execution /////////////////////////////////////////////////////////////////// | ||||||
|  | template <typename SImpl> | ||||||
|  | void TDiv<SImpl>::execute(void) | ||||||
|  | { | ||||||
|  |     const auto nd = env().getNd(); | ||||||
|  |  | ||||||
|  |     LOG(Message) << "Computing the " << par().type << " divergence of ["; | ||||||
|  |     for (unsigned int mu = 0; mu < nd; ++mu) | ||||||
|  |     { | ||||||
|  |         std::cout << par().op[mu] << ((mu == nd - 1) ? "]" : ", "); | ||||||
|  |     } | ||||||
|  |     std::cout << std::endl; | ||||||
|  |  | ||||||
|  |     auto &div = envGet(ComplexField, getName()); | ||||||
|  |     div = zero; | ||||||
|  |     for (unsigned int mu = 0; mu < nd; ++mu) | ||||||
|  |     { | ||||||
|  |         auto &op = envGet(ComplexField, par().op[mu]); | ||||||
|  |         switch(par().type) | ||||||
|  |         { | ||||||
|  |             case DivPar::DiffType::backward: | ||||||
|  |                 div += op - Cshift(op, mu, -1); | ||||||
|  |                 break; | ||||||
|  |             case DivPar::DiffType::forward: | ||||||
|  |                 div += Cshift(op, mu, 1) - op; | ||||||
|  |                 break; | ||||||
|  |             case DivPar::DiffType::central: | ||||||
|  |                 div += 0.5*(Cshift(op, mu, 1) - Cshift(op, mu, -1)); | ||||||
|  |                 break; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     if (!par().output.empty()) | ||||||
|  |     { | ||||||
|  |         Result       r; | ||||||
|  |         ResultWriter writer(RESULT_FILE_NAME(par().output)); | ||||||
|  |  | ||||||
|  |         r.type  = par().type; | ||||||
|  |         r.value = TensorRemove(sum(div)); | ||||||
|  |         write(writer, "div", r); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | END_MODULE_NAMESPACE | ||||||
|  |  | ||||||
|  | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | #endif // Hadrons_MScalarSUN_Div_hpp_ | ||||||
| @@ -44,6 +44,7 @@ modules_hpp =\ | |||||||
|   Modules/MAction/DWF.hpp \ |   Modules/MAction/DWF.hpp \ | ||||||
|   Modules/MAction/Wilson.hpp \ |   Modules/MAction/Wilson.hpp \ | ||||||
|   Modules/MAction/WilsonClover.hpp \ |   Modules/MAction/WilsonClover.hpp \ | ||||||
|  |   Modules/MScalarSUN/Div.hpp \ | ||||||
|   Modules/MScalarSUN/TrMag.hpp \ |   Modules/MScalarSUN/TrMag.hpp \ | ||||||
|   Modules/MScalarSUN/TwoPoint.hpp \ |   Modules/MScalarSUN/TwoPoint.hpp \ | ||||||
|   Modules/MScalarSUN/TrPhi.hpp \ |   Modules/MScalarSUN/TrPhi.hpp \ | ||||||
|   | |||||||
| @@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/algorithms/approx/MultiShiftFunction.h> | #include <Grid/algorithms/approx/MultiShiftFunction.h> | ||||||
| #include <Grid/algorithms/approx/Forecast.h> | #include <Grid/algorithms/approx/Forecast.h> | ||||||
|  |  | ||||||
|  | #include <Grid/algorithms/iterative/Deflation.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradient.h> | #include <Grid/algorithms/iterative/ConjugateGradient.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateResidual.h> | #include <Grid/algorithms/iterative/ConjugateResidual.h> | ||||||
| #include <Grid/algorithms/iterative/NormalEquations.h> | #include <Grid/algorithms/iterative/NormalEquations.h> | ||||||
|   | |||||||
| @@ -309,36 +309,59 @@ namespace Grid { | |||||||
|       class SchurStaggeredOperator :  public SchurOperatorBase<Field> { |       class SchurStaggeredOperator :  public SchurOperatorBase<Field> { | ||||||
|     protected: |     protected: | ||||||
|       Matrix &_Mat; |       Matrix &_Mat; | ||||||
|  |       Field tmp; | ||||||
|  |       RealD mass; | ||||||
|  |       double tMpc; | ||||||
|  |       double tIP; | ||||||
|  |       double tMeo; | ||||||
|  |       double taxpby_norm; | ||||||
|  |       uint64_t ncall; | ||||||
|     public: |     public: | ||||||
|       SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){}; |       void Report(void) | ||||||
|  |       { | ||||||
|  | 	std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl; | ||||||
|  | 	std::cout << GridLogMessage << " HermOpAndNorm.IP  "<< tIP /ncall<<" usec "<<std::endl; | ||||||
|  | 	std::cout << GridLogMessage << " Mpc.MeoMoe        "<< tMeo/ncall<<" usec "<<std::endl; | ||||||
|  | 	std::cout << GridLogMessage << " Mpc.axpby_norm    "<< taxpby_norm/ncall<<" usec "<<std::endl; | ||||||
|  |       } | ||||||
|  |       SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())  | ||||||
|  |       {  | ||||||
|  | 	assert( _Mat.isTrivialEE() ); | ||||||
|  | 	mass = _Mat.Mass(); | ||||||
|  | 	tMpc=0; | ||||||
|  | 	tIP =0; | ||||||
|  |         tMeo=0; | ||||||
|  |         taxpby_norm=0; | ||||||
|  | 	ncall=0; | ||||||
|  |       } | ||||||
|       virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ |       virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ | ||||||
| 	GridLogIterative.TimingMode(1); | 	ncall++; | ||||||
| 	std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl; | 	tMpc-=usecond(); | ||||||
| 	n2 = Mpc(in,out); | 	n2 = Mpc(in,out); | ||||||
| 	std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl; | 	tMpc+=usecond(); | ||||||
|  | 	tIP-=usecond(); | ||||||
| 	ComplexD dot= innerProduct(in,out); | 	ComplexD dot= innerProduct(in,out); | ||||||
| 	std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl; | 	tIP+=usecond(); | ||||||
| 	n1 = real(dot); | 	n1 = real(dot); | ||||||
|       } |       } | ||||||
|       virtual void HermOp(const Field &in, Field &out){ |       virtual void HermOp(const Field &in, Field &out){ | ||||||
| 	std::cout << GridLogIterative << " HermOp "<<std::endl; | 	ncall++; | ||||||
| 	Mpc(in,out); | 	tMpc-=usecond(); | ||||||
|  | 	_Mat.Meooe(in,out); | ||||||
|  | 	_Mat.Meooe(out,tmp); | ||||||
|  | 	tMpc+=usecond(); | ||||||
|  | 	taxpby_norm-=usecond(); | ||||||
|  | 	axpby(out,-1.0,mass*mass,tmp,in); | ||||||
|  | 	taxpby_norm+=usecond(); | ||||||
|       } |       } | ||||||
|       virtual  RealD Mpc      (const Field &in, Field &out) { |       virtual  RealD Mpc      (const Field &in, Field &out) { | ||||||
| 	Field tmp(in._grid); | 	tMeo-=usecond(); | ||||||
| 	Field tmp2(in._grid); |  | ||||||
|  |  | ||||||
| 	std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl; |  | ||||||
| 	_Mat.Mooee(in,out); |  | ||||||
| 	_Mat.Mooee(out,tmp); |  | ||||||
| 	std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl; |  | ||||||
|  |  | ||||||
| 	_Mat.Meooe(in,out); | 	_Mat.Meooe(in,out); | ||||||
| 	_Mat.Meooe(out,tmp2); | 	_Mat.Meooe(out,tmp); | ||||||
| 	std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl; | 	tMeo+=usecond(); | ||||||
|  | 	taxpby_norm-=usecond(); | ||||||
| 	RealD nn=axpy_norm(out,-1.0,tmp2,tmp); | 	RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in); | ||||||
| 	std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl; | 	taxpby_norm+=usecond(); | ||||||
| 	return nn; | 	return nn; | ||||||
|       } |       } | ||||||
|       virtual  RealD MpcDag   (const Field &in, Field &out){ |       virtual  RealD MpcDag   (const Field &in, Field &out){ | ||||||
|   | |||||||
| @@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|  |  | ||||||
|   void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { |   void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { | ||||||
|  |  | ||||||
|  |  | ||||||
|     psi.checkerboard = src.checkerboard; |     psi.checkerboard = src.checkerboard; | ||||||
|     conformable(psi, src); |     conformable(psi, src); | ||||||
|  |  | ||||||
| @@ -70,7 +71,6 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|      |      | ||||||
|     Linop.HermOpAndNorm(psi, mmp, d, b); |     Linop.HermOpAndNorm(psi, mmp, d, b); | ||||||
|  |  | ||||||
|  |  | ||||||
|     r = src - mmp; |     r = src - mmp; | ||||||
|     p = r; |     p = r; | ||||||
|  |  | ||||||
| @@ -96,38 +96,44 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|               << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; |               << "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl; | ||||||
|  |  | ||||||
|     GridStopWatch LinalgTimer; |     GridStopWatch LinalgTimer; | ||||||
|  |     GridStopWatch InnerTimer; | ||||||
|  |     GridStopWatch AxpyNormTimer; | ||||||
|  |     GridStopWatch LinearCombTimer; | ||||||
|     GridStopWatch MatrixTimer; |     GridStopWatch MatrixTimer; | ||||||
|     GridStopWatch SolverTimer; |     GridStopWatch SolverTimer; | ||||||
|  |  | ||||||
|     SolverTimer.Start(); |     SolverTimer.Start(); | ||||||
|     int k; |     int k; | ||||||
|     for (k = 1; k <= MaxIterations; k++) { |     for (k = 1; k <= MaxIterations*1000; k++) { | ||||||
|       c = cp; |       c = cp; | ||||||
|  |  | ||||||
|       MatrixTimer.Start(); |       MatrixTimer.Start(); | ||||||
|       Linop.HermOpAndNorm(p, mmp, d, qq); |       Linop.HermOp(p, mmp); | ||||||
|       MatrixTimer.Stop(); |       MatrixTimer.Stop(); | ||||||
|  |  | ||||||
|       LinalgTimer.Start(); |       LinalgTimer.Start(); | ||||||
|       //  RealD    qqck = norm2(mmp); |  | ||||||
|       //  ComplexD dck  = innerProduct(p,mmp); |  | ||||||
|  |  | ||||||
|  |       InnerTimer.Start(); | ||||||
|  |       ComplexD dc  = innerProduct(p,mmp); | ||||||
|  |       InnerTimer.Stop(); | ||||||
|  |       d = dc.real(); | ||||||
|       a = c / d; |       a = c / d; | ||||||
|       b_pred = a * (a * qq - d) / c; |  | ||||||
|  |  | ||||||
|  |       AxpyNormTimer.Start(); | ||||||
|       cp = axpy_norm(r, -a, mmp, r); |       cp = axpy_norm(r, -a, mmp, r); | ||||||
|  |       AxpyNormTimer.Stop(); | ||||||
|       b = cp / c; |       b = cp / c; | ||||||
|  |  | ||||||
|       // Fuse these loops ; should be really easy |       LinearCombTimer.Start(); | ||||||
|       psi = a * p + psi; |       parallel_for(int ss=0;ss<src._grid->oSites();ss++){ | ||||||
|       p = p * b + r; | 	vstream(psi[ss], a      *  p[ss] + psi[ss]); | ||||||
|  | 	vstream(p  [ss], b      *  p[ss] + r[ss]); | ||||||
|  |       } | ||||||
|  |       LinearCombTimer.Stop(); | ||||||
|       LinalgTimer.Stop(); |       LinalgTimer.Stop(); | ||||||
|  |  | ||||||
|       std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k |       std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k | ||||||
|                 << " residual " << cp << " target " << rsq << std::endl; |                 << " residual " << cp << " target " << rsq << std::endl; | ||||||
|       std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl; |  | ||||||
|       std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl; |  | ||||||
|  |  | ||||||
|       // Stopping condition |       // Stopping condition | ||||||
|       if (cp <= rsq) { |       if (cp <= rsq) { | ||||||
| @@ -148,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
| 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | 	std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl; | ||||||
| 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | 	std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl; | ||||||
| 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | 	std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl; | ||||||
|  | 	std::cout << GridLogMessage << "\tInner      " << InnerTimer.Elapsed() <<std::endl; | ||||||
|  | 	std::cout << GridLogMessage << "\tAxpyNorm   " << AxpyNormTimer.Elapsed() <<std::endl; | ||||||
|  | 	std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; | ||||||
|  |  | ||||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); |         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -43,6 +43,7 @@ namespace Grid { | |||||||
| public:                                                 | public:                                                 | ||||||
|     RealD   Tolerance; |     RealD   Tolerance; | ||||||
|     Integer MaxIterations; |     Integer MaxIterations; | ||||||
|  |     Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion | ||||||
|     int verbose; |     int verbose; | ||||||
|     MultiShiftFunction shifts; |     MultiShiftFunction shifts; | ||||||
|  |  | ||||||
| @@ -164,6 +165,15 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
|     axpby(psi[s],0.,-bs[s]*alpha[s],src,src); |     axpby(psi[s],0.,-bs[s]*alpha[s],src,src); | ||||||
|   } |   } | ||||||
|   |   | ||||||
|  |   /////////////////////////////////////// | ||||||
|  |   // Timers | ||||||
|  |   /////////////////////////////////////// | ||||||
|  |   GridStopWatch AXPYTimer; | ||||||
|  |   GridStopWatch ShiftTimer; | ||||||
|  |   GridStopWatch QRTimer; | ||||||
|  |   GridStopWatch MatrixTimer; | ||||||
|  |   GridStopWatch SolverTimer; | ||||||
|  |   SolverTimer.Start(); | ||||||
|    |    | ||||||
|   // Iteration loop |   // Iteration loop | ||||||
|   int k; |   int k; | ||||||
| @@ -171,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
|   for (k=1;k<=MaxIterations;k++){ |   for (k=1;k<=MaxIterations;k++){ | ||||||
|      |      | ||||||
|     a = c /cp; |     a = c /cp; | ||||||
|  |     AXPYTimer.Start(); | ||||||
|     axpy(p,a,p,r); |     axpy(p,a,p,r); | ||||||
|  |     AXPYTimer.Stop(); | ||||||
|      |      | ||||||
|     // Note to self - direction ps is iterated seperately |     // Note to self - direction ps is iterated seperately | ||||||
|     // for each shift. Does not appear to have any scope |     // for each shift. Does not appear to have any scope | ||||||
| @@ -180,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
|     // However SAME r is used. Could load "r" and update |     // However SAME r is used. Could load "r" and update | ||||||
|     // ALL ps[s]. 2/3 Bandwidth saving |     // ALL ps[s]. 2/3 Bandwidth saving | ||||||
|     // New Kernel: Load r, vector of coeffs, vector of pointers ps |     // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||||
|  |     AXPYTimer.Start(); | ||||||
|     for(int s=0;s<nshift;s++){ |     for(int s=0;s<nshift;s++){ | ||||||
|       if ( ! converged[s] ) {  |       if ( ! converged[s] ) {  | ||||||
| 	if (s==0){ | 	if (s==0){ | ||||||
| @@ -190,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     AXPYTimer.Stop(); | ||||||
|      |      | ||||||
|     cp=c; |     cp=c; | ||||||
|  |     MatrixTimer.Start();   | ||||||
|  |     //Linop.HermOpAndNorm(p,mmp,d,qq); // d is used | ||||||
|  |     // The below is faster on KNL | ||||||
|  |     Linop.HermOp(p,mmp);  | ||||||
|  |     d=real(innerProduct(p,mmp)); | ||||||
|      |      | ||||||
|     Linop.HermOpAndNorm(p,mmp,d,qq); |     MatrixTimer.Stop();   | ||||||
|  |  | ||||||
|  |     AXPYTimer.Start(); | ||||||
|     axpy(mmp,mass[0],p,mmp); |     axpy(mmp,mass[0],p,mmp); | ||||||
|  |     AXPYTimer.Stop(); | ||||||
|     RealD rn = norm2(p); |     RealD rn = norm2(p); | ||||||
|     d += rn*mass[0]; |     d += rn*mass[0]; | ||||||
|      |      | ||||||
|     bp=b; |     bp=b; | ||||||
|     b=-cp/d; |     b=-cp/d; | ||||||
|      |      | ||||||
|  |     AXPYTimer.Start(); | ||||||
|     c=axpy_norm(r,b,mmp,r); |     c=axpy_norm(r,b,mmp,r); | ||||||
|  |     AXPYTimer.Stop(); | ||||||
|  |  | ||||||
|     // Toggle the recurrence history |     // Toggle the recurrence history | ||||||
|     bs[0] = b; |     bs[0] = b; | ||||||
|     iz = 1-iz; |     iz = 1-iz; | ||||||
|  |     ShiftTimer.Start(); | ||||||
|     for(int s=1;s<nshift;s++){ |     for(int s=1;s<nshift;s++){ | ||||||
|       if((!converged[s])){ |       if((!converged[s])){ | ||||||
| 	RealD z0 = z[s][1-iz]; | 	RealD z0 = z[s][1-iz]; | ||||||
| @@ -215,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
| 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike | 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     ShiftTimer.Stop(); | ||||||
|      |      | ||||||
|     for(int s=0;s<nshift;s++){ |     for(int s=0;s<nshift;s++){ | ||||||
|       int ss = s; |       int ss = s; | ||||||
| @@ -257,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
|      |      | ||||||
|     if ( all_converged ){ |     if ( all_converged ){ | ||||||
|  |  | ||||||
|  |     SolverTimer.Stop(); | ||||||
|  |  | ||||||
|  |  | ||||||
|       std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; |       std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; | ||||||
|       std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; |       std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; | ||||||
|        |        | ||||||
| @@ -269,8 +298,19 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
| 	RealD cn = norm2(src); | 	RealD cn = norm2(src); | ||||||
| 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage << "Time Breakdown "<<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed()     <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tAXPY    " << AXPYTimer.Elapsed()     <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tMarix    " << MatrixTimer.Elapsed()     <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tShift    " << ShiftTimer.Elapsed()     <<std::endl; | ||||||
|  |  | ||||||
|  |       IterationsToComplete = k;	 | ||||||
|  |  | ||||||
|       return; |       return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     | ||||||
|   } |   } | ||||||
|   // ugly hack |   // ugly hack | ||||||
|   std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; |   std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; | ||||||
|   | |||||||
							
								
								
									
										101
									
								
								lib/algorithms/iterative/Deflation.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								lib/algorithms/iterative/Deflation.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,101 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #ifndef GRID_DEFLATION_H | ||||||
|  | #define GRID_DEFLATION_H | ||||||
|  |  | ||||||
|  | namespace Grid {  | ||||||
|  |  | ||||||
|  | struct ZeroGuesser { | ||||||
|  | public: | ||||||
|  |   template<class Field>  | ||||||
|  |   void operator()(const Field &src,Field &guess) { guess = Zero(); }; | ||||||
|  | }; | ||||||
|  | struct SourceGuesser { | ||||||
|  | public: | ||||||
|  |   template<class Field>  | ||||||
|  |   void operator()(const Field &src,Field &guess) { guess = src; }; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | //////////////////////////////// | ||||||
|  | // Fine grid deflation | ||||||
|  | //////////////////////////////// | ||||||
|  | template<class Field> | ||||||
|  | struct DeflatedGuesser { | ||||||
|  | private: | ||||||
|  |   const std::vector<Field> &evec; | ||||||
|  |   const std::vector<RealD> &eval; | ||||||
|  |  | ||||||
|  | public: | ||||||
|  |  | ||||||
|  |   DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; | ||||||
|  |  | ||||||
|  |   void operator()(const Field &src,Field &guess) {  | ||||||
|  |     guess = zero; | ||||||
|  |     assert(evec.size()==eval.size()); | ||||||
|  |     auto N = evec.size(); | ||||||
|  |     for (int i=0;i<N;i++) { | ||||||
|  |       const Field& tmp = evec[i]; | ||||||
|  |       axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template<class FineField, class CoarseField> | ||||||
|  | class LocalCoherenceDeflatedGuesser { | ||||||
|  | private: | ||||||
|  |   const std::vector<FineField>   &subspace; | ||||||
|  |   const std::vector<CoarseField> &evec_coarse; | ||||||
|  |   const std::vector<RealD>       &eval_coarse; | ||||||
|  | public: | ||||||
|  |    | ||||||
|  |   LocalCoherenceDeflatedGuesser(const std::vector<FineField>   &_subspace, | ||||||
|  | 				const std::vector<CoarseField> &_evec_coarse, | ||||||
|  | 				const std::vector<RealD>       &_eval_coarse) | ||||||
|  |     : subspace(_subspace),  | ||||||
|  |       evec_coarse(_evec_coarse),  | ||||||
|  |       eval_coarse(_eval_coarse)   | ||||||
|  |   { | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   void operator()(const FineField &src,FineField &guess) {  | ||||||
|  |     int N = (int)evec_coarse.size(); | ||||||
|  |     CoarseField src_coarse(evec_coarse[0]._grid); | ||||||
|  |     CoarseField guess_coarse(evec_coarse[0]._grid);    guess_coarse = zero; | ||||||
|  |     blockProject(src_coarse,src,subspace);     | ||||||
|  |     for (int i=0;i<N;i++) { | ||||||
|  |       const CoarseField & tmp = evec_coarse[i]; | ||||||
|  |       axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse); | ||||||
|  |     } | ||||||
|  |     blockPromote(guess_coarse,guess,subspace); | ||||||
|  |   }; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
|  | #endif | ||||||
| @@ -57,7 +57,8 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i | |||||||
|        |        | ||||||
|   parallel_region |   parallel_region | ||||||
|   { |   { | ||||||
|     std::vector < vobj > B(Nm); // Thread private |  | ||||||
|  |     std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private | ||||||
|         |         | ||||||
|     parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ |     parallel_for_internal(int ss=0;ss < grid->oSites();ss++){ | ||||||
|       for(int j=j0; j<j1; ++j) B[j]=0.; |       for(int j=j0; j<j1; ++j) B[j]=0.; | ||||||
| @@ -149,19 +150,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo | |||||||
|   basisReorderInPlace(_v,sort_vals,idx); |   basisReorderInPlace(_v,sort_vals,idx); | ||||||
| } | } | ||||||
|  |  | ||||||
| // PAB: faster to compute the inner products first then fuse loops. |  | ||||||
| // If performance critical can improve. |  | ||||||
| template<class Field> |  | ||||||
| void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) { |  | ||||||
|   result = zero; |  | ||||||
|   assert(_v.size()==eval.size()); |  | ||||||
|   int N = (int)_v.size(); |  | ||||||
|   for (int i=0;i<N;i++) { |  | ||||||
|     Field& tmp = _v[i]; |  | ||||||
|     axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////// | ||||||
| // Implicitly restarted lanczos | // Implicitly restarted lanczos | ||||||
| ///////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////// | ||||||
| @@ -181,6 +169,7 @@ enum IRLdiagonalisation { | |||||||
| template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field> | template<class Field> class ImplicitlyRestartedLanczosHermOpTester  : public ImplicitlyRestartedLanczosTester<Field> | ||||||
| { | { | ||||||
|  public: |  public: | ||||||
|  |  | ||||||
|   LinearFunction<Field>       &_HermOp; |   LinearFunction<Field>       &_HermOp; | ||||||
|   ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  }; |   ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp)  {  }; | ||||||
|   int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox) |   int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox) | ||||||
| @@ -243,6 +232,7 @@ class ImplicitlyRestartedLanczos { | |||||||
|   ///////////////////////// |   ///////////////////////// | ||||||
|    |    | ||||||
| public:        | public:        | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////// | ||||||
|   // PAB: |   // PAB: | ||||||
|   ////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////// | ||||||
| @@ -490,15 +480,13 @@ until convergence | |||||||
| 	Field B(grid); B.checkerboard = evec[0].checkerboard; | 	Field B(grid); B.checkerboard = evec[0].checkerboard; | ||||||
|  |  | ||||||
| 	//  power of two search pattern;  not every evalue in eval2 is assessed. | 	//  power of two search pattern;  not every evalue in eval2 is assessed. | ||||||
|  | 	int allconv =1; | ||||||
| 	for(int jj = 1; jj<=Nstop; jj*=2){ | 	for(int jj = 1; jj<=Nstop; jj*=2){ | ||||||
| 	  int j = Nstop-jj; | 	  int j = Nstop-jj; | ||||||
| 	  RealD e = eval2_copy[j]; // Discard the evalue | 	  RealD e = eval2_copy[j]; // Discard the evalue | ||||||
| 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	     | 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	     | ||||||
| 	  if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) { | 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) { | ||||||
| 	    if ( j > Nconv ) { | 	    allconv=0; | ||||||
| 	      Nconv=j+1; |  | ||||||
| 	      jj=Nstop; // Terminate the scan |  | ||||||
| 	    } |  | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
| 	// Do evec[0] for good measure | 	// Do evec[0] for good measure | ||||||
| @@ -506,8 +494,10 @@ until convergence | |||||||
| 	  int j=0; | 	  int j=0; | ||||||
| 	  RealD e = eval2_copy[0];  | 	  RealD e = eval2_copy[0];  | ||||||
| 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	     | 	  basisRotateJ(B,evec,Qt,j,0,Nk,Nm);	     | ||||||
| 	  _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox); | 	  if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0; | ||||||
| 	} | 	} | ||||||
|  | 	if ( allconv ) Nconv = Nstop; | ||||||
|  |  | ||||||
| 	// test if we converged, if so, terminate | 	// test if we converged, if so, terminate | ||||||
| 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl; | 	std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl; | ||||||
| 	//	if( Nconv>=Nstop || beta_k < betastp){ | 	//	if( Nconv>=Nstop || beta_k < betastp){ | ||||||
|   | |||||||
| @@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #ifndef GRID_LOCAL_COHERENCE_IRL_H | #ifndef GRID_LOCAL_COHERENCE_IRL_H | ||||||
| #define GRID_LOCAL_COHERENCE_IRL_H | #define GRID_LOCAL_COHERENCE_IRL_H | ||||||
|  |  | ||||||
| namespace Grid {  | namespace Grid {  | ||||||
|  |  | ||||||
|  |  | ||||||
| struct LanczosParams : Serializable { | struct LanczosParams : Serializable { | ||||||
|  public: |  public: | ||||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, |   GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams, | ||||||
| @@ -45,6 +48,7 @@ struct LanczosParams : Serializable { | |||||||
| struct LocalCoherenceLanczosParams : Serializable { | struct LocalCoherenceLanczosParams : Serializable { | ||||||
|  public: |  public: | ||||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, |   GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, | ||||||
|  | 				  bool, saveEvecs, | ||||||
| 				  bool, doFine, | 				  bool, doFine, | ||||||
| 				  bool, doFineRead, | 				  bool, doFineRead, | ||||||
| 				  bool, doCoarse, | 				  bool, doCoarse, | ||||||
| @@ -70,21 +74,24 @@ public: | |||||||
|   typedef Lattice<Fobj>          FineField; |   typedef Lattice<Fobj>          FineField; | ||||||
|  |  | ||||||
|   LinearOperatorBase<FineField> &_Linop; |   LinearOperatorBase<FineField> &_Linop; | ||||||
|   Aggregation<Fobj,CComplex,nbasis> &_Aggregate; |   std::vector<FineField>        &subspace; | ||||||
|  |  | ||||||
|   ProjectedHermOp(LinearOperatorBase<FineField>& linop,  Aggregation<Fobj,CComplex,nbasis> &aggregate) :  |   ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :  | ||||||
|     _Linop(linop), |     _Linop(linop), subspace(_subspace) | ||||||
|     _Aggregate(aggregate)  {  }; |   {   | ||||||
|  |     assert(subspace.size() >0); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   void operator()(const CoarseField& in, CoarseField& out) { |   void operator()(const CoarseField& in, CoarseField& out) { | ||||||
|  |     GridBase *FineGrid = subspace[0]._grid;     | ||||||
|  |     int   checkerboard = subspace[0].checkerboard; | ||||||
|        |        | ||||||
|     GridBase *FineGrid = _Aggregate.FineGrid; |     FineField fin (FineGrid);     fin.checkerboard= checkerboard; | ||||||
|     FineField fin(FineGrid); |     FineField fout(FineGrid);   fout.checkerboard = checkerboard; | ||||||
|     FineField fout(FineGrid); |  | ||||||
|  |  | ||||||
|     _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl; |     blockPromote(in,fin,subspace);       std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl; | ||||||
|     _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl; |     _Linop.HermOp(fin,fout);             std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl; | ||||||
|     _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl; |     blockProject(out,fout,subspace);     std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl; | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -99,24 +106,27 @@ public: | |||||||
|  |  | ||||||
|   OperatorFunction<FineField>   & _poly; |   OperatorFunction<FineField>   & _poly; | ||||||
|   LinearOperatorBase<FineField> &_Linop; |   LinearOperatorBase<FineField> &_Linop; | ||||||
|   Aggregation<Fobj,CComplex,nbasis> &_Aggregate; |   std::vector<FineField>        &subspace; | ||||||
|  |  | ||||||
|   ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop,  |   ProjectedFunctionHermOp(OperatorFunction<FineField> & poly, | ||||||
| 			  Aggregation<Fobj,CComplex,nbasis> &aggregate) :  | 			  LinearOperatorBase<FineField>& linop,  | ||||||
|  | 			  std::vector<FineField> & _subspace) : | ||||||
|     _poly(poly), |     _poly(poly), | ||||||
|     _Linop(linop), |     _Linop(linop), | ||||||
|     _Aggregate(aggregate)  {  }; |     subspace(_subspace) | ||||||
|  |   {  }; | ||||||
|  |  | ||||||
|   void operator()(const CoarseField& in, CoarseField& out) { |   void operator()(const CoarseField& in, CoarseField& out) { | ||||||
|      |      | ||||||
|     GridBase *FineGrid = _Aggregate.FineGrid; |     GridBase *FineGrid = subspace[0]._grid;     | ||||||
|  |     int   checkerboard = subspace[0].checkerboard; | ||||||
|  |  | ||||||
|     FineField fin(FineGrid) ;fin.checkerboard  =_Aggregate.checkerboard; |     FineField fin (FineGrid); fin.checkerboard =checkerboard; | ||||||
|     FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard; |     FineField fout(FineGrid);fout.checkerboard =checkerboard; | ||||||
|      |      | ||||||
|     _Aggregate.PromoteFromSubspace(in,fin);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl; |     blockPromote(in,fin,subspace);             std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl; | ||||||
|     _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl; |     _poly(_Linop,fin,fout);                    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl; | ||||||
|     _Aggregate.ProjectToSubspace(out,fout);    std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl; |     blockProject(out,fout,subspace);           std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl; | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -132,19 +142,23 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc | |||||||
|   LinearFunction<CoarseField> & _Poly; |   LinearFunction<CoarseField> & _Poly; | ||||||
|   OperatorFunction<FineField>   & _smoother; |   OperatorFunction<FineField>   & _smoother; | ||||||
|   LinearOperatorBase<FineField> &_Linop; |   LinearOperatorBase<FineField> &_Linop; | ||||||
|   Aggregation<Fobj,CComplex,nbasis> &_Aggregate; |  | ||||||
|   RealD                          _coarse_relax_tol; |   RealD                          _coarse_relax_tol; | ||||||
|  |   std::vector<FineField>        &_subspace; | ||||||
|  |    | ||||||
|   ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly, |   ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField>   &Poly, | ||||||
| 					   OperatorFunction<FineField>   &smoother, | 					   OperatorFunction<FineField>   &smoother, | ||||||
| 					   LinearOperatorBase<FineField> &Linop, | 					   LinearOperatorBase<FineField> &Linop, | ||||||
| 					   Aggregation<Fobj,CComplex,nbasis> &Aggregate, | 					   std::vector<FineField>        &subspace, | ||||||
| 					   RealD coarse_relax_tol=5.0e3)  | 					   RealD coarse_relax_tol=5.0e3)  | ||||||
|     : _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol)  {    }; |     : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace), | ||||||
|  |       _coarse_relax_tol(coarse_relax_tol)   | ||||||
|  |   {    }; | ||||||
|  |  | ||||||
|   int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) |   int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) | ||||||
|   { |   { | ||||||
|     CoarseField v(B); |     CoarseField v(B); | ||||||
|     RealD eval_poly = eval; |     RealD eval_poly = eval; | ||||||
|  |  | ||||||
|     // Apply operator |     // Apply operator | ||||||
|     _Poly(B,v); |     _Poly(B,v); | ||||||
|  |  | ||||||
| @@ -168,14 +182,13 @@ class ImplicitlyRestartedLanczosSmoothedTester  : public ImplicitlyRestartedLanc | |||||||
|   } |   } | ||||||
|   int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) |   int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) | ||||||
|   { |   { | ||||||
|     GridBase *FineGrid = _Aggregate.FineGrid; |     GridBase *FineGrid = _subspace[0]._grid;     | ||||||
|  |     int checkerboard   = _subspace[0].checkerboard; | ||||||
|     int checkerboard   = _Aggregate.checkerboard; |  | ||||||
|  |  | ||||||
|     FineField fB(FineGrid);fB.checkerboard =checkerboard; |     FineField fB(FineGrid);fB.checkerboard =checkerboard; | ||||||
|     FineField fv(FineGrid);fv.checkerboard =checkerboard; |     FineField fv(FineGrid);fv.checkerboard =checkerboard; | ||||||
|  |  | ||||||
|     _Aggregate.PromoteFromSubspace(B,fv); |     blockPromote(B,fv,_subspace);   | ||||||
|  |      | ||||||
|     _smoother(_Linop,fv,fB);  |     _smoother(_Linop,fv,fB);  | ||||||
|  |  | ||||||
|     RealD eval_poly = eval; |     RealD eval_poly = eval; | ||||||
| @@ -217,27 +230,65 @@ protected: | |||||||
|   int _checkerboard; |   int _checkerboard; | ||||||
|   LinearOperatorBase<FineField>                 & _FineOp; |   LinearOperatorBase<FineField>                 & _FineOp; | ||||||
|    |    | ||||||
|   // FIXME replace Aggregation with vector of fine; the code reuse is too small for |   std::vector<RealD>                              &evals_fine; | ||||||
|   // the hassle and complexity of cross coupling. |   std::vector<RealD>                              &evals_coarse;  | ||||||
|   Aggregation<Fobj,CComplex,nbasis>               _Aggregate;   |   std::vector<FineField>                          &subspace; | ||||||
|   std::vector<RealD>                              evals_fine; |   std::vector<CoarseField>                        &evec_coarse; | ||||||
|   std::vector<RealD>                              evals_coarse;  |  | ||||||
|   std::vector<CoarseField>                        evec_coarse; | private: | ||||||
|  |   std::vector<RealD>                              _evals_fine; | ||||||
|  |   std::vector<RealD>                              _evals_coarse;  | ||||||
|  |   std::vector<FineField>                          _subspace; | ||||||
|  |   std::vector<CoarseField>                        _evec_coarse; | ||||||
|  |  | ||||||
| public: | public: | ||||||
|  |  | ||||||
|   LocalCoherenceLanczos(GridBase *FineGrid, |   LocalCoherenceLanczos(GridBase *FineGrid, | ||||||
| 			GridBase *CoarseGrid, | 			GridBase *CoarseGrid, | ||||||
| 			LinearOperatorBase<FineField> &FineOp, | 			LinearOperatorBase<FineField> &FineOp, | ||||||
| 			int checkerboard) : | 			int checkerboard) : | ||||||
|     _CoarseGrid(CoarseGrid), |     _CoarseGrid(CoarseGrid), | ||||||
|     _FineGrid(FineGrid), |     _FineGrid(FineGrid), | ||||||
|     _Aggregate(CoarseGrid,FineGrid,checkerboard), |  | ||||||
|     _FineOp(FineOp), |     _FineOp(FineOp), | ||||||
|     _checkerboard(checkerboard) |     _checkerboard(checkerboard), | ||||||
|  |     evals_fine  (_evals_fine), | ||||||
|  |     evals_coarse(_evals_coarse), | ||||||
|  |     subspace    (_subspace), | ||||||
|  |     evec_coarse(_evec_coarse) | ||||||
|   { |   { | ||||||
|     evals_fine.resize(0); |     evals_fine.resize(0); | ||||||
|     evals_coarse.resize(0); |     evals_coarse.resize(0); | ||||||
|   }; |   }; | ||||||
|   void Orthogonalise(void ) { _Aggregate.Orthogonalise(); } |   ////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Alternate constructore, external storage for use by Hadrons module | ||||||
|  |   ////////////////////////////////////////////////////////////////////////// | ||||||
|  |   LocalCoherenceLanczos(GridBase *FineGrid, | ||||||
|  | 			GridBase *CoarseGrid, | ||||||
|  | 			LinearOperatorBase<FineField> &FineOp, | ||||||
|  | 			int checkerboard, | ||||||
|  | 			std::vector<FineField>   &ext_subspace, | ||||||
|  | 			std::vector<CoarseField> &ext_coarse, | ||||||
|  | 			std::vector<RealD>       &ext_eval_fine, | ||||||
|  | 			std::vector<RealD>       &ext_eval_coarse | ||||||
|  | 			) : | ||||||
|  |     _CoarseGrid(CoarseGrid), | ||||||
|  |     _FineGrid(FineGrid), | ||||||
|  |     _FineOp(FineOp), | ||||||
|  |     _checkerboard(checkerboard), | ||||||
|  |     evals_fine  (ext_eval_fine),  | ||||||
|  |     evals_coarse(ext_eval_coarse), | ||||||
|  |     subspace    (ext_subspace), | ||||||
|  |     evec_coarse (ext_coarse) | ||||||
|  |   { | ||||||
|  |     evals_fine.resize(0); | ||||||
|  |     evals_coarse.resize(0); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   void Orthogonalise(void ) { | ||||||
|  |     CoarseScalar InnerProd(_CoarseGrid);  | ||||||
|  |     blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl; | ||||||
|  |     blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl; | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   template<typename T>  static RealD normalise(T& v)  |   template<typename T>  static RealD normalise(T& v)  | ||||||
|   { |   { | ||||||
| @@ -246,43 +297,44 @@ public: | |||||||
|     v = v * (1.0/nn); |     v = v * (1.0/nn); | ||||||
|     return nn; |     return nn; | ||||||
|   } |   } | ||||||
|  |   /* | ||||||
|   void fakeFine(void) |   void fakeFine(void) | ||||||
|   { |   { | ||||||
|     int Nk = nbasis; |     int Nk = nbasis; | ||||||
|     _Aggregate.subspace.resize(Nk,_FineGrid); |     subspace.resize(Nk,_FineGrid); | ||||||
|     _Aggregate.subspace[0]=1.0; |     subspace[0]=1.0; | ||||||
|     _Aggregate.subspace[0].checkerboard=_checkerboard; |     subspace[0].checkerboard=_checkerboard; | ||||||
|     normalise(_Aggregate.subspace[0]); |     normalise(subspace[0]); | ||||||
|     PlainHermOp<FineField>    Op(_FineOp); |     PlainHermOp<FineField>    Op(_FineOp); | ||||||
|     for(int k=1;k<Nk;k++){ |     for(int k=1;k<Nk;k++){ | ||||||
|       _Aggregate.subspace[k].checkerboard=_checkerboard; |       subspace[k].checkerboard=_checkerboard; | ||||||
|       Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]); |       Op(subspace[k-1],subspace[k]); | ||||||
|       normalise(_Aggregate.subspace[k]); |       normalise(subspace[k]); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   */ | ||||||
|  |  | ||||||
|   void testFine(RealD resid)  |   void testFine(RealD resid)  | ||||||
|   { |   { | ||||||
|     assert(evals_fine.size() == nbasis); |     assert(evals_fine.size() == nbasis); | ||||||
|     assert(_Aggregate.subspace.size() == nbasis); |     assert(subspace.size() == nbasis); | ||||||
|     PlainHermOp<FineField>    Op(_FineOp); |     PlainHermOp<FineField>    Op(_FineOp); | ||||||
|     ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op); |     ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op); | ||||||
|     for(int k=0;k<nbasis;k++){ |     for(int k=0;k<nbasis;k++){ | ||||||
|       assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1); |       assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)  |   void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)  | ||||||
|   { |   { | ||||||
|     assert(evals_fine.size() == nbasis); |     assert(evals_fine.size() == nbasis); | ||||||
|     assert(_Aggregate.subspace.size() == nbasis); |     assert(subspace.size() == nbasis); | ||||||
|     ////////////////////////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL |     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL | ||||||
|     ////////////////////////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|     Chebyshev<FineField>                          ChebySmooth(cheby_smooth); |     Chebyshev<FineField>                          ChebySmooth(cheby_smooth); | ||||||
|     ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate); |     ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace); | ||||||
|     ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); |     ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); | ||||||
|  |  | ||||||
|     for(int k=0;k<evec_coarse.size();k++){ |     for(int k=0;k<evec_coarse.size();k++){ | ||||||
|       if ( k < nbasis ) {  |       if ( k < nbasis ) {  | ||||||
| @@ -302,34 +354,34 @@ public: | |||||||
|     PlainHermOp<FineField>    Op(_FineOp); |     PlainHermOp<FineField>    Op(_FineOp); | ||||||
|  |  | ||||||
|     evals_fine.resize(Nm); |     evals_fine.resize(Nm); | ||||||
|     _Aggregate.subspace.resize(Nm,_FineGrid); |     subspace.resize(Nm,_FineGrid); | ||||||
|  |  | ||||||
|     ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); |     ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); | ||||||
|  |  | ||||||
|     FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; |     FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard; | ||||||
|  |  | ||||||
|     int Nconv; |     int Nconv; | ||||||
|     IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false); |     IRL.calc(evals_fine,subspace,src,Nconv,false); | ||||||
|      |      | ||||||
|     // Shrink down to number saved |     // Shrink down to number saved | ||||||
|     assert(Nstop>=nbasis); |     assert(Nstop>=nbasis); | ||||||
|     assert(Nconv>=nbasis); |     assert(Nconv>=nbasis); | ||||||
|     evals_fine.resize(nbasis); |     evals_fine.resize(nbasis); | ||||||
|     _Aggregate.subspace.resize(nbasis,_FineGrid); |     subspace.resize(nbasis,_FineGrid); | ||||||
|   } |   } | ||||||
|   void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, |   void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, | ||||||
| 		  int Nstop, int Nk, int Nm,RealD resid,  | 		  int Nstop, int Nk, int Nm,RealD resid,  | ||||||
| 		  RealD MaxIt, RealD betastp, int MinRes) | 		  RealD MaxIt, RealD betastp, int MinRes) | ||||||
|   { |   { | ||||||
|     Chebyshev<FineField>                          Cheby(cheby_op); |     Chebyshev<FineField>                          Cheby(cheby_op); | ||||||
|     ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_Aggregate); |     ProjectedHermOp<Fobj,CComplex,nbasis>         Op(_FineOp,_subspace); | ||||||
|     ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate); |     ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace); | ||||||
|     ////////////////////////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL |     // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL | ||||||
|     ////////////////////////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|     Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); |     Chebyshev<FineField>                                           ChebySmooth(cheby_smooth); | ||||||
|     ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax); |     ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax); | ||||||
|  |  | ||||||
|     evals_coarse.resize(Nm); |     evals_coarse.resize(Nm); | ||||||
|     evec_coarse.resize(Nm,_CoarseGrid); |     evec_coarse.resize(Nm,_CoarseGrid); | ||||||
|   | |||||||
| @@ -108,6 +108,11 @@ namespace Grid { | |||||||
|  |  | ||||||
|     template<class Matrix> |     template<class Matrix> | ||||||
|     void operator() (Matrix & _Matrix,const Field &in, Field &out){ |     void operator() (Matrix & _Matrix,const Field &in, Field &out){ | ||||||
|  |       ZeroGuesser guess; | ||||||
|  |       (*this)(_Matrix,in,out,guess); | ||||||
|  |     } | ||||||
|  |     template<class Matrix, class Guesser> | ||||||
|  |     void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){ | ||||||
|  |  | ||||||
|       // FIXME CGdiagonalMee not implemented virtual function |       // FIXME CGdiagonalMee not implemented virtual function | ||||||
|       // FIXME use CBfactorise to control schur decomp |       // FIXME use CBfactorise to control schur decomp | ||||||
| @@ -129,7 +134,6 @@ namespace Grid { | |||||||
|       pickCheckerboard(Odd ,src_o,in); |       pickCheckerboard(Odd ,src_o,in); | ||||||
|       pickCheckerboard(Even,sol_e,out); |       pickCheckerboard(Even,sol_e,out); | ||||||
|       pickCheckerboard(Odd ,sol_o,out); |       pickCheckerboard(Odd ,sol_o,out); | ||||||
|  |  | ||||||
|       std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl; |       std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl; | ||||||
|      |      | ||||||
|       ///////////////////////////////////////////////////// |       ///////////////////////////////////////////////////// | ||||||
| @@ -146,6 +150,7 @@ namespace Grid { | |||||||
|       // Call the red-black solver |       // Call the red-black solver | ||||||
|       ////////////////////////////////////////////////////////////// |       ////////////////////////////////////////////////////////////// | ||||||
|       std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl; |       std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl; | ||||||
|  |       guess(src_o,sol_o); | ||||||
|       _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); |       _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | ||||||
|       std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl; |       std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called  the Mpc solver" <<std::endl; | ||||||
|  |  | ||||||
| @@ -190,6 +195,11 @@ namespace Grid { | |||||||
|   }; |   }; | ||||||
|     template<class Matrix> |     template<class Matrix> | ||||||
|     void operator() (Matrix & _Matrix,const Field &in, Field &out){ |     void operator() (Matrix & _Matrix,const Field &in, Field &out){ | ||||||
|  |       ZeroGuesser guess; | ||||||
|  |       (*this)(_Matrix,in,out,guess); | ||||||
|  |     } | ||||||
|  |     template<class Matrix, class Guesser> | ||||||
|  |     void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){ | ||||||
|  |  | ||||||
|       // FIXME CGdiagonalMee not implemented virtual function |       // FIXME CGdiagonalMee not implemented virtual function | ||||||
|       // FIXME use CBfactorise to control schur decomp |       // FIXME use CBfactorise to control schur decomp | ||||||
| @@ -225,6 +235,7 @@ namespace Grid { | |||||||
|       // Call the red-black solver |       // Call the red-black solver | ||||||
|       ////////////////////////////////////////////////////////////// |       ////////////////////////////////////////////////////////////// | ||||||
|       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl; |       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl; | ||||||
|  |       guess(src_o,sol_o); | ||||||
|       _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); |       _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | ||||||
|  |  | ||||||
|       /////////////////////////////////////////////////// |       /////////////////////////////////////////////////// | ||||||
| @@ -269,6 +280,11 @@ namespace Grid { | |||||||
|  |  | ||||||
|     template<class Matrix> |     template<class Matrix> | ||||||
|     void operator() (Matrix & _Matrix,const Field &in, Field &out){ |     void operator() (Matrix & _Matrix,const Field &in, Field &out){ | ||||||
|  |       ZeroGuesser guess; | ||||||
|  |       (*this)(_Matrix,in,out,guess); | ||||||
|  |     } | ||||||
|  |     template<class Matrix,class Guesser> | ||||||
|  |     void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){ | ||||||
|  |  | ||||||
|       // FIXME CGdiagonalMee not implemented virtual function |       // FIXME CGdiagonalMee not implemented virtual function | ||||||
|       // FIXME use CBfactorise to control schur decomp |       // FIXME use CBfactorise to control schur decomp | ||||||
| @@ -305,6 +321,7 @@ namespace Grid { | |||||||
|       ////////////////////////////////////////////////////////////// |       ////////////////////////////////////////////////////////////// | ||||||
|       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl; |       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl; | ||||||
| //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | ||||||
|  |       guess(src_o,tmp); | ||||||
|       _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd); |       _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd); | ||||||
|       _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd); |       _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd); | ||||||
|  |  | ||||||
| @@ -348,6 +365,11 @@ namespace Grid { | |||||||
|  |  | ||||||
|     template<class Matrix> |     template<class Matrix> | ||||||
|     void operator() (Matrix & _Matrix,const Field &in, Field &out){ |     void operator() (Matrix & _Matrix,const Field &in, Field &out){ | ||||||
|  |       ZeroGuesser guess; | ||||||
|  |       (*this)(_Matrix,in,out,guess); | ||||||
|  |     } | ||||||
|  |     template<class Matrix, class Guesser> | ||||||
|  |     void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){ | ||||||
|  |  | ||||||
|       // FIXME CGdiagonalMee not implemented virtual function |       // FIXME CGdiagonalMee not implemented virtual function | ||||||
|       // FIXME use CBfactorise to control schur decomp |       // FIXME use CBfactorise to control schur decomp | ||||||
| @@ -385,6 +407,7 @@ namespace Grid { | |||||||
|       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl; |       std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl; | ||||||
| //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | //      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd); | ||||||
| //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd); | //      _HermitianRBSolver(_HermOpEO,src_o,tmp);  assert(tmp.checkerboard==Odd); | ||||||
|  |       guess(src_o,tmp); | ||||||
|       _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd); |       _HermitianRBSolver(src_o,tmp);  assert(tmp.checkerboard==Odd); | ||||||
|       _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd); |       _Matrix.MooeeInv(tmp,sol_o);        assert(  sol_o.checkerboard   ==Odd); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -277,7 +277,9 @@ public: | |||||||
|     uint8_t *cp = (uint8_t *)ptr; |     uint8_t *cp = (uint8_t *)ptr; | ||||||
|     if ( ptr ) {  |     if ( ptr ) {  | ||||||
|     // One touch per 4k page, static OMP loop to catch same loop order |     // One touch per 4k page, static OMP loop to catch same loop order | ||||||
|  | #ifdef GRID_OMP | ||||||
| #pragma omp parallel for schedule(static) | #pragma omp parallel for schedule(static) | ||||||
|  | #endif | ||||||
|       for(size_type n=0;n<bytes;n+=4096){ |       for(size_type n=0;n<bytes;n+=4096){ | ||||||
| 	cp[n]=0; | 	cp[n]=0; | ||||||
|       } |       } | ||||||
|   | |||||||
| @@ -44,11 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) | |||||||
|   MPI_Initialized(&flag); // needed to coexist with other libs apparently |   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||||
|   if ( !flag ) { |   if ( !flag ) { | ||||||
|     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); |     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); | ||||||
|     assert (provided == MPI_THREAD_MULTIPLE); |     //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE | ||||||
|  |     if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) || | ||||||
|  |         (nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) ) | ||||||
|  |       assert(0); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   Grid_quiesce_nodes(); |   Grid_quiesce_nodes(); | ||||||
|  |  | ||||||
|  |   // Never clean up as done once. | ||||||
|   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|  |  | ||||||
|   GlobalSharedMemory::Init(communicator_world); |   GlobalSharedMemory::Init(communicator_world); | ||||||
| @@ -85,9 +89,17 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c | |||||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)  | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)  | ||||||
| { | { | ||||||
|   MPI_Comm optimal_comm; |   MPI_Comm optimal_comm; | ||||||
|   GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); // Remap using the shared memory optimising routine |   //////////////////////////////////////////////////// | ||||||
|  |   // Remap using the shared memory optimising routine | ||||||
|  |   // The remap creates a comm which must be freed | ||||||
|  |   //////////////////////////////////////////////////// | ||||||
|  |   GlobalSharedMemory::OptimalCommunicator    (processors,optimal_comm); | ||||||
|   InitFromMPICommunicator(processors,optimal_comm); |   InitFromMPICommunicator(processors,optimal_comm); | ||||||
|   SetCommunicator(optimal_comm); |   SetCommunicator(optimal_comm); | ||||||
|  |   /////////////////////////////////////////////////// | ||||||
|  |   // Free the temp communicator | ||||||
|  |   /////////////////////////////////////////////////// | ||||||
|  |   MPI_Comm_free(&optimal_comm); | ||||||
| } | } | ||||||
|  |  | ||||||
| ////////////////////////////////// | ////////////////////////////////// | ||||||
| @@ -183,8 +195,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors, | |||||||
|  |  | ||||||
|   } else { |   } else { | ||||||
|     srank = 0; |     srank = 0; | ||||||
|     comm_split    = parent.communicator; |     int ierr = MPI_Comm_dup (parent.communicator,&comm_split); | ||||||
|     //    std::cout << " Inherited communicator " <<comm_split <<std::endl; |     assert(ierr==0); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -197,6 +209,11 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors, | |||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   SetCommunicator(comm_split); |   SetCommunicator(comm_split); | ||||||
|    |    | ||||||
|  |   /////////////////////////////////////////////// | ||||||
|  |   // Free the temp communicator  | ||||||
|  |   /////////////////////////////////////////////// | ||||||
|  |   MPI_Comm_free(&comm_split); | ||||||
|  |  | ||||||
|   if(0){  |   if(0){  | ||||||
|     std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; |     std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; | ||||||
|     for(int d=0;d<processors.size();d++){ |     for(int d=0;d<processors.size();d++){ | ||||||
| @@ -210,6 +227,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors, | |||||||
|  |  | ||||||
| void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base) | void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base) | ||||||
| { | { | ||||||
|  |   //////////////////////////////////////////////////// | ||||||
|  |   // Creates communicator, and the communicator_halo | ||||||
|  |   //////////////////////////////////////////////////// | ||||||
|   _ndimension = processors.size(); |   _ndimension = processors.size(); | ||||||
|   _processor_coor.resize(_ndimension); |   _processor_coor.resize(_ndimension); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -133,6 +133,7 @@ class SharedMemory | |||||||
|  |  | ||||||
|  public: |  public: | ||||||
|   SharedMemory() {}; |   SharedMemory() {}; | ||||||
|  |   ~SharedMemory(); | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // set the buffers & sizes |   // set the buffers & sizes | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
|  | #include <pwd.h> | ||||||
|  |  | ||||||
| namespace Grid {  | namespace Grid {  | ||||||
|  |  | ||||||
| @@ -113,19 +114,151 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) | |||||||
|   assert(WorldNode!=-1); |   assert(WorldNode!=-1); | ||||||
|   _ShmSetup=1; |   _ShmSetup=1; | ||||||
| } | } | ||||||
|  | // Gray encode support  | ||||||
| void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) | int BinaryToGray (int  binary) { | ||||||
|  |   int gray = (binary>>1)^binary; | ||||||
|  |   return gray; | ||||||
|  | } | ||||||
|  | int Log2Size(int TwoToPower,int MAXLOG2) | ||||||
| { | { | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Assert power of two shm_size. |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   int log2size = -1; |   int log2size = -1; | ||||||
|   for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){   |   for(int i=0;i<=MAXLOG2;i++){ | ||||||
|     if ( (0x1<<i) == WorldShmSize ) { |     if ( (0x1<<i) == TwoToPower ) { | ||||||
|       log2size = i; |       log2size = i; | ||||||
|       break; |       break; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   return log2size; | ||||||
|  | } | ||||||
|  | void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm) | ||||||
|  | { | ||||||
|  | #undef HYPERCUBE  | ||||||
|  | #ifdef HYPERCUBE | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Assert power of two shm_size. | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); | ||||||
|  |   assert(log2size != -1); | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Identify the hypercube coordinate of this node using hostname | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // n runs 0...7 9...16 18...25 27...34     (8*4)  5 bits | ||||||
|  |   // i runs 0..7                                    3 bits | ||||||
|  |   // r runs 0..3                                    2 bits | ||||||
|  |   // 2^10 = 1024 nodes | ||||||
|  |   const int maxhdim = 10;  | ||||||
|  |   std::vector<int> HyperCubeCoords(maxhdim,0); | ||||||
|  |   std::vector<int> RootHyperCubeCoords(maxhdim,0); | ||||||
|  |   int R; | ||||||
|  |   int I; | ||||||
|  |   int N; | ||||||
|  |   const int namelen = _POSIX_HOST_NAME_MAX; | ||||||
|  |   char name[namelen]; | ||||||
|  |  | ||||||
|  |   // Parse ICE-XA hostname to get hypercube location | ||||||
|  |   gethostname(name,namelen); | ||||||
|  |   int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; | ||||||
|  |   assert(nscan==3); | ||||||
|  |  | ||||||
|  |   int nlo = N%9; | ||||||
|  |   int nhi = N/9; | ||||||
|  |   uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ; | ||||||
|  |   uint32_t rootcoor  = hypercoor; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////// | ||||||
|  |   // Print debug info | ||||||
|  |   ////////////////////////////////////////////////////////////////// | ||||||
|  |   for(int d=0;d<maxhdim;d++){ | ||||||
|  |     HyperCubeCoords[d] = (hypercoor>>d)&0x1; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   std::string hname(name); | ||||||
|  |   std::cout << "hostname "<<hname<<std::endl; | ||||||
|  |   std::cout << "R " << R << " I " << I << " N "<< N<< | ||||||
|  |             << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////// | ||||||
|  |   // broadcast node 0's base coordinate for this partition. | ||||||
|  |   ////////////////////////////////////////////////////////////////// | ||||||
|  |   MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);  | ||||||
|  |   hypercoor=hypercoor-rootcoor; | ||||||
|  |   assert(hypercoor<WorldSize); | ||||||
|  |   assert(hypercoor>=0); | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////// | ||||||
|  |   // Printing | ||||||
|  |   ////////////////////////////////////// | ||||||
|  |   for(int d=0;d<maxhdim;d++){ | ||||||
|  |     HyperCubeCoords[d] = (hypercoor>>d)&0x1; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Identify subblock of ranks on node spreading across dims | ||||||
|  |   // in a maximally symmetrical way | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int ndimension              = processors.size(); | ||||||
|  |   std::vector<int> processor_coor(ndimension); | ||||||
|  |   std::vector<int> WorldDims = processors;   std::vector<int> ShmDims  (ndimension,1);  std::vector<int> NodeDims (ndimension); | ||||||
|  |   std::vector<int> ShmCoor  (ndimension);    std::vector<int> NodeCoor (ndimension);    std::vector<int> WorldCoor(ndimension); | ||||||
|  |   std::vector<int> HyperCoor(ndimension); | ||||||
|  |   int dim = 0; | ||||||
|  |   for(int l2=0;l2<log2size;l2++){ | ||||||
|  |     while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension; | ||||||
|  |     ShmDims[dim]*=2; | ||||||
|  |     dim=(dim+1)%ndimension; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Establish torus of processes and nodes with sub-blockings | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   for(int d=0;d<ndimension;d++){ | ||||||
|  |     NodeDims[d] = WorldDims[d]/ShmDims[d]; | ||||||
|  |   } | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Map Hcube according to physical lattice  | ||||||
|  |   // must partition. Loop over dims and find out who would join. | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int hcoor = hypercoor; | ||||||
|  |   for(int d=0;d<ndimension;d++){ | ||||||
|  |      int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE); | ||||||
|  |      int msk  = (0x1<<bits)-1; | ||||||
|  |      HyperCoor[d]=hcoor & msk;   | ||||||
|  |      HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic | ||||||
|  |      hcoor = hcoor >> bits; | ||||||
|  |   }  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Check processor counts match | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int Nprocessors=1; | ||||||
|  |   for(int i=0;i<ndimension;i++){ | ||||||
|  |     Nprocessors*=processors[i]; | ||||||
|  |   } | ||||||
|  |   assert(WorldSize==Nprocessors); | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Establish mapping between lexico physics coord and WorldRank | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int rank; | ||||||
|  |  | ||||||
|  |   Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode   ,NodeDims); | ||||||
|  |  | ||||||
|  |   for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d]; | ||||||
|  |  | ||||||
|  |   Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims); | ||||||
|  |   for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d]; | ||||||
|  |   Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims); | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////////////////////// | ||||||
|  |   // Build the new communicator | ||||||
|  |   ///////////////////////////////////////////////////////////////// | ||||||
|  |   int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); | ||||||
|  |   assert(ierr==0); | ||||||
|  | #else  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Assert power of two shm_size. | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); | ||||||
|   assert(log2size != -1); |   assert(log2size != -1); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -174,7 +307,69 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors, | |||||||
|   ///////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////// | ||||||
|   int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); |   int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|  | #endif | ||||||
| } | } | ||||||
|  | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // SHMGET | ||||||
|  | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #ifdef GRID_MPI3_SHMGET | ||||||
|  | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
|  | { | ||||||
|  |   std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; | ||||||
|  |   assert(_ShmSetup==1); | ||||||
|  |   assert(_ShmAlloc==0); | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // allocate the shared windows for our group | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   MPI_Barrier(WorldShmComm); | ||||||
|  |   WorldShmCommBufs.resize(WorldShmSize); | ||||||
|  |   std::vector<int> shmids(WorldShmSize); | ||||||
|  |  | ||||||
|  |   if ( WorldShmRank == 0 ) { | ||||||
|  |     for(int r=0;r<WorldShmSize;r++){ | ||||||
|  |       size_t size = bytes; | ||||||
|  |       key_t key   = IPC_PRIVATE; | ||||||
|  |       int flags = IPC_CREAT | SHM_R | SHM_W; | ||||||
|  | #ifdef SHM_HUGETLB | ||||||
|  |       if (Hugepages) flags|=SHM_HUGETLB; | ||||||
|  | #endif | ||||||
|  |       if ((shmids[r]= shmget(key,size, flags)) ==-1) { | ||||||
|  |         int errsv = errno; | ||||||
|  |         printf("Errno %d\n",errsv); | ||||||
|  |         printf("key   %d\n",key); | ||||||
|  |         printf("size  %lld\n",size); | ||||||
|  |         printf("flags %d\n",flags); | ||||||
|  |         perror("shmget"); | ||||||
|  |         exit(1); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   MPI_Barrier(WorldShmComm); | ||||||
|  |   MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm); | ||||||
|  |   MPI_Barrier(WorldShmComm); | ||||||
|  |  | ||||||
|  |   for(int r=0;r<WorldShmSize;r++){ | ||||||
|  |     WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0); | ||||||
|  |     if (WorldShmCommBufs[r] == (uint64_t *)-1) { | ||||||
|  |       perror("Shared memory attach failure"); | ||||||
|  |       shmctl(shmids[r], IPC_RMID, NULL); | ||||||
|  |       exit(2); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   MPI_Barrier(WorldShmComm); | ||||||
|  |   /////////////////////////////////// | ||||||
|  |   // Mark for clean up | ||||||
|  |   /////////////////////////////////// | ||||||
|  |   for(int r=0;r<WorldShmSize;r++){ | ||||||
|  |     shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL); | ||||||
|  |   } | ||||||
|  |   MPI_Barrier(WorldShmComm); | ||||||
|  |  | ||||||
|  |   _ShmAlloc=1; | ||||||
|  |   _ShmAllocBytes  = bytes; | ||||||
|  | } | ||||||
|  | #endif | ||||||
|   |   | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Hugetlbfs mapping intended | // Hugetlbfs mapping intended | ||||||
| @@ -182,6 +377,7 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors, | |||||||
| #ifdef GRID_MPI3_SHMMMAP | #ifdef GRID_MPI3_SHMMMAP | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| { | { | ||||||
|  |   std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; | ||||||
|   assert(_ShmSetup==1); |   assert(_ShmSetup==1); | ||||||
|   assert(_ShmAlloc==0); |   assert(_ShmAlloc==0); | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -191,7 +387,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   WorldShmCommBufs.resize(WorldShmSize); |   WorldShmCommBufs.resize(WorldShmSize); | ||||||
|    |    | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Hugetlbf and others map filesystems as mappable huge pages |   // Hugetlbfs and others map filesystems as mappable huge pages | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   char shm_name [NAME_MAX]; |   char shm_name [NAME_MAX]; | ||||||
|   for(int r=0;r<WorldShmSize;r++){ |   for(int r=0;r<WorldShmSize;r++){ | ||||||
| @@ -218,6 +414,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|     assert(((uint64_t)ptr&0x3F)==0); |     assert(((uint64_t)ptr&0x3F)==0); | ||||||
|     close(fd); |     close(fd); | ||||||
|     WorldShmCommBufs[r] =ptr; |     WorldShmCommBufs[r] =ptr; | ||||||
|  |     std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; | ||||||
|  |   } | ||||||
|  |   _ShmAlloc=1; | ||||||
|  |   _ShmAllocBytes  = bytes; | ||||||
|  | }; | ||||||
|  | #endif // MMAP | ||||||
|  |  | ||||||
|  | #ifdef GRID_MPI3_SHM_NONE | ||||||
|  | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
|  | { | ||||||
|  |   std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; | ||||||
|  |   assert(_ShmSetup==1); | ||||||
|  |   assert(_ShmAlloc==0); | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // allocate the shared windows for our group | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   MPI_Barrier(WorldShmComm); | ||||||
|  |   WorldShmCommBufs.resize(WorldShmSize); | ||||||
|  |    | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Hugetlbf and others map filesystems as mappable huge pages | ||||||
|  |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   char shm_name [NAME_MAX]; | ||||||
|  |   assert(WorldShmSize == 1); | ||||||
|  |   for(int r=0;r<WorldShmSize;r++){ | ||||||
|  |      | ||||||
|  |     int fd=-1; | ||||||
|  |     int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ; | ||||||
|  | #ifdef MAP_POPULATE     | ||||||
|  |     mmap_flag|=MAP_POPULATE; | ||||||
|  | #endif | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |     if ( flags ) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |     void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);  | ||||||
|  |     if ( ptr == (void *)MAP_FAILED ) {     | ||||||
|  |       printf("mmap %s failed\n",shm_name); | ||||||
|  |       perror("failed mmap");      assert(0);     | ||||||
|  |     } | ||||||
|  |     assert(((uint64_t)ptr&0x3F)==0); | ||||||
|  |     close(fd); | ||||||
|  |     WorldShmCommBufs[r] =ptr; | ||||||
|  |     std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; | ||||||
|   } |   } | ||||||
|   _ShmAlloc=1; |   _ShmAlloc=1; | ||||||
|   _ShmAllocBytes  = bytes; |   _ShmAllocBytes  = bytes; | ||||||
| @@ -232,6 +471,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| {  | {  | ||||||
|  |   std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; | ||||||
|   assert(_ShmSetup==1); |   assert(_ShmSetup==1); | ||||||
|   assert(_ShmAlloc==0);  |   assert(_ShmAlloc==0);  | ||||||
|   MPI_Barrier(WorldShmComm); |   MPI_Barrier(WorldShmComm); | ||||||
| @@ -243,7 +483,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| 	 | 	 | ||||||
|       size_t size = bytes; |       size_t size = bytes; | ||||||
|        |        | ||||||
|       sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r); |       struct passwd *pw = getpwuid (getuid()); | ||||||
|  |       sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r); | ||||||
|        |        | ||||||
|       shm_unlink(shm_name); |       shm_unlink(shm_name); | ||||||
|       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); |       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); | ||||||
| @@ -259,7 +500,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #endif | #endif | ||||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); |       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); | ||||||
|        |        | ||||||
|       if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    } |       std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl; | ||||||
|  |       if ( ptr == (void * )MAP_FAILED ) {        | ||||||
|  | 	perror("failed mmap");      | ||||||
|  | 	assert(0);     | ||||||
|  |       } | ||||||
|       assert(((uint64_t)ptr&0x3F)==0); |       assert(((uint64_t)ptr&0x3F)==0); | ||||||
|        |        | ||||||
|       WorldShmCommBufs[r] =ptr; |       WorldShmCommBufs[r] =ptr; | ||||||
| @@ -274,7 +519,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|  |  | ||||||
|       size_t size = bytes ; |       size_t size = bytes ; | ||||||
|        |        | ||||||
|       sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r); |       struct passwd *pw = getpwuid (getuid()); | ||||||
|  |       sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r); | ||||||
|        |        | ||||||
|       int fd=shm_open(shm_name,O_RDWR,0666); |       int fd=shm_open(shm_name,O_RDWR,0666); | ||||||
|       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      } |       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      } | ||||||
| @@ -292,6 +538,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////// | ||||||
|   // Global shared functionality finished |   // Global shared functionality finished | ||||||
|   // Now move to per communicator functionality |   // Now move to per communicator functionality | ||||||
| @@ -318,11 +567,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | |||||||
|   heap_size = GlobalSharedMemory::ShmAllocBytes(); |   heap_size = GlobalSharedMemory::ShmAllocBytes(); | ||||||
|   for(int r=0;r<ShmSize;r++){ |   for(int r=0;r<ShmSize;r++){ | ||||||
|  |  | ||||||
|     uint32_t sr = (r==ShmRank) ? GlobalSharedMemory::WorldRank : 0 ; |     uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ; | ||||||
|  |  | ||||||
|     MPI_Allreduce(MPI_IN_PLACE,&sr,1,MPI_UINT32_T,MPI_SUM,comm); |     MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); | ||||||
|  |  | ||||||
|     ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[sr]; |     ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; | ||||||
|  |     //    std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< "  wsr = "<<wsr<<std::endl; | ||||||
|   } |   } | ||||||
|   ShmBufferFreeAll(); |   ShmBufferFreeAll(); | ||||||
|  |  | ||||||
| @@ -391,5 +641,12 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) | |||||||
|     return (void *) remote; |     return (void *) remote; | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  | SharedMemory::~SharedMemory() | ||||||
|  | { | ||||||
|  |   int MPI_is_finalised;  MPI_Finalized(&MPI_is_finalised); | ||||||
|  |   if ( !MPI_is_finalised ) {  | ||||||
|  |     MPI_Comm_free(&ShmComm); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -122,5 +122,7 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) | |||||||
| { | { | ||||||
|   return NULL; |   return NULL; | ||||||
| } | } | ||||||
|  | SharedMemory::~SharedMemory() | ||||||
|  | {}; | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -45,31 +45,33 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen | |||||||
|   int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  |   int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|  |   int ent = 0; | ||||||
|  |  | ||||||
|  |   static std::vector<std::pair<int,int> > table; table.resize(e1*e2); | ||||||
|  |  | ||||||
|   int stride=rhs._grid->_slice_stride[dimension]; |   int stride=rhs._grid->_slice_stride[dimension]; | ||||||
|   if ( cbmask == 0x3 ) {  |   if ( cbmask == 0x3 ) {  | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o  = n*stride; | 	int o  = n*stride; | ||||||
| 	int bo = n*e2; | 	int bo = n*e2; | ||||||
| 	buffer[off+bo+b]=rhs._odata[so+o+b]; | 	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } else {  |   } else {  | ||||||
|      int bo=0; |      int bo=0; | ||||||
|      std::vector<std::pair<int,int> > table; |  | ||||||
|      for(int n=0;n<e1;n++){ |      for(int n=0;n<e1;n++){ | ||||||
|        for(int b=0;b<e2;b++){ |        for(int b=0;b<e2;b++){ | ||||||
| 	 int o  = n*stride; | 	 int o  = n*stride; | ||||||
| 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||||
| 	 if ( ocb &cbmask ) { | 	 if ( ocb &cbmask ) { | ||||||
| 	   table.push_back(std::pair<int,int> (bo++,o+b)); | 	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b); | ||||||
| 	 } | 	 } | ||||||
|        } |        } | ||||||
|      } |      } | ||||||
|      parallel_for(int i=0;i<table.size();i++){ |  | ||||||
|        buffer[off+table[i].first]=rhs._odata[so+table[i].second]; |  | ||||||
|   } |   } | ||||||
|  |   parallel_for(int i=0;i<ent;i++){ | ||||||
|  |     buffer[table[i].first]=rhs._odata[table[i].second]; | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -141,32 +143,36 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo | |||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   int stride=rhs._grid->_slice_stride[dimension]; |   int stride=rhs._grid->_slice_stride[dimension]; | ||||||
|  |  | ||||||
|  |   static std::vector<std::pair<int,int> > table; table.resize(e1*e2); | ||||||
|  |   int ent    =0; | ||||||
|  |  | ||||||
|   if ( cbmask ==0x3 ) { |   if ( cbmask ==0x3 ) { | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ |  | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||||
| 	int bo  =n*rhs._grid->_slice_block[dimension]; | 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||||
| 	rhs._odata[so+o+b]=buffer[bo+b]; | 	table[ent++] = std::pair<int,int>(so+o+b,bo+b); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   } else {  |   } else {  | ||||||
|     std::vector<std::pair<int,int> > table; |  | ||||||
|     int bo=0; |     int bo=0; | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||||
| 	if ( ocb & cbmask ) { | 	if ( ocb & cbmask ) { | ||||||
| 	  table.push_back(std::pair<int,int> (so+o+b,bo++)); | 	  table[ent++]=std::pair<int,int> (so+o+b,bo++); | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     parallel_for(int i=0;i<table.size();i++){ |   } | ||||||
|        //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl; |  | ||||||
|  |   parallel_for(int i=0;i<ent;i++){ | ||||||
|     rhs._odata[table[i].first]=buffer[table[i].second]; |     rhs._odata[table[i].first]=buffer[table[i].second]; | ||||||
|   } |   } | ||||||
| } | } | ||||||
| } |  | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| // Scatter for when there *is* need to SIMD split | // Scatter for when there *is* need to SIMD split | ||||||
| @@ -228,29 +234,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc |   int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   int stride = rhs._grid->_slice_stride[dimension]; |   int stride = rhs._grid->_slice_stride[dimension]; | ||||||
|   if(cbmask == 0x3 ){ |   static std::vector<std::pair<int,int> > table; table.resize(e1*e2); | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ |   int ent=0; | ||||||
|       for(int b=0;b<e2;b++){ |  | ||||||
|  |  | ||||||
|  |   if(cbmask == 0x3 ){ | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|         int o =n*stride+b; |         int o =n*stride+b; | ||||||
|   	//lhs._odata[lo+o]=rhs._odata[ro+o]; | 	table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||||
| 	vstream(lhs._odata[lo+o],rhs._odata[ro+o]); |  | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } else {  |   } else {  | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|   |  | ||||||
|         int o =n*stride+b; |         int o =n*stride+b; | ||||||
|         int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); |         int ocb=1<<lhs._grid->CheckerBoardFromOindex(o); | ||||||
|         if ( ocb&cbmask ) { |         if ( ocb&cbmask ) { | ||||||
|   	//lhs._odata[lo+o]=rhs._odata[ro+o]; | 	  table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||||
| 	  vstream(lhs._odata[lo+o],rhs._odata[ro+o]); |  | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   parallel_for(int i=0;i<ent;i++){ | ||||||
|  |     lhs._odata[table[i].first]=rhs._odata[table[i].second]; | ||||||
|  |   } | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) | template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) | ||||||
| @@ -269,16 +278,28 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | |||||||
|   int e2=rhs._grid->_slice_block [dimension]; |   int e2=rhs._grid->_slice_block [dimension]; | ||||||
|   int stride = rhs._grid->_slice_stride[dimension]; |   int stride = rhs._grid->_slice_stride[dimension]; | ||||||
|  |  | ||||||
|   parallel_for_nest2(int n=0;n<e1;n++){ |   static std::vector<std::pair<int,int> > table;  table.resize(e1*e2); | ||||||
|   for(int b=0;b<e2;b++){ |   int ent=0; | ||||||
|  |  | ||||||
|  |   double t_tab,t_perm; | ||||||
|  |   if ( cbmask == 0x3 ) { | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |     for(int b=0;b<e2;b++){ | ||||||
|  |       int o  =n*stride; | ||||||
|  |       table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||||
|  |     }} | ||||||
|  |   } else { | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |     for(int b=0;b<e2;b++){ | ||||||
|       int o  =n*stride; |       int o  =n*stride; | ||||||
|       int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b); |       int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b); | ||||||
|       if ( ocb&cbmask ) { |       if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||||
| 	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type); |     }} | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   }} |   parallel_for(int i=0;i<ent;i++){ | ||||||
|  |     permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type); | ||||||
|  |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| @@ -291,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r | |||||||
|   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); |   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); | ||||||
|   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); |   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); | ||||||
|  |  | ||||||
|  |   double t_local; | ||||||
|  |    | ||||||
|   if ( sshift[0] == sshift[1] ) { |   if ( sshift[0] == sshift[1] ) { | ||||||
|     Cshift_local(ret,rhs,dimension,shift,0x3); |     Cshift_local(ret,rhs,dimension,shift,0x3); | ||||||
|   } else { |   } else { | ||||||
| @@ -299,7 +322,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| { | { | ||||||
|   GridBase *grid = rhs._grid; |   GridBase *grid = rhs._grid; | ||||||
|   int fd = grid->_fdimensions[dimension]; |   int fd = grid->_fdimensions[dimension]; | ||||||
| @@ -326,10 +349,6 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | |||||||
|     int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); |     int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); | ||||||
|     int sx     = (x+sshift)%rd; |     int sx     = (x+sshift)%rd; | ||||||
|      |      | ||||||
|     // FIXME : This must change where we have a  |  | ||||||
|     // Rotate slice. |  | ||||||
|      |  | ||||||
|     // Document how this works ; why didn't I do this when I first wrote it... |  | ||||||
|     // wrap is whether sshift > rd. |     // wrap is whether sshift > rd. | ||||||
|     //  num is sshift mod rd. |     //  num is sshift mod rd. | ||||||
|     //  |     //  | ||||||
| @@ -366,9 +385,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | |||||||
|     if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist); |     if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist); | ||||||
|     else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask);  |     else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask);  | ||||||
|    |    | ||||||
|    |  | ||||||
|   } |   } | ||||||
|   return ret; |  | ||||||
| } | } | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -54,13 +54,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | |||||||
|  |  | ||||||
|  |  | ||||||
|   if ( !comm_dim ) { |   if ( !comm_dim ) { | ||||||
|     //    std::cout << "Cshift_local" <<std::endl; |     //std::cout << "CSHIFT: Cshift_local" <<std::endl; | ||||||
|     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding |     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding | ||||||
|   } else if ( splice_dim ) { |   } else if ( splice_dim ) { | ||||||
|     //    std::cout << "Cshift_comms_simd" <<std::endl; |     //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl; | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift); |     Cshift_comms_simd(ret,rhs,dimension,shift); | ||||||
|   } else { |   } else { | ||||||
|     //    std::cout << "Cshift_comms" <<std::endl; |     //std::cout << "CSHIFT: Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms(ret,rhs,dimension,shift); |     Cshift_comms(ret,rhs,dimension,shift); | ||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| @@ -91,9 +91,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob | |||||||
|   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); |   sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even); | ||||||
|   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); |   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); | ||||||
|  |  | ||||||
|  |   //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; | ||||||
|   if ( sshift[0] == sshift[1] ) { |   if ( sshift[0] == sshift[1] ) { | ||||||
|  |     //std::cout << "Single pass Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift,0x3); |     Cshift_comms_simd(ret,rhs,dimension,shift,0x3); | ||||||
|   } else { |   } else { | ||||||
|  |     //std::cout << "Two pass Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes |     Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration |     Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration | ||||||
|   } |   } | ||||||
| @@ -175,6 +178,10 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   int simd_layout     = grid->_simd_layout[dimension]; |   int simd_layout     = grid->_simd_layout[dimension]; | ||||||
|   int comm_dim        = grid->_processors[dimension] >1 ; |   int comm_dim        = grid->_processors[dimension] >1 ; | ||||||
|  |  | ||||||
|  |   //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd | ||||||
|  |   //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout  | ||||||
|  |   //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; | ||||||
|  |  | ||||||
|   assert(comm_dim==1); |   assert(comm_dim==1); | ||||||
|   assert(simd_layout==2); |   assert(simd_layout==2); | ||||||
|   assert(shift>=0); |   assert(shift>=0); | ||||||
|   | |||||||
| @@ -244,19 +244,11 @@ namespace Grid { | |||||||
|  |  | ||||||
|   template<class sobj,class vobj> strong_inline |   template<class sobj,class vobj> strong_inline | ||||||
|   RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ |   RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||||
|     ret.checkerboard = x.checkerboard; |     return axpy_norm_fast(ret,a,x,y); | ||||||
|     conformable(ret,x); |  | ||||||
|     conformable(x,y); |  | ||||||
|     axpy(ret,a,x,y); |  | ||||||
|     return norm2(ret); |  | ||||||
|   } |   } | ||||||
|   template<class sobj,class vobj> strong_inline |   template<class sobj,class vobj> strong_inline | ||||||
|   RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ |   RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ | ||||||
|     ret.checkerboard = x.checkerboard; |     return axpby_norm_fast(ret,a,b,x,y); | ||||||
|     conformable(ret,x); |  | ||||||
|     conformable(x,y); |  | ||||||
|     axpby(ret,a,b,x,y); |  | ||||||
|     return norm2(ret); // FIXME implement parallel norm in ss loop |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -257,7 +257,40 @@ public: | |||||||
|     }  	 |     }  	 | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   Lattice(Lattice&& r){ // move constructor | ||||||
|  |     _grid = r._grid; | ||||||
|  |     checkerboard = r.checkerboard; | ||||||
|  |     _odata=std::move(r._odata); | ||||||
|  |   } | ||||||
|    |    | ||||||
|  |   inline Lattice<vobj> & operator = (Lattice<vobj> && r) | ||||||
|  |   { | ||||||
|  |     _grid        = r._grid; | ||||||
|  |     checkerboard = r.checkerboard; | ||||||
|  |     _odata       =std::move(r._odata); | ||||||
|  |     return *this; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ | ||||||
|  |     _grid        = r._grid; | ||||||
|  |     checkerboard = r.checkerboard; | ||||||
|  |     _odata.resize(_grid->oSites());// essential | ||||||
|  |      | ||||||
|  |     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
|  |       _odata[ss]=r._odata[ss]; | ||||||
|  |     }  	 | ||||||
|  |     return *this; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){ | ||||||
|  |     this->checkerboard = r.checkerboard; | ||||||
|  |     conformable(*this,r); | ||||||
|  |      | ||||||
|  |     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
|  |       this->_odata[ss]=r._odata[ss]; | ||||||
|  |     } | ||||||
|  |     return *this; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   virtual ~Lattice(void) = default; |   virtual ~Lattice(void) = default; | ||||||
|      |      | ||||||
| @@ -277,15 +310,6 @@ public: | |||||||
|     return *this; |     return *this; | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){ |  | ||||||
|     this->checkerboard = r.checkerboard; |  | ||||||
|     conformable(*this,r); |  | ||||||
|      |  | ||||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ |  | ||||||
|       this->_odata[ss]=r._odata[ss]; |  | ||||||
|     } |  | ||||||
|     return *this; |  | ||||||
|   } |  | ||||||
|    |    | ||||||
|   // *=,+=,-= operators inherit behvour from correspond */+/- operation |   // *=,+=,-= operators inherit behvour from correspond */+/- operation | ||||||
|   template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) { |   template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) { | ||||||
|   | |||||||
| @@ -179,7 +179,7 @@ namespace Grid { | |||||||
|       return ret; |       return ret; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| #define DECLARE_RELATIONAL(op,functor) \ | #define DECLARE_RELATIONAL_EQ(op,functor) \ | ||||||
|   template<class vsimd,IfSimd<vsimd> = 0>\ |   template<class vsimd,IfSimd<vsimd> = 0>\ | ||||||
|     inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ |     inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\ | ||||||
|     {\ |     {\ | ||||||
| @@ -198,11 +198,6 @@ namespace Grid { | |||||||
|       typedef typename vsimd::scalar_type scalar;\ |       typedef typename vsimd::scalar_type scalar;\ | ||||||
|       return Comparison(functor<scalar,scalar>(),lhs,rhs);\ |       return Comparison(functor<scalar,scalar>(),lhs,rhs);\ | ||||||
|     }\ |     }\ | ||||||
|   template<class vsimd>\ |  | ||||||
|     inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\ |  | ||||||
|     {									\ |  | ||||||
|       return lhs._internal op rhs._internal;				\ |  | ||||||
|     }									\ |  | ||||||
|   template<class vsimd>\ |   template<class vsimd>\ | ||||||
|     inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \ |     inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \ | ||||||
|     {									\ |     {									\ | ||||||
| @@ -212,14 +207,21 @@ namespace Grid { | |||||||
|     inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \ |     inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \ | ||||||
|     {									\ |     {									\ | ||||||
|       return lhs op rhs._internal;					\ |       return lhs op rhs._internal;					\ | ||||||
|     }									 |     }									\ | ||||||
|  |  | ||||||
|  | #define DECLARE_RELATIONAL(op,functor) \ | ||||||
|  |   DECLARE_RELATIONAL_EQ(op,functor)    \ | ||||||
|  |   template<class vsimd>\ | ||||||
|  |     inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\ | ||||||
|  |     {									\ | ||||||
|  |       return lhs._internal op rhs._internal;				\ | ||||||
|  |     }									 | ||||||
|  |  | ||||||
| DECLARE_RELATIONAL(<,slt); | DECLARE_RELATIONAL(<,slt); | ||||||
| DECLARE_RELATIONAL(<=,sle); | DECLARE_RELATIONAL(<=,sle); | ||||||
| DECLARE_RELATIONAL(>,sgt); | DECLARE_RELATIONAL(>,sgt); | ||||||
| DECLARE_RELATIONAL(>=,sge); | DECLARE_RELATIONAL(>=,sge); | ||||||
| DECLARE_RELATIONAL(==,seq); | DECLARE_RELATIONAL_EQ(==,seq); | ||||||
| DECLARE_RELATIONAL(!=,sne); | DECLARE_RELATIONAL(!=,sne); | ||||||
|  |  | ||||||
| #undef DECLARE_RELATIONAL | #undef DECLARE_RELATIONAL | ||||||
|   | |||||||
| @@ -52,23 +52,5 @@ namespace Grid { | |||||||
|       } |       } | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     // LatticeCoordinate(); |  | ||||||
|     // FIXME for debug; deprecate this; made obscelete by  |  | ||||||
|     template<class vobj> void lex_sites(Lattice<vobj> &l){ |  | ||||||
|       Real *v_ptr = (Real *)&l._odata[0]; |  | ||||||
|       size_t o_len = l._grid->oSites(); |  | ||||||
|       size_t v_len = sizeof(vobj)/sizeof(vRealF); |  | ||||||
|       size_t vec_len = vRealF::Nsimd(); |  | ||||||
|  |  | ||||||
|       for(int i=0;i<o_len;i++){ |  | ||||||
| 	for(int j=0;j<v_len;j++){ |  | ||||||
|           for(int vv=0;vv<vec_len;vv+=2){ |  | ||||||
| 	    v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500; |  | ||||||
| 	    v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500; |  | ||||||
| 	  } |  | ||||||
| 	}} |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -33,7 +33,7 @@ namespace Grid { | |||||||
|   // Deterministic Reduction operations |   // Deterministic Reduction operations | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ | template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ | ||||||
|   ComplexD nrm = innerProduct(arg,arg); |   auto nrm = innerProduct(arg,arg); | ||||||
|   return std::real(nrm);  |   return std::real(nrm);  | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -43,32 +43,85 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ | |||||||
| { | { | ||||||
|   typedef typename vobj::scalar_type scalar_type; |   typedef typename vobj::scalar_type scalar_type; | ||||||
|   typedef typename vobj::vector_typeD vector_type; |   typedef typename vobj::vector_typeD vector_type; | ||||||
|   scalar_type  nrm; |  | ||||||
|    |  | ||||||
|   GridBase *grid = left._grid; |   GridBase *grid = left._grid; | ||||||
|  |   const int pad = 8; | ||||||
|  |  | ||||||
|   std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize()); |   ComplexD  inner; | ||||||
|  |   Vector<ComplexD> sumarray(grid->SumArraySize()*pad); | ||||||
|  |  | ||||||
|   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ |   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||||
|     int nwork, mywork, myoff; |     int nwork, mywork, myoff; | ||||||
|     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); |     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); | ||||||
|      |      | ||||||
|     decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation |     decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation | ||||||
|     for(int ss=myoff;ss<mywork+myoff; ss++){ |     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||||
|       vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]); |       vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]); | ||||||
|     } |     } | ||||||
|     sumarray[thr]=TensorRemove(vnrm) ; |     // All threads sum across SIMD; reduce serial work at end | ||||||
|  |     // one write per cacheline with streaming store | ||||||
|  |     ComplexD tmp = Reduce(TensorRemove(vinner)) ; | ||||||
|  |     vstream(sumarray[thr*pad],tmp); | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   vector_type vvnrm; vvnrm=zero;  // sum across threads |   inner=0.0; | ||||||
|   for(int i=0;i<grid->SumArraySize();i++){ |   for(int i=0;i<grid->SumArraySize();i++){ | ||||||
|     vvnrm = vvnrm+sumarray[i]; |     inner = inner+sumarray[i*pad]; | ||||||
|   }  |   }  | ||||||
|   nrm = Reduce(vvnrm);// sum across simd |   right._grid->GlobalSum(inner); | ||||||
|   right._grid->GlobalSum(nrm); |   return inner; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ///////////////////////// | ||||||
|  | // Fast axpby_norm | ||||||
|  | // z = a x + b y | ||||||
|  | // return norm z | ||||||
|  | ///////////////////////// | ||||||
|  | template<class sobj,class vobj> strong_inline RealD  | ||||||
|  | axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)  | ||||||
|  | { | ||||||
|  |   sobj one(1.0); | ||||||
|  |   return axpby_norm_fast(z,a,one,x,y); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class sobj,class vobj> strong_inline RealD  | ||||||
|  | axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)  | ||||||
|  | { | ||||||
|  |   const int pad = 8; | ||||||
|  |   z.checkerboard = x.checkerboard; | ||||||
|  |   conformable(z,x); | ||||||
|  |   conformable(x,y); | ||||||
|  |  | ||||||
|  |   typedef typename vobj::scalar_type scalar_type; | ||||||
|  |   typedef typename vobj::vector_typeD vector_type; | ||||||
|  |   RealD  nrm; | ||||||
|  |    | ||||||
|  |   GridBase *grid = x._grid; | ||||||
|  |    | ||||||
|  |   Vector<RealD> sumarray(grid->SumArraySize()*pad); | ||||||
|  |    | ||||||
|  |   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||||
|  |     int nwork, mywork, myoff; | ||||||
|  |     GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff); | ||||||
|  |      | ||||||
|  |     // private to thread; sub summation | ||||||
|  |     decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;  | ||||||
|  |     for(int ss=myoff;ss<mywork+myoff; ss++){ | ||||||
|  |       vobj tmp = a*x._odata[ss]+b*y._odata[ss]; | ||||||
|  |       vnrm = vnrm + innerProductD(tmp,tmp); | ||||||
|  |       vstream(z._odata[ss],tmp); | ||||||
|  |     } | ||||||
|  |     vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   nrm = 0.0; // sum across threads; linear in thread count but fast | ||||||
|  |   for(int i=0;i<grid->SumArraySize();i++){ | ||||||
|  |     nrm = nrm+sumarray[i*pad]; | ||||||
|  |   }  | ||||||
|  |   z._grid->GlobalSum(nrm); | ||||||
|   return nrm;  |   return nrm;  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |   | ||||||
| template<class Op,class T1> | template<class Op,class T1> | ||||||
| inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) | inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) | ||||||
|   ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object |   ->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object | ||||||
|   | |||||||
| @@ -158,10 +158,19 @@ namespace Grid { | |||||||
|       // tens of seconds per trajectory so this is clean in all reasonable cases, |       // tens of seconds per trajectory so this is clean in all reasonable cases, | ||||||
|       // and margin of safety is orders of magnitude. |       // and margin of safety is orders of magnitude. | ||||||
|       // We could hack Sitmo to skip in the higher order words of state if necessary |       // We could hack Sitmo to skip in the higher order words of state if necessary | ||||||
|  |       // | ||||||
|  |       // Replace with 2^30 ; avoid problem on large volumes | ||||||
|  |       // | ||||||
|       ///////////////////////////////////////////////////////////////////////////////////// |       ///////////////////////////////////////////////////////////////////////////////////// | ||||||
|       //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init |       //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init | ||||||
|  |       const int shift = 30; | ||||||
|  |  | ||||||
|       uint64_t skip = site; |       uint64_t skip = site; | ||||||
|       skip = skip<<40; |  | ||||||
|  |       skip = skip<<shift; | ||||||
|  |  | ||||||
|  |       assert((skip >> shift)==site); // check for overflow | ||||||
|  |  | ||||||
|       eng.discard(skip); |       eng.discard(skip); | ||||||
|       //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl; |       //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl; | ||||||
|     }  |     }  | ||||||
|   | |||||||
| @@ -599,6 +599,51 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in) | |||||||
|     extract1(in_vobj, out_ptrs, 0); |     extract1(in_vobj, out_ptrs, 0); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template<typename vobj, typename sobj> | ||||||
|  | typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type  | ||||||
|  | unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in) | ||||||
|  | { | ||||||
|  |  | ||||||
|  |   typedef typename vobj::vector_type vtype; | ||||||
|  |    | ||||||
|  |   GridBase* in_grid = in._grid; | ||||||
|  |   out.resize(in_grid->lSites()); | ||||||
|  |    | ||||||
|  |   int ndim = in_grid->Nd(); | ||||||
|  |   int in_nsimd = vtype::Nsimd(); | ||||||
|  |  | ||||||
|  |   std::vector<std::vector<int> > in_icoor(in_nsimd); | ||||||
|  |        | ||||||
|  |   for(int lane=0; lane < in_nsimd; lane++){ | ||||||
|  |     in_icoor[lane].resize(ndim); | ||||||
|  |     in_grid->iCoorFromIindex(in_icoor[lane], lane); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index | ||||||
|  |     //Assemble vector of pointers to output elements | ||||||
|  |     std::vector<sobj*> out_ptrs(in_nsimd); | ||||||
|  |  | ||||||
|  |     std::vector<int> in_ocoor(ndim); | ||||||
|  |     in_grid->oCoorFromOindex(in_ocoor, in_oidx); | ||||||
|  |  | ||||||
|  |     std::vector<int> lcoor(in_grid->Nd()); | ||||||
|  |        | ||||||
|  |     for(int lane=0; lane < in_nsimd; lane++){ | ||||||
|  |       for(int mu=0;mu<ndim;mu++) | ||||||
|  | 	lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu]; | ||||||
|  |  | ||||||
|  |       int lex; | ||||||
|  |       Lexicographic::IndexFromCoorReversed(lcoor, lex, in_grid->_ldimensions); | ||||||
|  |       out_ptrs[lane] = &out[lex]; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     //Unpack into those ptrs | ||||||
|  |     const vobj & in_vobj = in._odata[in_oidx]; | ||||||
|  |     extract1(in_vobj, out_ptrs, 0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order | //Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order | ||||||
| template<typename vobj, typename sobj> | template<typename vobj, typename sobj> | ||||||
| typename std::enable_if<isSIMDvectorized<vobj>::value  | typename std::enable_if<isSIMDvectorized<vobj>::value  | ||||||
| @@ -648,10 +693,59 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out) | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template<typename vobj, typename sobj> | ||||||
|  | typename std::enable_if<isSIMDvectorized<vobj>::value  | ||||||
|  |                     && !isSIMDvectorized<sobj>::value, void>::type  | ||||||
|  | vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out) | ||||||
|  | { | ||||||
|  |  | ||||||
|  |   typedef typename vobj::vector_type vtype; | ||||||
|  |    | ||||||
|  |   GridBase* grid = out._grid; | ||||||
|  |   assert(in.size()==grid->lSites()); | ||||||
|  |    | ||||||
|  |   int ndim     = grid->Nd(); | ||||||
|  |   int nsimd    = vtype::Nsimd(); | ||||||
|  |  | ||||||
|  |   std::vector<std::vector<int> > icoor(nsimd); | ||||||
|  |        | ||||||
|  |   for(int lane=0; lane < nsimd; lane++){ | ||||||
|  |     icoor[lane].resize(ndim); | ||||||
|  |     grid->iCoorFromIindex(icoor[lane],lane); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index | ||||||
|  |     //Assemble vector of pointers to output elements | ||||||
|  |     std::vector<sobj*> ptrs(nsimd); | ||||||
|  |  | ||||||
|  |     std::vector<int> ocoor(ndim); | ||||||
|  |     grid->oCoorFromOindex(ocoor, oidx); | ||||||
|  |  | ||||||
|  |     std::vector<int> lcoor(grid->Nd()); | ||||||
|  |        | ||||||
|  |     for(int lane=0; lane < nsimd; lane++){ | ||||||
|  |  | ||||||
|  |       for(int mu=0;mu<ndim;mu++){ | ||||||
|  | 	lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu]; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       int lex; | ||||||
|  |       Lexicographic::IndexFromCoorReversed(lcoor, lex, grid->_ldimensions); | ||||||
|  |       ptrs[lane] = &in[lex]; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     //pack from those ptrs | ||||||
|  |     vobj vecobj; | ||||||
|  |     merge1(vecobj, ptrs, 0); | ||||||
|  |     out._odata[oidx] = vecobj;  | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| //Convert a Lattice from one precision to another | //Convert a Lattice from one precision to another | ||||||
| template<class VobjOut, class VobjIn> | template<class VobjOut, class VobjIn> | ||||||
| void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ | void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ | ||||||
|   assert(out._grid->Nd() == in._grid->Nd()); |   assert(out._grid->Nd() == in._grid->Nd()); | ||||||
|  |   assert(out._grid->FullDimensions() == in._grid->FullDimensions()); | ||||||
|   out.checkerboard = in.checkerboard; |   out.checkerboard = in.checkerboard; | ||||||
|   GridBase *in_grid=in._grid; |   GridBase *in_grid=in._grid; | ||||||
|   GridBase *out_grid = out._grid; |   GridBase *out_grid = out._grid; | ||||||
|   | |||||||
| @@ -91,7 +91,7 @@ class BinaryIO { | |||||||
|     typedef typename vobj::scalar_object sobj; |     typedef typename vobj::scalar_object sobj; | ||||||
|  |  | ||||||
|     GridBase *grid = lat._grid; |     GridBase *grid = lat._grid; | ||||||
|     int lsites = grid->lSites(); |     uint64_t lsites = grid->lSites(); | ||||||
|  |  | ||||||
|     std::vector<sobj> scalardata(lsites);  |     std::vector<sobj> scalardata(lsites);  | ||||||
|     unvectorizeToLexOrdArray(scalardata,lat);     |     unvectorizeToLexOrdArray(scalardata,lat);     | ||||||
| @@ -110,11 +110,11 @@ class BinaryIO { | |||||||
|       lsites = 1; |       lsites = 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #pragma omp parallel | PARALLEL_REGION | ||||||
|     { |     { | ||||||
|       uint32_t nersc_csum_thr = 0; |       uint32_t nersc_csum_thr = 0; | ||||||
|  |  | ||||||
|       #pragma omp for | PARALLEL_FOR_LOOP_INTERN | ||||||
|       for (uint64_t local_site = 0; local_site < lsites; local_site++) |       for (uint64_t local_site = 0; local_site < lsites; local_site++) | ||||||
|       { |       { | ||||||
|         uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; |         uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; | ||||||
| @@ -124,7 +124,7 @@ class BinaryIO { | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       #pragma omp critical | PARALLEL_CRITICAL | ||||||
|       { |       { | ||||||
|         nersc_csum += nersc_csum_thr; |         nersc_csum += nersc_csum_thr; | ||||||
|       } |       } | ||||||
| @@ -146,21 +146,23 @@ class BinaryIO { | |||||||
|     std::vector<int> local_start =grid->LocalStarts(); |     std::vector<int> local_start =grid->LocalStarts(); | ||||||
|     std::vector<int> global_vol  =grid->FullDimensions(); |     std::vector<int> global_vol  =grid->FullDimensions(); | ||||||
|  |  | ||||||
| #pragma omp parallel | PARALLEL_REGION | ||||||
|     {  |     {  | ||||||
|       std::vector<int> coor(nd); |       std::vector<int> coor(nd); | ||||||
|       uint32_t scidac_csuma_thr=0; |       uint32_t scidac_csuma_thr=0; | ||||||
|       uint32_t scidac_csumb_thr=0; |       uint32_t scidac_csumb_thr=0; | ||||||
|       uint32_t site_crc=0; |       uint32_t site_crc=0; | ||||||
|  |  | ||||||
| #pragma omp for | PARALLEL_FOR_LOOP_INTERN | ||||||
|       for(uint64_t local_site=0;local_site<lsites;local_site++){ |       for(uint64_t local_site=0;local_site<lsites;local_site++){ | ||||||
|  |  | ||||||
| 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; | 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; | ||||||
|  |  | ||||||
| 	/*  | 	/*  | ||||||
| 	 * Scidac csum  is rather more heavyweight | 	 * Scidac csum  is rather more heavyweight | ||||||
|  | 	 * FIXME -- 128^3 x 256 x 16 will overflow. | ||||||
| 	 */ | 	 */ | ||||||
|  | 	 | ||||||
| 	int global_site; | 	int global_site; | ||||||
|  |  | ||||||
| 	Lexicographic::CoorFromIndex(coor,local_site,local_vol); | 	Lexicographic::CoorFromIndex(coor,local_site,local_vol); | ||||||
| @@ -181,7 +183,7 @@ class BinaryIO { | |||||||
| 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31); | 	scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31); | ||||||
|       } |       } | ||||||
|  |  | ||||||
| #pragma omp critical | PARALLEL_CRITICAL | ||||||
|       { |       { | ||||||
| 	scidac_csuma^= scidac_csuma_thr; | 	scidac_csuma^= scidac_csuma_thr; | ||||||
| 	scidac_csumb^= scidac_csumb_thr; | 	scidac_csumb^= scidac_csumb_thr; | ||||||
| @@ -261,7 +263,7 @@ class BinaryIO { | |||||||
| 			      GridBase *grid, | 			      GridBase *grid, | ||||||
| 			      std::vector<fobj> &iodata, | 			      std::vector<fobj> &iodata, | ||||||
| 			      std::string file, | 			      std::string file, | ||||||
| 			      Integer offset, | 			      uint64_t& offset, | ||||||
| 			      const std::string &format, int control, | 			      const std::string &format, int control, | ||||||
| 			      uint32_t &nersc_csum, | 			      uint32_t &nersc_csum, | ||||||
| 			      uint32_t &scidac_csuma, | 			      uint32_t &scidac_csuma, | ||||||
| @@ -429,14 +431,20 @@ class BinaryIO { | |||||||
|           MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); |           MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl; |         std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl; | ||||||
|         ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); |         ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); | ||||||
|         assert(ierr == 0); |         assert(ierr == 0); | ||||||
|  |  | ||||||
|         std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl; |         std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl; | ||||||
|         ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); |         ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); | ||||||
|         assert(ierr == 0); |         assert(ierr == 0); | ||||||
|  |  | ||||||
|  |         MPI_Offset os; | ||||||
|  |         MPI_File_get_position(fh, &os); | ||||||
|  |         MPI_File_get_byte_offset(fh, os, &disp); | ||||||
|  |         offset = disp; | ||||||
|  |  | ||||||
|  |  | ||||||
|         MPI_File_close(&fh); |         MPI_File_close(&fh); | ||||||
|         MPI_Type_free(&fileArray); |         MPI_Type_free(&fileArray); | ||||||
|         MPI_Type_free(&localArray); |         MPI_Type_free(&localArray); | ||||||
| @@ -446,16 +454,20 @@ class BinaryIO { | |||||||
|       } else {  |       } else {  | ||||||
|  |  | ||||||
|         std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : " |         std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : " | ||||||
|                   << iodata.size() * sizeof(fobj) << " bytes" << std::endl; |                   << iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl; | ||||||
|          |          | ||||||
| 	std::ofstream fout;  | 	std::ofstream fout;  | ||||||
| 	fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); | 	fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); | ||||||
| 	try { | 	try { | ||||||
|  | 	  if (offset) { // Must already exist and contain data | ||||||
| 	    fout.open(file,std::ios::binary|std::ios::out|std::ios::in); | 	    fout.open(file,std::ios::binary|std::ios::out|std::ios::in); | ||||||
|  | 	  } else {     // Allow create | ||||||
|  | 	    fout.open(file,std::ios::binary|std::ios::out); | ||||||
|  | 	  } | ||||||
| 	} catch (const std::fstream::failure& exc) { | 	} catch (const std::fstream::failure& exc) { | ||||||
| 	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl; | 	  std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl; | ||||||
| 	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl; | 	  std::cout << GridLogError << "Exception description: " << exc.what() << std::endl; | ||||||
| 	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl; | 	  //	  std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl; | ||||||
| #ifdef USE_MPI_IO | #ifdef USE_MPI_IO | ||||||
| 	  MPI_Abort(MPI_COMM_WORLD,1); | 	  MPI_Abort(MPI_COMM_WORLD,1); | ||||||
| #else | #else | ||||||
| @@ -489,6 +501,7 @@ class BinaryIO { | |||||||
| 	  exit(1); | 	  exit(1); | ||||||
| #endif | #endif | ||||||
| 	} | 	} | ||||||
|  |   offset  = fout.tellp(); | ||||||
| 	fout.close(); | 	fout.close(); | ||||||
|       } |       } | ||||||
|       timer.Stop(); |       timer.Stop(); | ||||||
| @@ -523,7 +536,7 @@ class BinaryIO { | |||||||
|   static inline void readLatticeObject(Lattice<vobj> &Umu, |   static inline void readLatticeObject(Lattice<vobj> &Umu, | ||||||
| 				       std::string file, | 				       std::string file, | ||||||
| 				       munger munge, | 				       munger munge, | ||||||
| 				       Integer offset, | 				       uint64_t offset, | ||||||
| 				       const std::string &format, | 				       const std::string &format, | ||||||
| 				       uint32_t &nersc_csum, | 				       uint32_t &nersc_csum, | ||||||
| 				       uint32_t &scidac_csuma, | 				       uint32_t &scidac_csuma, | ||||||
| @@ -533,7 +546,7 @@ class BinaryIO { | |||||||
|     typedef typename vobj::Realified::scalar_type word;    word w=0; |     typedef typename vobj::Realified::scalar_type word;    word w=0; | ||||||
|  |  | ||||||
|     GridBase *grid = Umu._grid; |     GridBase *grid = Umu._grid; | ||||||
|     int lsites = grid->lSites(); |     uint64_t lsites = grid->lSites(); | ||||||
|  |  | ||||||
|     std::vector<sobj> scalardata(lsites);  |     std::vector<sobj> scalardata(lsites);  | ||||||
|     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here |     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here | ||||||
| @@ -544,7 +557,7 @@ class BinaryIO { | |||||||
|     GridStopWatch timer;  |     GridStopWatch timer;  | ||||||
|     timer.Start(); |     timer.Start(); | ||||||
|  |  | ||||||
|     parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]); |     parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]); | ||||||
|  |  | ||||||
|     vectorizeFromLexOrdArray(scalardata,Umu);     |     vectorizeFromLexOrdArray(scalardata,Umu);     | ||||||
|     grid->Barrier(); |     grid->Barrier(); | ||||||
| @@ -560,7 +573,7 @@ class BinaryIO { | |||||||
|     static inline void writeLatticeObject(Lattice<vobj> &Umu, |     static inline void writeLatticeObject(Lattice<vobj> &Umu, | ||||||
| 					  std::string file, | 					  std::string file, | ||||||
| 					  munger munge, | 					  munger munge, | ||||||
| 					  Integer offset, | 					  uint64_t offset, | ||||||
| 					  const std::string &format, | 					  const std::string &format, | ||||||
| 					  uint32_t &nersc_csum, | 					  uint32_t &nersc_csum, | ||||||
| 					  uint32_t &scidac_csuma, | 					  uint32_t &scidac_csuma, | ||||||
| @@ -569,7 +582,7 @@ class BinaryIO { | |||||||
|     typedef typename vobj::scalar_object sobj; |     typedef typename vobj::scalar_object sobj; | ||||||
|     typedef typename vobj::Realified::scalar_type word;    word w=0; |     typedef typename vobj::Realified::scalar_type word;    word w=0; | ||||||
|     GridBase *grid = Umu._grid; |     GridBase *grid = Umu._grid; | ||||||
|     int lsites = grid->lSites(); |     uint64_t lsites = grid->lSites(); | ||||||
|  |  | ||||||
|     std::vector<sobj> scalardata(lsites);  |     std::vector<sobj> scalardata(lsites);  | ||||||
|     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here |     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here | ||||||
| @@ -580,7 +593,7 @@ class BinaryIO { | |||||||
|     GridStopWatch timer; timer.Start(); |     GridStopWatch timer; timer.Start(); | ||||||
|     unvectorizeToLexOrdArray(scalardata,Umu);     |     unvectorizeToLexOrdArray(scalardata,Umu);     | ||||||
|  |  | ||||||
|     parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]); |     parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]); | ||||||
|  |  | ||||||
|     grid->Barrier(); |     grid->Barrier(); | ||||||
|     timer.Stop(); |     timer.Stop(); | ||||||
| @@ -597,7 +610,7 @@ class BinaryIO { | |||||||
|   static inline void readRNG(GridSerialRNG &serial, |   static inline void readRNG(GridSerialRNG &serial, | ||||||
| 			     GridParallelRNG ¶llel, | 			     GridParallelRNG ¶llel, | ||||||
| 			     std::string file, | 			     std::string file, | ||||||
| 			     Integer offset, | 			     uint64_t offset, | ||||||
| 			     uint32_t &nersc_csum, | 			     uint32_t &nersc_csum, | ||||||
| 			     uint32_t &scidac_csuma, | 			     uint32_t &scidac_csuma, | ||||||
| 			     uint32_t &scidac_csumb) | 			     uint32_t &scidac_csumb) | ||||||
| @@ -610,8 +623,8 @@ class BinaryIO { | |||||||
|     std::string format = "IEEE32BIG"; |     std::string format = "IEEE32BIG"; | ||||||
|  |  | ||||||
|     GridBase *grid = parallel._grid; |     GridBase *grid = parallel._grid; | ||||||
|     int gsites = grid->gSites(); |     uint64_t gsites = grid->gSites(); | ||||||
|     int lsites = grid->lSites(); |     uint64_t lsites = grid->lSites(); | ||||||
|  |  | ||||||
|     uint32_t nersc_csum_tmp   = 0; |     uint32_t nersc_csum_tmp   = 0; | ||||||
|     uint32_t scidac_csuma_tmp = 0; |     uint32_t scidac_csuma_tmp = 0; | ||||||
| @@ -626,7 +639,7 @@ class BinaryIO { | |||||||
| 	     nersc_csum,scidac_csuma,scidac_csumb); | 	     nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|  |  | ||||||
|     timer.Start(); |     timer.Start(); | ||||||
|     parallel_for(int lidx=0;lidx<lsites;lidx++){ |     parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){ | ||||||
|       std::vector<RngStateType> tmp(RngStateCount); |       std::vector<RngStateType> tmp(RngStateCount); | ||||||
|       std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); |       std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); | ||||||
|       parallel.SetState(tmp,lidx); |       parallel.SetState(tmp,lidx); | ||||||
| @@ -659,7 +672,7 @@ class BinaryIO { | |||||||
|   static inline void writeRNG(GridSerialRNG &serial, |   static inline void writeRNG(GridSerialRNG &serial, | ||||||
| 			      GridParallelRNG ¶llel, | 			      GridParallelRNG ¶llel, | ||||||
| 			      std::string file, | 			      std::string file, | ||||||
| 			      Integer offset, | 			      uint64_t offset, | ||||||
| 			      uint32_t &nersc_csum, | 			      uint32_t &nersc_csum, | ||||||
| 			      uint32_t &scidac_csuma, | 			      uint32_t &scidac_csuma, | ||||||
| 			      uint32_t &scidac_csumb) | 			      uint32_t &scidac_csumb) | ||||||
| @@ -670,8 +683,8 @@ class BinaryIO { | |||||||
|     typedef std::array<RngStateType,RngStateCount> RNGstate; |     typedef std::array<RngStateType,RngStateCount> RNGstate; | ||||||
|  |  | ||||||
|     GridBase *grid = parallel._grid; |     GridBase *grid = parallel._grid; | ||||||
|     int gsites = grid->gSites(); |     uint64_t gsites = grid->gSites(); | ||||||
|     int lsites = grid->lSites(); |     uint64_t lsites = grid->lSites(); | ||||||
|  |  | ||||||
|     uint32_t nersc_csum_tmp; |     uint32_t nersc_csum_tmp; | ||||||
|     uint32_t scidac_csuma_tmp; |     uint32_t scidac_csuma_tmp; | ||||||
| @@ -684,7 +697,7 @@ class BinaryIO { | |||||||
|  |  | ||||||
|     timer.Start(); |     timer.Start(); | ||||||
|     std::vector<RNGstate> iodata(lsites); |     std::vector<RNGstate> iodata(lsites); | ||||||
|     parallel_for(int lidx=0;lidx<lsites;lidx++){ |     parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){ | ||||||
|       std::vector<RngStateType> tmp(RngStateCount); |       std::vector<RngStateType> tmp(RngStateCount); | ||||||
|       parallel.GetState(tmp,lidx); |       parallel.GetState(tmp,lidx); | ||||||
|       std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); |       std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); | ||||||
| @@ -693,7 +706,6 @@ class BinaryIO { | |||||||
|  |  | ||||||
|     IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, |     IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC, | ||||||
| 	     nersc_csum,scidac_csuma,scidac_csumb); | 	     nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|  |  | ||||||
|     iodata.resize(1); |     iodata.resize(1); | ||||||
|     { |     { | ||||||
|       std::vector<RngStateType> tmp(RngStateCount); |       std::vector<RngStateType> tmp(RngStateCount); | ||||||
|   | |||||||
| @@ -182,6 +182,11 @@ class GridLimeReader : public BinaryIO { | |||||||
|    { |    { | ||||||
|      filename= _filename; |      filename= _filename; | ||||||
|      File = fopen(filename.c_str(), "r"); |      File = fopen(filename.c_str(), "r"); | ||||||
|  |      if (File == nullptr) | ||||||
|  |      { | ||||||
|  |        std::cerr << "cannot open file '" << filename << "'" << std::endl; | ||||||
|  |        abort(); | ||||||
|  |      } | ||||||
|      LimeR = limeCreateReader(File); |      LimeR = limeCreateReader(File); | ||||||
|    } |    } | ||||||
|    ///////////////////////////////////////////// |    ///////////////////////////////////////////// | ||||||
| @@ -248,7 +253,6 @@ class GridLimeReader : public BinaryIO { | |||||||
|   template<class serialisable_object> |   template<class serialisable_object> | ||||||
|   void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name) |   void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name) | ||||||
|   { |   { | ||||||
|     std::string xmlstring; |  | ||||||
|     // should this be a do while; can we miss a first record?? |     // should this be a do while; can we miss a first record?? | ||||||
|     while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {  |     while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {  | ||||||
|  |  | ||||||
| @@ -262,7 +266,8 @@ class GridLimeReader : public BinaryIO { | |||||||
| 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);     | 	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);     | ||||||
| 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl; | 	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl; | ||||||
|  |  | ||||||
| 	XmlReader RD(&xmlc[0],""); |   std::string xmlstring(&xmlc[0]); | ||||||
|  | 	XmlReader RD(xmlstring, true, ""); | ||||||
| 	read(RD,object_name,object); | 	read(RD,object_name,object); | ||||||
| 	return; | 	return; | ||||||
|       } |       } | ||||||
| @@ -272,8 +277,10 @@ class GridLimeReader : public BinaryIO { | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| class GridLimeWriter : public BinaryIO { | class GridLimeWriter : public BinaryIO  | ||||||
|  | { | ||||||
|  public: |  public: | ||||||
|  |  | ||||||
|    /////////////////////////////////////////////////// |    /////////////////////////////////////////////////// | ||||||
|    // FIXME: format for RNG? Now just binary out instead |    // FIXME: format for RNG? Now just binary out instead | ||||||
|    // FIXME: collective calls or not ? |    // FIXME: collective calls or not ? | ||||||
| @@ -282,17 +289,24 @@ class GridLimeWriter : public BinaryIO { | |||||||
|    FILE       *File; |    FILE       *File; | ||||||
|    LimeWriter *LimeW; |    LimeWriter *LimeW; | ||||||
|    std::string filename; |    std::string filename; | ||||||
|  |    bool        boss_node; | ||||||
|  |    GridLimeWriter( bool isboss = true) { | ||||||
|  |      boss_node = isboss; | ||||||
|  |    } | ||||||
|    void open(const std::string &_filename) {  |    void open(const std::string &_filename) {  | ||||||
|      filename= _filename; |      filename= _filename; | ||||||
|  |      if ( boss_node ) { | ||||||
|        File = fopen(filename.c_str(), "w"); |        File = fopen(filename.c_str(), "w"); | ||||||
|        LimeW = limeCreateWriter(File); assert(LimeW != NULL ); |        LimeW = limeCreateWriter(File); assert(LimeW != NULL ); | ||||||
|      } |      } | ||||||
|  |    } | ||||||
|    ///////////////////////////////////////////// |    ///////////////////////////////////////////// | ||||||
|    // Close the file |    // Close the file | ||||||
|    ///////////////////////////////////////////// |    ///////////////////////////////////////////// | ||||||
|    void close(void) { |    void close(void) { | ||||||
|  |      if ( boss_node ) { | ||||||
|        fclose(File); |        fclose(File); | ||||||
|  |      } | ||||||
|      //  limeDestroyWriter(LimeW); |      //  limeDestroyWriter(LimeW); | ||||||
|    } |    } | ||||||
|   /////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////// | ||||||
| @@ -300,10 +314,12 @@ class GridLimeWriter : public BinaryIO { | |||||||
|   /////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////// | ||||||
|   int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize) |   int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize) | ||||||
|   { |   { | ||||||
|  |     if ( boss_node ) { | ||||||
|       LimeRecordHeader *h; |       LimeRecordHeader *h; | ||||||
|       h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize); |       h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize); | ||||||
|       assert(limeWriteRecordHeader(h, LimeW) >= 0); |       assert(limeWriteRecordHeader(h, LimeW) >= 0); | ||||||
|       limeDestroyHeader(h); |       limeDestroyHeader(h); | ||||||
|  |     } | ||||||
|     return LIME_SUCCESS; |     return LIME_SUCCESS; | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////////// |   //////////////////////////////////////////// | ||||||
| @@ -312,6 +328,7 @@ class GridLimeWriter : public BinaryIO { | |||||||
|   template<class serialisable_object> |   template<class serialisable_object> | ||||||
|   void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name) |   void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name) | ||||||
|   { |   { | ||||||
|  |     if ( boss_node ) { | ||||||
|       std::string xmlstring; |       std::string xmlstring; | ||||||
|       { |       { | ||||||
| 	XmlWriter WR("",""); | 	XmlWriter WR("",""); | ||||||
| @@ -329,48 +346,81 @@ class GridLimeWriter : public BinaryIO { | |||||||
|       err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0); |       err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0); | ||||||
|       err=limeWriterCloseRecord(LimeW);                       assert(err>=0); |       err=limeWriterCloseRecord(LimeW);                       assert(err>=0); | ||||||
|       limeDestroyHeader(h); |       limeDestroyHeader(h); | ||||||
|     //    std::cout << " File offset is now"<<ftello(File) << std::endl; |  | ||||||
|     } |     } | ||||||
|   //////////////////////////////////////////// |   } | ||||||
|  |   //////////////////////////////////////////////////// | ||||||
|   // Write a generic lattice field and csum |   // Write a generic lattice field and csum | ||||||
|   //////////////////////////////////////////// |   // This routine is Collectively called by all nodes | ||||||
|  |   // in communicator used by the field._grid | ||||||
|  |   //////////////////////////////////////////////////// | ||||||
|   template<class vobj> |   template<class vobj> | ||||||
|   void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name) |   void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name) | ||||||
|   { |   { | ||||||
|     //////////////////////////////////////////// |  | ||||||
|     // Create record header |  | ||||||
|     //////////////////////////////////////////// |  | ||||||
|     typedef typename vobj::scalar_object sobj; |  | ||||||
|     int err; |  | ||||||
|     uint32_t nersc_csum,scidac_csuma,scidac_csumb; |  | ||||||
|     uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites; |  | ||||||
|     createLimeRecordHeader(record_name, 0, 0, PayloadSize); |  | ||||||
|  |  | ||||||
|     //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl; |  | ||||||
|     //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl; |  | ||||||
|     //    std::cout << "W Payload expected " <<PayloadSize<<std::endl; |  | ||||||
|  |  | ||||||
|     //////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////// | ||||||
|     // NB: FILE and iostream are jointly writing disjoint sequences in the |     // NB: FILE and iostream are jointly writing disjoint sequences in the | ||||||
|     // the same file through different file handles (integer units). |     // the same file through different file handles (integer units). | ||||||
|     //  |     //  | ||||||
|     // These are both buffered, so why I think this code is right is as follows. |     // These are both buffered, so why I think this code is right is as follows. | ||||||
|     // |     // | ||||||
|     // i)  write record header to FILE *File, telegraphing the size.  |     // i)  write record header to FILE *File, telegraphing the size; flush | ||||||
|     // ii) ftello reads the offset from FILE *File .  |     // ii) ftello reads the offset from FILE *File .  | ||||||
|     // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk. |     // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk. | ||||||
|     //      Closes iostream and flushes. |     //      Closes iostream and flushes. | ||||||
|     // iv) fseek on FILE * to end of this disjoint section. |     // iv) fseek on FILE * to end of this disjoint section. | ||||||
|     //  v) Continue writing scidac record. |     //  v) Continue writing scidac record. | ||||||
|     //////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////// | ||||||
|     uint64_t offset = ftello(File); |      | ||||||
|     //    std::cout << " Writing to offset "<<offset << std::endl; |     GridBase *grid = field._grid; | ||||||
|  |     assert(boss_node == field._grid->IsBoss() ); | ||||||
|  |  | ||||||
|  |     //////////////////////////////////////////// | ||||||
|  |     // Create record header | ||||||
|  |     //////////////////////////////////////////// | ||||||
|  |     typedef typename vobj::scalar_object sobj; | ||||||
|  |     int err; | ||||||
|  |     uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
|  |     uint64_t PayloadSize = sizeof(sobj) * grid->_gsites; | ||||||
|  |     if ( boss_node ) { | ||||||
|  |       createLimeRecordHeader(record_name, 0, 0, PayloadSize); | ||||||
|  |       fflush(File); | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl; | ||||||
|  |     //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl; | ||||||
|  |     //    std::cout << "W Payload expected " <<PayloadSize<<std::endl; | ||||||
|  |  | ||||||
|  |     //////////////////////////////////////////////// | ||||||
|  |     // Check all nodes agree on file position | ||||||
|  |     //////////////////////////////////////////////// | ||||||
|  |     uint64_t offset1; | ||||||
|  |     if ( boss_node ) { | ||||||
|  |       offset1 = ftello(File);     | ||||||
|  |     } | ||||||
|  |     grid->Broadcast(0,(void *)&offset1,sizeof(offset1)); | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////// | ||||||
|  |     // The above is collective. Write by other means into the binary record | ||||||
|  |     /////////////////////////////////////////// | ||||||
|     std::string format = getFormatString<vobj>(); |     std::string format = getFormatString<vobj>(); | ||||||
|     BinarySimpleMunger<sobj,sobj> munge; |     BinarySimpleMunger<sobj,sobj> munge; | ||||||
|     BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb); |     BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|     //    fseek(File,0,SEEK_END);    offset = ftello(File);std::cout << " offset now "<<offset << std::endl; |  | ||||||
|     err=limeWriterCloseRecord(LimeW);  assert(err>=0); |  | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////// | ||||||
|  |     // Wind forward and close the record | ||||||
|  |     /////////////////////////////////////////// | ||||||
|  |     if ( boss_node ) { | ||||||
|  |       fseek(File,0,SEEK_END);              | ||||||
|  |       uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl; | ||||||
|  |       assert( (offset2-offset1) == PayloadSize); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ///////////////////////////////////////////////////////////// | ||||||
|  |     // Check MPI-2 I/O did what we expect to file | ||||||
|  |     ///////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  |     if ( boss_node ) {  | ||||||
|  |       err=limeWriterCloseRecord(LimeW);  assert(err>=0); | ||||||
|  |     } | ||||||
|     //////////////////////////////////////// |     //////////////////////////////////////// | ||||||
|     // Write checksum element, propagaing forward from the BinaryIO |     // Write checksum element, propagaing forward from the BinaryIO | ||||||
|     // Always pair a checksum with a binary object, and close message |     // Always pair a checksum with a binary object, and close message | ||||||
| @@ -380,21 +430,26 @@ class GridLimeWriter : public BinaryIO { | |||||||
|     std::stringstream streamb; streamb << std::hex << scidac_csumb; |     std::stringstream streamb; streamb << std::hex << scidac_csumb; | ||||||
|     checksum.suma= streama.str(); |     checksum.suma= streama.str(); | ||||||
|     checksum.sumb= streamb.str(); |     checksum.sumb= streamb.str(); | ||||||
|     //    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl; |     if ( boss_node ) {  | ||||||
|       writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM)); |       writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM)); | ||||||
|     } |     } | ||||||
|  |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| class ScidacWriter : public GridLimeWriter { | class ScidacWriter : public GridLimeWriter { | ||||||
|  public: |  public: | ||||||
|  |  | ||||||
|  |   ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss)  { }; | ||||||
|  |  | ||||||
|   template<class SerialisableUserFile> |   template<class SerialisableUserFile> | ||||||
|   void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile) |   void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile) | ||||||
|   { |   { | ||||||
|     scidacFile    _scidacFile(grid); |     scidacFile    _scidacFile(grid); | ||||||
|  |     if ( this->boss_node ) { | ||||||
|       writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); |       writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML)); | ||||||
|       writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); |       writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML)); | ||||||
|     } |     } | ||||||
|  |   } | ||||||
|   //////////////////////////////////////////////// |   //////////////////////////////////////////////// | ||||||
|   // Write generic lattice field in scidac format |   // Write generic lattice field in scidac format | ||||||
|   //////////////////////////////////////////////// |   //////////////////////////////////////////////// | ||||||
| @@ -415,9 +470,12 @@ class ScidacWriter : public GridLimeWriter { | |||||||
|     ////////////////////////////////////////////// |     ////////////////////////////////////////////// | ||||||
|     // Fill the Lime file record by record |     // Fill the Lime file record by record | ||||||
|     ////////////////////////////////////////////// |     ////////////////////////////////////////////// | ||||||
|  |     if ( this->boss_node ) { | ||||||
|       writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message  |       writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message  | ||||||
|       writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML)); |       writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML)); | ||||||
|       writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); |       writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); | ||||||
|  |     } | ||||||
|  |     // Collective call | ||||||
|     writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum |     writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| @@ -485,6 +543,8 @@ class ScidacReader : public GridLimeReader { | |||||||
| class IldgWriter : public ScidacWriter { | class IldgWriter : public ScidacWriter { | ||||||
|  public: |  public: | ||||||
|    |    | ||||||
|  |   IldgWriter(bool isboss) : ScidacWriter(isboss) {}; | ||||||
|  |  | ||||||
|   /////////////////////////////////// |   /////////////////////////////////// | ||||||
|   // A little helper |   // A little helper | ||||||
|   /////////////////////////////////// |   /////////////////////////////////// | ||||||
| @@ -568,7 +628,6 @@ class IldgWriter : public ScidacWriter { | |||||||
|     writeLimeIldgLFN(header.ildg_lfn);                                                 // rec |     writeLimeIldgLFN(header.ildg_lfn);                                                 // rec | ||||||
|     writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum |     writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA));      // Closes message with checksum | ||||||
|     //    limeDestroyWriter(LimeW); |     //    limeDestroyWriter(LimeW); | ||||||
|     fclose(File); |  | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -644,9 +703,11 @@ class IldgReader : public GridLimeReader { | |||||||
|  |  | ||||||
| 	////////////////////////////////// | 	////////////////////////////////// | ||||||
| 	// ILDG format record | 	// ILDG format record | ||||||
|  |  | ||||||
|  |   std::string xmlstring(&xmlc[0]); | ||||||
| 	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) {  | 	if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) {  | ||||||
|  |  | ||||||
| 	  XmlReader RD(&xmlc[0],""); | 	  XmlReader RD(xmlstring, true, ""); | ||||||
| 	  read(RD,"ildgFormat",ildgFormat_); | 	  read(RD,"ildgFormat",ildgFormat_); | ||||||
|  |  | ||||||
| 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG"); | 	  if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG"); | ||||||
| @@ -661,13 +722,13 @@ class IldgReader : public GridLimeReader { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) { | 	if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) { | ||||||
| 	  FieldMetaData_.ildg_lfn = std::string(&xmlc[0]); | 	  FieldMetaData_.ildg_lfn = xmlstring; | ||||||
| 	  found_ildgLFN = 1; | 	  found_ildgLFN = 1; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) {  | 	if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) {  | ||||||
|  |  | ||||||
| 	  XmlReader RD(&xmlc[0],""); | 	  XmlReader RD(xmlstring, true, ""); | ||||||
| 	  read(RD,"FieldMetaData",FieldMetaData_); | 	  read(RD,"FieldMetaData",FieldMetaData_); | ||||||
|  |  | ||||||
| 	  format = FieldMetaData_.floating_point; | 	  format = FieldMetaData_.floating_point; | ||||||
| @@ -681,18 +742,17 @@ class IldgReader : public GridLimeReader { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) {  | 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) {  | ||||||
| 	  std::string xmls(&xmlc[0]); |  | ||||||
| 	  // is it a USQCD info field | 	  // is it a USQCD info field | ||||||
| 	  if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) {  | 	  if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) {  | ||||||
| 	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl; | 	    //	    std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl; | ||||||
| 	    XmlReader RD(&xmlc[0],""); | 	    XmlReader RD(xmlstring, true, ""); | ||||||
| 	    read(RD,"usqcdInfo",usqcdInfo_); | 	    read(RD,"usqcdInfo",usqcdInfo_); | ||||||
| 	    found_usqcdInfo = 1; | 	    found_usqcdInfo = 1; | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) {  | 	if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) {  | ||||||
| 	  XmlReader RD(&xmlc[0],""); | 	  XmlReader RD(xmlstring, true, ""); | ||||||
| 	  read(RD,"scidacChecksum",scidacChecksum_); | 	  read(RD,"scidacChecksum",scidacChecksum_); | ||||||
| 	  found_scidacChecksum = 1; | 	  found_scidacChecksum = 1; | ||||||
| 	} | 	} | ||||||
|   | |||||||
| @@ -136,8 +136,9 @@ struct scidacRecord : Serializable { | |||||||
| 				  int, typesize, | 				  int, typesize, | ||||||
| 				  int, datacount); | 				  int, datacount); | ||||||
|  |  | ||||||
|   scidacRecord() { version =1.0; } |   scidacRecord() | ||||||
|  |   : version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0) | ||||||
|  |   {} | ||||||
| }; | }; | ||||||
|  |  | ||||||
| //////////////////////// | //////////////////////// | ||||||
|   | |||||||
| @@ -81,18 +81,16 @@ namespace Grid { | |||||||
| 				      std::string, creation_date, | 				      std::string, creation_date, | ||||||
| 				      std::string, archive_date, | 				      std::string, archive_date, | ||||||
| 				      std::string, floating_point); | 				      std::string, floating_point); | ||||||
|       FieldMetaData(void) {  |       // WARNING: non-initialised values might lead to twisted parallel IO | ||||||
| 	nd=4; |       // issues, std::string are fine because they initliase to size 0 | ||||||
| 	dimension.resize(4); |       // as per C++ standard. | ||||||
| 	boundary.resize(4); |       FieldMetaData(void)  | ||||||
| 	scidac_checksuma=0; |       : nd(4), dimension(4,0), boundary(4, ""), data_start(0), | ||||||
| 	scidac_checksumb=0; |       link_trace(0.), plaquette(0.), checksum(0), | ||||||
| 	checksum=0; |       scidac_checksuma(0), scidac_checksumb(0), sequence_number(0) | ||||||
|       } |       {} | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   namespace QCD { |   namespace QCD { | ||||||
|  |  | ||||||
|     using namespace Grid; |     using namespace Grid; | ||||||
|   | |||||||
| @@ -57,7 +57,7 @@ namespace Grid { | |||||||
|       // for the header-reader |       // for the header-reader | ||||||
|       static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field) |       static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field) | ||||||
|       { |       { | ||||||
|       int offset=0; |       uint64_t offset=0; | ||||||
|       std::map<std::string,std::string> header; |       std::map<std::string,std::string> header; | ||||||
|       std::string line; |       std::string line; | ||||||
|  |  | ||||||
| @@ -139,7 +139,7 @@ namespace Grid { | |||||||
|       typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; |       typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField; | ||||||
|  |  | ||||||
|       GridBase *grid = Umu._grid; |       GridBase *grid = Umu._grid; | ||||||
|       int offset = readHeader(file,Umu._grid,header); |       uint64_t offset = readHeader(file,Umu._grid,header); | ||||||
|  |  | ||||||
|       FieldMetaData clone(header); |       FieldMetaData clone(header); | ||||||
|  |  | ||||||
| @@ -236,21 +236,25 @@ namespace Grid { | |||||||
| 	GaugeStatistics(Umu,header); | 	GaugeStatistics(Umu,header); | ||||||
| 	MachineCharacteristics(header); | 	MachineCharacteristics(header); | ||||||
|  |  | ||||||
| 	int offset; | 	uint64_t offset; | ||||||
|    |  | ||||||
| 	truncate(file); |  | ||||||
|  |  | ||||||
| 	// Sod it -- always write 3x3 double | 	// Sod it -- always write 3x3 double | ||||||
| 	header.floating_point = std::string("IEEE64BIG"); | 	header.floating_point = std::string("IEEE64BIG"); | ||||||
| 	header.data_type      = std::string("4D_SU3_GAUGE_3x3"); | 	header.data_type      = std::string("4D_SU3_GAUGE_3x3"); | ||||||
| 	GaugeSimpleUnmunger<fobj3D,sobj> munge; | 	GaugeSimpleUnmunger<fobj3D,sobj> munge; | ||||||
|  | 	if ( grid->IsBoss() ) {  | ||||||
|  | 	  truncate(file); | ||||||
| 	  offset = writeHeader(header,file); | 	  offset = writeHeader(header,file); | ||||||
|  | 	} | ||||||
|  | 	grid->Broadcast(0,(void *)&offset,sizeof(offset)); | ||||||
|  |  | ||||||
| 	uint32_t nersc_csum,scidac_csuma,scidac_csumb; | 	uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
| 	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point, | 	BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point, | ||||||
| 								  nersc_csum,scidac_csuma,scidac_csumb); | 								  nersc_csum,scidac_csuma,scidac_csumb); | ||||||
| 	header.checksum = nersc_csum; | 	header.checksum = nersc_csum; | ||||||
|  | 	if ( grid->IsBoss() ) {  | ||||||
| 	  writeHeader(header,file); | 	  writeHeader(header,file); | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum " | 	std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum " | ||||||
| 		 <<std::hex<<header.checksum | 		 <<std::hex<<header.checksum | ||||||
| @@ -278,7 +282,7 @@ namespace Grid { | |||||||
| 	header.plaquette=0.0; | 	header.plaquette=0.0; | ||||||
| 	MachineCharacteristics(header); | 	MachineCharacteristics(header); | ||||||
|  |  | ||||||
| 	int offset; | 	uint64_t offset; | ||||||
|    |    | ||||||
| #ifdef RNG_RANLUX | #ifdef RNG_RANLUX | ||||||
| 	header.floating_point = std::string("UINT64"); | 	header.floating_point = std::string("UINT64"); | ||||||
| @@ -293,12 +297,18 @@ namespace Grid { | |||||||
| 	header.data_type      = std::string("SITMO"); | 	header.data_type      = std::string("SITMO"); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | 	if ( grid->IsBoss() ) {  | ||||||
| 	  truncate(file); | 	  truncate(file); | ||||||
| 	  offset = writeHeader(header,file); | 	  offset = writeHeader(header,file); | ||||||
|  | 	} | ||||||
|  | 	grid->Broadcast(0,(void *)&offset,sizeof(offset)); | ||||||
|  | 	 | ||||||
| 	uint32_t nersc_csum,scidac_csuma,scidac_csumb; | 	uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
| 	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb); | 	BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
| 	header.checksum = nersc_csum; | 	header.checksum = nersc_csum; | ||||||
|  | 	if ( grid->IsBoss() ) {  | ||||||
| 	  offset = writeHeader(header,file); | 	  offset = writeHeader(header,file); | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	std::cout<<GridLogMessage  | 	std::cout<<GridLogMessage  | ||||||
| 		 <<"Written NERSC RNG STATE "<<file<< " checksum " | 		 <<"Written NERSC RNG STATE "<<file<< " checksum " | ||||||
| @@ -313,7 +323,7 @@ namespace Grid { | |||||||
|  |  | ||||||
| 	GridBase *grid = parallel._grid; | 	GridBase *grid = parallel._grid; | ||||||
|  |  | ||||||
| 	int offset = readHeader(file,grid,header); | 	uint64_t offset = readHeader(file,grid,header); | ||||||
|  |  | ||||||
| 	FieldMetaData clone(header); | 	FieldMetaData clone(header); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -49,7 +49,8 @@ inline double usecond(void) { | |||||||
|  |  | ||||||
| typedef  std::chrono::system_clock          GridClock; | typedef  std::chrono::system_clock          GridClock; | ||||||
| typedef  std::chrono::time_point<GridClock> GridTimePoint; | typedef  std::chrono::time_point<GridClock> GridTimePoint; | ||||||
| typedef  std::chrono::milliseconds          GridTime; | typedef  std::chrono::milliseconds          GridMillisecs; | ||||||
|  | typedef  std::chrono::microseconds          GridTime; | ||||||
| typedef  std::chrono::microseconds          GridUsecs; | typedef  std::chrono::microseconds          GridUsecs; | ||||||
|  |  | ||||||
| inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time) | inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time) | ||||||
| @@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli | |||||||
|   stream << time.count()<<" ms"; |   stream << time.count()<<" ms"; | ||||||
|   return stream; |   return stream; | ||||||
| } | } | ||||||
|  | inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time) | ||||||
|  | { | ||||||
|  |   stream << time.count()<<" usec"; | ||||||
|  |   return stream; | ||||||
|  | } | ||||||
|   |   | ||||||
| class GridStopWatch { | class GridStopWatch { | ||||||
| private: | private: | ||||||
|   | |||||||
| @@ -1,44 +0,0 @@ | |||||||
| pugixml [](https://travis-ci.org/zeux/pugixml) [](https://ci.appveyor.com/project/zeux/pugixml) |  | ||||||
| ======= |  | ||||||
|  |  | ||||||
| pugixml is a C++ XML processing library, which consists of a DOM-like interface with rich traversal/modification |  | ||||||
| capabilities, an extremely fast XML parser which constructs the DOM tree from an XML file/buffer, and an XPath 1.0 |  | ||||||
| implementation for complex data-driven tree queries. Full Unicode support is also available, with Unicode interface |  | ||||||
| variants and conversions between different Unicode encodings (which happen automatically during parsing/saving). |  | ||||||
|  |  | ||||||
| pugixml is used by a lot of projects, both open-source and proprietary, for performance and easy-to-use interface. |  | ||||||
|  |  | ||||||
| ## Documentation |  | ||||||
|  |  | ||||||
| Documentation for the current release of pugixml is available on-line as two separate documents: |  | ||||||
|  |  | ||||||
| * [Quick-start guide](http://pugixml.org/docs/quickstart.html), that aims to provide enough information to start using the library; |  | ||||||
| * [Complete reference manual](http://pugixml.org/docs/manual.html), that describes all features of the library in detail. |  | ||||||
|  |  | ||||||
| You’re advised to start with the quick-start guide; however, many important library features are either not described in it at all or only mentioned briefly; if you require more information you should read the complete manual. |  | ||||||
|  |  | ||||||
| ## License |  | ||||||
| This library is available to anybody free of charge, under the terms of MIT License: |  | ||||||
|  |  | ||||||
| Copyright (c) 2006-2015 Arseny Kapoulkine |  | ||||||
|  |  | ||||||
| Permission is hereby granted, free of charge, to any person |  | ||||||
| obtaining a copy of this software and associated documentation |  | ||||||
| files (the "Software"), to deal in the Software without |  | ||||||
| restriction, including without limitation the rights to use, |  | ||||||
| copy, modify, merge, publish, distribute, sublicense, and/or sell |  | ||||||
| copies of the Software, and to permit persons to whom the |  | ||||||
| Software is furnished to do so, subject to the following |  | ||||||
| conditions: |  | ||||||
|  |  | ||||||
| The above copyright notice and this permission notice shall be |  | ||||||
| included in all copies or substantial portions of the Software. |  | ||||||
|  |  | ||||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |  | ||||||
| EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |  | ||||||
| OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |  | ||||||
| NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |  | ||||||
| HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |  | ||||||
| WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |  | ||||||
| FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |  | ||||||
| OTHER DEALINGS IN THE SOFTWARE. |  | ||||||
| @@ -1,7 +1,7 @@ | |||||||
| /** | /** | ||||||
|  * pugixml parser - version 1.6 |  * pugixml parser - version 1.9 | ||||||
|  * -------------------------------------------------------- |  * -------------------------------------------------------- | ||||||
|  * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) |  * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) | ||||||
|  * Report bugs and download new versions at http://pugixml.org/ |  * Report bugs and download new versions at http://pugixml.org/ | ||||||
|  * |  * | ||||||
|  * This library is distributed under the MIT License. See notice at the end |  * This library is distributed under the MIT License. See notice at the end | ||||||
| @@ -17,6 +17,9 @@ | |||||||
| // Uncomment this to enable wchar_t mode | // Uncomment this to enable wchar_t mode | ||||||
| // #define PUGIXML_WCHAR_MODE | // #define PUGIXML_WCHAR_MODE | ||||||
|  |  | ||||||
|  | // Uncomment this to enable compact mode | ||||||
|  | // #define PUGIXML_COMPACT | ||||||
|  |  | ||||||
| // Uncomment this to disable XPath | // Uncomment this to disable XPath | ||||||
| // #define PUGIXML_NO_XPATH | // #define PUGIXML_NO_XPATH | ||||||
|  |  | ||||||
| @@ -46,7 +49,7 @@ | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * Copyright (c) 2006-2015 Arseny Kapoulkine |  * Copyright (c) 2006-2018 Arseny Kapoulkine | ||||||
|  * |  * | ||||||
|  * Permission is hereby granted, free of charge, to any person |  * Permission is hereby granted, free of charge, to any person | ||||||
|  * obtaining a copy of this software and associated documentation |  * obtaining a copy of this software and associated documentation | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,7 +1,7 @@ | |||||||
| /** | /** | ||||||
|  * pugixml parser - version 1.6 |  * pugixml parser - version 1.9 | ||||||
|  * -------------------------------------------------------- |  * -------------------------------------------------------- | ||||||
|  * Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) |  * Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) | ||||||
|  * Report bugs and download new versions at http://pugixml.org/ |  * Report bugs and download new versions at http://pugixml.org/ | ||||||
|  * |  * | ||||||
|  * This library is distributed under the MIT License. See notice at the end |  * This library is distributed under the MIT License. See notice at the end | ||||||
| @@ -13,7 +13,7 @@ | |||||||
|  |  | ||||||
| #ifndef PUGIXML_VERSION | #ifndef PUGIXML_VERSION | ||||||
| // Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons | // Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons | ||||||
| #	define PUGIXML_VERSION 160 | #	define PUGIXML_VERSION 190 | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| // Include user configuration file (this can define various configuration macros) | // Include user configuration file (this can define various configuration macros) | ||||||
| @@ -72,6 +72,44 @@ | |||||||
| #	endif | #	endif | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | // If the platform is known to have move semantics support, compile move ctor/operator implementation | ||||||
|  | #ifndef PUGIXML_HAS_MOVE | ||||||
|  | #	if __cplusplus >= 201103 | ||||||
|  | #		define PUGIXML_HAS_MOVE | ||||||
|  | #	elif defined(_MSC_VER) && _MSC_VER >= 1600 | ||||||
|  | #		define PUGIXML_HAS_MOVE | ||||||
|  | #	endif | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | // If C++ is 2011 or higher, add 'noexcept' specifiers | ||||||
|  | #ifndef PUGIXML_NOEXCEPT | ||||||
|  | #	if __cplusplus >= 201103 | ||||||
|  | #		define PUGIXML_NOEXCEPT noexcept | ||||||
|  | #	elif defined(_MSC_VER) && _MSC_VER >= 1900 | ||||||
|  | #		define PUGIXML_NOEXCEPT noexcept | ||||||
|  | #	else | ||||||
|  | #		define PUGIXML_NOEXCEPT | ||||||
|  | #	endif | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | // Some functions can not be noexcept in compact mode | ||||||
|  | #ifdef PUGIXML_COMPACT | ||||||
|  | #	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT | ||||||
|  | #else | ||||||
|  | #	define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | // If C++ is 2011 or higher, add 'override' qualifiers | ||||||
|  | #ifndef PUGIXML_OVERRIDE | ||||||
|  | #	if __cplusplus >= 201103 | ||||||
|  | #		define PUGIXML_OVERRIDE override | ||||||
|  | #	elif defined(_MSC_VER) && _MSC_VER >= 1700 | ||||||
|  | #		define PUGIXML_OVERRIDE override | ||||||
|  | #	else | ||||||
|  | #		define PUGIXML_OVERRIDE | ||||||
|  | #	endif | ||||||
|  | #endif | ||||||
|  |  | ||||||
| // Character interface macros | // Character interface macros | ||||||
| #ifdef PUGIXML_WCHAR_MODE | #ifdef PUGIXML_WCHAR_MODE | ||||||
| #	define PUGIXML_TEXT(t) L ## t | #	define PUGIXML_TEXT(t) L ## t | ||||||
| @@ -158,6 +196,11 @@ namespace pugi | |||||||
| 	// is a valid document. This flag is off by default. | 	// is a valid document. This flag is off by default. | ||||||
| 	const unsigned int parse_fragment = 0x1000; | 	const unsigned int parse_fragment = 0x1000; | ||||||
|  |  | ||||||
|  | 	// This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of | ||||||
|  | 	// the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments. | ||||||
|  | 	// This flag is off by default. | ||||||
|  | 	const unsigned int parse_embed_pcdata = 0x2000; | ||||||
|  |  | ||||||
| 	// The default parsing mode. | 	// The default parsing mode. | ||||||
| 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, | 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, | ||||||
| 	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. | 	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. | ||||||
| @@ -206,6 +249,9 @@ namespace pugi | |||||||
| 	// Write every attribute on a new line with appropriate indentation. This flag is off by default. | 	// Write every attribute on a new line with appropriate indentation. This flag is off by default. | ||||||
| 	const unsigned int format_indent_attributes = 0x40; | 	const unsigned int format_indent_attributes = 0x40; | ||||||
|  |  | ||||||
|  | 	// Don't output empty element tags, instead writing an explicit start and end tag even if there are no children. This flag is off by default. | ||||||
|  | 	const unsigned int format_no_empty_element_tags = 0x80; | ||||||
|  |  | ||||||
| 	// The default set of formatting flags. | 	// The default set of formatting flags. | ||||||
| 	// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none. | 	// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none. | ||||||
| 	const unsigned int format_default = format_indent; | 	const unsigned int format_default = format_indent; | ||||||
| @@ -268,7 +314,7 @@ namespace pugi | |||||||
| 		// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio | 		// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio | ||||||
| 		xml_writer_file(void* file); | 		xml_writer_file(void* file); | ||||||
|  |  | ||||||
| 		virtual void write(const void* data, size_t size); | 		virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE; | ||||||
|  |  | ||||||
| 	private: | 	private: | ||||||
| 		void* file; | 		void* file; | ||||||
| @@ -283,7 +329,7 @@ namespace pugi | |||||||
| 		xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream); | 		xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream); | ||||||
| 		xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream); | 		xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream); | ||||||
|  |  | ||||||
| 		virtual void write(const void* data, size_t size); | 		virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE; | ||||||
|  |  | ||||||
| 	private: | 	private: | ||||||
| 		std::basic_ostream<char, std::char_traits<char> >* narrow_stream; | 		std::basic_ostream<char, std::char_traits<char> >* narrow_stream; | ||||||
| @@ -354,6 +400,8 @@ namespace pugi | |||||||
| 		// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") | 		// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") | ||||||
| 		bool set_value(int rhs); | 		bool set_value(int rhs); | ||||||
| 		bool set_value(unsigned int rhs); | 		bool set_value(unsigned int rhs); | ||||||
|  | 		bool set_value(long rhs); | ||||||
|  | 		bool set_value(unsigned long rhs); | ||||||
| 		bool set_value(double rhs); | 		bool set_value(double rhs); | ||||||
| 		bool set_value(float rhs); | 		bool set_value(float rhs); | ||||||
| 		bool set_value(bool rhs); | 		bool set_value(bool rhs); | ||||||
| @@ -367,6 +415,8 @@ namespace pugi | |||||||
| 		xml_attribute& operator=(const char_t* rhs); | 		xml_attribute& operator=(const char_t* rhs); | ||||||
| 		xml_attribute& operator=(int rhs); | 		xml_attribute& operator=(int rhs); | ||||||
| 		xml_attribute& operator=(unsigned int rhs); | 		xml_attribute& operator=(unsigned int rhs); | ||||||
|  | 		xml_attribute& operator=(long rhs); | ||||||
|  | 		xml_attribute& operator=(unsigned long rhs); | ||||||
| 		xml_attribute& operator=(double rhs); | 		xml_attribute& operator=(double rhs); | ||||||
| 		xml_attribute& operator=(float rhs); | 		xml_attribute& operator=(float rhs); | ||||||
| 		xml_attribute& operator=(bool rhs); | 		xml_attribute& operator=(bool rhs); | ||||||
| @@ -601,8 +651,8 @@ namespace pugi | |||||||
| 		xpath_node_set select_nodes(const xpath_query& query) const; | 		xpath_node_set select_nodes(const xpath_query& query) const; | ||||||
|  |  | ||||||
| 		// (deprecated: use select_node instead) Select single node by evaluating XPath query. | 		// (deprecated: use select_node instead) Select single node by evaluating XPath query. | ||||||
| 		xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const; | 		PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const; | ||||||
| 		xpath_node select_single_node(const xpath_query& query) const; | 		PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const; | ||||||
|  |  | ||||||
| 	#endif | 	#endif | ||||||
|  |  | ||||||
| @@ -701,6 +751,8 @@ namespace pugi | |||||||
| 		// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") | 		// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false") | ||||||
| 		bool set(int rhs); | 		bool set(int rhs); | ||||||
| 		bool set(unsigned int rhs); | 		bool set(unsigned int rhs); | ||||||
|  | 		bool set(long rhs); | ||||||
|  | 		bool set(unsigned long rhs); | ||||||
| 		bool set(double rhs); | 		bool set(double rhs); | ||||||
| 		bool set(float rhs); | 		bool set(float rhs); | ||||||
| 		bool set(bool rhs); | 		bool set(bool rhs); | ||||||
| @@ -714,6 +766,8 @@ namespace pugi | |||||||
| 		xml_text& operator=(const char_t* rhs); | 		xml_text& operator=(const char_t* rhs); | ||||||
| 		xml_text& operator=(int rhs); | 		xml_text& operator=(int rhs); | ||||||
| 		xml_text& operator=(unsigned int rhs); | 		xml_text& operator=(unsigned int rhs); | ||||||
|  | 		xml_text& operator=(long rhs); | ||||||
|  | 		xml_text& operator=(unsigned long rhs); | ||||||
| 		xml_text& operator=(double rhs); | 		xml_text& operator=(double rhs); | ||||||
| 		xml_text& operator=(float rhs); | 		xml_text& operator=(float rhs); | ||||||
| 		xml_text& operator=(bool rhs); | 		xml_text& operator=(bool rhs); | ||||||
| @@ -945,10 +999,11 @@ namespace pugi | |||||||
|  |  | ||||||
| 		// Non-copyable semantics | 		// Non-copyable semantics | ||||||
| 		xml_document(const xml_document&); | 		xml_document(const xml_document&); | ||||||
| 		const xml_document& operator=(const xml_document&); | 		xml_document& operator=(const xml_document&); | ||||||
|  |  | ||||||
| 		void create(); | 		void _create(); | ||||||
| 		void destroy(); | 		void _destroy(); | ||||||
|  | 		void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT; | ||||||
|  |  | ||||||
| 	public: | 	public: | ||||||
| 		// Default constructor, makes empty document | 		// Default constructor, makes empty document | ||||||
| @@ -957,6 +1012,12 @@ namespace pugi | |||||||
| 		// Destructor, invalidates all node/attribute handles to this document | 		// Destructor, invalidates all node/attribute handles to this document | ||||||
| 		~xml_document(); | 		~xml_document(); | ||||||
|  |  | ||||||
|  | 	#ifdef PUGIXML_HAS_MOVE | ||||||
|  | 		// Move semantics support | ||||||
|  | 		xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT; | ||||||
|  | 		xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT; | ||||||
|  | 	#endif | ||||||
|  |  | ||||||
| 		// Removes all nodes, leaving the empty document | 		// Removes all nodes, leaving the empty document | ||||||
| 		void reset(); | 		void reset(); | ||||||
|  |  | ||||||
| @@ -970,7 +1031,7 @@ namespace pugi | |||||||
| 	#endif | 	#endif | ||||||
|  |  | ||||||
| 		// (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied. | 		// (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied. | ||||||
| 		xml_parse_result load(const char_t* contents, unsigned int options = parse_default); | 		PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default); | ||||||
|  |  | ||||||
| 		// Load document from zero-terminated string. No encoding conversions are applied. | 		// Load document from zero-terminated string. No encoding conversions are applied. | ||||||
| 		xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default); | 		xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default); | ||||||
| @@ -1095,10 +1156,10 @@ namespace pugi | |||||||
| 		xpath_variable_set(const xpath_variable_set& rhs); | 		xpath_variable_set(const xpath_variable_set& rhs); | ||||||
| 		xpath_variable_set& operator=(const xpath_variable_set& rhs); | 		xpath_variable_set& operator=(const xpath_variable_set& rhs); | ||||||
|  |  | ||||||
| 	#if __cplusplus >= 201103 | 	#ifdef PUGIXML_HAS_MOVE | ||||||
| 		// Move semantics support | 		// Move semantics support | ||||||
| 		xpath_variable_set(xpath_variable_set&& rhs); | 		xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT; | ||||||
| 		xpath_variable_set& operator=(xpath_variable_set&& rhs); | 		xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT; | ||||||
| 	#endif | 	#endif | ||||||
|  |  | ||||||
| 		// Add a new variable or get the existing one, if the types match | 		// Add a new variable or get the existing one, if the types match | ||||||
| @@ -1139,10 +1200,10 @@ namespace pugi | |||||||
| 		// Destructor | 		// Destructor | ||||||
| 		~xpath_query(); | 		~xpath_query(); | ||||||
|  |  | ||||||
| 	#if __cplusplus >= 201103 | 	#ifdef PUGIXML_HAS_MOVE | ||||||
| 		// Move semantics support | 		// Move semantics support | ||||||
| 		xpath_query(xpath_query&& rhs); | 		xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT; | ||||||
| 		xpath_query& operator=(xpath_query&& rhs); | 		xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT; | ||||||
| 	#endif | 	#endif | ||||||
|  |  | ||||||
| 		// Get query expression return type | 		// Get query expression return type | ||||||
| @@ -1201,7 +1262,7 @@ namespace pugi | |||||||
| 		explicit xpath_exception(const xpath_parse_result& result); | 		explicit xpath_exception(const xpath_parse_result& result); | ||||||
|  |  | ||||||
| 		// Get error message | 		// Get error message | ||||||
| 		virtual const char* what() const throw(); | 		virtual const char* what() const throw() PUGIXML_OVERRIDE; | ||||||
|  |  | ||||||
| 		// Get parse result | 		// Get parse result | ||||||
| 		const xpath_parse_result& result() const; | 		const xpath_parse_result& result() const; | ||||||
| @@ -1280,10 +1341,10 @@ namespace pugi | |||||||
| 		xpath_node_set(const xpath_node_set& ns); | 		xpath_node_set(const xpath_node_set& ns); | ||||||
| 		xpath_node_set& operator=(const xpath_node_set& ns); | 		xpath_node_set& operator=(const xpath_node_set& ns); | ||||||
|  |  | ||||||
| 	#if __cplusplus >= 201103 | 	#ifdef PUGIXML_HAS_MOVE | ||||||
| 		// Move semantics support | 		// Move semantics support | ||||||
| 		xpath_node_set(xpath_node_set&& rhs); | 		xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT; | ||||||
| 		xpath_node_set& operator=(xpath_node_set&& rhs); | 		xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT; | ||||||
| 	#endif | 	#endif | ||||||
|  |  | ||||||
| 		// Get collection type | 		// Get collection type | ||||||
| @@ -1317,7 +1378,7 @@ namespace pugi | |||||||
| 		xpath_node* _end; | 		xpath_node* _end; | ||||||
|  |  | ||||||
| 		void _assign(const_iterator begin, const_iterator end, type_t type); | 		void _assign(const_iterator begin, const_iterator end, type_t type); | ||||||
| 		void _move(xpath_node_set& rhs); | 		void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT; | ||||||
| 	}; | 	}; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -1375,7 +1436,7 @@ namespace std | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * Copyright (c) 2006-2015 Arseny Kapoulkine |  * Copyright (c) 2006-2018 Arseny Kapoulkine | ||||||
|  * |  * | ||||||
|  * Permission is hereby granted, free of charge, to any person |  * Permission is hereby granted, free of charge, to any person | ||||||
|  * obtaining a copy of this software and associated documentation |  * obtaining a copy of this software and associated documentation | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| pugixml 1.6 - an XML processing library | pugixml 1.9 - an XML processing library | ||||||
|  |  | ||||||
| Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) | Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) | ||||||
| Report bugs and download new versions at http://pugixml.org/ | Report bugs and download new versions at http://pugixml.org/ | ||||||
|  |  | ||||||
| This is the distribution of pugixml, which is a C++ XML processing library, | This is the distribution of pugixml, which is a C++ XML processing library, | ||||||
| @@ -28,7 +28,7 @@ The distribution contains the following folders: | |||||||
|  |  | ||||||
| This library is distributed under the MIT License: | This library is distributed under the MIT License: | ||||||
|  |  | ||||||
| Copyright (c) 2006-2015 Arseny Kapoulkine | Copyright (c) 2006-2018 Arseny Kapoulkine | ||||||
|  |  | ||||||
| Permission is hereby granted, free of charge, to any person | Permission is hereby granted, free of charge, to any person | ||||||
| obtaining a copy of this software and associated documentation | obtaining a copy of this software and associated documentation | ||||||
|   | |||||||
| @@ -52,6 +52,35 @@ namespace QCD { | |||||||
|  {  |  {  | ||||||
|  } |  } | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////// | ||||||
|  | // Physical surface field utilities | ||||||
|  | /////////////////////////////////////////////////////////////// | ||||||
|  | template<class Impl>   | ||||||
|  | void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d) | ||||||
|  | { | ||||||
|  |   int Ls = this->Ls; | ||||||
|  |   FermionField tmp(this->FermionGrid()); | ||||||
|  |   tmp = solution5d; | ||||||
|  |   conformable(solution5d._grid,this->FermionGrid()); | ||||||
|  |   conformable(exported4d._grid,this->GaugeGrid()); | ||||||
|  |   axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0); | ||||||
|  |   axpby_ssp_pplus (tmp, 1., tmp       , 1., solution5d, 0, Ls-1); | ||||||
|  |   ExtractSlice(exported4d, tmp, 0, 0); | ||||||
|  | } | ||||||
|  | template<class Impl>   | ||||||
|  | void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) | ||||||
|  | { | ||||||
|  |   int Ls = this->Ls; | ||||||
|  |   FermionField tmp(this->FermionGrid()); | ||||||
|  |   conformable(imported5d._grid,this->FermionGrid()); | ||||||
|  |   conformable(input4d._grid   ,this->GaugeGrid()); | ||||||
|  |   tmp = zero; | ||||||
|  |   InsertSlice(input4d, tmp, 0   , 0); | ||||||
|  |   InsertSlice(input4d, tmp, Ls-1, 0); | ||||||
|  |   axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0); | ||||||
|  |   axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1); | ||||||
|  |   Dminus(tmp,imported5d); | ||||||
|  | } | ||||||
| template<class Impl>   | template<class Impl>   | ||||||
| void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
| @@ -73,7 +102,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi | |||||||
|   this->DW(psi,tmp_f,DaggerYes); |   this->DW(psi,tmp_f,DaggerYes); | ||||||
|  |  | ||||||
|   for(int s=0;s<Ls;s++){ |   for(int s=0;s<Ls;s++){ | ||||||
|     axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi |     axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -83,8 +83,13 @@ namespace Grid { | |||||||
|       virtual void   M5D   (const FermionField &psi, FermionField &chi); |       virtual void   M5D   (const FermionField &psi, FermionField &chi); | ||||||
|       virtual void   M5Ddag(const FermionField &psi, FermionField &chi); |       virtual void   M5Ddag(const FermionField &psi, FermionField &chi); | ||||||
|  |  | ||||||
|  |       /////////////////////////////////////////////////////////////// | ||||||
|  |       // Physical surface field utilities | ||||||
|  |       /////////////////////////////////////////////////////////////// | ||||||
|       virtual void   Dminus(const FermionField &psi, FermionField &chi); |       virtual void   Dminus(const FermionField &psi, FermionField &chi); | ||||||
|       virtual void   DminusDag(const FermionField &psi, FermionField &chi); |       virtual void   DminusDag(const FermionField &psi, FermionField &chi); | ||||||
|  |       virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); | ||||||
|  |       virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d); | ||||||
|  |  | ||||||
|       ///////////////////////////////////////////////////// |       ///////////////////////////////////////////////////// | ||||||
|       // Instantiate different versions depending on Impl |       // Instantiate different versions depending on Impl | ||||||
|   | |||||||
| @@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie | |||||||
| 	} | 	} | ||||||
| 	a0 = a0+incr; | 	a0 = a0+incr; | ||||||
| 	a1 = a1+incr; | 	a1 = a1+incr; | ||||||
| 	a2 = a2+sizeof(Simd::scalar_type); | 	a2 = a2+sizeof(typename Simd::scalar_type); | ||||||
|       }} |       }} | ||||||
|     { |     { | ||||||
|       int lexa = s1+LLs*site; |       int lexa = s1+LLs*site; | ||||||
| @@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi | |||||||
| 	} | 	} | ||||||
| 	a0 = a0+incr; | 	a0 = a0+incr; | ||||||
| 	a1 = a1+incr; | 	a1 = a1+incr; | ||||||
| 	a2 = a2+sizeof(Simd::scalar_type); | 	a2 = a2+sizeof(typename Simd::scalar_type); | ||||||
|       }} |       }} | ||||||
|     { |     { | ||||||
|       int lexa = s1+LLs*site; |       int lexa = s1+LLs*site; | ||||||
|   | |||||||
| @@ -295,6 +295,27 @@ namespace Grid { | |||||||
|       assert((Ls&0x1)==1); // Odd Ls required |       assert((Ls&0x1)==1); // Odd Ls required | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     template<class Impl> | ||||||
|  |     void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d) | ||||||
|  |     { | ||||||
|  |       int Ls = this->Ls; | ||||||
|  |       conformable(solution5d._grid,this->FermionGrid()); | ||||||
|  |       conformable(exported4d._grid,this->GaugeGrid()); | ||||||
|  |       ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); | ||||||
|  |     } | ||||||
|  |     template<class Impl> | ||||||
|  |     void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) | ||||||
|  |     { | ||||||
|  |       int Ls = this->Ls; | ||||||
|  |       conformable(imported5d._grid,this->FermionGrid()); | ||||||
|  |       conformable(input4d._grid   ,this->GaugeGrid()); | ||||||
|  |       FermionField tmp(this->FermionGrid()); | ||||||
|  |       tmp=zero; | ||||||
|  |       InsertSlice(input4d, tmp, Ls-1, Ls-1); | ||||||
|  |       tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; | ||||||
|  |       this->Dminus(tmp,imported5d); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     FermOpTemplateInstantiate(ContinuedFractionFermion5D); |     FermOpTemplateInstantiate(ContinuedFractionFermion5D); | ||||||
|  |  | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -65,6 +65,14 @@ namespace Grid { | |||||||
|       // Efficient support for multigrid coarsening |       // Efficient support for multigrid coarsening | ||||||
|       virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp); |       virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp); | ||||||
|  |  | ||||||
|  |       /////////////////////////////////////////////////////////////// | ||||||
|  |       // Physical surface field utilities | ||||||
|  |       /////////////////////////////////////////////////////////////// | ||||||
|  |       //      virtual void Dminus(const FermionField &psi, FermionField &chi);     // Inherit trivial case | ||||||
|  |       //      virtual void DminusDag(const FermionField &psi, FermionField &chi);  // Inherit trivial case | ||||||
|  |       virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); | ||||||
|  |       virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d); | ||||||
|  |  | ||||||
|       // Constructors |       // Constructors | ||||||
|       ContinuedFractionFermion5D(GaugeField &_Umu, |       ContinuedFractionFermion5D(GaugeField &_Umu, | ||||||
| 				 GridCartesian         &FiveDimGrid, | 				 GridCartesian         &FiveDimGrid, | ||||||
|   | |||||||
| @@ -475,7 +475,7 @@ namespace QCD { | |||||||
|                         } |                         } | ||||||
|                         a0 = a0 + incr; |                         a0 = a0 + incr; | ||||||
|                         a1 = a1 + incr; |                         a1 = a1 + incr; | ||||||
|                         a2 = a2 + sizeof(Simd::scalar_type); |                         a2 = a2 + sizeof(typename Simd::scalar_type); | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -63,9 +63,12 @@ namespace Grid { | |||||||
|       virtual RealD  M    (const FermionField &in, FermionField &out)=0; |       virtual RealD  M    (const FermionField &in, FermionField &out)=0; | ||||||
|       virtual RealD  Mdag (const FermionField &in, FermionField &out)=0; |       virtual RealD  Mdag (const FermionField &in, FermionField &out)=0; | ||||||
|  |  | ||||||
|       // half checkerboard operaions |       // Query the even even properties to make algorithmic decisions | ||||||
|       virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field |       virtual int    ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field | ||||||
|  |       virtual int    isTrivialEE(void) { return 0; }; | ||||||
|  |       virtual RealD  Mass(void) {return 0.0;}; | ||||||
|  |  | ||||||
|  |       // half checkerboard operaions | ||||||
|       virtual void   Meooe       (const FermionField &in, FermionField &out)=0; |       virtual void   Meooe       (const FermionField &in, FermionField &out)=0; | ||||||
|       virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0; |       virtual void   MeooeDag    (const FermionField &in, FermionField &out)=0; | ||||||
|       virtual void   Mooee       (const FermionField &in, FermionField &out)=0; |       virtual void   Mooee       (const FermionField &in, FermionField &out)=0; | ||||||
| @@ -128,6 +131,19 @@ namespace Grid { | |||||||
|                                        std::vector<Real> mom, |                                        std::vector<Real> mom, | ||||||
|                                        unsigned int tmin,  |                                        unsigned int tmin,  | ||||||
|                                        unsigned int tmax)=0; |                                        unsigned int tmax)=0; | ||||||
|  |       /////////////////////////////////////////////// | ||||||
|  |       // Physical field import/export | ||||||
|  |       /////////////////////////////////////////////// | ||||||
|  |       virtual void Dminus(const FermionField &psi, FermionField &chi)    { chi=psi; } | ||||||
|  |       virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; } | ||||||
|  |       virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported) | ||||||
|  |       { | ||||||
|  | 	imported = input; | ||||||
|  |       }; | ||||||
|  |       virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported) | ||||||
|  |       { | ||||||
|  | 	exported=solution; | ||||||
|  |       }; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -765,6 +765,11 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation: | |||||||
|       reg = memory; |       reg = memory; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     inline void InsertGaugeField(DoubledGaugeField &U_ds, | ||||||
|  | 				 const GaugeLinkField &U,int mu) | ||||||
|  |     { | ||||||
|  |       PokeIndex<LorentzIndex>(U_ds, U, mu); | ||||||
|  |     } | ||||||
|     inline void DoubleStore(GridBase *GaugeGrid, |     inline void DoubleStore(GridBase *GaugeGrid, | ||||||
| 			    DoubledGaugeField &UUUds, // for Naik term | 			    DoubledGaugeField &UUUds, // for Naik term | ||||||
| 			    DoubledGaugeField &Uds, | 			    DoubledGaugeField &Uds, | ||||||
| @@ -803,8 +808,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation: | |||||||
| 	U    = U    *phases; | 	U    = U    *phases; | ||||||
| 	Udag = Udag *phases; | 	Udag = Udag *phases; | ||||||
|  |  | ||||||
| 	PokeIndex<LorentzIndex>(Uds, U, mu); | 	InsertGaugeField(Uds,U,mu); | ||||||
| 	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4); | 	InsertGaugeField(Uds,Udag,mu+4); | ||||||
|  | 	//	PokeIndex<LorentzIndex>(Uds, U, mu); | ||||||
|  | 	//	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4); | ||||||
|  |  | ||||||
| 	// 3 hop based on thin links. Crazy huh ? | 	// 3 hop based on thin links. Crazy huh ? | ||||||
| 	U  = PeekIndex<LorentzIndex>(Uthin, mu); | 	U  = PeekIndex<LorentzIndex>(Uthin, mu); | ||||||
| @@ -816,8 +823,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation: | |||||||
| 	UUU    = UUU    *phases; | 	UUU    = UUU    *phases; | ||||||
| 	UUUdag = UUUdag *phases; | 	UUUdag = UUUdag *phases; | ||||||
|  |  | ||||||
| 	PokeIndex<LorentzIndex>(UUUds, UUU, mu); | 	InsertGaugeField(UUUds,UUU,mu); | ||||||
| 	PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4); | 	InsertGaugeField(UUUds,UUUdag,mu+4); | ||||||
|  |  | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
| @@ -910,6 +917,23 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation: | |||||||
|       mac(&phi(), &UU(), &chi()); |       mac(&phi(), &UU(), &chi()); | ||||||
|     } |     } | ||||||
|        |        | ||||||
|  |     inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu) | ||||||
|  |     { | ||||||
|  |       GridBase *GaugeGrid = U_ds._grid; | ||||||
|  |       parallel_for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { | ||||||
|  |  | ||||||
|  | 	SiteScalarGaugeLink   ScalarU; | ||||||
|  | 	SiteDoubledGaugeField ScalarUds; | ||||||
|  | 	 | ||||||
|  | 	std::vector<int> lcoor; | ||||||
|  | 	GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); | ||||||
|  | 	peekLocalSite(ScalarUds, U_ds, lcoor); | ||||||
|  | 	 | ||||||
|  | 	peekLocalSite(ScalarU, U, lcoor); | ||||||
|  | 	ScalarUds(mu) = ScalarU(); | ||||||
|  | 	 | ||||||
|  |       } | ||||||
|  |     } | ||||||
|     inline void DoubleStore(GridBase *GaugeGrid, |     inline void DoubleStore(GridBase *GaugeGrid, | ||||||
| 			    DoubledGaugeField &UUUds, // for Naik term | 			    DoubledGaugeField &UUUds, // for Naik term | ||||||
| 			    DoubledGaugeField &Uds, | 			    DoubledGaugeField &Uds, | ||||||
| @@ -951,23 +975,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation: | |||||||
| 	U    = U    *phases; | 	U    = U    *phases; | ||||||
| 	Udag = Udag *phases; | 	Udag = Udag *phases; | ||||||
|  |  | ||||||
|  | 	InsertGaugeField(Uds,U,mu); | ||||||
| 	for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { | 	InsertGaugeField(Uds,Udag,mu+4); | ||||||
| 	  SiteScalarGaugeLink   ScalarU; |  | ||||||
| 	  SiteDoubledGaugeField ScalarUds; |  | ||||||
| 	   |  | ||||||
| 	  std::vector<int> lcoor; |  | ||||||
| 	  GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); |  | ||||||
| 	  peekLocalSite(ScalarUds, Uds, lcoor); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, U, lcoor); |  | ||||||
| 	  ScalarUds(mu) = ScalarU(); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, Udag, lcoor); |  | ||||||
| 	  ScalarUds(mu + 4) = ScalarU(); |  | ||||||
|  |  | ||||||
| 	  pokeLocalSite(ScalarUds, Uds, lcoor); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	// 3 hop based on thin links. Crazy huh ? | 	// 3 hop based on thin links. Crazy huh ? | ||||||
| 	U  = PeekIndex<LorentzIndex>(Uthin, mu); | 	U  = PeekIndex<LorentzIndex>(Uthin, mu); | ||||||
| @@ -979,24 +988,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation: | |||||||
| 	UUU    = UUU    *phases; | 	UUU    = UUU    *phases; | ||||||
| 	UUUdag = UUUdag *phases; | 	UUUdag = UUUdag *phases; | ||||||
|  |  | ||||||
| 	for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { | 	InsertGaugeField(UUUds,UUU,mu); | ||||||
|  | 	InsertGaugeField(UUUds,UUUdag,mu+4); | ||||||
| 	  SiteScalarGaugeLink  ScalarU; |  | ||||||
| 	  SiteDoubledGaugeField ScalarUds; |  | ||||||
| 	   |  | ||||||
| 	  std::vector<int> lcoor; |  | ||||||
| 	  GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); |  | ||||||
|        |  | ||||||
| 	  peekLocalSite(ScalarUds, UUUds, lcoor); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, UUU, lcoor); |  | ||||||
| 	  ScalarUds(mu) = ScalarU(); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, UUUdag, lcoor); |  | ||||||
| 	  ScalarUds(mu + 4) = ScalarU(); |  | ||||||
| 	   |  | ||||||
| 	  pokeLocalSite(ScalarUds, UUUds, lcoor); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, | |||||||
| template <class Impl> | template <class Impl> | ||||||
| ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,  | ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,  | ||||||
| 							 RealD _mass, | 							 RealD _mass, | ||||||
|  | 							 RealD _c1, RealD _c2,RealD _u0, | ||||||
| 							 const ImplParams &p) | 							 const ImplParams &p) | ||||||
|     : Kernels(p), |     : Kernels(p), | ||||||
|       _grid(&Fgrid), |       _grid(&Fgrid), | ||||||
| @@ -62,6 +63,16 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G | |||||||
|       UUUmuOdd(&Hgrid) , |       UUUmuOdd(&Hgrid) , | ||||||
|       _tmp(&Hgrid) |       _tmp(&Hgrid) | ||||||
| { | { | ||||||
|  |   int vol4; | ||||||
|  |   int LLs=1; | ||||||
|  |   c1=_c1; | ||||||
|  |   c2=_c2; | ||||||
|  |   u0=_u0; | ||||||
|  |   vol4= _grid->oSites(); | ||||||
|  |   Stencil.BuildSurfaceList(LLs,vol4); | ||||||
|  |   vol4= _cbgrid->oSites(); | ||||||
|  |   StencilEven.BuildSurfaceList(LLs,vol4); | ||||||
|  |   StencilOdd.BuildSurfaceList(LLs,vol4); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -69,22 +80,10 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau | |||||||
| 							 GridRedBlackCartesian &Hgrid, RealD _mass, | 							 GridRedBlackCartesian &Hgrid, RealD _mass, | ||||||
| 							 RealD _c1, RealD _c2,RealD _u0, | 							 RealD _c1, RealD _c2,RealD _u0, | ||||||
| 							 const ImplParams &p) | 							 const ImplParams &p) | ||||||
|   : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) |   : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p) | ||||||
| { | { | ||||||
|   c1=_c1; |  | ||||||
|   c2=_c2; |  | ||||||
|   u0=_u0; |  | ||||||
|   ImportGauge(_Uthin,_Ufat); |   ImportGauge(_Uthin,_Ufat); | ||||||
| } | } | ||||||
| template <class Impl> |  | ||||||
| ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, |  | ||||||
| 							 GridRedBlackCartesian &Hgrid, RealD _mass, |  | ||||||
| 							 const ImplParams &p) |  | ||||||
|   : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) |  | ||||||
| { |  | ||||||
|   ImportGaugeSimple(_Utriple,_Ufat); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   // Momentum space propagator should be  |   // Momentum space propagator should be  | ||||||
| @@ -98,11 +97,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug | |||||||
|   // of above link to implmement fourier based solver. |   // of above link to implmement fourier based solver. | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)  |  | ||||||
| { |  | ||||||
|   ImportGauge(_Uthin,_Uthin); |  | ||||||
| }; |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)  | void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)  | ||||||
| { | { | ||||||
|   ///////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////// | ||||||
| @@ -125,6 +119,20 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl | |||||||
|     PokeIndex<LorentzIndex>(Umu, -U, mu+4); |     PokeIndex<LorentzIndex>(Umu, -U, mu+4); | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |   CopyGaugeCheckerboards(); | ||||||
|  | } | ||||||
|  | template <class Impl> | ||||||
|  | void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)  | ||||||
|  | { | ||||||
|  |  | ||||||
|  |   Umu   = _U; | ||||||
|  |   UUUmu = _UUU; | ||||||
|  |   CopyGaugeCheckerboards(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template <class Impl> | ||||||
|  | void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void) | ||||||
|  | { | ||||||
|   pickCheckerboard(Even, UmuEven,  Umu); |   pickCheckerboard(Even, UmuEven,  Umu); | ||||||
|   pickCheckerboard(Odd,  UmuOdd ,  Umu); |   pickCheckerboard(Odd,  UmuOdd ,  Umu); | ||||||
|   pickCheckerboard(Even, UUUmuEven,UUUmu); |   pickCheckerboard(Even, UUUmuEven,UUUmu); | ||||||
| @@ -160,10 +168,7 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const | |||||||
|     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); |     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   pickCheckerboard(Even, UmuEven, Umu); |   CopyGaugeCheckerboards(); | ||||||
|   pickCheckerboard(Odd,  UmuOdd , Umu); |  | ||||||
|   pickCheckerboard(Even, UUUmuEven, UUUmu); |  | ||||||
|   pickCheckerboard(Odd,   UUUmuOdd, UUUmu); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| ///////////////////////////// | ///////////////////////////// | ||||||
| @@ -322,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF | |||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) { | void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) { | ||||||
|  |   DhopCalls+=2; | ||||||
|   conformable(in._grid, _grid);  // verifies full grid |   conformable(in._grid, _grid);  // verifies full grid | ||||||
|   conformable(in._grid, out._grid); |   conformable(in._grid, out._grid); | ||||||
|  |  | ||||||
| @@ -332,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField & | |||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) { | void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) { | ||||||
|  |   DhopCalls+=1; | ||||||
|   conformable(in._grid, _cbgrid);    // verifies half grid |   conformable(in._grid, _cbgrid);    // verifies half grid | ||||||
|   conformable(in._grid, out._grid);  // drops the cb check |   conformable(in._grid, out._grid);  // drops the cb check | ||||||
|  |  | ||||||
| @@ -343,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField | |||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) { | void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) { | ||||||
|  |   DhopCalls+=1; | ||||||
|   conformable(in._grid, _cbgrid);    // verifies half grid |   conformable(in._grid, _cbgrid);    // verifies half grid | ||||||
|   conformable(in._grid, out._grid);  // drops the cb check |   conformable(in._grid, out._grid);  // drops the cb check | ||||||
|  |  | ||||||
| @@ -374,25 +382,193 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder | |||||||
| 						  DoubledGaugeField &U, | 						  DoubledGaugeField &U, | ||||||
| 						  DoubledGaugeField &UUU, | 						  DoubledGaugeField &UUU, | ||||||
| 						  const FermionField &in, | 						  const FermionField &in, | ||||||
| 						  FermionField &out, int dag) { | 						  FermionField &out, int dag)  | ||||||
|  | { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) | ||||||
|  |     DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); | ||||||
|  |   else | ||||||
|  | #endif | ||||||
|  |     DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); | ||||||
|  | } | ||||||
|  | template <class Impl> | ||||||
|  | void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, | ||||||
|  | 								 DoubledGaugeField &U, | ||||||
|  | 								 DoubledGaugeField &UUU, | ||||||
|  | 								 const FermionField &in, | ||||||
|  | 								 FermionField &out, int dag)  | ||||||
|  | { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |   Compressor compressor;  | ||||||
|  |   int len =  U._grid->oSites(); | ||||||
|  |   const int LLs =  1; | ||||||
|  |  | ||||||
|  |   DhopTotalTime   -= usecond(); | ||||||
|  |  | ||||||
|  |   DhopFaceTime    -= usecond(); | ||||||
|  |   st.Prepare(); | ||||||
|  |   st.HaloGather(in,compressor); | ||||||
|  |   st.CommsMergeSHM(compressor); | ||||||
|  |   DhopFaceTime    += usecond(); | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Ugly explicit thread mapping introduced for OPA reasons. | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   DhopComputeTime    -= usecond(); | ||||||
|  | #pragma omp parallel  | ||||||
|  |   { | ||||||
|  |     int tid = omp_get_thread_num(); | ||||||
|  |     int nthreads = omp_get_num_threads(); | ||||||
|  |     int ncomms = CartesianCommunicator::nCommThreads; | ||||||
|  |     if (ncomms == -1) ncomms = 1; | ||||||
|  |     assert(nthreads > ncomms); | ||||||
|  |  | ||||||
|  |     if (tid >= ncomms) { | ||||||
|  |       nthreads -= ncomms; | ||||||
|  |       int ttid  = tid - ncomms; | ||||||
|  |       int n     = len; | ||||||
|  |       int chunk = n / nthreads; | ||||||
|  |       int rem   = n % nthreads; | ||||||
|  |       int myblock, myn; | ||||||
|  |       if (ttid < rem) { | ||||||
|  |         myblock = ttid * chunk + ttid; | ||||||
|  |         myn = chunk+1; | ||||||
|  |       } else { | ||||||
|  |         myblock = ttid*chunk + rem; | ||||||
|  |         myn = chunk; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       // do the compute | ||||||
|  |       if (dag == DaggerYes) { | ||||||
|  |         for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  |           int sU = ss; | ||||||
|  | 	  // Interior = 1; Exterior = 0; must implement for staggered | ||||||
|  |           Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);  | ||||||
|  |         } | ||||||
|  |       } else { | ||||||
|  |         for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  | 	  // Interior = 1; Exterior = 0; | ||||||
|  |           int sU = ss; | ||||||
|  |           Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |     } else { | ||||||
|  |       st.CommunicateThreaded(); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopComputeTime    += usecond(); | ||||||
|  |  | ||||||
|  |   // First to enter, last to leave timing | ||||||
|  |   DhopFaceTime    -= usecond(); | ||||||
|  |   st.CommsMerge(compressor); | ||||||
|  |   DhopFaceTime    -= usecond(); | ||||||
|  |  | ||||||
|  |   DhopComputeTime2    -= usecond(); | ||||||
|  |   if (dag == DaggerYes) { | ||||||
|  |     int sz=st.surface_list.size(); | ||||||
|  |     parallel_for (int ss = 0; ss < sz; ss++) { | ||||||
|  |       int sU = st.surface_list[ss]; | ||||||
|  |       Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); | ||||||
|  |     } | ||||||
|  |   } else { | ||||||
|  |     int sz=st.surface_list.size(); | ||||||
|  |     parallel_for (int ss = 0; ss < sz; ss++) { | ||||||
|  |       int sU = st.surface_list[ss]; | ||||||
|  |       Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopComputeTime2    += usecond(); | ||||||
|  | #else | ||||||
|  |   assert(0); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template <class Impl> | ||||||
|  | void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, | ||||||
|  | 							     DoubledGaugeField &U, | ||||||
|  | 							     DoubledGaugeField &UUU, | ||||||
|  | 							     const FermionField &in, | ||||||
|  | 							     FermionField &out, int dag)  | ||||||
|  | { | ||||||
|   assert((dag == DaggerNo) || (dag == DaggerYes)); |   assert((dag == DaggerNo) || (dag == DaggerYes)); | ||||||
|  |  | ||||||
|  |   DhopTotalTime   -= usecond(); | ||||||
|  |  | ||||||
|  |   DhopCommTime    -= usecond(); | ||||||
|   Compressor compressor; |   Compressor compressor; | ||||||
|   st.HaloExchange(in, compressor); |   st.HaloExchange(in, compressor); | ||||||
|  |   DhopCommTime    += usecond(); | ||||||
|  |  | ||||||
|  |   DhopComputeTime -= usecond(); | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     PARALLEL_FOR_LOOP |     parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|     for (int sss = 0; sss < in._grid->oSites(); sss++) { |  | ||||||
|       Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); |       Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); | ||||||
|     } |     } | ||||||
|   } else { |   } else { | ||||||
|     PARALLEL_FOR_LOOP |     parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|     for (int sss = 0; sss < in._grid->oSites(); sss++) { |  | ||||||
|       Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); |       Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   DhopComputeTime += usecond(); | ||||||
|  |   DhopTotalTime   += usecond(); | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  |   // Reporting | ||||||
|  |   //////////////////////////////////////////////////////////////// | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion<Impl>::Report(void)  | ||||||
|  | { | ||||||
|  |   std::vector<int> latt = GridDefaultLatt();           | ||||||
|  |   RealD volume = 1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; | ||||||
|  |   RealD NP = _grid->_Nprocessors; | ||||||
|  |   RealD NN = _grid->NodeCount(); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls   : "  | ||||||
|  | 	    << DhopCalls   << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime   /Calls       : "  | ||||||
|  | 	    << DhopTotalTime   / DhopCalls << " us" << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime    /Calls       : "  | ||||||
|  | 	    << DhopCommTime    / DhopCalls << " us" << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls        : "  | ||||||
|  | 	    << DhopComputeTime / DhopCalls << " us" << std::endl; | ||||||
|  |  | ||||||
|  |   // Average the compute time | ||||||
|  |   _grid->GlobalSum(DhopComputeTime); | ||||||
|  |   DhopComputeTime/=NP; | ||||||
|  |  | ||||||
|  |   RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl; | ||||||
|  |    | ||||||
|  |   RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil"    <<std::endl;  Stencil.Report(); | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl;  StencilEven.Report(); | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl;  StencilOdd.Report(); | ||||||
|  | } | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)  | ||||||
|  | { | ||||||
|  |   DhopCalls       = 0; | ||||||
|  |   DhopTotalTime   = 0; | ||||||
|  |   DhopCommTime    = 0; | ||||||
|  |   DhopComputeTime = 0; | ||||||
|  |   DhopFaceTime    = 0; | ||||||
|  |  | ||||||
|  |   Stencil.ZeroCounters(); | ||||||
|  |   StencilEven.ZeroCounters(); | ||||||
|  |   StencilOdd.ZeroCounters(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////////  | ////////////////////////////////////////////////////////  | ||||||
| // Conserved current - not yet implemented. | // Conserved current - not yet implemented. | ||||||
| //////////////////////////////////////////////////////// | //////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS | |||||||
|   FermionField _tmp; |   FermionField _tmp; | ||||||
|   FermionField &tmp(void) { return _tmp; } |   FermionField &tmp(void) { return _tmp; } | ||||||
|  |  | ||||||
|  |   //////////////////////////////////////// | ||||||
|  |   // Performance monitoring | ||||||
|  |   //////////////////////////////////////// | ||||||
|  |   void Report(void); | ||||||
|  |   void ZeroCounters(void); | ||||||
|  |   double DhopTotalTime; | ||||||
|  |   double DhopCalls; | ||||||
|  |   double DhopCommTime; | ||||||
|  |   double DhopComputeTime; | ||||||
|  |   double DhopComputeTime2; | ||||||
|  |   double DhopFaceTime; | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
|   // Implement the abstract base |   // Implement the abstract base | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
| @@ -105,25 +117,34 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS | |||||||
|  |  | ||||||
|   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, |   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, | ||||||
|                     const FermionField &in, FermionField &out, int dag); |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |   void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, | ||||||
|  |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |   void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, | ||||||
|  |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|   // Constructor |   ////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Grid own interface Constructor | ||||||
|  |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, |   ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, | ||||||
| 			   GridRedBlackCartesian &Hgrid, RealD _mass, | 			   GridRedBlackCartesian &Hgrid, RealD _mass, | ||||||
| 			   RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, | 			   RealD _c1, RealD _c2,RealD _u0, | ||||||
| 			   const ImplParams &p = ImplParams()); |  | ||||||
|  |  | ||||||
|   ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, |  | ||||||
| 			   GridRedBlackCartesian &Hgrid, RealD _mass, |  | ||||||
| 			   const ImplParams &p = ImplParams()); | 			   const ImplParams &p = ImplParams()); | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // MILC constructor no gauge fields | ||||||
|  |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, |   ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, | ||||||
|  | 			   RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, | ||||||
| 			   const ImplParams &p = ImplParams()); | 			   const ImplParams &p = ImplParams()); | ||||||
|  |  | ||||||
|  |  | ||||||
|   // DoubleStore impl dependent |   // DoubleStore impl dependent | ||||||
|   void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); |   void ImportGauge      (const GaugeField &_Uthin ) { assert(0); } | ||||||
|   void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat); |   void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat); | ||||||
|   void ImportGauge(const GaugeField &_Uthin); |   void ImportGaugeSimple(const GaugeField &_UUU    ,const GaugeField &_U); | ||||||
|  |   void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); | ||||||
|  |   DoubledGaugeField &GetU(void)   { return Umu ; } ; | ||||||
|  |   DoubledGaugeField &GetUUU(void) { return UUUmu; }; | ||||||
|  |   void CopyGaugeCheckerboards(void); | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
|   // Data members require to support the functionality |   // Data members require to support the functionality | ||||||
| @@ -132,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS | |||||||
|   //    protected: |   //    protected: | ||||||
|  public: |  public: | ||||||
|   // any other parameters of action ??? |   // any other parameters of action ??? | ||||||
|  |   virtual int   isTrivialEE(void) { return 1; }; | ||||||
|  |   virtual RealD Mass(void) { return mass; } | ||||||
|   RealD mass; |   RealD mass; | ||||||
|   RealD u0; |   RealD u0; | ||||||
|   RealD c1; |   RealD c1; | ||||||
|   | |||||||
| @@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, | |||||||
|  |  | ||||||
|   // 5d lattice for DWF. |   // 5d lattice for DWF. | ||||||
| template<class Impl> | template<class Impl> | ||||||
| ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, | ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid, | ||||||
| 							     GridCartesian         &FiveDimGrid, |  | ||||||
| 							     GridRedBlackCartesian &FiveDimRedBlackGrid, | 							     GridRedBlackCartesian &FiveDimRedBlackGrid, | ||||||
| 							     GridCartesian         &FourDimGrid, | 							     GridCartesian         &FourDimGrid, | ||||||
| 							     GridRedBlackCartesian &FourDimRedBlackGrid, | 							     GridRedBlackCartesian &FourDimRedBlackGrid, | ||||||
| @@ -121,16 +120,74 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin, | |||||||
|     assert(FiveDimGrid._simd_layout[0]        ==1); |     assert(FiveDimGrid._simd_layout[0]        ==1); | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |   int LLs = FiveDimGrid._rdimensions[0]; | ||||||
|  |   int vol4= FourDimGrid.oSites(); | ||||||
|  |   Stencil.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|   // Allocate the required comms buffer |   vol4=FourDimRedBlackGrid.oSites(); | ||||||
|  |   StencilEven.BuildSurfaceList(LLs,vol4); | ||||||
|  |   StencilOdd.BuildSurfaceList(LLs,vol4); | ||||||
|  | } | ||||||
|  | template <class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void) | ||||||
|  | { | ||||||
|  |   pickCheckerboard(Even, UmuEven,  Umu); | ||||||
|  |   pickCheckerboard(Odd,  UmuOdd ,  Umu); | ||||||
|  |   pickCheckerboard(Even, UUUmuEven,UUUmu); | ||||||
|  |   pickCheckerboard(Odd,  UUUmuOdd, UUUmu); | ||||||
|  | } | ||||||
|  | template<class Impl> | ||||||
|  | ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, | ||||||
|  | 							     GridCartesian         &FiveDimGrid, | ||||||
|  | 							     GridRedBlackCartesian &FiveDimRedBlackGrid, | ||||||
|  | 							     GridCartesian         &FourDimGrid, | ||||||
|  | 							     GridRedBlackCartesian &FourDimRedBlackGrid, | ||||||
|  | 							     RealD _mass, | ||||||
|  | 							     RealD _c1,RealD _c2, RealD _u0, | ||||||
|  | 							     const ImplParams &p) : | ||||||
|  |   ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid, | ||||||
|  | 			     FourDimGrid,FourDimRedBlackGrid, | ||||||
|  | 			     _mass,_c1,_c2,_u0,p) | ||||||
|  | { | ||||||
|   ImportGauge(_Uthin,_Ufat); |   ImportGauge(_Uthin,_Ufat); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////// | ||||||
|  | // For MILC use; pass three link U's and 1 link U | ||||||
|  | /////////////////////////////////////////////////// | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)  | void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)  | ||||||
| { | { | ||||||
|   ImportGauge(_Uthin,_Uthin); |   ///////////////////////////////////////////////////////////////// | ||||||
| }; |   // Trivial import; phases and fattening and such like preapplied | ||||||
|  |   ///////////////////////////////////////////////////////////////// | ||||||
|  |   for (int mu = 0; mu < Nd; mu++) { | ||||||
|  |  | ||||||
|  |     auto U = PeekIndex<LorentzIndex>(_Utriple, mu); | ||||||
|  |     Impl::InsertGaugeField(UUUmu,U,mu); | ||||||
|  |  | ||||||
|  |     U = adj( Cshift(U, mu, -3)); | ||||||
|  |     Impl::InsertGaugeField(UUUmu,-U,mu+4); | ||||||
|  |  | ||||||
|  |     U = PeekIndex<LorentzIndex>(_Ufat, mu); | ||||||
|  |     Impl::InsertGaugeField(Umu,U,mu); | ||||||
|  |  | ||||||
|  |     U = adj( Cshift(U, mu, -1)); | ||||||
|  |     Impl::InsertGaugeField(Umu,-U,mu+4); | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |   CopyGaugeCheckerboards(); | ||||||
|  | } | ||||||
|  | template <class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)  | ||||||
|  | { | ||||||
|  |   ///////////////////////////////////////////////////////////////// | ||||||
|  |   // Trivial import; phases and fattening and such like preapplied | ||||||
|  |   ///////////////////////////////////////////////////////////////// | ||||||
|  |   Umu   = _U; | ||||||
|  |   UUUmu = _UUU; | ||||||
|  |   CopyGaugeCheckerboards(); | ||||||
|  | } | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) | void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) | ||||||
| { | { | ||||||
| @@ -159,10 +216,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons | |||||||
|     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); |     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   pickCheckerboard(Even, UmuEven, Umu); |   CopyGaugeCheckerboards(); | ||||||
|   pickCheckerboard(Odd,  UmuOdd , Umu); |  | ||||||
|   pickCheckerboard(Even, UUUmuEven, UUUmu); |  | ||||||
|   pickCheckerboard(Odd,  UUUmuOdd, UUUmu); |  | ||||||
| } | } | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp) | void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp) | ||||||
| @@ -223,6 +277,162 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat, | |||||||
|   assert(0); |   assert(0); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /*CHANGE */ | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | ||||||
|  | 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | ||||||
|  | 						    const FermionField &in, FermionField &out,int dag) | ||||||
|  | { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) | ||||||
|  |     DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); | ||||||
|  |   else | ||||||
|  | #endif | ||||||
|  |     DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, | ||||||
|  | 								   DoubledGaugeField & U,DoubledGaugeField & UUU, | ||||||
|  | 								   const FermionField &in, FermionField &out,int dag) | ||||||
|  | { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); | ||||||
|  |  | ||||||
|  |   Compressor compressor;  | ||||||
|  |  | ||||||
|  |   int LLs = in._grid->_rdimensions[0]; | ||||||
|  |   int len =  U._grid->oSites(); | ||||||
|  |  | ||||||
|  |   DhopFaceTime-=usecond(); | ||||||
|  |   st.Prepare(); | ||||||
|  |   st.HaloGather(in,compressor); | ||||||
|  |   //  st.HaloExchangeOptGather(in,compressor); // Wilson compressor | ||||||
|  |   st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms | ||||||
|  |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|  |   double ctime=0; | ||||||
|  |   double ptime=0; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Ugly explicit thread mapping introduced for OPA reasons. | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #pragma omp parallel reduction(max:ctime) reduction(max:ptime) | ||||||
|  |   { | ||||||
|  |     int tid = omp_get_thread_num(); | ||||||
|  |     int nthreads = omp_get_num_threads(); | ||||||
|  |     int ncomms = CartesianCommunicator::nCommThreads; | ||||||
|  |     if (ncomms == -1) ncomms = 1; | ||||||
|  |     assert(nthreads > ncomms); | ||||||
|  |     if (tid >= ncomms) { | ||||||
|  |       double start = usecond(); | ||||||
|  |       nthreads -= ncomms; | ||||||
|  |       int ttid  = tid - ncomms; | ||||||
|  |       int n     = U._grid->oSites(); // 4d vol | ||||||
|  |       int chunk = n / nthreads; | ||||||
|  |       int rem   = n % nthreads; | ||||||
|  |       int myblock, myn; | ||||||
|  |       if (ttid < rem) { | ||||||
|  |         myblock = ttid * chunk + ttid; | ||||||
|  |         myn = chunk+1; | ||||||
|  |       } else { | ||||||
|  |         myblock = ttid*chunk + rem; | ||||||
|  |         myn = chunk; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       // do the compute | ||||||
|  |       if (dag == DaggerYes) { | ||||||
|  |         for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  |           int sU = ss; | ||||||
|  | 	  // Interior = 1; Exterior = 0; must implement for staggered | ||||||
|  |           Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<--------- | ||||||
|  |         } | ||||||
|  |       } else { | ||||||
|  |         for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|  | 	  // Interior = 1; Exterior = 0; | ||||||
|  |           int sU = ss; | ||||||
|  |           Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------ | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |         ptime = usecond() - start; | ||||||
|  |     } else { | ||||||
|  |       double start = usecond(); | ||||||
|  |       st.CommunicateThreaded(); | ||||||
|  |       ctime = usecond() - start; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopCommTime += ctime; | ||||||
|  |   DhopComputeTime+=ptime; | ||||||
|  |  | ||||||
|  |   // First to enter, last to leave timing | ||||||
|  |   st.CollateThreads(); | ||||||
|  |  | ||||||
|  |   DhopFaceTime-=usecond(); | ||||||
|  |   st.CommsMerge(compressor); | ||||||
|  |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|  |   DhopComputeTime2-=usecond(); | ||||||
|  |   if (dag == DaggerYes) { | ||||||
|  |     int sz=st.surface_list.size(); | ||||||
|  |     parallel_for (int ss = 0; ss < sz; ss++) { | ||||||
|  |       int sU = st.surface_list[ss]; | ||||||
|  |       Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<---------- | ||||||
|  |     } | ||||||
|  |   } else { | ||||||
|  |     int sz=st.surface_list.size(); | ||||||
|  |     parallel_for (int ss = 0; ss < sz; ss++) { | ||||||
|  |       int sU = st.surface_list[ss]; | ||||||
|  |       Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<---------- | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopComputeTime2+=usecond(); | ||||||
|  | #else | ||||||
|  |   assert(0); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | ||||||
|  | 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | ||||||
|  | 						    const FermionField &in, FermionField &out,int dag) | ||||||
|  | { | ||||||
|  |   Compressor compressor; | ||||||
|  |   int LLs = in._grid->_rdimensions[0]; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  //double t1=usecond(); | ||||||
|  |   DhopTotalTime -= usecond(); | ||||||
|  |   DhopCommTime -= usecond(); | ||||||
|  |   st.HaloExchange(in,compressor); | ||||||
|  |   DhopCommTime += usecond(); | ||||||
|  |    | ||||||
|  |   DhopComputeTime -= usecond(); | ||||||
|  |   // Dhop takes the 4d grid from U, and makes a 5d index for fermion | ||||||
|  |   if (dag == DaggerYes) { | ||||||
|  |     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { | ||||||
|  |       int sU=ss; | ||||||
|  |       Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); | ||||||
|  |     } | ||||||
|  |   } else { | ||||||
|  |     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { | ||||||
|  |       int sU=ss; | ||||||
|  |       Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   DhopComputeTime += usecond(); | ||||||
|  |   DhopTotalTime   += usecond(); | ||||||
|  |  //double t2=usecond(); | ||||||
|  |  //std::cout << __FILE__ << " " << __func__  << " Total Time " << DhopTotalTime << std::endl; | ||||||
|  |  //std::cout << __FILE__ << " " << __func__  << " Total Time Org " << t2-t1 << std::endl; | ||||||
|  |  //std::cout << __FILE__ << " " << __func__  << " Comml Time " << DhopCommTime << std::endl; | ||||||
|  |  //std::cout << __FILE__ << " " << __func__  << " Compute Time " << DhopComputeTime << std::endl; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | /*CHANGE END*/ | ||||||
|  |  | ||||||
|  | /* ORG | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | ||||||
| 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | ||||||
| @@ -254,6 +464,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr | |||||||
|   DhopComputeTime += usecond(); |   DhopComputeTime += usecond(); | ||||||
|   DhopTotalTime   += usecond(); |   DhopTotalTime   += usecond(); | ||||||
| } | } | ||||||
|  | */ | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -336,6 +547,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) | |||||||
|   DhopTotalTime    = 0; |   DhopTotalTime    = 0; | ||||||
|   DhopCommTime    = 0; |   DhopCommTime    = 0; | ||||||
|   DhopComputeTime = 0; |   DhopComputeTime = 0; | ||||||
|  |   DhopFaceTime    = 0; | ||||||
|  |  | ||||||
|  |  | ||||||
|   Stencil.ZeroCounters(); |   Stencil.ZeroCounters(); | ||||||
|   StencilEven.ZeroCounters(); |   StencilEven.ZeroCounters(); | ||||||
|   StencilOdd.ZeroCounters(); |   StencilOdd.ZeroCounters(); | ||||||
|   | |||||||
| @@ -64,6 +64,8 @@ namespace QCD { | |||||||
|       double DhopCalls; |       double DhopCalls; | ||||||
|       double DhopCommTime; |       double DhopCommTime; | ||||||
|       double DhopComputeTime; |       double DhopComputeTime; | ||||||
|  |       double DhopComputeTime2; | ||||||
|  |       double DhopFaceTime; | ||||||
|  |  | ||||||
|       /////////////////////////////////////////////////////////////// |       /////////////////////////////////////////////////////////////// | ||||||
|       // Implement the abstract base |       // Implement the abstract base | ||||||
| @@ -119,7 +121,27 @@ namespace QCD { | |||||||
| 		      FermionField &out, | 		      FermionField &out, | ||||||
| 		      int dag); | 		      int dag); | ||||||
|      |      | ||||||
|  |     void DhopInternalOverlappedComms(StencilImpl & st, | ||||||
|  | 		      LebesgueOrder &lo, | ||||||
|  | 		      DoubledGaugeField &U, | ||||||
|  | 		      DoubledGaugeField &UUU, | ||||||
|  | 		      const FermionField &in,  | ||||||
|  | 		      FermionField &out, | ||||||
|  | 		      int dag); | ||||||
|  |  | ||||||
|  |     void DhopInternalSerialComms(StencilImpl & st, | ||||||
|  | 		      LebesgueOrder &lo, | ||||||
|  | 		      DoubledGaugeField &U, | ||||||
|  | 		      DoubledGaugeField &UUU, | ||||||
|  | 		      const FermionField &in,  | ||||||
|  | 		      FermionField &out, | ||||||
|  | 		      int dag); | ||||||
|  |      | ||||||
|  |      | ||||||
|     // Constructors |     // Constructors | ||||||
|  |     //////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |     // Grid internal interface -- Thin link and fat link, with coefficients | ||||||
|  |     //////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|     ImprovedStaggeredFermion5D(GaugeField &_Uthin, |     ImprovedStaggeredFermion5D(GaugeField &_Uthin, | ||||||
| 			       GaugeField &_Ufat, | 			       GaugeField &_Ufat, | ||||||
| 			       GridCartesian         &FiveDimGrid, | 			       GridCartesian         &FiveDimGrid, | ||||||
| @@ -127,18 +149,38 @@ namespace QCD { | |||||||
| 			       GridCartesian         &FourDimGrid, | 			       GridCartesian         &FourDimGrid, | ||||||
| 			       GridRedBlackCartesian &FourDimRedBlackGrid, | 			       GridRedBlackCartesian &FourDimRedBlackGrid, | ||||||
| 			       double _mass, | 			       double _mass, | ||||||
| 			       RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, | 			       RealD _c1, RealD _c2,RealD _u0, | ||||||
|  | 			       const ImplParams &p= ImplParams()); | ||||||
|  |     //////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |     // MILC constructor ; triple links, no rescale factors; must be externally pre multiplied | ||||||
|  |     //////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |     ImprovedStaggeredFermion5D(GridCartesian         &FiveDimGrid, | ||||||
|  | 			       GridRedBlackCartesian &FiveDimRedBlackGrid, | ||||||
|  | 			       GridCartesian         &FourDimGrid, | ||||||
|  | 			       GridRedBlackCartesian &FourDimRedBlackGrid, | ||||||
|  | 			       double _mass, | ||||||
|  | 			       RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, | ||||||
| 			       const ImplParams &p= ImplParams()); | 			       const ImplParams &p= ImplParams()); | ||||||
|  |  | ||||||
|     // DoubleStore |     // DoubleStore gauge field in operator | ||||||
|     void ImportGauge(const GaugeField &_U); |     void ImportGauge      (const GaugeField &_Uthin ) { assert(0); } | ||||||
|     void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat); |     void ImportGauge      (const GaugeField &_Uthin  ,const GaugeField &_Ufat); | ||||||
|  |     void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U); | ||||||
|  |     void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); | ||||||
|  |     // Give a reference; can be used to do an assignment or copy back out after import | ||||||
|  |     // if Carleton wants to cache them and not use the ImportSimple | ||||||
|  |     DoubledGaugeField &GetU(void)   { return Umu ; } ; | ||||||
|  |     DoubledGaugeField &GetUUU(void) { return UUUmu; }; | ||||||
|  |     void CopyGaugeCheckerboards(void); | ||||||
|      |      | ||||||
|     /////////////////////////////////////////////////////////////// |     /////////////////////////////////////////////////////////////// | ||||||
|     // Data members require to support the functionality |     // Data members require to support the functionality | ||||||
|     /////////////////////////////////////////////////////////////// |     /////////////////////////////////////////////////////////////// | ||||||
|   public: |   public: | ||||||
|  |  | ||||||
|  |     virtual int   isTrivialEE(void) { return 1; }; | ||||||
|  |     virtual RealD Mass(void) { return mass; } | ||||||
|  |      | ||||||
|     GridBase *_FourDimGrid; |     GridBase *_FourDimGrid; | ||||||
|     GridBase *_FourDimRedBlackGrid; |     GridBase *_FourDimRedBlackGrid; | ||||||
|     GridBase *_FiveDimGrid; |     GridBase *_FiveDimGrid; | ||||||
|   | |||||||
| @@ -853,7 +853,7 @@ namespace QCD { | |||||||
|  |  | ||||||
|               a0 = a0 + incr; |               a0 = a0 + incr; | ||||||
|               a1 = a1 + incr; |               a1 = a1 + incr; | ||||||
|               a2 = a2 + sizeof(Simd::scalar_type); |               a2 = a2 + sizeof(typename Simd::scalar_type); | ||||||
|             } |             } | ||||||
|           } |           } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -396,6 +396,27 @@ namespace Grid { | |||||||
|       amax=zolo_hi; |       amax=zolo_hi; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     template<class Impl> | ||||||
|  |     void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d) | ||||||
|  |     { | ||||||
|  |       int Ls = this->Ls; | ||||||
|  |       conformable(solution5d._grid,this->FermionGrid()); | ||||||
|  |       conformable(exported4d._grid,this->GaugeGrid()); | ||||||
|  |       ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); | ||||||
|  |     } | ||||||
|  |     template<class Impl> | ||||||
|  |     void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) | ||||||
|  |     { | ||||||
|  |       int Ls = this->Ls; | ||||||
|  |       conformable(imported5d._grid,this->FermionGrid()); | ||||||
|  |       conformable(input4d._grid   ,this->GaugeGrid()); | ||||||
|  |       FermionField tmp(this->FermionGrid()); | ||||||
|  |       tmp=zero; | ||||||
|  |       InsertSlice(input4d, tmp, Ls-1, Ls-1); | ||||||
|  |       tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; | ||||||
|  |       this->Dminus(tmp,imported5d); | ||||||
|  |     } | ||||||
|  |  | ||||||
|       // Constructors |       // Constructors | ||||||
|     template<class Impl> |     template<class Impl> | ||||||
|     PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu, |     PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu, | ||||||
|   | |||||||
| @@ -70,6 +70,12 @@ namespace Grid { | |||||||
|       // Efficient support for multigrid coarsening |       // Efficient support for multigrid coarsening | ||||||
|       virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp); |       virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp); | ||||||
|  |  | ||||||
|  |       /////////////////////////////////////////////////////////////// | ||||||
|  |       // Physical surface field utilities | ||||||
|  |       /////////////////////////////////////////////////////////////// | ||||||
|  |       virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); | ||||||
|  |       virtual void ImportPhysicalFermionSource  (const FermionField &input4d,FermionField &imported5d); | ||||||
|  |  | ||||||
|       // Constructors |       // Constructors | ||||||
|       PartialFractionFermion5D(GaugeField &_Umu, |       PartialFractionFermion5D(GaugeField &_Umu, | ||||||
| 			       GridCartesian         &FiveDimGrid, | 			       GridCartesian         &FiveDimGrid, | ||||||
|   | |||||||
| @@ -32,223 +32,241 @@ namespace Grid { | |||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
| int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric; | int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric; | ||||||
|  | int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute; | ||||||
|  |  | ||||||
|  | #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)		\ | ||||||
|  |   SE = st.GetEntry(ptype, Dir+skew, sF);			\ | ||||||
|  |   if (SE->_is_local ) {						\ | ||||||
|  |     if (SE->_permute) {						\ | ||||||
|  |       chi_p = χ						\ | ||||||
|  |       permute(chi,  in._odata[SE->_offset], ptype);		\ | ||||||
|  |     } else {							\ | ||||||
|  |       chi_p = &in._odata[SE->_offset];				\ | ||||||
|  |     }								\ | ||||||
|  |   } else {							\ | ||||||
|  |     chi_p = &buf[SE->_offset];					\ | ||||||
|  |   }								\ | ||||||
|  |   multLink(Uchi, U._odata[sU], *chi_p, Dir);			 | ||||||
|  |  | ||||||
|  | #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)		\ | ||||||
|  |   SE = st.GetEntry(ptype, Dir+skew, sF);			\ | ||||||
|  |   if (SE->_is_local ) {						\ | ||||||
|  |     if (SE->_permute) {						\ | ||||||
|  |       chi_p = χ						\ | ||||||
|  |       permute(chi,  in._odata[SE->_offset], ptype);		\ | ||||||
|  |     } else {							\ | ||||||
|  |       chi_p = &in._odata[SE->_offset];				\ | ||||||
|  |     }								\ | ||||||
|  |   } else if ( st.same_node[Dir] ) {				\ | ||||||
|  |     chi_p = &buf[SE->_offset];					\ | ||||||
|  |   }								\ | ||||||
|  |   if (SE->_is_local || st.same_node[Dir] ) {			\ | ||||||
|  |     multLink(Uchi, U._odata[sU], *chi_p, Dir);			\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)		\ | ||||||
|  |   SE = st.GetEntry(ptype, Dir+skew, sF);			\ | ||||||
|  |   if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\ | ||||||
|  |     nmu++;							\ | ||||||
|  |     chi_p = &buf[SE->_offset];					\ | ||||||
|  |     multLink(Uchi, U._odata[sU], *chi_p, Dir);			\ | ||||||
|  |   } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){}; | StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){}; | ||||||
|  |  | ||||||
| //////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Generic implementation; move to different file? | // Generic implementation; move to different file? | ||||||
| //////////////////////////////////////////// | // Int, Ext, Int+Ext cases for comms overlap | ||||||
|  | //////////////////////////////////////////////////////////////////////////////////// | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 					   SiteSpinor *buf, int sF, | 					     DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 					   int sU, const FermionField &in, SiteSpinor &out,int threeLink) { | 					     SiteSpinor *buf, int LLs, int sU,  | ||||||
|  | 					     const FermionField &in, FermionField &out, int dag) { | ||||||
|   const SiteSpinor *chi_p; |   const SiteSpinor *chi_p; | ||||||
|   SiteSpinor chi; |   SiteSpinor chi; | ||||||
|   SiteSpinor Uchi; |   SiteSpinor Uchi; | ||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|   int ptype; |   int ptype; | ||||||
|   int skew = 0; |   int skew; | ||||||
|   if (threeLink) skew=8; |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Xp |  | ||||||
|   /////////////////////////// |  | ||||||
|  |  | ||||||
|   SE = st.GetEntry(ptype, Xp+skew, sF); |   for(int s=0;s<LLs;s++){ | ||||||
|   if (SE->_is_local) { |     int sF=LLs*sU+s; | ||||||
|     if (SE->_permute) { |     skew = 0; | ||||||
|       chi_p = χ |     GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink); | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |     GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd); | ||||||
|     } else { |     GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd); | ||||||
|       chi_p = &in._odata[SE->_offset]; |     GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd); | ||||||
|  |     skew=8; | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd); | ||||||
|  |     if ( dag ) {  | ||||||
|  |       Uchi = - Uchi; | ||||||
|     }  |     }  | ||||||
|   } else { |     vstream(out._odata[sF], Uchi); | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |   } | ||||||
|   Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp); |  | ||||||
|  |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Yp |  | ||||||
|   /////////////////////////// |  | ||||||
|   SE = st.GetEntry(ptype, Yp+skew, sF); |  | ||||||
|   if (SE->_is_local) { |  | ||||||
|     if (SE->_permute) { |  | ||||||
|       chi_p = χ |  | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |  | ||||||
|     } else { |  | ||||||
|       chi_p = &in._odata[SE->_offset]; |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |  | ||||||
|   Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp); |  | ||||||
|  |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Zp |  | ||||||
|   /////////////////////////// |  | ||||||
|   SE = st.GetEntry(ptype, Zp+skew, sF); |  | ||||||
|   if (SE->_is_local) { |  | ||||||
|     if (SE->_permute) { |  | ||||||
|       chi_p = χ |  | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |  | ||||||
|     } else { |  | ||||||
|       chi_p = &in._odata[SE->_offset]; |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |  | ||||||
|   Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp); |  | ||||||
|  |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Tp |  | ||||||
|   /////////////////////////// |  | ||||||
|   SE = st.GetEntry(ptype, Tp+skew, sF); |  | ||||||
|   if (SE->_is_local) { |  | ||||||
|     if (SE->_permute) { |  | ||||||
|       chi_p = χ |  | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |  | ||||||
|     } else { |  | ||||||
|       chi_p = &in._odata[SE->_offset]; |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |  | ||||||
|   Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp); |  | ||||||
|  |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Xm |  | ||||||
|   /////////////////////////// |  | ||||||
|   SE = st.GetEntry(ptype, Xm+skew, sF); |  | ||||||
|   if (SE->_is_local) { |  | ||||||
|     if (SE->_permute) { |  | ||||||
|       chi_p = χ |  | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |  | ||||||
|     } else { |  | ||||||
|       chi_p = &in._odata[SE->_offset]; |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |  | ||||||
|   Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm); |  | ||||||
|  |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Ym |  | ||||||
|   /////////////////////////// |  | ||||||
|   SE = st.GetEntry(ptype, Ym+skew, sF); |  | ||||||
|   if (SE->_is_local) { |  | ||||||
|     if (SE->_permute) { |  | ||||||
|       chi_p = χ |  | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |  | ||||||
|     } else { |  | ||||||
|       chi_p = &in._odata[SE->_offset]; |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |  | ||||||
|   Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym); |  | ||||||
|  |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Zm |  | ||||||
|   /////////////////////////// |  | ||||||
|   SE = st.GetEntry(ptype, Zm+skew, sF); |  | ||||||
|   if (SE->_is_local) { |  | ||||||
|     if (SE->_permute) { |  | ||||||
|       chi_p = χ |  | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |  | ||||||
|     } else { |  | ||||||
|       chi_p = &in._odata[SE->_offset]; |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |  | ||||||
|   Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm); |  | ||||||
|  |  | ||||||
|   /////////////////////////// |  | ||||||
|   // Tm |  | ||||||
|   /////////////////////////// |  | ||||||
|   SE = st.GetEntry(ptype, Tm+skew, sF); |  | ||||||
|   if (SE->_is_local) { |  | ||||||
|     if (SE->_permute) { |  | ||||||
|       chi_p = χ |  | ||||||
|       permute(chi,  in._odata[SE->_offset], ptype); |  | ||||||
|     } else { |  | ||||||
|       chi_p = &in._odata[SE->_offset]; |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     chi_p = &buf[SE->_offset]; |  | ||||||
|   } |  | ||||||
|   Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm); |  | ||||||
|  |  | ||||||
|   vstream(out, Uchi); |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////// | ||||||
|  |   // Only contributions from interior of our node | ||||||
|  |   /////////////////////////////////////////////////// | ||||||
|  | template <class Impl> | ||||||
|  | void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 						DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
|  | 						SiteSpinor *buf, int LLs, int sU,  | ||||||
|  | 						const FermionField &in, FermionField &out,int dag) { | ||||||
|  |   const SiteSpinor *chi_p; | ||||||
|  |   SiteSpinor chi; | ||||||
|  |   SiteSpinor Uchi; | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int ptype; | ||||||
|  |   int skew ; | ||||||
|  |  | ||||||
|  |   for(int s=0;s<LLs;s++){ | ||||||
|  |     int sF=LLs*sU+s; | ||||||
|  |     skew = 0; | ||||||
|  |     Uchi=zero; | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd); | ||||||
|  |     skew=8; | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd); | ||||||
|  |     if ( dag ) { | ||||||
|  |       Uchi = - Uchi; | ||||||
|  |     } | ||||||
|  |     vstream(out._odata[sF], Uchi); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////// | ||||||
|  |   // Only contributions from exterior of our node | ||||||
|  |   /////////////////////////////////////////////////// | ||||||
|  | template <class Impl> | ||||||
|  | void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 						DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
|  | 						SiteSpinor *buf, int LLs, int sU, | ||||||
|  | 						const FermionField &in, FermionField &out,int dag) { | ||||||
|  |   const SiteSpinor *chi_p; | ||||||
|  |   SiteSpinor chi; | ||||||
|  |   SiteSpinor Uchi; | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int ptype; | ||||||
|  |   int nmu=0; | ||||||
|  |   int skew ; | ||||||
|  |  | ||||||
|  |   for(int s=0;s<LLs;s++){ | ||||||
|  |     int sF=LLs*sU+s; | ||||||
|  |     skew = 0; | ||||||
|  |     Uchi=zero; | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd); | ||||||
|  |     skew=8; | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); | ||||||
|  |     GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); | ||||||
|  |  | ||||||
|  |     if ( nmu ) {  | ||||||
|  |       if ( dag ) {  | ||||||
|  | 	out._odata[sF] = out._odata[sF] - Uchi; | ||||||
|  |       } else {  | ||||||
|  | 	out._odata[sF] = out._odata[sF] + Uchi; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Driving / wrapping routine to select right kernel | ||||||
|  | //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, | void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 					 SiteSpinor *buf, int LLs, int sU, | 					 SiteSpinor *buf, int LLs, int sU, | ||||||
| 						  const FermionField &in, FermionField &out) { | 					 const FermionField &in, FermionField &out, | ||||||
|   SiteSpinor naik; | 					 int interior,int exterior) | ||||||
|   SiteSpinor naive; | { | ||||||
|   int oneLink  =0; |  | ||||||
|   int threeLink=1; |  | ||||||
|   int dag=1; |   int dag=1; | ||||||
|   switch(Opt) { |   DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); | ||||||
| #ifdef AVX512 | }; | ||||||
|   //FIXME; move the sign into the Asm routine |  | ||||||
|   case OptInlineAsm: | template <class Impl> | ||||||
|     DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); | void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
|     for(int s=0;s<LLs;s++) { | 				      SiteSpinor *buf, int LLs, int sU, | ||||||
|       int sF=s+LLs*sU; | 				      const FermionField &in, FermionField &out, | ||||||
|       out._odata[sF]=-out._odata[sF]; | 				      int interior,int exterior) | ||||||
|     } | { | ||||||
|     break; |   int dag=0; | ||||||
| #endif |   DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior); | ||||||
|   case OptHandUnroll: |  | ||||||
|     DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag); |  | ||||||
|     break; |  | ||||||
|   case OptGeneric: |  | ||||||
|     for(int s=0;s<LLs;s++){ |  | ||||||
|        int sF=s+LLs*sU; |  | ||||||
|        DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); |  | ||||||
|        DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); |  | ||||||
|        out._odata[sF] =-naive-naik;  |  | ||||||
|      } |  | ||||||
|     break; |  | ||||||
|   default: |  | ||||||
|     std::cout<<"Oops Opt = "<<Opt<<std::endl; |  | ||||||
|     assert(0); |  | ||||||
|     break; |  | ||||||
|   } |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, | void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 				      SiteSpinor *buf, int LLs, | 				      SiteSpinor *buf, int LLs, | ||||||
| 				      int sU, const FermionField &in, FermionField &out)  | 				      int sU, const FermionField &in, FermionField &out, | ||||||
|  | 				      int dag,int interior,int exterior)  | ||||||
| { | { | ||||||
|   int oneLink  =0; |  | ||||||
|   int threeLink=1; |  | ||||||
|   SiteSpinor naik; |  | ||||||
|   SiteSpinor naive; |  | ||||||
|   int dag=0; |  | ||||||
|   switch(Opt) { |   switch(Opt) { | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|   case OptInlineAsm: |   case OptInlineAsm: | ||||||
|     DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out); |     if ( interior && exterior ) { | ||||||
|  |       DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag); | ||||||
|  |     } else {  | ||||||
|  |       std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl; | ||||||
|  |       assert(0); | ||||||
|  |     } | ||||||
|     break; |     break; | ||||||
| #endif | #endif | ||||||
|   case OptHandUnroll: |   case OptHandUnroll: | ||||||
|  |     if ( interior && exterior ) { | ||||||
|       DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag); |       DhopSiteHand   (st,lo,U,UUU,buf,LLs,sU,in,out,dag); | ||||||
|  |     } else if ( interior ) { | ||||||
|  |       DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); | ||||||
|  |     } else if ( exterior ) { | ||||||
|  |       DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); | ||||||
|  |     } | ||||||
|     break; |     break; | ||||||
|   case OptGeneric: |   case OptGeneric: | ||||||
|     for(int s=0;s<LLs;s++){ |     if ( interior && exterior ) { | ||||||
|       int sF=LLs*sU+s; |       DhopSiteGeneric   (st,lo,U,UUU,buf,LLs,sU,in,out,dag); | ||||||
|       //      assert(sF<in._odata.size()); |     } else if ( interior ) { | ||||||
|       //      assert(sU< U._odata.size()); |       DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); | ||||||
|       //      assert(sF>=0);      assert(sU>=0); |     } else if ( exterior ) { | ||||||
|       DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink); |       DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag); | ||||||
|       DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink); |  | ||||||
|       out._odata[sF] =naive+naik; |  | ||||||
|     } |     } | ||||||
|     break; |     break; | ||||||
|   default: |   default: | ||||||
|   | |||||||
| @@ -38,8 +38,9 @@ namespace QCD { | |||||||
| class StaggeredKernelsStatic {  | class StaggeredKernelsStatic {  | ||||||
|  public: |  public: | ||||||
|   enum { OptGeneric, OptHandUnroll, OptInlineAsm }; |   enum { OptGeneric, OptHandUnroll, OptInlineAsm }; | ||||||
|   // S-direction is INNERMOST and takes no part in the parity. |   enum { CommsAndCompute, CommsThenCompute }; | ||||||
|   static int Opt;  // these are a temporary hack |   static int Opt; | ||||||
|  |   static int Comms; | ||||||
| }; | }; | ||||||
|   |   | ||||||
| template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {  | template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {  | ||||||
| @@ -53,24 +54,62 @@ public: | |||||||
|    void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, |    void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, | ||||||
| 		      int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp); | 		      int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp); | ||||||
|  |  | ||||||
|    void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, |    /////////////////////////////////////////////////////////////////////////////////////// | ||||||
| 		     int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink); |    // Generic Nc kernels | ||||||
|  |    /////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |    void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 			DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||||
|  | 			SiteSpinor * buf, int LLs, int sU,  | ||||||
|  | 			const FermionField &in, FermionField &out,int dag); | ||||||
|  |    void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 			   DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||||
|  | 			   SiteSpinor * buf, int LLs, int sU,  | ||||||
|  | 			   const FermionField &in, FermionField &out,int dag); | ||||||
|  |    void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 			   DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
|  | 			   SiteSpinor * buf, int LLs, int sU,  | ||||||
|  | 			   const FermionField &in, FermionField &out,int dag); | ||||||
|  |  | ||||||
|  |    /////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |    // Nc=3 specific kernels | ||||||
|  |    /////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |    void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 		     DoubledGaugeField &U,DoubledGaugeField &UUU,  | ||||||
|  | 		     SiteSpinor * buf, int LLs, int sU,  | ||||||
|  | 		     const FermionField &in, FermionField &out,int dag); | ||||||
|  |    void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 			DoubledGaugeField &U,DoubledGaugeField &UUU,  | ||||||
|  | 			SiteSpinor * buf, int LLs, int sU,  | ||||||
|  | 			const FermionField &in, FermionField &out,int dag); | ||||||
|  |    void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 			DoubledGaugeField &U,DoubledGaugeField &UUU,  | ||||||
|  | 			SiteSpinor * buf, int LLs, int sU,  | ||||||
|  | 			const FermionField &in, FermionField &out,int dag); | ||||||
|  |  | ||||||
|    void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf, |    /////////////////////////////////////////////////////////////////////////////////////// | ||||||
| 		     int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink); |    // Asm Nc=3 specific kernels | ||||||
|  |    /////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |    void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 		    DoubledGaugeField &U,DoubledGaugeField &UUU,  | ||||||
|  | 		    SiteSpinor * buf, int LLs, int sU,  | ||||||
|  | 		    const FermionField &in, FermionField &out,int dag); | ||||||
|  |    /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |    // Generic interface; fan out to right routine | ||||||
|  |    /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |    void DhopSite(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 		 DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||||
|  | 		 SiteSpinor * buf, int LLs, int sU, | ||||||
|  | 		 const FermionField &in, FermionField &out, int interior=1,int exterior=1); | ||||||
|  |  | ||||||
|    void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf, |    void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 		     int LLs, int sU, const FermionField &in, FermionField &out, int dag); | 		    DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||||
|  | 		    SiteSpinor * buf, int LLs, int sU, | ||||||
|  | 		    const FermionField &in, FermionField &out, int interior=1,int exterior=1); | ||||||
|  |  | ||||||
|    void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf, |    void DhopSite(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 			 int LLs, int sU, const FermionField &in, FermionField &out); | 		 DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||||
|        | 		 SiteSpinor * buf, int LLs, int sU, | ||||||
|    void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf, | 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior); | ||||||
| 		int sF, int sU, const FermionField &in, FermionField &out); |  | ||||||
|  |  | ||||||
|    void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,  |  | ||||||
|                    int LLs, int sU, const FermionField &in, FermionField &out); |  | ||||||
|    |    | ||||||
| public: | public: | ||||||
|  |  | ||||||
|   | |||||||
| @@ -560,6 +560,27 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|        VSTORE(2,%0,pUChi_02)					\ |        VSTORE(2,%0,pUChi_02)					\ | ||||||
|        : : "r" (out) : "memory" ); |        : : "r" (out) : "memory" ); | ||||||
|  |  | ||||||
|  | #define nREDUCE(out)							\ | ||||||
|  |   asm (									\ | ||||||
|  |        VADD(UChi_00,UChi_10,UChi_00)					\ | ||||||
|  |        VADD(UChi_01,UChi_11,UChi_01)					\ | ||||||
|  |        VADD(UChi_02,UChi_12,UChi_02)					\ | ||||||
|  |        VADD(UChi_30,UChi_20,UChi_30)					\ | ||||||
|  |        VADD(UChi_31,UChi_21,UChi_31)					\ | ||||||
|  |        VADD(UChi_32,UChi_22,UChi_32)					\ | ||||||
|  |        VADD(UChi_00,UChi_30,UChi_00)					\ | ||||||
|  |        VADD(UChi_01,UChi_31,UChi_01)					\ | ||||||
|  |        VADD(UChi_02,UChi_32,UChi_02)				);	\ | ||||||
|  |   asm (VZERO(Chi_00)							\ | ||||||
|  |        VSUB(UChi_00,Chi_00,UChi_00)					\ | ||||||
|  |        VSUB(UChi_01,Chi_00,UChi_01)					\ | ||||||
|  |        VSUB(UChi_02,Chi_00,UChi_02)				);	\ | ||||||
|  |   asm (								\ | ||||||
|  |        VSTORE(0,%0,pUChi_00)					\ | ||||||
|  |        VSTORE(1,%0,pUChi_01)					\ | ||||||
|  |        VSTORE(2,%0,pUChi_02)					\ | ||||||
|  |        : : "r" (out) : "memory" ); | ||||||
|  |  | ||||||
| #define REDUCEa(out)					\ | #define REDUCEa(out)					\ | ||||||
|   asm (							\ |   asm (							\ | ||||||
|   VADD(UChi_00,UChi_10,UChi_00)				\ |   VADD(UChi_00,UChi_10,UChi_00)				\ | ||||||
| @@ -571,6 +592,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|        VSTORE(2,%0,pUChi_02)						\ |        VSTORE(2,%0,pUChi_02)						\ | ||||||
|        : : "r" (out) : "memory" ); |        : : "r" (out) : "memory" ); | ||||||
|  |  | ||||||
|  | // FIXME is sign right in the VSUB ? | ||||||
|  | #define nREDUCEa(out)					\ | ||||||
|  |   asm (							\ | ||||||
|  |   VADD(UChi_00,UChi_10,UChi_00)				\ | ||||||
|  |   VADD(UChi_01,UChi_11,UChi_01)				\ | ||||||
|  |   VADD(UChi_02,UChi_12,UChi_02)	);			\ | ||||||
|  |   asm (VZERO(Chi_00)							\ | ||||||
|  |        VSUB(UChi_00,Chi_00,UChi_00)					\ | ||||||
|  |        VSUB(UChi_01,Chi_00,UChi_01)					\ | ||||||
|  |        VSUB(UChi_02,Chi_00,UChi_02)				);	\ | ||||||
|  |   asm (									\ | ||||||
|  |        VSTORE(0,%0,pUChi_00)				\ | ||||||
|  |        VSTORE(1,%0,pUChi_01)				\ | ||||||
|  |        VSTORE(2,%0,pUChi_02)				\ | ||||||
|  |        : : "r" (out) : "memory" ); | ||||||
|  |  | ||||||
| #define PERMUTE_DIR(dir)			\ | #define PERMUTE_DIR(dir)			\ | ||||||
|       permute##dir(Chi_0,Chi_0);\ |       permute##dir(Chi_0,Chi_0);\ | ||||||
|       permute##dir(Chi_1,Chi_1);\ |       permute##dir(Chi_1,Chi_1);\ | ||||||
| @@ -581,10 +618,9 @@ namespace QCD { | |||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 					 DoubledGaugeField &U, | 					 DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 					 DoubledGaugeField &UUU, | 					 SiteSpinor *buf, int LLs, int sU,  | ||||||
| 					 SiteSpinor *buf, int LLs, | 					 const FermionField &in, FermionField &out,int dag)  | ||||||
| 					 int sU, const FermionField &in, FermionField &out)  |  | ||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| }; | }; | ||||||
| @@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, | |||||||
|   // This is the single precision 5th direction vectorised kernel |   // This is the single precision 5th direction vectorised kernel | ||||||
| #include <simd/Intel512single.h> | #include <simd/Intel512single.h> | ||||||
| template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 								    DoubledGaugeField &U, | 								    DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 								    DoubledGaugeField &UUU, | 								    SiteSpinor *buf, int LLs, int sU,  | ||||||
| 								    SiteSpinor *buf, int LLs, | 								    const FermionField &in, FermionField &out,int dag)  | ||||||
| 								    int sU, const FermionField &in, FermionField &out)  |  | ||||||
| { | { | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|   uint64_t gauge0,gauge1,gauge2,gauge3; |   uint64_t gauge0,gauge1,gauge2,gauge3; | ||||||
| @@ -685,8 +720,12 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl | |||||||
|     MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); |     MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); | ||||||
|  |  | ||||||
|     addr0 = (uint64_t) &out._odata[sF]; |     addr0 = (uint64_t) &out._odata[sF]; | ||||||
|  |     if ( dag ) { | ||||||
|  |       nREDUCE(addr0); | ||||||
|  |     } else {  | ||||||
|       REDUCE(addr0); |       REDUCE(addr0); | ||||||
|     } |     } | ||||||
|  |    } | ||||||
| #else  | #else  | ||||||
|     assert(0); |     assert(0); | ||||||
| #endif | #endif | ||||||
| @@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl | |||||||
|  |  | ||||||
| #include <simd/Intel512double.h> | #include <simd/Intel512double.h> | ||||||
| template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 								    DoubledGaugeField &U, | 								    DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 								    DoubledGaugeField &UUU, | 								    SiteSpinor *buf, int LLs, int sU,  | ||||||
| 								    SiteSpinor *buf, int LLs, | 								    const FermionField &in, FermionField &out,int dag)  | ||||||
| 								    int sU, const FermionField &in, FermionField &out)  |  | ||||||
| { | { | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|   uint64_t gauge0,gauge1,gauge2,gauge3; |   uint64_t gauge0,gauge1,gauge2,gauge3; | ||||||
| @@ -734,8 +772,12 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl | |||||||
|     MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); |     MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3); | ||||||
|  |  | ||||||
|     addr0 = (uint64_t) &out._odata[sF]; |     addr0 = (uint64_t) &out._odata[sF]; | ||||||
|  |     if ( dag ) { | ||||||
|  |       nREDUCE(addr0); | ||||||
|  |     } else {  | ||||||
|       REDUCE(addr0); |       REDUCE(addr0); | ||||||
|     } |     } | ||||||
|  |   } | ||||||
| #else  | #else  | ||||||
|   assert(0); |   assert(0); | ||||||
| #endif | #endif | ||||||
| @@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl | |||||||
|  |  | ||||||
| #include <simd/Intel512single.h> | #include <simd/Intel512single.h> | ||||||
| template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 								    DoubledGaugeField &U, | 							       DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 								    DoubledGaugeField &UUU, | 							       SiteSpinor *buf, int LLs, int sU,  | ||||||
| 								    SiteSpinor *buf, int LLs, | 							       const FermionField &in, FermionField &out,int dag)  | ||||||
| 								    int sU, const FermionField &in, FermionField &out)  |  | ||||||
| { | { | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|   uint64_t gauge0,gauge1,gauge2,gauge3; |   uint64_t gauge0,gauge1,gauge2,gauge3; | ||||||
| @@ -832,8 +873,12 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, | |||||||
|     MULT_ADD_XYZT(gauge2,gauge3);   |     MULT_ADD_XYZT(gauge2,gauge3);   | ||||||
|  |  | ||||||
|     addr0 = (uint64_t) &out._odata[sF]; |     addr0 = (uint64_t) &out._odata[sF]; | ||||||
|  |     if ( dag ) {  | ||||||
|  |       nREDUCEa(addr0); | ||||||
|  |     } else {  | ||||||
|       REDUCEa(addr0); |       REDUCEa(addr0); | ||||||
|     } |     } | ||||||
|  |   } | ||||||
| #else  | #else  | ||||||
|   assert(0); |   assert(0); | ||||||
| #endif | #endif | ||||||
| @@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, | |||||||
|  |  | ||||||
| #include <simd/Intel512double.h> | #include <simd/Intel512double.h> | ||||||
| template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 								    DoubledGaugeField &U, | 							       DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
| 								    DoubledGaugeField &UUU, | 							       SiteSpinor *buf, int LLs, int sU,  | ||||||
| 								    SiteSpinor *buf, int LLs, | 							       const FermionField &in, FermionField &out,int dag)  | ||||||
| 								    int sU, const FermionField &in, FermionField &out)  |  | ||||||
| { | { | ||||||
| #ifdef AVX512 | #ifdef AVX512 | ||||||
|   uint64_t gauge0,gauge1,gauge2,gauge3; |   uint64_t gauge0,gauge1,gauge2,gauge3; | ||||||
| @@ -897,8 +941,12 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, | |||||||
|     MULT_ADD_XYZT(gauge2,gauge3);   |     MULT_ADD_XYZT(gauge2,gauge3);   | ||||||
|      |      | ||||||
|     addr0 = (uint64_t) &out._odata[sF]; |     addr0 = (uint64_t) &out._odata[sF]; | ||||||
|  |     if ( dag ) { | ||||||
|  |       nREDUCEa(addr0); | ||||||
|  |     } else {  | ||||||
|       REDUCEa(addr0); |       REDUCEa(addr0); | ||||||
|     } |     } | ||||||
|  |   } | ||||||
| #else  | #else  | ||||||
|   assert(0); |   assert(0); | ||||||
| #endif | #endif | ||||||
| @@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, | |||||||
| 				  DoubledGaugeField &U,			\ | 				  DoubledGaugeField &U,			\ | ||||||
| 				  DoubledGaugeField &UUU,		\ | 				  DoubledGaugeField &UUU,		\ | ||||||
| 				  SiteSpinor *buf, int LLs,		\ | 				  SiteSpinor *buf, int LLs,		\ | ||||||
| 				  int sU, const FermionField &in, FermionField &out); | 				  int sU, const FermionField &in, FermionField &out,int dag); | ||||||
|  |  | ||||||
| KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD); | KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD); | ||||||
| KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF); | KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF); | ||||||
|   | |||||||
| @@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid.h> | #include <Grid.h> | ||||||
|  |  | ||||||
| #define REGISTER |  | ||||||
|  |  | ||||||
| #define LOAD_CHI(b)		\ | #define LOAD_CHI(b)		\ | ||||||
|   const SiteSpinor & ref (b[offset]);	\ |   const SiteSpinor & ref (b[offset]);	\ | ||||||
| @@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     UChi ## _1 += U_12*Chi_2;\ |     UChi ## _1 += U_12*Chi_2;\ | ||||||
|     UChi ## _2 += U_22*Chi_2; |     UChi ## _2 += U_22*Chi_2; | ||||||
|  |  | ||||||
| #define MULT_ADD(A,UChi)				\ | #define MULT_ADD(U,A,UChi)			\ | ||||||
|   auto & ref(U._odata[sU](A));			\ |   auto & ref(U._odata[sU](A));			\ | ||||||
|    Impl::loadLinkElement(U_00,ref()(0,0));      \ |    Impl::loadLinkElement(U_00,ref()(0,0));      \ | ||||||
|    Impl::loadLinkElement(U_10,ref()(1,0));      \ |    Impl::loadLinkElement(U_10,ref()(1,0));      \ | ||||||
| @@ -86,237 +85,315 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|   permute##dir(Chi_1,Chi_1);			\ |   permute##dir(Chi_1,Chi_1);			\ | ||||||
|   permute##dir(Chi_2,Chi_2); |   permute##dir(Chi_2,Chi_2); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew)	\ | ||||||
|  |   SE=st.GetEntry(ptype,Dir+skew,sF);	\ | ||||||
|  |   offset = SE->_offset;			\ | ||||||
|  |   local  = SE->_is_local;		\ | ||||||
|  |   perm   = SE->_permute;		\ | ||||||
|  |   if ( local ) {						\ | ||||||
|  |     LOAD_CHI(in._odata);					\ | ||||||
|  |     if ( perm) {						\ | ||||||
|  |       PERMUTE_DIR(Perm);					\ | ||||||
|  |     }								\ | ||||||
|  |   } else {							\ | ||||||
|  |     LOAD_CHI(buf);						\ | ||||||
|  |   }								 | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even)		\ | ||||||
|  |   HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\ | ||||||
|  |   {								\ | ||||||
|  |     MULT(Dir,even);						\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG(U,Dir,Perm,skew,even)			\ | ||||||
|  |   HAND_STENCIL_LEG_BASE(Dir,Perm,skew)				\ | ||||||
|  |   {								\ | ||||||
|  |     MULT_ADD(U,Dir,even);					\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even)	\ | ||||||
|  |   SE=st.GetEntry(ptype,Dir+skew,sF);			\ | ||||||
|  |   offset = SE->_offset;					\ | ||||||
|  |   local  = SE->_is_local;				\ | ||||||
|  |   perm   = SE->_permute;				\ | ||||||
|  |   if ( local ) {					\ | ||||||
|  |     LOAD_CHI(in._odata);				\ | ||||||
|  |     if ( perm) {					\ | ||||||
|  |       PERMUTE_DIR(Perm);				\ | ||||||
|  |     }							\ | ||||||
|  |   } else if ( st.same_node[Dir] ) {			\ | ||||||
|  |     LOAD_CHI(buf);					\ | ||||||
|  |   }							\ | ||||||
|  |   if (SE->_is_local || st.same_node[Dir] ) {		\ | ||||||
|  |     MULT_ADD(U,Dir,even);				\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even)	\ | ||||||
|  |   SE=st.GetEntry(ptype,Dir+skew,sF);			\ | ||||||
|  |   offset = SE->_offset;					\ | ||||||
|  |   local  = SE->_is_local;				\ | ||||||
|  |   perm   = SE->_permute;				\ | ||||||
|  |   if ((!SE->_is_local) && (!st.same_node[Dir]) ) {		\ | ||||||
|  |     nmu++;							\ | ||||||
|  |     { LOAD_CHI(buf);	  }					\ | ||||||
|  |     { MULT_ADD(U,Dir,even); }					\ | ||||||
|  |   }								 | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, | void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,  | ||||||
| 					  SiteSpinor *buf, int LLs, | 					  DoubledGaugeField &U,DoubledGaugeField &UUU, | ||||||
| 					  int sU, const FermionField &in, FermionField &out, int dag)  | 					  SiteSpinor *buf, int LLs, int sU,  | ||||||
| { | 					  const FermionField &in, FermionField &out,int dag)  | ||||||
|   SiteSpinor naik;  |  | ||||||
|   SiteSpinor naive; |  | ||||||
|   int oneLink  =0; |  | ||||||
|   int threeLink=1; |  | ||||||
|   int skew(0); |  | ||||||
|   Real scale(1.0); |  | ||||||
|    |  | ||||||
|   if(dag) scale = -1.0; |  | ||||||
|    |  | ||||||
|   for(int s=0;s<LLs;s++){ |  | ||||||
|     int sF=s+LLs*sU; |  | ||||||
|     DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink); |  | ||||||
|     DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink); |  | ||||||
|     out._odata[sF] =scale*(naive+naik); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |  | ||||||
| 					       SiteSpinor *buf, int sF, |  | ||||||
| 					       int sU, const FermionField &in, SiteSpinor &out,int threeLink)  |  | ||||||
| { | { | ||||||
|   typedef typename Simd::scalar_type S; |   typedef typename Simd::scalar_type S; | ||||||
|   typedef typename Simd::vector_type V; |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|   REGISTER Simd even_0; // 12 regs on knc |   Simd even_0; // 12 regs on knc | ||||||
|   REGISTER Simd even_1; |   Simd even_1; | ||||||
|   REGISTER Simd even_2; |   Simd even_2; | ||||||
|   REGISTER Simd odd_0; // 12 regs on knc |   Simd odd_0; // 12 regs on knc | ||||||
|   REGISTER Simd odd_1; |   Simd odd_1; | ||||||
|   REGISTER Simd odd_2; |   Simd odd_2; | ||||||
|  |  | ||||||
|   REGISTER Simd Chi_0;    // two spinor; 6 regs |   Simd Chi_0;    // two spinor; 6 regs | ||||||
|   REGISTER Simd Chi_1; |   Simd Chi_1; | ||||||
|   REGISTER Simd Chi_2; |   Simd Chi_2; | ||||||
|    |    | ||||||
|   REGISTER Simd U_00;  // two rows of U matrix |   Simd U_00;  // two rows of U matrix | ||||||
|   REGISTER Simd U_10; |   Simd U_10; | ||||||
|   REGISTER Simd U_20;   |   Simd U_20;   | ||||||
|   REGISTER Simd U_01; |   Simd U_01; | ||||||
|   REGISTER Simd U_11; |   Simd U_11; | ||||||
|   REGISTER Simd U_21;  // 2 reg left. |   Simd U_21;  // 2 reg left. | ||||||
|   REGISTER Simd U_02; |   Simd U_02; | ||||||
|   REGISTER Simd U_12; |   Simd U_12; | ||||||
|   REGISTER Simd U_22;  |   Simd U_22;  | ||||||
|  |  | ||||||
|   int skew = 0; |  | ||||||
|   if (threeLink) skew=8; |  | ||||||
|  |  | ||||||
|  |   SiteSpinor result; | ||||||
|   int offset,local,perm, ptype; |   int offset,local,perm, ptype; | ||||||
|  |  | ||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|  |   int skew; | ||||||
|  |  | ||||||
|   // Xp |   for(int s=0;s<LLs;s++){ | ||||||
|   SE=st.GetEntry(ptype,Xp+skew,sF); |     int sF=s+LLs*sU; | ||||||
|   offset = SE->_offset; |  | ||||||
|   local  = SE->_is_local; |  | ||||||
|   perm   = SE->_permute; |  | ||||||
|  |  | ||||||
|   if ( local ) { |     skew = 0; | ||||||
|     LOAD_CHI(in._odata); |     HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);   | ||||||
|     if ( perm) { |     HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);    | ||||||
|       PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |     HAND_STENCIL_LEG      (U,Zp,1,skew,even);   | ||||||
|     } |     HAND_STENCIL_LEG      (U,Tp,0,skew,odd);   | ||||||
|  |     HAND_STENCIL_LEG      (U,Xm,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG      (U,Ym,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG      (U,Zm,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG      (U,Tm,0,skew,odd);   | ||||||
|  |     skew = 8; | ||||||
|  |     HAND_STENCIL_LEG(UUU,Xp,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG(UUU,Zp,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);   | ||||||
|  |     HAND_STENCIL_LEG(UUU,Xm,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG(UUU,Zm,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);   | ||||||
|  |      | ||||||
|  |     if ( dag ) { | ||||||
|  |       result()()(0) = - even_0 - odd_0; | ||||||
|  |       result()()(1) = - even_1 - odd_1; | ||||||
|  |       result()()(2) = - even_2 - odd_2; | ||||||
|     } else {  |     } else {  | ||||||
|     LOAD_CHI(buf); |       result()()(0) = even_0 + odd_0; | ||||||
|  |       result()()(1) = even_1 + odd_1; | ||||||
|  |       result()()(2) = even_2 + odd_2; | ||||||
|     } |     } | ||||||
|  |     vstream(out._odata[sF],result); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template <class Impl> | ||||||
|  | void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 					     DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
|  | 					     SiteSpinor *buf, int LLs, int sU,  | ||||||
|  | 					     const FermionField &in, FermionField &out,int dag)  | ||||||
| { | { | ||||||
|     MULT(Xp,even); |   typedef typename Simd::scalar_type S; | ||||||
|   } |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|   // Yp |   Simd even_0; // 12 regs on knc | ||||||
|   SE=st.GetEntry(ptype,Yp+skew,sF); |   Simd even_1; | ||||||
|   offset = SE->_offset; |   Simd even_2; | ||||||
|   local  = SE->_is_local; |   Simd odd_0; // 12 regs on knc | ||||||
|   perm   = SE->_permute; |   Simd odd_1; | ||||||
|  |   Simd odd_2; | ||||||
|  |  | ||||||
|   if ( local ) { |   Simd Chi_0;    // two spinor; 6 regs | ||||||
|     LOAD_CHI(in._odata); |   Simd Chi_1; | ||||||
|     if ( perm) { |   Simd Chi_2; | ||||||
|       PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |    | ||||||
|     } |   Simd U_00;  // two rows of U matrix | ||||||
|  |   Simd U_10; | ||||||
|  |   Simd U_20;   | ||||||
|  |   Simd U_01; | ||||||
|  |   Simd U_11; | ||||||
|  |   Simd U_21;  // 2 reg left. | ||||||
|  |   Simd U_02; | ||||||
|  |   Simd U_12; | ||||||
|  |   Simd U_22;  | ||||||
|  |  | ||||||
|  |   SiteSpinor result; | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |  | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int skew; | ||||||
|  |  | ||||||
|  |   for(int s=0;s<LLs;s++){ | ||||||
|  |     int sF=s+LLs*sU; | ||||||
|  |  | ||||||
|  |     even_0 = zero;    even_1 = zero;    even_2 = zero; | ||||||
|  |      odd_0 = zero;     odd_1 = zero;     odd_2 = zero; | ||||||
|  |  | ||||||
|  |     skew = 0; | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);   | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);   | ||||||
|  |     skew = 8; | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);   | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);   | ||||||
|  |  | ||||||
|  |     // Assume every site must be connected to at least one interior point. No 1^4 subvols. | ||||||
|  |     if ( dag ) { | ||||||
|  |       result()()(0) = - even_0 - odd_0; | ||||||
|  |       result()()(1) = - even_1 - odd_1; | ||||||
|  |       result()()(2) = - even_2 - odd_2; | ||||||
|     } else {  |     } else {  | ||||||
|     LOAD_CHI(buf); |       result()()(0) = even_0 + odd_0; | ||||||
|  |       result()()(1) = even_1 + odd_1; | ||||||
|  |       result()()(2) = even_2 + odd_2; | ||||||
|     } |     } | ||||||
|  |     vstream(out._odata[sF],result); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template <class Impl> | ||||||
|  | void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,  | ||||||
|  | 					     DoubledGaugeField &U, DoubledGaugeField &UUU, | ||||||
|  | 					     SiteSpinor *buf, int LLs, int sU,  | ||||||
|  | 					     const FermionField &in, FermionField &out,int dag)  | ||||||
| { | { | ||||||
|     MULT(Yp,odd); |   typedef typename Simd::scalar_type S; | ||||||
|   } |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|  |   Simd even_0; // 12 regs on knc | ||||||
|  |   Simd even_1; | ||||||
|  |   Simd even_2; | ||||||
|  |   Simd odd_0; // 12 regs on knc | ||||||
|  |   Simd odd_1; | ||||||
|  |   Simd odd_2; | ||||||
|  |  | ||||||
|   // Zp |   Simd Chi_0;    // two spinor; 6 regs | ||||||
|   SE=st.GetEntry(ptype,Zp+skew,sF); |   Simd Chi_1; | ||||||
|   offset = SE->_offset; |   Simd Chi_2; | ||||||
|   local  = SE->_is_local; |  | ||||||
|   perm   = SE->_permute; |  | ||||||
|    |    | ||||||
|   if ( local ) { |   Simd U_00;  // two rows of U matrix | ||||||
|     LOAD_CHI(in._odata); |   Simd U_10; | ||||||
|     if ( perm) { |   Simd U_20;   | ||||||
|       PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |   Simd U_01; | ||||||
|     } |   Simd U_11; | ||||||
|  |   Simd U_21;  // 2 reg left. | ||||||
|  |   Simd U_02; | ||||||
|  |   Simd U_12; | ||||||
|  |   Simd U_22;  | ||||||
|  |  | ||||||
|  |   SiteSpinor result; | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |  | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int skew; | ||||||
|  |  | ||||||
|  |   for(int s=0;s<LLs;s++){ | ||||||
|  |     int sF=s+LLs*sU; | ||||||
|  |  | ||||||
|  |     even_0 = zero;    even_1 = zero;    even_2 = zero; | ||||||
|  |      odd_0 = zero;     odd_1 = zero;     odd_2 = zero; | ||||||
|  |     int nmu=0; | ||||||
|  |     skew = 0; | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);   | ||||||
|  |     skew = 8; | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);    | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);   | ||||||
|  |     HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);   | ||||||
|  |  | ||||||
|  |     // Add sum of all exterior connected stencil legs | ||||||
|  |     if ( nmu ) {  | ||||||
|  |       if ( dag ) { | ||||||
|  | 	result()()(0) = - even_0 - odd_0; | ||||||
|  | 	result()()(1) = - even_1 - odd_1; | ||||||
|  | 	result()()(2) = - even_2 - odd_2; | ||||||
|       } else {  |       } else {  | ||||||
|     LOAD_CHI(buf); | 	result()()(0) = even_0 + odd_0; | ||||||
|  | 	result()()(1) = even_1 + odd_1; | ||||||
|  | 	result()()(2) = even_2 + odd_2; | ||||||
|  |       } | ||||||
|  |       out._odata[sF] = out._odata[sF] + result; | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|   { |  | ||||||
|     MULT_ADD(Zp,even); |  | ||||||
| } | } | ||||||
|  |  | ||||||
|   // Tp |  | ||||||
|   SE=st.GetEntry(ptype,Tp+skew,sF); |  | ||||||
|   offset = SE->_offset; |  | ||||||
|   local  = SE->_is_local; |  | ||||||
|   perm   = SE->_permute; |  | ||||||
|    |  | ||||||
|   if ( local ) { |  | ||||||
|     LOAD_CHI(in._odata); |  | ||||||
|     if ( perm) { |  | ||||||
|       PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |  | ||||||
|     } |  | ||||||
|   } else {  |  | ||||||
|     LOAD_CHI(buf); |  | ||||||
|   } |  | ||||||
|   { |  | ||||||
|     MULT_ADD(Tp,odd); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Xm |  | ||||||
|   SE=st.GetEntry(ptype,Xm+skew,sF); |  | ||||||
|   offset = SE->_offset; |  | ||||||
|   local  = SE->_is_local; |  | ||||||
|   perm   = SE->_permute; |  | ||||||
|    |  | ||||||
|   if ( local ) { |  | ||||||
|     LOAD_CHI(in._odata); |  | ||||||
|     if ( perm) { |  | ||||||
|       PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |  | ||||||
|     } |  | ||||||
|   } else {  |  | ||||||
|     LOAD_CHI(buf); |  | ||||||
|   } |  | ||||||
|   { |  | ||||||
|     MULT_ADD(Xm,even); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|    |  | ||||||
|   // Ym |  | ||||||
|   SE=st.GetEntry(ptype,Ym+skew,sF); |  | ||||||
|   offset = SE->_offset; |  | ||||||
|   local  = SE->_is_local; |  | ||||||
|   perm   = SE->_permute; |  | ||||||
|    |  | ||||||
|   if ( local ) { |  | ||||||
|     LOAD_CHI(in._odata); |  | ||||||
|     if ( perm) { |  | ||||||
|       PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |  | ||||||
|     } |  | ||||||
|   } else {  |  | ||||||
|     LOAD_CHI(buf); |  | ||||||
|   } |  | ||||||
|   { |  | ||||||
|     MULT_ADD(Ym,odd); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Zm |  | ||||||
|   SE=st.GetEntry(ptype,Zm+skew,sF); |  | ||||||
|   offset = SE->_offset; |  | ||||||
|   local  = SE->_is_local; |  | ||||||
|   perm   = SE->_permute; |  | ||||||
|    |  | ||||||
|   if ( local ) { |  | ||||||
|     LOAD_CHI(in._odata); |  | ||||||
|     if ( perm) { |  | ||||||
|       PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |  | ||||||
|     } |  | ||||||
|   } else {  |  | ||||||
|     LOAD_CHI(buf); |  | ||||||
|   } |  | ||||||
|   { |  | ||||||
|     MULT_ADD(Zm,even); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Tm |  | ||||||
|   SE=st.GetEntry(ptype,Tm+skew,sF); |  | ||||||
|   offset = SE->_offset; |  | ||||||
|   local  = SE->_is_local; |  | ||||||
|   perm   = SE->_permute; |  | ||||||
|    |  | ||||||
|   if ( local ) { |  | ||||||
|     LOAD_CHI(in._odata); |  | ||||||
|     if ( perm) { |  | ||||||
|       PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... |  | ||||||
|     } |  | ||||||
|   } else {  |  | ||||||
|     LOAD_CHI(buf); |  | ||||||
|   } |  | ||||||
|   { |  | ||||||
|     MULT_ADD(Tm,odd); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   vstream(out()()(0),even_0+odd_0); |  | ||||||
|   vstream(out()()(1),even_1+odd_1); |  | ||||||
|   vstream(out()()(2),even_2+odd_2); |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\ | #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\ | ||||||
|   template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ |   template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ | ||||||
| 						     DoubledGaugeField &U,DoubledGaugeField &UUU, \ | 						     DoubledGaugeField &U,DoubledGaugeField &UUU, \ | ||||||
| 						     SiteSpinor *buf, int LLs, \ | 						     SiteSpinor *buf, int LLs, int sU, \ | ||||||
| 						     int sU, const FermionField &in, FermionField &out, int dag); | 						     const FermionField &in, FermionField &out, int dag); \ | ||||||
|  | 									\ | ||||||
|  |   template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ | ||||||
|  | 						     DoubledGaugeField &U,DoubledGaugeField &UUU, \ | ||||||
|  | 						     SiteSpinor *buf, int LLs, int sU, \ | ||||||
|  | 						     const FermionField &in, FermionField &out, int dag); \ | ||||||
|  | 									\ | ||||||
|  |   template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ | ||||||
|  | 						     DoubledGaugeField &U,DoubledGaugeField &UUU, \ | ||||||
|  | 						     SiteSpinor *buf, int LLs, int sU, \ | ||||||
|  | 						     const FermionField &in, FermionField &out, int dag); \ | ||||||
|  |  | ||||||
| #define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL)				\ |  | ||||||
|   template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \ |  | ||||||
| 							  SiteSpinor *buf, int sF, \ |  | ||||||
| 							  int sU, const FermionField &in, SiteSpinor &out,int threeLink) ; |  | ||||||
| DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD); | DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD); | ||||||
| DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF); | DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF); | ||||||
| DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD); | DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD); | ||||||
| DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF); | DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF); | ||||||
|  |  | ||||||
| DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD); |  | ||||||
| DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF); |  | ||||||
| DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD); |  | ||||||
| DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF); |  | ||||||
|  |  | ||||||
| }} | } | ||||||
|  | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -69,39 +69,47 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, | |||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   /* Compress includes precision change if mpi data is not same */ |   /* Compress includes precision change if mpi data is not same */ | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) { |   inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) { | ||||||
|     projector::Proj(buf[o],in,mu,dag); |     SiteHalfSpinor tmp; | ||||||
|  |     projector::Proj(tmp,in,mu,dag); | ||||||
|  |     vstream(buf[o],tmp); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   /* Exchange includes precision change if mpi data is not same */ |   /* Exchange includes precision change if mpi data is not same */ | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   inline void Exchange(SiteHalfSpinor *mp, |   inline void Exchange(SiteHalfSpinor * __restrict__ mp, | ||||||
|                        SiteHalfSpinor *vp0, |                        const SiteHalfSpinor * __restrict__ vp0, | ||||||
|                        SiteHalfSpinor *vp1, |                        const SiteHalfSpinor * __restrict__ vp1, | ||||||
| 		       Integer type,Integer o){ | 		       Integer type,Integer o){ | ||||||
|     exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); |     SiteHalfSpinor tmp1; | ||||||
|  |     SiteHalfSpinor tmp2; | ||||||
|  |     exchange(tmp1,tmp2,vp0[o],vp1[o],type); | ||||||
|  |     vstream(mp[2*o  ],tmp1); | ||||||
|  |     vstream(mp[2*o+1],tmp2); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   /* Have a decompression step if mpi data is not same */ |   /* Have a decompression step if mpi data is not same */ | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   inline void Decompress(SiteHalfSpinor *out, |   inline void Decompress(SiteHalfSpinor * __restrict__ out, | ||||||
| 			 SiteHalfSpinor *in, Integer o) {     | 			 SiteHalfSpinor * __restrict__ in, Integer o) {     | ||||||
|     assert(0); |     assert(0); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   /* Compress Exchange                                 */ |   /* Compress Exchange                                 */ | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
|   inline void CompressExchange(SiteHalfSpinor *out0, |   inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, | ||||||
| 			       SiteHalfSpinor *out1, | 			       SiteHalfSpinor * __restrict__ out1, | ||||||
| 			       const SiteSpinor *in, | 			       const SiteSpinor * __restrict__ in, | ||||||
| 			       Integer j,Integer k, Integer m,Integer type){ | 			       Integer j,Integer k, Integer m,Integer type){ | ||||||
|     SiteHalfSpinor temp1, temp2,temp3,temp4; |     SiteHalfSpinor temp1, temp2,temp3,temp4; | ||||||
|     projector::Proj(temp1,in[k],mu,dag); |     projector::Proj(temp1,in[k],mu,dag); | ||||||
|     projector::Proj(temp2,in[m],mu,dag); |     projector::Proj(temp2,in[m],mu,dag); | ||||||
|     exchange(out0[j],out1[j],temp1,temp2,type); |     exchange(temp3,temp4,temp1,temp2,type); | ||||||
|  |     vstream(out0[j],temp3); | ||||||
|  |     vstream(out1[j],temp4); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   /*****************************************************/ |   /*****************************************************/ | ||||||
| @@ -266,41 +274,16 @@ public: | |||||||
|     if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl; |     if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   std::vector<int> same_node; |  | ||||||
|   std::vector<int> surface_list; |  | ||||||
|  |  | ||||||
|   WilsonStencil(GridBase *grid, |   WilsonStencil(GridBase *grid, | ||||||
| 		int npoints, | 		int npoints, | ||||||
| 		int checkerboard, | 		int checkerboard, | ||||||
| 		const std::vector<int> &directions, | 		const std::vector<int> &directions, | ||||||
| 		const std::vector<int> &distances)   | 		const std::vector<int> &distances)   | ||||||
|     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , |     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)  | ||||||
|     same_node(npoints) |  | ||||||
|   {  |   {  | ||||||
|     ZeroCountersi(); |     ZeroCountersi(); | ||||||
|     surface_list.resize(0); |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   void BuildSurfaceList(int Ls,int vol4){ |  | ||||||
|  |  | ||||||
|     // find same node for SHM |  | ||||||
|     // Here we know the distance is 1 for WilsonStencil |  | ||||||
|     for(int point=0;point<this->_npoints;point++){ |  | ||||||
|       same_node[point] = this->SameNode(point); |  | ||||||
|     } |  | ||||||
|      |  | ||||||
|     for(int site = 0 ;site< vol4;site++){ |  | ||||||
|       int local = 1; |  | ||||||
|       for(int point=0;point<this->_npoints;point++){ |  | ||||||
| 	if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){  |  | ||||||
| 	  local = 0; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|       if(local == 0) {  |  | ||||||
| 	surface_list.push_back(site); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   template < class compressor> |   template < class compressor> | ||||||
|   void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)  |   void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)  | ||||||
| @@ -361,23 +344,23 @@ public: | |||||||
|     int dag = compress.dag; |     int dag = compress.dag; | ||||||
|     int face_idx=0; |     int face_idx=0; | ||||||
|     if ( dag ) {  |     if ( dag ) {  | ||||||
|       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); |       assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); | ||||||
|       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); |       assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); | ||||||
|       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); |       assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); | ||||||
|       assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); |       assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); | ||||||
|       assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); |       assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); | ||||||
|       assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); |       assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); | ||||||
|       assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); |       assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); | ||||||
|       assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); |       assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); | ||||||
|     } else { |     } else { | ||||||
|       assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); |       assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); | ||||||
|       assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); |       assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); | ||||||
|       assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); |       assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); | ||||||
|       assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); |       assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); | ||||||
|       assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); |       assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); | ||||||
|       assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); |       assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); | ||||||
|       assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); |       assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); | ||||||
|       assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); |       assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); | ||||||
|     } |     } | ||||||
|     this->face_table_computed=1; |     this->face_table_computed=1; | ||||||
|     assert(this->u_comm_offset==this->_unified_buffer_size); |     assert(this->u_comm_offset==this->_unified_buffer_size); | ||||||
|   | |||||||
| @@ -348,15 +348,98 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out, | |||||||
|   parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { |   parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|     Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); |     Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma); | ||||||
|   } |   } | ||||||
| }; | }  | ||||||
|  | /*Change starts*/ | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, | void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, | ||||||
|                                        DoubledGaugeField &U, |                                        DoubledGaugeField &U, | ||||||
|                                        const FermionField &in, |                                        const FermionField &in, | ||||||
|                                        FermionField &out, int dag) { |                                        FermionField &out, int dag) { | ||||||
|   assert((dag == DaggerNo) || (dag == DaggerYes)); | #ifdef GRID_OMP | ||||||
|  |   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) | ||||||
|  |     DhopInternalOverlappedComms(st,lo,U,in,out,dag); | ||||||
|  |   else | ||||||
|  | #endif  | ||||||
|  |     DhopInternalSerial(st,lo,U,in,out,dag); | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template <class Impl> | ||||||
|  | void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, | ||||||
|  |                                        DoubledGaugeField &U, | ||||||
|  |                                        const FermionField &in, | ||||||
|  |                                        FermionField &out, int dag) { | ||||||
|  |   assert((dag == DaggerNo) || (dag == DaggerYes)); | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |   Compressor compressor; | ||||||
|  |   int len =  U._grid->oSites(); | ||||||
|  |   const int LLs =  1; | ||||||
|  |  | ||||||
|  |   st.Prepare(); | ||||||
|  |   st.HaloGather(in,compressor); | ||||||
|  |   st.CommsMergeSHM(compressor); | ||||||
|  | #pragma omp parallel | ||||||
|  |   { | ||||||
|  |     int tid = omp_get_thread_num(); | ||||||
|  |     int nthreads = omp_get_num_threads(); | ||||||
|  |     int ncomms = CartesianCommunicator::nCommThreads; | ||||||
|  |     if (ncomms == -1) ncomms = 1; | ||||||
|  |     assert(nthreads > ncomms); | ||||||
|  |     if (tid >= ncomms) { | ||||||
|  |       nthreads -= ncomms; | ||||||
|  |       int ttid  = tid - ncomms; | ||||||
|  |       int n     = len; | ||||||
|  |       int chunk = n / nthreads; | ||||||
|  |       int rem   = n % nthreads; | ||||||
|  |       int myblock, myn; | ||||||
|  |       if (ttid < rem) { | ||||||
|  |         myblock = ttid * chunk + ttid; | ||||||
|  |         myn = chunk+1; | ||||||
|  |       } else { | ||||||
|  |         myblock = ttid*chunk + rem; | ||||||
|  |         myn = chunk; | ||||||
|  |       } | ||||||
|  |       // do the compute | ||||||
|  |      if (dag == DaggerYes) { | ||||||
|  |  | ||||||
|  |         for (int sss = myblock; sss < myblock+myn; ++sss) { | ||||||
|  |          Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); | ||||||
|  |        } | ||||||
|  |      } else { | ||||||
|  |         for (int sss = myblock; sss < myblock+myn; ++sss) { | ||||||
|  |          Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); | ||||||
|  |        } | ||||||
|  |     } //else | ||||||
|  |  | ||||||
|  |     } else { | ||||||
|  |       st.CommunicateThreaded(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   Compressor compressor(dag); | ||||||
|  |  | ||||||
|  |   if (dag == DaggerYes) { | ||||||
|  |     parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|  |       Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); | ||||||
|  |     } | ||||||
|  |   } else { | ||||||
|  |     parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) { | ||||||
|  |       Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   }  //pragma | ||||||
|  | #else | ||||||
|  |   assert(0); | ||||||
|  | #endif | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template <class Impl> | ||||||
|  | void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, | ||||||
|  |                                        DoubledGaugeField &U, | ||||||
|  |                                        const FermionField &in, | ||||||
|  |                                        FermionField &out, int dag) { | ||||||
|  |   assert((dag == DaggerNo) || (dag == DaggerYes)); | ||||||
|   Compressor compressor(dag); |   Compressor compressor(dag); | ||||||
|   st.HaloExchange(in, compressor); |   st.HaloExchange(in, compressor); | ||||||
|  |  | ||||||
| @@ -370,6 +453,7 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  | /*Change ends */ | ||||||
|  |  | ||||||
| /******************************************************************************* | /******************************************************************************* | ||||||
|  * Conserved current utilities for Wilson fermions, for contracting propagators |  * Conserved current utilities for Wilson fermions, for contracting propagators | ||||||
|   | |||||||
| @@ -130,6 +130,12 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic { | |||||||
|   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | ||||||
|                     const FermionField &in, FermionField &out, int dag); |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|  |   void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | ||||||
|  |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|  |   void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, | ||||||
|  |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|   // Constructor |   // Constructor | ||||||
|   WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, |   WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, | ||||||
|                 GridRedBlackCartesian &Hgrid, RealD _mass,  |                 GridRedBlackCartesian &Hgrid, RealD _mass,  | ||||||
| @@ -145,6 +151,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic { | |||||||
|  |  | ||||||
|   //    protected: |   //    protected: | ||||||
|  public: |  public: | ||||||
|  |   virtual RealD Mass(void) { return mass; } | ||||||
|  |   virtual int   isTrivialEE(void) { return 1; }; | ||||||
|   RealD mass; |   RealD mass; | ||||||
|   RealD diag_mass; |   RealD diag_mass; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -445,8 +445,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
| 	ptime = usecond() - start; | 	ptime = usecond() - start; | ||||||
|     } |     } else { | ||||||
|     { |  | ||||||
|       double start = usecond(); |       double start = usecond(); | ||||||
|       st.CommunicateThreaded(); |       st.CommunicateThreaded(); | ||||||
|       ctime = usecond() - start; |       ctime = usecond() - start; | ||||||
|   | |||||||
| @@ -232,6 +232,7 @@ private: | |||||||
|   void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, |   void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 			     int sF, int sU, const FermionField &in, FermionField &out); | 			     int sF, int sU, const FermionField &in, FermionField &out); | ||||||
|  |  | ||||||
|  |  | ||||||
|   void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, |   void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 		   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); | 		   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -30,147 +30,33 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| #define REGISTER | #define REGISTER | ||||||
|  |  | ||||||
| #define LOAD_CHIMU_BODY(F)			\ | #define LOAD_CHIMU \ | ||||||
|   Chimu_00=ref(F)(0)(0);			\ |  | ||||||
|   Chimu_01=ref(F)(0)(1);			\ |  | ||||||
|   Chimu_02=ref(F)(0)(2);			\ |  | ||||||
|   Chimu_10=ref(F)(1)(0);			\ |  | ||||||
|   Chimu_11=ref(F)(1)(1);			\ |  | ||||||
|   Chimu_12=ref(F)(1)(2);			\ |  | ||||||
|   Chimu_20=ref(F)(2)(0);			\ |  | ||||||
|   Chimu_21=ref(F)(2)(1);			\ |  | ||||||
|   Chimu_22=ref(F)(2)(2);			\ |  | ||||||
|   Chimu_30=ref(F)(3)(0);			\ |  | ||||||
|   Chimu_31=ref(F)(3)(1);			\ |  | ||||||
|   Chimu_32=ref(F)(3)(2) |  | ||||||
|  |  | ||||||
| #define LOAD_CHIMU(DIR,F,PERM)						\ |  | ||||||
|   { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); } |  | ||||||
|  |  | ||||||
| #define LOAD_CHI_BODY(F)				\ |  | ||||||
|     Chi_00 = ref(F)(0)(0);\ |  | ||||||
|     Chi_01 = ref(F)(0)(1);\ |  | ||||||
|     Chi_02 = ref(F)(0)(2);\ |  | ||||||
|     Chi_10 = ref(F)(1)(0);\ |  | ||||||
|     Chi_11 = ref(F)(1)(1);\ |  | ||||||
|     Chi_12 = ref(F)(1)(2) |  | ||||||
|  |  | ||||||
| #define LOAD_CHI(DIR,F,PERM)					\ |  | ||||||
|   {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| //G-parity implementations using in-place intrinsic ops |  | ||||||
|  |  | ||||||
| //1l 1h -> 1h 1l |  | ||||||
| //0l 0h , 1h 1l -> 0l 1h 0h,1l |  | ||||||
| //0h,1l -> 1l,0h |  | ||||||
| //if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) ) |  | ||||||
| //Pulled fermion through forwards face, GPBC on upper component |  | ||||||
| //Need 0= 0l 1h   1= 1l 0h |  | ||||||
| //else if( (distance == -1 && !perm) || (distance == 1 && perm) ) |  | ||||||
| //Pulled fermion through backwards face, GPBC on lower component |  | ||||||
| //Need 0= 1l 0h   1= 0l 1h |  | ||||||
|  |  | ||||||
| //1l 1h -> 1h 1l |  | ||||||
| //0l 0h , 1h 1l -> 0l 1h 0h,1l |  | ||||||
| #define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\ |  | ||||||
|   permute##PERM(tmp1, ref(1)(S)(C));				\ |  | ||||||
|   exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\ |  | ||||||
|   INTO = tmp2; |  | ||||||
|  |  | ||||||
| //0l 0h -> 0h 0l |  | ||||||
| //1l 1h, 0h 0l -> 1l 0h, 1h 0l |  | ||||||
| #define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\ |  | ||||||
|   permute##PERM(tmp1, ref(0)(S)(C));				\ |  | ||||||
|   exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\ |  | ||||||
|   INTO = tmp2; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #define LOAD_CHI_SETUP(DIR,F)						\ |  | ||||||
|   g = F;								\ |  | ||||||
|   direction = st._directions[DIR];				\ |  | ||||||
|   distance = st._distances[DIR];				\ |  | ||||||
|   sl = st._grid->_simd_layout[direction];			\ |  | ||||||
|   inplace_twist = 0;						\ |  | ||||||
|   if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\ |  | ||||||
|     if(sl == 1){							\ |  | ||||||
|       g = (F+1) % 2;							\ |  | ||||||
|     }else{								\ |  | ||||||
|       inplace_twist = 1;						\ |  | ||||||
|     }									\ |  | ||||||
|   }   |  | ||||||
|  |  | ||||||
| #define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\ |  | ||||||
|   {const SiteSpinor & ref (in._odata[offset]);	\ |   {const SiteSpinor & ref (in._odata[offset]);	\ | ||||||
|     LOAD_CHI_SETUP(DIR,F);						\ |     Chimu_00=ref()(0)(0);\ | ||||||
|     if(!inplace_twist){							\ |     Chimu_01=ref()(0)(1);\ | ||||||
|       LOAD_CHIMU_BODY(g);						\ |     Chimu_02=ref()(0)(2);\ | ||||||
|     }else{								\ |     Chimu_10=ref()(1)(0);\ | ||||||
|       if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \ |     Chimu_11=ref()(1)(1);\ | ||||||
| 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \ |     Chimu_12=ref()(1)(2);\ | ||||||
| 	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\ |     Chimu_20=ref()(2)(0);\ | ||||||
| 	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\ |     Chimu_21=ref()(2)(1);\ | ||||||
| 	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\ |     Chimu_22=ref()(2)(2);\ | ||||||
| 	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\ |     Chimu_30=ref()(3)(0);\ | ||||||
| 	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\ |     Chimu_31=ref()(3)(1);\ | ||||||
| 	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\ |     Chimu_32=ref()(3)(2);} | ||||||
| 	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
| 	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
| 	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
|       }else{								\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\ |  | ||||||
|       } \ |  | ||||||
|     } \ |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  | #define LOAD_CHI\ | ||||||
| #define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\ |  | ||||||
|   {const SiteHalfSpinor &ref(buf[offset]);	\ |   {const SiteHalfSpinor &ref(buf[offset]);	\ | ||||||
|     LOAD_CHI_SETUP(DIR,F);						\ |     Chi_00 = ref()(0)(0);\ | ||||||
|     if(!inplace_twist){							\ |     Chi_01 = ref()(0)(1);\ | ||||||
|       LOAD_CHI_BODY(g);							\ |     Chi_02 = ref()(0)(2);\ | ||||||
|     }else{								\ |     Chi_10 = ref()(1)(0);\ | ||||||
|       if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \ |     Chi_11 = ref()(1)(1);\ | ||||||
| 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \ |     Chi_12 = ref()(1)(2);} | ||||||
| 	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\ |  | ||||||
| 	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\ |  | ||||||
| 	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\ |  | ||||||
| 	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\ |  | ||||||
| 	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\ |  | ||||||
| 	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\ |  | ||||||
|       }else{								\ |  | ||||||
| 	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\ |  | ||||||
| 	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\ |  | ||||||
| 	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\ |  | ||||||
| 	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\ |  | ||||||
| 	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\ |  | ||||||
|       }									\ |  | ||||||
|     }									\ |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) |  | ||||||
| #define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) |  | ||||||
|  |  | ||||||
| // To splat or not to splat depends on the implementation | // To splat or not to splat depends on the implementation | ||||||
| #define MULT_2SPIN_BODY \ | #define MULT_2SPIN(A)\ | ||||||
|  |   {auto & ref(U._odata[sU](A));			\ | ||||||
|    Impl::loadLinkElement(U_00,ref()(0,0));	\ |    Impl::loadLinkElement(U_00,ref()(0,0));	\ | ||||||
|    Impl::loadLinkElement(U_10,ref()(1,0));	\ |    Impl::loadLinkElement(U_10,ref()(1,0));	\ | ||||||
|    Impl::loadLinkElement(U_20,ref()(2,0));	\ |    Impl::loadLinkElement(U_20,ref()(2,0));	\ | ||||||
| @@ -197,14 +83,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     UChi_01+= U_10*Chi_02;\ |     UChi_01+= U_10*Chi_02;\ | ||||||
|     UChi_11+= U_10*Chi_12;\ |     UChi_11+= U_10*Chi_12;\ | ||||||
|     UChi_02+= U_20*Chi_02;\ |     UChi_02+= U_20*Chi_02;\ | ||||||
|   UChi_12+= U_20*Chi_12 |     UChi_12+= U_20*Chi_12;} | ||||||
|  |  | ||||||
|  |  | ||||||
| #define MULT_2SPIN(A,F)					\ |  | ||||||
|   {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; } |  | ||||||
|  |  | ||||||
| #define MULT_2SPIN_GPARITY(A,F)				\ |  | ||||||
|   {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #define PERMUTE_DIR(dir)			\ | #define PERMUTE_DIR(dir)			\ | ||||||
| @@ -428,87 +307,84 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|   result_31-= UChi_11;	\ |   result_31-= UChi_11;	\ | ||||||
|   result_32-= UChi_12; |   result_32-= UChi_12; | ||||||
|  |  | ||||||
| #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ | #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON)	\ | ||||||
|   SE=st.GetEntry(ptype,DIR,ss);			\ |   SE=st.GetEntry(ptype,DIR,ss);			\ | ||||||
|   offset = SE->_offset;				\ |   offset = SE->_offset;				\ | ||||||
|   local  = SE->_is_local;			\ |   local  = SE->_is_local;			\ | ||||||
|   perm   = SE->_permute;			\ |   perm   = SE->_permute;			\ | ||||||
|   if ( local ) {				\ |   if ( local ) {				\ | ||||||
|     LOAD_CHIMU_IMPL(DIR,F,PERM);			\ |     LOAD_CHIMU;					\ | ||||||
|     PROJ;					\ |     PROJ;					\ | ||||||
|     if ( perm) {				\ |     if ( perm) {				\ | ||||||
|       PERMUTE_DIR(PERM);			\ |       PERMUTE_DIR(PERM);			\ | ||||||
|     }						\ |     }						\ | ||||||
|   } else {					\ |   } else {					\ | ||||||
|     LOAD_CHI_IMPL(DIR,F,PERM);			\ |     LOAD_CHI;					\ | ||||||
|   }						\ |   }						\ | ||||||
|   MULT_2SPIN_IMPL(DIR,F);			\ |   MULT_2SPIN(DIR);				\ | ||||||
|   RECON;					 |   RECON;					 | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON)	\ | ||||||
| #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\ |  | ||||||
|   SE=st.GetEntry(ptype,DIR,ss);			\ |   SE=st.GetEntry(ptype,DIR,ss);			\ | ||||||
|   offset = SE->_offset;				\ |   offset = SE->_offset;				\ | ||||||
|   local  = SE->_is_local;			\ |   local  = SE->_is_local;			\ | ||||||
|   perm   = SE->_permute;			\ |   perm   = SE->_permute;			\ | ||||||
|   if ( local ) {				\ |   if ( local ) {				\ | ||||||
|     LOAD_CHIMU_IMPL(DIR,F,PERM);			\ |     LOAD_CHIMU;					\ | ||||||
|     PROJ;					\ |     PROJ;					\ | ||||||
|     if ( perm) {				\ |     if ( perm) {				\ | ||||||
|       PERMUTE_DIR(PERM);			\ |       PERMUTE_DIR(PERM);			\ | ||||||
|     }						\ |     }						\ | ||||||
|   } else if ( st.same_node[DIR] ) {		\ |   } else if ( st.same_node[DIR] ) {		\ | ||||||
|     LOAD_CHI_IMPL(DIR,F,PERM);			\ |     LOAD_CHI;					\ | ||||||
|   }						\ |   }						\ | ||||||
|   if (local || st.same_node[DIR] ) {		\ |   if (local || st.same_node[DIR] ) {		\ | ||||||
|     MULT_2SPIN_IMPL(DIR,F);			\ |     MULT_2SPIN(DIR);				\ | ||||||
|     RECON;					\ |     RECON;					\ | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\ | #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON)	\ | ||||||
|   SE=st.GetEntry(ptype,DIR,ss);			\ |   SE=st.GetEntry(ptype,DIR,ss);			\ | ||||||
|   offset = SE->_offset;				\ |   offset = SE->_offset;				\ | ||||||
|   local  = SE->_is_local;			\ |  | ||||||
|   perm   = SE->_permute;			\ |  | ||||||
|   if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\ |   if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\ | ||||||
|     LOAD_CHI_IMPL(DIR,F,PERM);			\ |     LOAD_CHI;					\ | ||||||
|     MULT_2SPIN_IMPL(DIR,F);			\ |     MULT_2SPIN(DIR);				\ | ||||||
|     RECON;					\ |     RECON;					\ | ||||||
|     nmu++;					\ |     nmu++;					\ | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #define HAND_RESULT(ss,F)			\ | #define HAND_RESULT(ss)				\ | ||||||
|   {						\ |   {						\ | ||||||
|     SiteSpinor & ref (out._odata[ss]);		\ |     SiteSpinor & ref (out._odata[ss]);		\ | ||||||
|     vstream(ref(F)(0)(0),result_00);		\ |     vstream(ref()(0)(0),result_00);		\ | ||||||
|     vstream(ref(F)(0)(1),result_01);		\ |     vstream(ref()(0)(1),result_01);		\ | ||||||
|     vstream(ref(F)(0)(2),result_02);		\ |     vstream(ref()(0)(2),result_02);		\ | ||||||
|     vstream(ref(F)(1)(0),result_10);		\ |     vstream(ref()(1)(0),result_10);		\ | ||||||
|     vstream(ref(F)(1)(1),result_11);		\ |     vstream(ref()(1)(1),result_11);		\ | ||||||
|     vstream(ref(F)(1)(2),result_12);		\ |     vstream(ref()(1)(2),result_12);		\ | ||||||
|     vstream(ref(F)(2)(0),result_20);		\ |     vstream(ref()(2)(0),result_20);		\ | ||||||
|     vstream(ref(F)(2)(1),result_21);		\ |     vstream(ref()(2)(1),result_21);		\ | ||||||
|     vstream(ref(F)(2)(2),result_22);		\ |     vstream(ref()(2)(2),result_22);		\ | ||||||
|     vstream(ref(F)(3)(0),result_30);		\ |     vstream(ref()(3)(0),result_30);		\ | ||||||
|     vstream(ref(F)(3)(1),result_31);		\ |     vstream(ref()(3)(1),result_31);		\ | ||||||
|     vstream(ref(F)(3)(2),result_32);		\ |     vstream(ref()(3)(2),result_32);		\ | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #define HAND_RESULT_EXT(ss,F)			\ | #define HAND_RESULT_EXT(ss)			\ | ||||||
|   if (nmu){					\ |   if (nmu){					\ | ||||||
|     SiteSpinor & ref (out._odata[ss]);		\ |     SiteSpinor & ref (out._odata[ss]);		\ | ||||||
|     ref(F)(0)(0)+=result_00;		\ |     ref()(0)(0)+=result_00;		\ | ||||||
|     ref(F)(0)(1)+=result_01;		\ |     ref()(0)(1)+=result_01;		\ | ||||||
|     ref(F)(0)(2)+=result_02;		\ |     ref()(0)(2)+=result_02;		\ | ||||||
|     ref(F)(1)(0)+=result_10;		\ |     ref()(1)(0)+=result_10;		\ | ||||||
|     ref(F)(1)(1)+=result_11;		\ |     ref()(1)(1)+=result_11;		\ | ||||||
|     ref(F)(1)(2)+=result_12;		\ |     ref()(1)(2)+=result_12;		\ | ||||||
|     ref(F)(2)(0)+=result_20;		\ |     ref()(2)(0)+=result_20;		\ | ||||||
|     ref(F)(2)(1)+=result_21;		\ |     ref()(2)(1)+=result_21;		\ | ||||||
|     ref(F)(2)(2)+=result_22;		\ |     ref()(2)(2)+=result_22;		\ | ||||||
|     ref(F)(3)(0)+=result_30;		\ |     ref()(3)(0)+=result_30;		\ | ||||||
|     ref(F)(3)(1)+=result_31;		\ |     ref()(3)(1)+=result_31;		\ | ||||||
|     ref(F)(3)(2)+=result_32;		\ |     ref()(3)(2)+=result_32;		\ | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -587,18 +463,15 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge | |||||||
|   int offset,local,perm, ptype; |   int offset,local,perm, ptype; | ||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|  |  | ||||||
| #define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ |   HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); | ||||||
|   HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\ |   HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_RESULT(ss); | ||||||
|   HAND_RESULT(ss,F) |  | ||||||
|  |  | ||||||
|   HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -613,18 +486,15 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub | |||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|   int offset,local,perm, ptype; |   int offset,local,perm, ptype; | ||||||
|    |    | ||||||
| #define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ |   HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); | ||||||
|   HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_RESULT(ss); | ||||||
|   HAND_RESULT(ss,F) |  | ||||||
|  |  | ||||||
|   HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> void  | template<class Impl> void  | ||||||
| @@ -639,20 +509,16 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa | |||||||
|  |  | ||||||
|   int offset,local,perm, ptype; |   int offset,local,perm, ptype; | ||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|  |   ZERO_RESULT; | ||||||
| #define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ |   HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); | ||||||
|   ZERO_RESULT; \ |   HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_RESULT(ss); | ||||||
|   HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |  | ||||||
|   HAND_RESULT(ss,F) |  | ||||||
|  |  | ||||||
|   HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -666,20 +532,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D | |||||||
|  |  | ||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|   int offset,local,perm, ptype; |   int offset,local,perm, ptype; | ||||||
|  |   ZERO_RESULT; | ||||||
| #define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\ |   HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); | ||||||
|   ZERO_RESULT;							\ |   HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |   HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |   HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |   HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |   HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |   HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |   HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |   HAND_RESULT(ss); | ||||||
|   HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ |  | ||||||
|   HAND_RESULT(ss,F) |  | ||||||
|    |  | ||||||
|   HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> void  | template<class Impl> void  | ||||||
| @@ -695,20 +557,16 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa | |||||||
|   int offset,local,perm, ptype; |   int offset,local,perm, ptype; | ||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|   int nmu=0; |   int nmu=0; | ||||||
|  |   ZERO_RESULT; | ||||||
| #define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ |   HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM); | ||||||
|   ZERO_RESULT; \ |   HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_RESULT_EXT(ss); | ||||||
|   HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |  | ||||||
|   HAND_RESULT_EXT(ss,F) |  | ||||||
|  |  | ||||||
|   HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -723,193 +581,18 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D | |||||||
|   StencilEntry *SE; |   StencilEntry *SE; | ||||||
|   int offset,local,perm, ptype; |   int offset,local,perm, ptype; | ||||||
|   int nmu=0; |   int nmu=0; | ||||||
|  |   ZERO_RESULT; | ||||||
| #define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ |   HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); | ||||||
|   ZERO_RESULT; \ |   HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM); | ||||||
|   HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |   HAND_RESULT_EXT(ss); | ||||||
|   HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ |  | ||||||
|   HAND_RESULT_EXT(ss,F) |  | ||||||
|  |  | ||||||
|   HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); |  | ||||||
| } | } | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////// |  | ||||||
|   // Specialise Gparity to simple implementation |  | ||||||
|   //////////////////////////////////////////////// |  | ||||||
| #define HAND_SPECIALISE_EMPTY(IMPL)					\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,			\ |  | ||||||
| 				    LebesgueOrder &lo,			\ |  | ||||||
| 				    DoubledGaugeField &U,		\ |  | ||||||
| 				    SiteHalfSpinor *buf,		\ |  | ||||||
| 				    int sF,int sU,			\ |  | ||||||
| 				    const FermionField &in,		\ |  | ||||||
| 				    FermionField &out){ assert(0); }	\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,			\ |  | ||||||
| 				    LebesgueOrder &lo,			\ |  | ||||||
| 				    DoubledGaugeField &U,		\ |  | ||||||
| 				    SiteHalfSpinor *buf,		\ |  | ||||||
| 				    int sF,int sU,			\ |  | ||||||
| 				    const FermionField &in,		\ |  | ||||||
| 				    FermionField &out){ assert(0); }	\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,			\ |  | ||||||
| 				    LebesgueOrder &lo,			\ |  | ||||||
| 				    DoubledGaugeField &U,		\ |  | ||||||
| 				    SiteHalfSpinor *buf,		\ |  | ||||||
| 				    int sF,int sU,			\ |  | ||||||
| 				    const FermionField &in,		\ |  | ||||||
| 				    FermionField &out){ assert(0); }	\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,			\ |  | ||||||
| 				    LebesgueOrder &lo,			\ |  | ||||||
| 				    DoubledGaugeField &U,		\ |  | ||||||
| 				    SiteHalfSpinor *buf,		\ |  | ||||||
| 				    int sF,int sU,			\ |  | ||||||
| 				    const FermionField &in,		\ |  | ||||||
| 				    FermionField &out){ assert(0); }	\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,	       	\ |  | ||||||
| 				    LebesgueOrder &lo,			\ |  | ||||||
| 				    DoubledGaugeField &U,		\ |  | ||||||
| 				    SiteHalfSpinor *buf,		\ |  | ||||||
| 				    int sF,int sU,			\ |  | ||||||
| 				    const FermionField &in,		\ |  | ||||||
| 				    FermionField &out){ assert(0); }	\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,	       	\ |  | ||||||
| 				    LebesgueOrder &lo,			\ |  | ||||||
| 				    DoubledGaugeField &U,		\ |  | ||||||
| 				    SiteHalfSpinor *buf,		\ |  | ||||||
| 				    int sF,int sU,			\ |  | ||||||
| 				    const FermionField &in,		\ |  | ||||||
| 				    FermionField &out){ assert(0); }	\ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #define HAND_SPECIALISE_GPARITY(IMPL)					\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \ |  | ||||||
| 				    int ss,int sU,const FermionField &in, FermionField &out) \ |  | ||||||
|   {									\ |  | ||||||
|     typedef IMPL Impl;							\ |  | ||||||
|     typedef typename Simd::scalar_type S;				\ |  | ||||||
|     typedef typename Simd::vector_type V;				\ |  | ||||||
| 									\ |  | ||||||
|     HAND_DECLARATIONS(ignore);						\ |  | ||||||
| 									\ |  | ||||||
|     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ |  | ||||||
|     StencilEntry *SE;							\ |  | ||||||
|     HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|     HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|   }									\ |  | ||||||
| 									\ |  | ||||||
|   template<>								\ |  | ||||||
|   void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ |  | ||||||
| 					    int ss,int sU,const FermionField &in, FermionField &out) \ |  | ||||||
|   {									\ |  | ||||||
|     typedef IMPL Impl;							\ |  | ||||||
|     typedef typename Simd::scalar_type S;				\ |  | ||||||
|     typedef typename Simd::vector_type V;				\ |  | ||||||
| 									\ |  | ||||||
|     HAND_DECLARATIONS(ignore);						\ |  | ||||||
| 									\ |  | ||||||
|     StencilEntry *SE;							\ |  | ||||||
|     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\ |  | ||||||
|     HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|     HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|   }									\ |  | ||||||
| 									\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \ |  | ||||||
| 						     int ss,int sU,const FermionField &in, FermionField &out) \ |  | ||||||
|   {									\ |  | ||||||
|     typedef IMPL Impl;							\ |  | ||||||
|     typedef typename Simd::scalar_type S;				\ |  | ||||||
|     typedef typename Simd::vector_type V;				\ |  | ||||||
| 									\ |  | ||||||
|     HAND_DECLARATIONS(ignore);						\ |  | ||||||
| 									\ |  | ||||||
|     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\ |  | ||||||
|     StencilEntry *SE;							\ |  | ||||||
|     HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|     HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|   }									\ |  | ||||||
| 									\ |  | ||||||
|   template<>								\ |  | ||||||
|   void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ |  | ||||||
| 							     int ss,int sU,const FermionField &in, FermionField &out) \ |  | ||||||
|   {									\ |  | ||||||
|     typedef IMPL Impl;							\ |  | ||||||
|     typedef typename Simd::scalar_type S;				\ |  | ||||||
|     typedef typename Simd::vector_type V;				\ |  | ||||||
| 									\ |  | ||||||
|     HAND_DECLARATIONS(ignore);						\ |  | ||||||
| 									\ |  | ||||||
|     StencilEntry *SE;							\ |  | ||||||
|     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ |  | ||||||
|     HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|     HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|   }									\ |  | ||||||
| 									\ |  | ||||||
|   template<> void							\ |  | ||||||
|   WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \ |  | ||||||
| 						     int ss,int sU,const FermionField &in, FermionField &out) \ |  | ||||||
|   {									\ |  | ||||||
|     typedef IMPL Impl;							\ |  | ||||||
|     typedef typename Simd::scalar_type S;				\ |  | ||||||
|     typedef typename Simd::vector_type V;				\ |  | ||||||
| 									\ |  | ||||||
|     HAND_DECLARATIONS(ignore);						\ |  | ||||||
| 									\ |  | ||||||
|     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ |  | ||||||
|     StencilEntry *SE;							\ |  | ||||||
|     int nmu=0;								\ |  | ||||||
|     HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|     nmu = 0;								\ |  | ||||||
|     HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|   }									\ |  | ||||||
|   template<>								\ |  | ||||||
|   void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ |  | ||||||
| 							     int ss,int sU,const FermionField &in, FermionField &out) \ |  | ||||||
|   {									\ |  | ||||||
|     typedef IMPL Impl;							\ |  | ||||||
|     typedef typename Simd::scalar_type S;				\ |  | ||||||
|     typedef typename Simd::vector_type V;				\ |  | ||||||
| 									\ |  | ||||||
|     HAND_DECLARATIONS(ignore);						\ |  | ||||||
| 									\ |  | ||||||
|     StencilEntry *SE;							\ |  | ||||||
|     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ |  | ||||||
|     int nmu=0;								\ |  | ||||||
|     HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|     nmu = 0;								\ |  | ||||||
|     HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| HAND_SPECIALISE_GPARITY(GparityWilsonImplF); |  | ||||||
| HAND_SPECIALISE_GPARITY(GparityWilsonImplD); |  | ||||||
| HAND_SPECIALISE_GPARITY(GparityWilsonImplFH); |  | ||||||
| HAND_SPECIALISE_GPARITY(GparityWilsonImplDF); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|    |  | ||||||
| ////////////// Wilson ; uses this implementation ///////////////////// | ////////////// Wilson ; uses this implementation ///////////////////// | ||||||
|  |  | ||||||
| #define INSTANTIATE_THEM(A) \ | #define INSTANTIATE_THEM(A) \ | ||||||
| @@ -930,8 +613,6 @@ INSTANTIATE_THEM(WilsonImplF); | |||||||
| INSTANTIATE_THEM(WilsonImplD); | INSTANTIATE_THEM(WilsonImplD); | ||||||
| INSTANTIATE_THEM(ZWilsonImplF); | INSTANTIATE_THEM(ZWilsonImplF); | ||||||
| INSTANTIATE_THEM(ZWilsonImplD); | INSTANTIATE_THEM(ZWilsonImplD); | ||||||
| INSTANTIATE_THEM(GparityWilsonImplF); |  | ||||||
| INSTANTIATE_THEM(GparityWilsonImplD); |  | ||||||
| INSTANTIATE_THEM(DomainWallVec5dImplF); | INSTANTIATE_THEM(DomainWallVec5dImplF); | ||||||
| INSTANTIATE_THEM(DomainWallVec5dImplD); | INSTANTIATE_THEM(DomainWallVec5dImplD); | ||||||
| INSTANTIATE_THEM(ZDomainWallVec5dImplF); | INSTANTIATE_THEM(ZDomainWallVec5dImplF); | ||||||
| @@ -940,12 +621,11 @@ INSTANTIATE_THEM(WilsonImplFH); | |||||||
| INSTANTIATE_THEM(WilsonImplDF); | INSTANTIATE_THEM(WilsonImplDF); | ||||||
| INSTANTIATE_THEM(ZWilsonImplFH); | INSTANTIATE_THEM(ZWilsonImplFH); | ||||||
| INSTANTIATE_THEM(ZWilsonImplDF); | INSTANTIATE_THEM(ZWilsonImplDF); | ||||||
| INSTANTIATE_THEM(GparityWilsonImplFH); |  | ||||||
| INSTANTIATE_THEM(GparityWilsonImplDF); |  | ||||||
| INSTANTIATE_THEM(DomainWallVec5dImplFH); | INSTANTIATE_THEM(DomainWallVec5dImplFH); | ||||||
| INSTANTIATE_THEM(DomainWallVec5dImplDF); | INSTANTIATE_THEM(DomainWallVec5dImplDF); | ||||||
| INSTANTIATE_THEM(ZDomainWallVec5dImplFH); | INSTANTIATE_THEM(ZDomainWallVec5dImplFH); | ||||||
| INSTANTIATE_THEM(ZDomainWallVec5dImplDF); | INSTANTIATE_THEM(ZDomainWallVec5dImplDF); | ||||||
| INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF); | INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF); | ||||||
| INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD); | INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD); | ||||||
|  |  | ||||||
| }} | }} | ||||||
|   | |||||||
							
								
								
									
										878
									
								
								lib/qcd/action/fermion/WilsonKernelsHandGparity.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										878
									
								
								lib/qcd/action/fermion/WilsonKernelsHandGparity.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,878 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/qcd/action/fermion/FermionCore.h> | ||||||
|  |  | ||||||
|  | #define REGISTER | ||||||
|  |  | ||||||
|  | #define LOAD_CHIMU_BODY(F)			\ | ||||||
|  |   Chimu_00=ref(F)(0)(0);			\ | ||||||
|  |   Chimu_01=ref(F)(0)(1);			\ | ||||||
|  |   Chimu_02=ref(F)(0)(2);			\ | ||||||
|  |   Chimu_10=ref(F)(1)(0);			\ | ||||||
|  |   Chimu_11=ref(F)(1)(1);			\ | ||||||
|  |   Chimu_12=ref(F)(1)(2);			\ | ||||||
|  |   Chimu_20=ref(F)(2)(0);			\ | ||||||
|  |   Chimu_21=ref(F)(2)(1);			\ | ||||||
|  |   Chimu_22=ref(F)(2)(2);			\ | ||||||
|  |   Chimu_30=ref(F)(3)(0);			\ | ||||||
|  |   Chimu_31=ref(F)(3)(1);			\ | ||||||
|  |   Chimu_32=ref(F)(3)(2) | ||||||
|  |  | ||||||
|  | #define LOAD_CHIMU(DIR,F,PERM)						\ | ||||||
|  |   { const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); } | ||||||
|  |  | ||||||
|  | #define LOAD_CHI_BODY(F)				\ | ||||||
|  |     Chi_00 = ref(F)(0)(0);\ | ||||||
|  |     Chi_01 = ref(F)(0)(1);\ | ||||||
|  |     Chi_02 = ref(F)(0)(2);\ | ||||||
|  |     Chi_10 = ref(F)(1)(0);\ | ||||||
|  |     Chi_11 = ref(F)(1)(1);\ | ||||||
|  |     Chi_12 = ref(F)(1)(2) | ||||||
|  |  | ||||||
|  | #define LOAD_CHI(DIR,F,PERM)					\ | ||||||
|  |   {const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | //G-parity implementations using in-place intrinsic ops | ||||||
|  |  | ||||||
|  | //1l 1h -> 1h 1l | ||||||
|  | //0l 0h , 1h 1l -> 0l 1h 0h,1l | ||||||
|  | //0h,1l -> 1l,0h | ||||||
|  | //if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) ) | ||||||
|  | //Pulled fermion through forwards face, GPBC on upper component | ||||||
|  | //Need 0= 0l 1h   1= 1l 0h | ||||||
|  | //else if( (distance == -1 && !perm) || (distance == 1 && perm) ) | ||||||
|  | //Pulled fermion through backwards face, GPBC on lower component | ||||||
|  | //Need 0= 1l 0h   1= 0l 1h | ||||||
|  |  | ||||||
|  | //1l 1h -> 1h 1l | ||||||
|  | //0l 0h , 1h 1l -> 0l 1h 0h,1l | ||||||
|  | #define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\ | ||||||
|  |   permute##PERM(tmp1, ref(1)(S)(C));				\ | ||||||
|  |   exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1);		\ | ||||||
|  |   INTO = tmp2; | ||||||
|  |  | ||||||
|  | //0l 0h -> 0h 0l | ||||||
|  | //1l 1h, 0h 0l -> 1l 0h, 1h 0l | ||||||
|  | #define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3)			\ | ||||||
|  |   permute##PERM(tmp1, ref(0)(S)(C));				\ | ||||||
|  |   exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1);		\ | ||||||
|  |   INTO = tmp2; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define LOAD_CHI_SETUP(DIR,F)						\ | ||||||
|  |   g = F;								\ | ||||||
|  |   direction = st._directions[DIR];				\ | ||||||
|  |   distance = st._distances[DIR];				\ | ||||||
|  |   sl = st._grid->_simd_layout[direction];			\ | ||||||
|  |   inplace_twist = 0;						\ | ||||||
|  |   if(SE->_around_the_world && this->Params.twists[DIR % 4]){		\ | ||||||
|  |     if(sl == 1){							\ | ||||||
|  |       g = (F+1) % 2;							\ | ||||||
|  |     }else{								\ | ||||||
|  |       inplace_twist = 1;						\ | ||||||
|  |     }									\ | ||||||
|  |   }   | ||||||
|  |  | ||||||
|  | #define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)			\ | ||||||
|  |   { const SiteSpinor &ref(in._odata[offset]);				\ | ||||||
|  |     LOAD_CHI_SETUP(DIR,F);						\ | ||||||
|  |     if(!inplace_twist){							\ | ||||||
|  |       LOAD_CHIMU_BODY(g);						\ | ||||||
|  |     }else{								\ | ||||||
|  |       if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \ | ||||||
|  | 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  |       }else{								\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM,  U_00,U_01,U_10);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM,  U_11,U_20,U_21);		\ | ||||||
|  |       } \ | ||||||
|  |     } \ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)				\ | ||||||
|  |   { const SiteHalfSpinor &ref(buf[offset]);				\ | ||||||
|  |     LOAD_CHI_SETUP(DIR,F);						\ | ||||||
|  |     if(!inplace_twist){							\ | ||||||
|  |       LOAD_CHI_BODY(g);							\ | ||||||
|  |     }else{								\ | ||||||
|  |       if(  ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \ | ||||||
|  | 	   ( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \ | ||||||
|  | 	DO_TWIST_0L_1H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\ | ||||||
|  | 	DO_TWIST_0L_1H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\ | ||||||
|  | 	DO_TWIST_0L_1H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\ | ||||||
|  | 	DO_TWIST_0L_1H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\ | ||||||
|  | 	DO_TWIST_0L_1H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\ | ||||||
|  |       }else{								\ | ||||||
|  | 	DO_TWIST_1L_0H(Chi_00,0,0,F,PERM,  U_00,U_01,U_10);			\ | ||||||
|  | 	DO_TWIST_1L_0H(Chi_01,0,1,F,PERM,  U_11,U_20,U_21);			\ | ||||||
|  | 	DO_TWIST_1L_0H(Chi_02,0,2,F,PERM,  UChi_00,UChi_01,UChi_02);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chi_10,1,0,F,PERM,  UChi_10,UChi_11,UChi_12);		\ | ||||||
|  | 	DO_TWIST_1L_0H(Chi_11,1,1,F,PERM,  U_00,U_01,U_10);			\ | ||||||
|  | 	DO_TWIST_1L_0H(Chi_12,1,2,F,PERM,  U_11,U_20,U_21);			\ | ||||||
|  |       }									\ | ||||||
|  |     }									\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) | ||||||
|  | #define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) | ||||||
|  |  | ||||||
|  | // To splat or not to splat depends on the implementation | ||||||
|  | #define MULT_2SPIN_BODY \ | ||||||
|  |   Impl::loadLinkElement(U_00,ref()(0,0));	\ | ||||||
|  |   Impl::loadLinkElement(U_10,ref()(1,0));	\ | ||||||
|  |   Impl::loadLinkElement(U_20,ref()(2,0));	\ | ||||||
|  |   Impl::loadLinkElement(U_01,ref()(0,1));	\ | ||||||
|  |   Impl::loadLinkElement(U_11,ref()(1,1));	\ | ||||||
|  |   Impl::loadLinkElement(U_21,ref()(2,1));	\ | ||||||
|  |   UChi_00 = U_00*Chi_00;			\ | ||||||
|  |   UChi_10 = U_00*Chi_10;			\ | ||||||
|  |   UChi_01 = U_10*Chi_00;			\ | ||||||
|  |   UChi_11 = U_10*Chi_10;			\ | ||||||
|  |   UChi_02 = U_20*Chi_00;			\ | ||||||
|  |   UChi_12 = U_20*Chi_10;			\ | ||||||
|  |   UChi_00+= U_01*Chi_01;			\ | ||||||
|  |   UChi_10+= U_01*Chi_11;			\ | ||||||
|  |   UChi_01+= U_11*Chi_01;			\ | ||||||
|  |   UChi_11+= U_11*Chi_11;			\ | ||||||
|  |   UChi_02+= U_21*Chi_01;			\ | ||||||
|  |   UChi_12+= U_21*Chi_11;			\ | ||||||
|  |   Impl::loadLinkElement(U_00,ref()(0,2));	\ | ||||||
|  |   Impl::loadLinkElement(U_10,ref()(1,2));	\ | ||||||
|  |   Impl::loadLinkElement(U_20,ref()(2,2));	\ | ||||||
|  |   UChi_00+= U_00*Chi_02;			\ | ||||||
|  |   UChi_10+= U_00*Chi_12;			\ | ||||||
|  |   UChi_01+= U_10*Chi_02;			\ | ||||||
|  |   UChi_11+= U_10*Chi_12;			\ | ||||||
|  |   UChi_02+= U_20*Chi_02;			\ | ||||||
|  |   UChi_12+= U_20*Chi_12 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define MULT_2SPIN(A,F)					\ | ||||||
|  |   {auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; } | ||||||
|  |  | ||||||
|  | #define MULT_2SPIN_GPARITY(A,F)				\ | ||||||
|  |   {auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define PERMUTE_DIR(dir)			\ | ||||||
|  |       permute##dir(Chi_00,Chi_00);\ | ||||||
|  |       permute##dir(Chi_01,Chi_01);\ | ||||||
|  |       permute##dir(Chi_02,Chi_02);\ | ||||||
|  |       permute##dir(Chi_10,Chi_10);\ | ||||||
|  |       permute##dir(Chi_11,Chi_11);\ | ||||||
|  |       permute##dir(Chi_12,Chi_12); | ||||||
|  |  | ||||||
|  | //      hspin(0)=fspin(0)+timesI(fspin(3)); | ||||||
|  | //      hspin(1)=fspin(1)+timesI(fspin(2)); | ||||||
|  | #define XP_PROJ \ | ||||||
|  |     Chi_00 = Chimu_00+timesI(Chimu_30);\ | ||||||
|  |     Chi_01 = Chimu_01+timesI(Chimu_31);\ | ||||||
|  |     Chi_02 = Chimu_02+timesI(Chimu_32);\ | ||||||
|  |     Chi_10 = Chimu_10+timesI(Chimu_20);\ | ||||||
|  |     Chi_11 = Chimu_11+timesI(Chimu_21);\ | ||||||
|  |     Chi_12 = Chimu_12+timesI(Chimu_22); | ||||||
|  |  | ||||||
|  | #define YP_PROJ \ | ||||||
|  |     Chi_00 = Chimu_00-Chimu_30;\ | ||||||
|  |     Chi_01 = Chimu_01-Chimu_31;\ | ||||||
|  |     Chi_02 = Chimu_02-Chimu_32;\ | ||||||
|  |     Chi_10 = Chimu_10+Chimu_20;\ | ||||||
|  |     Chi_11 = Chimu_11+Chimu_21;\ | ||||||
|  |     Chi_12 = Chimu_12+Chimu_22; | ||||||
|  |  | ||||||
|  | #define ZP_PROJ \ | ||||||
|  |   Chi_00 = Chimu_00+timesI(Chimu_20);		\ | ||||||
|  |   Chi_01 = Chimu_01+timesI(Chimu_21);		\ | ||||||
|  |   Chi_02 = Chimu_02+timesI(Chimu_22);		\ | ||||||
|  |   Chi_10 = Chimu_10-timesI(Chimu_30);		\ | ||||||
|  |   Chi_11 = Chimu_11-timesI(Chimu_31);		\ | ||||||
|  |   Chi_12 = Chimu_12-timesI(Chimu_32); | ||||||
|  |  | ||||||
|  | #define TP_PROJ \ | ||||||
|  |   Chi_00 = Chimu_00+Chimu_20;		\ | ||||||
|  |   Chi_01 = Chimu_01+Chimu_21;		\ | ||||||
|  |   Chi_02 = Chimu_02+Chimu_22;		\ | ||||||
|  |   Chi_10 = Chimu_10+Chimu_30;		\ | ||||||
|  |   Chi_11 = Chimu_11+Chimu_31;		\ | ||||||
|  |   Chi_12 = Chimu_12+Chimu_32; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | //      hspin(0)=fspin(0)-timesI(fspin(3)); | ||||||
|  | //      hspin(1)=fspin(1)-timesI(fspin(2)); | ||||||
|  | #define XM_PROJ \ | ||||||
|  |     Chi_00 = Chimu_00-timesI(Chimu_30);\ | ||||||
|  |     Chi_01 = Chimu_01-timesI(Chimu_31);\ | ||||||
|  |     Chi_02 = Chimu_02-timesI(Chimu_32);\ | ||||||
|  |     Chi_10 = Chimu_10-timesI(Chimu_20);\ | ||||||
|  |     Chi_11 = Chimu_11-timesI(Chimu_21);\ | ||||||
|  |     Chi_12 = Chimu_12-timesI(Chimu_22); | ||||||
|  |  | ||||||
|  | #define YM_PROJ \ | ||||||
|  |     Chi_00 = Chimu_00+Chimu_30;\ | ||||||
|  |     Chi_01 = Chimu_01+Chimu_31;\ | ||||||
|  |     Chi_02 = Chimu_02+Chimu_32;\ | ||||||
|  |     Chi_10 = Chimu_10-Chimu_20;\ | ||||||
|  |     Chi_11 = Chimu_11-Chimu_21;\ | ||||||
|  |     Chi_12 = Chimu_12-Chimu_22; | ||||||
|  |  | ||||||
|  | #define ZM_PROJ \ | ||||||
|  |   Chi_00 = Chimu_00-timesI(Chimu_20);		\ | ||||||
|  |   Chi_01 = Chimu_01-timesI(Chimu_21);		\ | ||||||
|  |   Chi_02 = Chimu_02-timesI(Chimu_22);		\ | ||||||
|  |   Chi_10 = Chimu_10+timesI(Chimu_30);		\ | ||||||
|  |   Chi_11 = Chimu_11+timesI(Chimu_31);		\ | ||||||
|  |   Chi_12 = Chimu_12+timesI(Chimu_32); | ||||||
|  |  | ||||||
|  | #define TM_PROJ \ | ||||||
|  |   Chi_00 = Chimu_00-Chimu_20;		\ | ||||||
|  |   Chi_01 = Chimu_01-Chimu_21;		\ | ||||||
|  |   Chi_02 = Chimu_02-Chimu_22;		\ | ||||||
|  |   Chi_10 = Chimu_10-Chimu_30;		\ | ||||||
|  |   Chi_11 = Chimu_11-Chimu_31;		\ | ||||||
|  |   Chi_12 = Chimu_12-Chimu_32; | ||||||
|  |  | ||||||
|  | //      fspin(0)=hspin(0); | ||||||
|  | //      fspin(1)=hspin(1); | ||||||
|  | //      fspin(2)=timesMinusI(hspin(1)); | ||||||
|  | //      fspin(3)=timesMinusI(hspin(0)); | ||||||
|  | #define XP_RECON\ | ||||||
|  |   result_00 = UChi_00;\ | ||||||
|  |   result_01 = UChi_01;\ | ||||||
|  |   result_02 = UChi_02;\ | ||||||
|  |   result_10 = UChi_10;\ | ||||||
|  |   result_11 = UChi_11;\ | ||||||
|  |   result_12 = UChi_12;\ | ||||||
|  |   result_20 = timesMinusI(UChi_10);\ | ||||||
|  |   result_21 = timesMinusI(UChi_11);\ | ||||||
|  |   result_22 = timesMinusI(UChi_12);\ | ||||||
|  |   result_30 = timesMinusI(UChi_00);\ | ||||||
|  |   result_31 = timesMinusI(UChi_01);\ | ||||||
|  |   result_32 = timesMinusI(UChi_02); | ||||||
|  |  | ||||||
|  | #define XP_RECON_ACCUM\ | ||||||
|  |   result_00+=UChi_00;\ | ||||||
|  |   result_01+=UChi_01;\ | ||||||
|  |   result_02+=UChi_02;\ | ||||||
|  |   result_10+=UChi_10;\ | ||||||
|  |   result_11+=UChi_11;\ | ||||||
|  |   result_12+=UChi_12;\ | ||||||
|  |   result_20-=timesI(UChi_10);\ | ||||||
|  |   result_21-=timesI(UChi_11);\ | ||||||
|  |   result_22-=timesI(UChi_12);\ | ||||||
|  |   result_30-=timesI(UChi_00);\ | ||||||
|  |   result_31-=timesI(UChi_01);\ | ||||||
|  |   result_32-=timesI(UChi_02); | ||||||
|  |  | ||||||
|  | #define XM_RECON\ | ||||||
|  |   result_00 = UChi_00;\ | ||||||
|  |   result_01 = UChi_01;\ | ||||||
|  |   result_02 = UChi_02;\ | ||||||
|  |   result_10 = UChi_10;\ | ||||||
|  |   result_11 = UChi_11;\ | ||||||
|  |   result_12 = UChi_12;\ | ||||||
|  |   result_20 = timesI(UChi_10);\ | ||||||
|  |   result_21 = timesI(UChi_11);\ | ||||||
|  |   result_22 = timesI(UChi_12);\ | ||||||
|  |   result_30 = timesI(UChi_00);\ | ||||||
|  |   result_31 = timesI(UChi_01);\ | ||||||
|  |   result_32 = timesI(UChi_02); | ||||||
|  |  | ||||||
|  | #define XM_RECON_ACCUM\ | ||||||
|  |   result_00+= UChi_00;\ | ||||||
|  |   result_01+= UChi_01;\ | ||||||
|  |   result_02+= UChi_02;\ | ||||||
|  |   result_10+= UChi_10;\ | ||||||
|  |   result_11+= UChi_11;\ | ||||||
|  |   result_12+= UChi_12;\ | ||||||
|  |   result_20+= timesI(UChi_10);\ | ||||||
|  |   result_21+= timesI(UChi_11);\ | ||||||
|  |   result_22+= timesI(UChi_12);\ | ||||||
|  |   result_30+= timesI(UChi_00);\ | ||||||
|  |   result_31+= timesI(UChi_01);\ | ||||||
|  |   result_32+= timesI(UChi_02); | ||||||
|  |  | ||||||
|  | #define YP_RECON_ACCUM\ | ||||||
|  |   result_00+= UChi_00;\ | ||||||
|  |   result_01+= UChi_01;\ | ||||||
|  |   result_02+= UChi_02;\ | ||||||
|  |   result_10+= UChi_10;\ | ||||||
|  |   result_11+= UChi_11;\ | ||||||
|  |   result_12+= UChi_12;\ | ||||||
|  |   result_20+= UChi_10;\ | ||||||
|  |   result_21+= UChi_11;\ | ||||||
|  |   result_22+= UChi_12;\ | ||||||
|  |   result_30-= UChi_00;\ | ||||||
|  |   result_31-= UChi_01;\ | ||||||
|  |   result_32-= UChi_02; | ||||||
|  |  | ||||||
|  | #define YM_RECON_ACCUM\ | ||||||
|  |   result_00+= UChi_00;\ | ||||||
|  |   result_01+= UChi_01;\ | ||||||
|  |   result_02+= UChi_02;\ | ||||||
|  |   result_10+= UChi_10;\ | ||||||
|  |   result_11+= UChi_11;\ | ||||||
|  |   result_12+= UChi_12;\ | ||||||
|  |   result_20-= UChi_10;\ | ||||||
|  |   result_21-= UChi_11;\ | ||||||
|  |   result_22-= UChi_12;\ | ||||||
|  |   result_30+= UChi_00;\ | ||||||
|  |   result_31+= UChi_01;\ | ||||||
|  |   result_32+= UChi_02; | ||||||
|  |  | ||||||
|  | #define ZP_RECON_ACCUM\ | ||||||
|  |   result_00+= UChi_00;\ | ||||||
|  |   result_01+= UChi_01;\ | ||||||
|  |   result_02+= UChi_02;\ | ||||||
|  |   result_10+= UChi_10;\ | ||||||
|  |   result_11+= UChi_11;\ | ||||||
|  |   result_12+= UChi_12;\ | ||||||
|  |   result_20-= timesI(UChi_00);			\ | ||||||
|  |   result_21-= timesI(UChi_01);			\ | ||||||
|  |   result_22-= timesI(UChi_02);			\ | ||||||
|  |   result_30+= timesI(UChi_10);			\ | ||||||
|  |   result_31+= timesI(UChi_11);			\ | ||||||
|  |   result_32+= timesI(UChi_12); | ||||||
|  |  | ||||||
|  | #define ZM_RECON_ACCUM\ | ||||||
|  |   result_00+= UChi_00;\ | ||||||
|  |   result_01+= UChi_01;\ | ||||||
|  |   result_02+= UChi_02;\ | ||||||
|  |   result_10+= UChi_10;\ | ||||||
|  |   result_11+= UChi_11;\ | ||||||
|  |   result_12+= UChi_12;\ | ||||||
|  |   result_20+= timesI(UChi_00);			\ | ||||||
|  |   result_21+= timesI(UChi_01);			\ | ||||||
|  |   result_22+= timesI(UChi_02);			\ | ||||||
|  |   result_30-= timesI(UChi_10);			\ | ||||||
|  |   result_31-= timesI(UChi_11);			\ | ||||||
|  |   result_32-= timesI(UChi_12); | ||||||
|  |  | ||||||
|  | #define TP_RECON_ACCUM\ | ||||||
|  |   result_00+= UChi_00;\ | ||||||
|  |   result_01+= UChi_01;\ | ||||||
|  |   result_02+= UChi_02;\ | ||||||
|  |   result_10+= UChi_10;\ | ||||||
|  |   result_11+= UChi_11;\ | ||||||
|  |   result_12+= UChi_12;\ | ||||||
|  |   result_20+= UChi_00;			\ | ||||||
|  |   result_21+= UChi_01;			\ | ||||||
|  |   result_22+= UChi_02;			\ | ||||||
|  |   result_30+= UChi_10;			\ | ||||||
|  |   result_31+= UChi_11;			\ | ||||||
|  |   result_32+= UChi_12; | ||||||
|  |  | ||||||
|  | #define TM_RECON_ACCUM\ | ||||||
|  |   result_00+= UChi_00;\ | ||||||
|  |   result_01+= UChi_01;\ | ||||||
|  |   result_02+= UChi_02;\ | ||||||
|  |   result_10+= UChi_10;\ | ||||||
|  |   result_11+= UChi_11;\ | ||||||
|  |   result_12+= UChi_12;\ | ||||||
|  |   result_20-= UChi_00;	\ | ||||||
|  |   result_21-= UChi_01;	\ | ||||||
|  |   result_22-= UChi_02;	\ | ||||||
|  |   result_30-= UChi_10;	\ | ||||||
|  |   result_31-= UChi_11;	\ | ||||||
|  |   result_32-= UChi_12; | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ | ||||||
|  |   SE=st.GetEntry(ptype,DIR,ss);			\ | ||||||
|  |   offset = SE->_offset;				\ | ||||||
|  |   local  = SE->_is_local;			\ | ||||||
|  |   perm   = SE->_permute;			\ | ||||||
|  |   if ( local ) {				\ | ||||||
|  |     LOAD_CHIMU_IMPL(DIR,F,PERM);			\ | ||||||
|  |     PROJ;					\ | ||||||
|  |     if ( perm) {				\ | ||||||
|  |       PERMUTE_DIR(PERM);			\ | ||||||
|  |     }						\ | ||||||
|  |   } else {					\ | ||||||
|  |     LOAD_CHI_IMPL(DIR,F,PERM);			\ | ||||||
|  |   }						\ | ||||||
|  |   MULT_2SPIN_IMPL(DIR,F);			\ | ||||||
|  |   RECON;					 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\ | ||||||
|  |   SE=st.GetEntry(ptype,DIR,ss);			\ | ||||||
|  |   offset = SE->_offset;				\ | ||||||
|  |   local  = SE->_is_local;			\ | ||||||
|  |   perm   = SE->_permute;			\ | ||||||
|  |   if ( local ) {				\ | ||||||
|  |     LOAD_CHIMU_IMPL(DIR,F,PERM);			\ | ||||||
|  |     PROJ;					\ | ||||||
|  |     if ( perm) {				\ | ||||||
|  |       PERMUTE_DIR(PERM);			\ | ||||||
|  |     }						\ | ||||||
|  |   } else if ( st.same_node[DIR] ) {		\ | ||||||
|  |     LOAD_CHI_IMPL(DIR,F,PERM);			\ | ||||||
|  |   }						\ | ||||||
|  |   if (local || st.same_node[DIR] ) {		\ | ||||||
|  |     MULT_2SPIN_IMPL(DIR,F);			\ | ||||||
|  |     RECON;					\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)	\ | ||||||
|  |   SE=st.GetEntry(ptype,DIR,ss);			\ | ||||||
|  |   offset = SE->_offset;				\ | ||||||
|  |   local  = SE->_is_local;			\ | ||||||
|  |   perm   = SE->_permute;			\ | ||||||
|  |   if((!SE->_is_local)&&(!st.same_node[DIR]) ) {	\ | ||||||
|  |     LOAD_CHI_IMPL(DIR,F,PERM);			\ | ||||||
|  |     MULT_2SPIN_IMPL(DIR,F);			\ | ||||||
|  |     RECON;					\ | ||||||
|  |     nmu++;					\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | #define HAND_RESULT(ss,F)			\ | ||||||
|  |   {						\ | ||||||
|  |     SiteSpinor & ref (out._odata[ss]);		\ | ||||||
|  |     vstream(ref(F)(0)(0),result_00);		\ | ||||||
|  |     vstream(ref(F)(0)(1),result_01);		\ | ||||||
|  |     vstream(ref(F)(0)(2),result_02);		\ | ||||||
|  |     vstream(ref(F)(1)(0),result_10);		\ | ||||||
|  |     vstream(ref(F)(1)(1),result_11);		\ | ||||||
|  |     vstream(ref(F)(1)(2),result_12);		\ | ||||||
|  |     vstream(ref(F)(2)(0),result_20);		\ | ||||||
|  |     vstream(ref(F)(2)(1),result_21);		\ | ||||||
|  |     vstream(ref(F)(2)(2),result_22);		\ | ||||||
|  |     vstream(ref(F)(3)(0),result_30);		\ | ||||||
|  |     vstream(ref(F)(3)(1),result_31);		\ | ||||||
|  |     vstream(ref(F)(3)(2),result_32);		\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | #define HAND_RESULT_EXT(ss,F)			\ | ||||||
|  |   if (nmu){					\ | ||||||
|  |     SiteSpinor & ref (out._odata[ss]);		\ | ||||||
|  |     ref(F)(0)(0)+=result_00;		\ | ||||||
|  |     ref(F)(0)(1)+=result_01;		\ | ||||||
|  |     ref(F)(0)(2)+=result_02;		\ | ||||||
|  |     ref(F)(1)(0)+=result_10;		\ | ||||||
|  |     ref(F)(1)(1)+=result_11;		\ | ||||||
|  |     ref(F)(1)(2)+=result_12;		\ | ||||||
|  |     ref(F)(2)(0)+=result_20;		\ | ||||||
|  |     ref(F)(2)(1)+=result_21;		\ | ||||||
|  |     ref(F)(2)(2)+=result_22;		\ | ||||||
|  |     ref(F)(3)(0)+=result_30;		\ | ||||||
|  |     ref(F)(3)(1)+=result_31;		\ | ||||||
|  |     ref(F)(3)(2)+=result_32;		\ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define HAND_DECLARATIONS(a)			\ | ||||||
|  |   Simd result_00;				\ | ||||||
|  |   Simd result_01;				\ | ||||||
|  |   Simd result_02;				\ | ||||||
|  |   Simd result_10;				\ | ||||||
|  |   Simd result_11;				\ | ||||||
|  |   Simd result_12;				\ | ||||||
|  |   Simd result_20;				\ | ||||||
|  |   Simd result_21;				\ | ||||||
|  |   Simd result_22;				\ | ||||||
|  |   Simd result_30;				\ | ||||||
|  |   Simd result_31;				\ | ||||||
|  |   Simd result_32;				\ | ||||||
|  |   Simd Chi_00;					\ | ||||||
|  |   Simd Chi_01;					\ | ||||||
|  |   Simd Chi_02;					\ | ||||||
|  |   Simd Chi_10;					\ | ||||||
|  |   Simd Chi_11;					\ | ||||||
|  |   Simd Chi_12;					\ | ||||||
|  |   Simd UChi_00;					\ | ||||||
|  |   Simd UChi_01;					\ | ||||||
|  |   Simd UChi_02;					\ | ||||||
|  |   Simd UChi_10;					\ | ||||||
|  |   Simd UChi_11;					\ | ||||||
|  |   Simd UChi_12;					\ | ||||||
|  |   Simd U_00;					\ | ||||||
|  |   Simd U_10;					\ | ||||||
|  |   Simd U_20;					\ | ||||||
|  |   Simd U_01;					\ | ||||||
|  |   Simd U_11;					\ | ||||||
|  |   Simd U_21; | ||||||
|  |  | ||||||
|  | #define ZERO_RESULT				\ | ||||||
|  |   result_00=zero;				\ | ||||||
|  |   result_01=zero;				\ | ||||||
|  |   result_02=zero;				\ | ||||||
|  |   result_10=zero;				\ | ||||||
|  |   result_11=zero;				\ | ||||||
|  |   result_12=zero;				\ | ||||||
|  |   result_20=zero;				\ | ||||||
|  |   result_21=zero;				\ | ||||||
|  |   result_22=zero;				\ | ||||||
|  |   result_30=zero;				\ | ||||||
|  |   result_31=zero;				\ | ||||||
|  |   result_32=zero;			 | ||||||
|  |  | ||||||
|  | #define Chimu_00 Chi_00 | ||||||
|  | #define Chimu_01 Chi_01 | ||||||
|  | #define Chimu_02 Chi_02 | ||||||
|  | #define Chimu_10 Chi_10 | ||||||
|  | #define Chimu_11 Chi_11 | ||||||
|  | #define Chimu_12 Chi_12 | ||||||
|  | #define Chimu_20 UChi_00 | ||||||
|  | #define Chimu_21 UChi_01 | ||||||
|  | #define Chimu_22 UChi_02 | ||||||
|  | #define Chimu_30 UChi_10 | ||||||
|  | #define Chimu_31 UChi_11 | ||||||
|  | #define Chimu_32 UChi_12 | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  | namespace QCD { | ||||||
|  |  | ||||||
|  | template<class Impl> void  | ||||||
|  | WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, | ||||||
|  | 					  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  | // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... | ||||||
|  |   typedef typename Simd::scalar_type S; | ||||||
|  |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|  |   HAND_DECLARATIONS(ignore); | ||||||
|  |  | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |   StencilEntry *SE; | ||||||
|  |  | ||||||
|  | #define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ | ||||||
|  |   HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);	\ | ||||||
|  |   HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_RESULT(ss,F) | ||||||
|  |  | ||||||
|  |   HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
|  | 						  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  |   typedef typename Simd::scalar_type S; | ||||||
|  |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|  |   HAND_DECLARATIONS(ignore); | ||||||
|  |  | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |  | ||||||
|  | #define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ | ||||||
|  |   HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_RESULT(ss,F) | ||||||
|  |  | ||||||
|  |   HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class Impl> void  | ||||||
|  | WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, | ||||||
|  | 					  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  | // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... | ||||||
|  |   typedef typename Simd::scalar_type S; | ||||||
|  |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|  |   HAND_DECLARATIONS(ignore); | ||||||
|  |  | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |   StencilEntry *SE; | ||||||
|  |  | ||||||
|  | #define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ | ||||||
|  |   ZERO_RESULT; \ | ||||||
|  |   HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_RESULT(ss,F) | ||||||
|  |  | ||||||
|  |   HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
|  | 						  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  |   typedef typename Simd::scalar_type S; | ||||||
|  |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|  |   HAND_DECLARATIONS(ignore); | ||||||
|  |  | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |  | ||||||
|  | #define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL)				\ | ||||||
|  |   ZERO_RESULT;							\ | ||||||
|  |   HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL);		\ | ||||||
|  |   HAND_RESULT(ss,F) | ||||||
|  |    | ||||||
|  |   HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class Impl> void  | ||||||
|  | WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, | ||||||
|  | 					  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  | // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... | ||||||
|  |   typedef typename Simd::scalar_type S; | ||||||
|  |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|  |   HAND_DECLARATIONS(ignore); | ||||||
|  |  | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int nmu=0; | ||||||
|  |  | ||||||
|  | #define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ | ||||||
|  |   ZERO_RESULT; \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_RESULT_EXT(ss,F) | ||||||
|  |  | ||||||
|  |   HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, | ||||||
|  | 						  int ss,int sU,const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  |   typedef typename Simd::scalar_type S; | ||||||
|  |   typedef typename Simd::vector_type V; | ||||||
|  |  | ||||||
|  |   HAND_DECLARATIONS(ignore); | ||||||
|  |  | ||||||
|  |   StencilEntry *SE; | ||||||
|  |   int offset,local,perm, ptype; | ||||||
|  |   int nmu=0; | ||||||
|  |  | ||||||
|  | #define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \ | ||||||
|  |   ZERO_RESULT; \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \ | ||||||
|  |   HAND_RESULT_EXT(ss,F) | ||||||
|  |  | ||||||
|  |   HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #define HAND_SPECIALISE_GPARITY(IMPL)					\ | ||||||
|  |   template<> void							\ | ||||||
|  |   WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \ | ||||||
|  | 				    int ss,int sU,const FermionField &in, FermionField &out) \ | ||||||
|  |   {									\ | ||||||
|  |     typedef IMPL Impl;							\ | ||||||
|  |     typedef typename Simd::scalar_type S;				\ | ||||||
|  |     typedef typename Simd::vector_type V;				\ | ||||||
|  | 									\ | ||||||
|  |     HAND_DECLARATIONS(ignore);						\ | ||||||
|  | 									\ | ||||||
|  |     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ | ||||||
|  |     StencilEntry *SE;							\ | ||||||
|  |     HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |     HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |   }									\ | ||||||
|  | 									\ | ||||||
|  |   template<>								\ | ||||||
|  |   void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ | ||||||
|  | 					    int ss,int sU,const FermionField &in, FermionField &out) \ | ||||||
|  |   {									\ | ||||||
|  |     typedef IMPL Impl;							\ | ||||||
|  |     typedef typename Simd::scalar_type S;				\ | ||||||
|  |     typedef typename Simd::vector_type V;				\ | ||||||
|  | 									\ | ||||||
|  |     HAND_DECLARATIONS(ignore);						\ | ||||||
|  | 									\ | ||||||
|  |     StencilEntry *SE;							\ | ||||||
|  |     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\ | ||||||
|  |     HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |     HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |   }									\ | ||||||
|  | 									\ | ||||||
|  |   template<> void							\ | ||||||
|  |   WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \ | ||||||
|  | 						     int ss,int sU,const FermionField &in, FermionField &out) \ | ||||||
|  |   {									\ | ||||||
|  |     typedef IMPL Impl;							\ | ||||||
|  |     typedef typename Simd::scalar_type S;				\ | ||||||
|  |     typedef typename Simd::vector_type V;				\ | ||||||
|  | 									\ | ||||||
|  |     HAND_DECLARATIONS(ignore);						\ | ||||||
|  | 									\ | ||||||
|  |     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist;					\ | ||||||
|  |     StencilEntry *SE;							\ | ||||||
|  |     HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |     HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |   }									\ | ||||||
|  | 									\ | ||||||
|  |   template<>								\ | ||||||
|  |   void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ | ||||||
|  | 							     int ss,int sU,const FermionField &in, FermionField &out) \ | ||||||
|  |   {									\ | ||||||
|  |     typedef IMPL Impl;							\ | ||||||
|  |     typedef typename Simd::scalar_type S;				\ | ||||||
|  |     typedef typename Simd::vector_type V;				\ | ||||||
|  | 									\ | ||||||
|  |     HAND_DECLARATIONS(ignore);						\ | ||||||
|  | 									\ | ||||||
|  |     StencilEntry *SE;							\ | ||||||
|  |     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ | ||||||
|  |     HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |     HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |   }									\ | ||||||
|  | 									\ | ||||||
|  |   template<> void							\ | ||||||
|  |   WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf, \ | ||||||
|  | 						     int ss,int sU,const FermionField &in, FermionField &out) \ | ||||||
|  |   {									\ | ||||||
|  |     typedef IMPL Impl;							\ | ||||||
|  |     typedef typename Simd::scalar_type S;				\ | ||||||
|  |     typedef typename Simd::vector_type V;				\ | ||||||
|  | 									\ | ||||||
|  |     HAND_DECLARATIONS(ignore);						\ | ||||||
|  | 									\ | ||||||
|  |     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ | ||||||
|  |     StencilEntry *SE;							\ | ||||||
|  |     int nmu=0;								\ | ||||||
|  |     HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |     nmu = 0;								\ | ||||||
|  |     HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |   }									\ | ||||||
|  |   template<>								\ | ||||||
|  |   void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ | ||||||
|  | 							     int ss,int sU,const FermionField &in, FermionField &out) \ | ||||||
|  |   {									\ | ||||||
|  |     typedef IMPL Impl;							\ | ||||||
|  |     typedef typename Simd::scalar_type S;				\ | ||||||
|  |     typedef typename Simd::vector_type V;				\ | ||||||
|  | 									\ | ||||||
|  |     HAND_DECLARATIONS(ignore);						\ | ||||||
|  | 									\ | ||||||
|  |     StencilEntry *SE;							\ | ||||||
|  |     int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \ | ||||||
|  |     int nmu=0;								\ | ||||||
|  |     HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |     nmu = 0;								\ | ||||||
|  |     HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | HAND_SPECIALISE_GPARITY(GparityWilsonImplF); | ||||||
|  | HAND_SPECIALISE_GPARITY(GparityWilsonImplD); | ||||||
|  | HAND_SPECIALISE_GPARITY(GparityWilsonImplFH); | ||||||
|  | HAND_SPECIALISE_GPARITY(GparityWilsonImplDF); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |    | ||||||
|  | ////////////// Wilson ; uses this implementation ///////////////////// | ||||||
|  |  | ||||||
|  | #define INSTANTIATE_THEM(A) \ | ||||||
|  | template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ | ||||||
|  | 					     int ss,int sU,const FermionField &in, FermionField &out); \ | ||||||
|  | template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ | ||||||
|  | 						int ss,int sU,const FermionField &in, FermionField &out);\ | ||||||
|  | template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ | ||||||
|  | 						int ss,int sU,const FermionField &in, FermionField &out); \ | ||||||
|  | template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ | ||||||
|  | 						   int ss,int sU,const FermionField &in, FermionField &out); \ | ||||||
|  | template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\ | ||||||
|  | 						int ss,int sU,const FermionField &in, FermionField &out); \ | ||||||
|  | template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \ | ||||||
|  | 						   int ss,int sU,const FermionField &in, FermionField &out);  | ||||||
|  |  | ||||||
|  | INSTANTIATE_THEM(GparityWilsonImplF); | ||||||
|  | INSTANTIATE_THEM(GparityWilsonImplD); | ||||||
|  | INSTANTIATE_THEM(GparityWilsonImplFH); | ||||||
|  | INSTANTIATE_THEM(GparityWilsonImplDF); | ||||||
|  | }} | ||||||
| @@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> { | |||||||
|  |  | ||||||
|     RealD factor = 0.5 * beta / RealD(Nc); |     RealD factor = 0.5 * beta / RealD(Nc); | ||||||
|  |  | ||||||
|     //GaugeLinkField Umu(U._grid); |     GaugeLinkField Umu(U._grid); | ||||||
|     GaugeLinkField dSdU_mu(U._grid); |     GaugeLinkField dSdU_mu(U._grid); | ||||||
|     for (int mu = 0; mu < Nd; mu++) { |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|       //Umu = PeekIndex<LorentzIndex>(U, mu); |       Umu = PeekIndex<LorentzIndex>(U, mu); | ||||||
|  |  | ||||||
|       // Staple in direction mu |       // Staple in direction mu | ||||||
|       //WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu); |       WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu); | ||||||
|       //dSdU_mu = Ta(Umu * dSdU_mu) * factor; |       dSdU_mu = Ta(Umu * dSdU_mu) * factor; | ||||||
|  |  | ||||||
|    |  | ||||||
|       WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu); |  | ||||||
|       dSdU_mu = Ta(dSdU_mu) * factor; |  | ||||||
|  |  | ||||||
|       PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu); |       PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -32,7 +32,6 @@ directory | |||||||
| #include <Grid/qcd/action/scalar/ScalarImpl.h> | #include <Grid/qcd/action/scalar/ScalarImpl.h> | ||||||
| #include <Grid/qcd/action/scalar/ScalarAction.h> | #include <Grid/qcd/action/scalar/ScalarAction.h> | ||||||
| #include <Grid/qcd/action/scalar/ScalarInteractionAction.h> | #include <Grid/qcd/action/scalar/ScalarInteractionAction.h> | ||||||
| #include <Grid/qcd/action/scalar/shGordonAction.h> |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
| @@ -45,9 +44,6 @@ namespace QCD { | |||||||
|   template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>; |   template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>; | ||||||
|   template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>; |   template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>; | ||||||
|    |    | ||||||
|   typedef shGordonAction<ScalarImplR>                 shGordonActionR; |  | ||||||
|   typedef shGordonAction<ScalarImplF>                 shGordonActionF; |  | ||||||
|   typedef shGordonAction<ScalarImplD>                 shGordonActionD; |  | ||||||
| } | } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,9 +29,7 @@ class ScalarImplTypes { | |||||||
|     static inline Field projectForce(Field& P){return P;} |     static inline Field projectForce(Field& P){return P;} | ||||||
|  |  | ||||||
|     static inline void update_field(Field& P, Field& U, double ep) { |     static inline void update_field(Field& P, Field& U, double ep) { | ||||||
|       //std::cout << GridLogDebug << "P:\n" << P << std::endl; |  | ||||||
|       U += P*ep; |       U += P*ep; | ||||||
|       //std::cout << GridLogDebug << "U:\n" << U << std::endl; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline RealD FieldSquareNorm(Field& U) { |     static inline RealD FieldSquareNorm(Field& U) { | ||||||
| @@ -39,17 +37,15 @@ class ScalarImplTypes { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { |     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
|      random(pRNG, U); |       gaussian(pRNG, U); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { |     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
|       random(pRNG, U); |       gaussian(pRNG, U); | ||||||
|       U *= 0.01; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { |     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
|       U = 0.0; |       U = 1.0; | ||||||
|       //std::cout << GridLogDebug << "Initial U:\n" << U << std::endl; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static void MomentumSpacePropagator(Field &out, RealD m) |     static void MomentumSpacePropagator(Field &out, RealD m) | ||||||
|   | |||||||
| @@ -1,80 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|   Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
|   Source file: ./lib/qcd/action/gauge/shGordonAction.h |  | ||||||
|  |  | ||||||
|   Copyright (C) 2018 |  | ||||||
|  |  | ||||||
|   Author: Guido Cossu <guido.cossu@ed.ac.uk> |  | ||||||
|  |  | ||||||
|   This program is free software; you can redistribute it and/or modify |  | ||||||
|   it under the terms of the GNU General Public License as published by |  | ||||||
|   the Free Software Foundation; either version 2 of the License, or |  | ||||||
|   (at your option) any later version. |  | ||||||
|  |  | ||||||
|   This program is distributed in the hope that it will be useful, |  | ||||||
|   but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|   GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|   You should have received a copy of the GNU General Public License along |  | ||||||
|   with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|   See the full license in the file "LICENSE" in the top level distribution |  | ||||||
|   directory |  | ||||||
|   *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
|  |  | ||||||
| #ifndef SHGORDON_ACTION_H |  | ||||||
| #define SHGORDON_ACTION_H |  | ||||||
|  |  | ||||||
| namespace Grid { |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| class shGordonAction : public QCD::Action<typename Impl::Field> { |  | ||||||
|  public: |  | ||||||
|     INHERIT_FIELD_TYPES(Impl); |  | ||||||
|  |  | ||||||
|  private: |  | ||||||
|     RealD mass_square; |  | ||||||
|     RealD g; |  | ||||||
|  |  | ||||||
|  public: |  | ||||||
|     shGordonAction(RealD ms, RealD g) : mass_square(ms), g(g) {} |  | ||||||
|  |  | ||||||
|     virtual std::string LogParameters() { |  | ||||||
|       std::stringstream sstream; |  | ||||||
|       sstream << GridLogMessage << "[shGordonAction] g           : " << g           << std::endl; |  | ||||||
|       sstream << GridLogMessage << "[shGordonAction] mass_square : " << mass_square << std::endl; |  | ||||||
|       return sstream.str(); |  | ||||||
|     } |  | ||||||
|     virtual std::string action_name() {return "shGordonAction";} |  | ||||||
|  |  | ||||||
|     virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms |  | ||||||
|  |  | ||||||
|     virtual RealD S(const Field &phi) { |  | ||||||
|       return QCD::Nd * ScalarObs<Impl>::sumphisquared(phi) + ScalarObs<Impl>::sumphider(phi) + 0.5*mass_square/(g*g)*sum(trace(exp(g*phi) + exp(-g*phi)))   ; |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     virtual void deriv(const Field &phi, |  | ||||||
|                        Field &force) { |  | ||||||
|         //std::cout << GridLogDebug << "Force total before :\n" << force << std::endl; |  | ||||||
|         Field tmp(phi._grid); |  | ||||||
|         tmp = 2.0*QCD::Nd*phi; |  | ||||||
|         for (int mu = 0; mu < QCD::Nd; mu++) tmp -= Cshift(phi, mu, 1) + Cshift(phi, mu, -1); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|         //std::cout << GridLogDebug << "Phi norm : " << norm2(phi) << std::endl; |  | ||||||
|         force += tmp + 0.5*mass_square/g*(exp(g*phi) - exp(-g*phi)); |  | ||||||
|         //std::cout << GridLogDebug << "Force tmp :\n" << tmp << std::endl; |  | ||||||
|         //std::cout << GridLogDebug << "Force total after :\n" << force << std::endl; |  | ||||||
|     } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| }  // namespace Grid |  | ||||||
|  |  | ||||||
| #endif // SHGORDON_ACTION_H |  | ||||||
| @@ -48,6 +48,22 @@ with this program; if not, write to the Free Software Foundation, Inc., | |||||||
|     }                                                                    \ |     }                                                                    \ | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  | #define RegisterLoadCheckPointerMetadataFunction(NAME)                   \ | ||||||
|  |   template < class Metadata >                                            \ | ||||||
|  |   void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \ | ||||||
|  |     if (!have_CheckPointer) {                                            \ | ||||||
|  |       std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME      \ | ||||||
|  |                 << std::endl;                                            \ | ||||||
|  |       CP = std::unique_ptr<CheckpointerBaseModule>(                      \ | ||||||
|  |         new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_));   \ | ||||||
|  |       have_CheckPointer = true;                                          \ | ||||||
|  |     } else {                                                             \ | ||||||
|  |       std::cout << GridLogError << "Checkpointer already loaded "        \ | ||||||
|  |                 << std::endl;                                            \ | ||||||
|  |       exit(1);                                                           \ | ||||||
|  |     }                                                                    \ | ||||||
|  |   } | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
| @@ -77,7 +93,7 @@ class HMCResourceManager { | |||||||
|   bool have_CheckPointer; |   bool have_CheckPointer; | ||||||
|  |  | ||||||
|   // NOTE: operator << is not overloaded for std::vector<string>  |   // NOTE: operator << is not overloaded for std::vector<string>  | ||||||
|   // so thsi function is necessary |   // so this function is necessary | ||||||
|   void output_vector_string(const std::vector<std::string> &vs){ |   void output_vector_string(const std::vector<std::string> &vs){ | ||||||
|     for (auto &i: vs) |     for (auto &i: vs) | ||||||
|       std::cout << i << " "; |       std::cout << i << " "; | ||||||
| @@ -254,6 +270,7 @@ class HMCResourceManager { | |||||||
|   RegisterLoadCheckPointerFunction(Nersc); |   RegisterLoadCheckPointerFunction(Nersc); | ||||||
|   #ifdef HAVE_LIME |   #ifdef HAVE_LIME | ||||||
|   RegisterLoadCheckPointerFunction(ILDG); |   RegisterLoadCheckPointerFunction(ILDG); | ||||||
|  |   RegisterLoadCheckPointerMetadataFunction(Scidac); | ||||||
|   #endif |   #endif | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -76,6 +76,14 @@ class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> { | |||||||
|     } |     } | ||||||
|  	}  |  	}  | ||||||
|  |  | ||||||
|  |   void check_filename(const std::string &filename){ | ||||||
|  |     std::ifstream f(filename.c_str()); | ||||||
|  |     if(!f.good()){ | ||||||
|  |       std::cout << GridLogError << "Filename " << filename << " not found. Aborting. " << std::endl; | ||||||
|  |       abort(); | ||||||
|  |     }; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   virtual void initialize(const CheckpointerParameters &Params) = 0; |   virtual void initialize(const CheckpointerParameters &Params) = 0; | ||||||
|  |  | ||||||
|   virtual void CheckpointRestore(int traj, typename Impl::Field &U, |   virtual void CheckpointRestore(int traj, typename Impl::Field &U, | ||||||
|   | |||||||
| @@ -93,6 +93,9 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> { | |||||||
|   void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { |   void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { | ||||||
|     std::string config, rng; |     std::string config, rng; | ||||||
|     this->build_filenames(traj, Params, config, rng); |     this->build_filenames(traj, Params, config, rng); | ||||||
|  |     this->check_filename(rng); | ||||||
|  |     this->check_filename(config); | ||||||
|  |  | ||||||
|  |  | ||||||
|     BinarySimpleMunger<sobj_double, sobj> munge; |     BinarySimpleMunger<sobj_double, sobj> munge; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -136,6 +136,20 @@ class ILDGCPModule: public CheckPointerModule< ImplementationPolicy> { | |||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | template<class ImplementationPolicy, class Metadata> | ||||||
|  | class ScidacCPModule: public CheckPointerModule< ImplementationPolicy> { | ||||||
|  |   typedef CheckPointerModule< ImplementationPolicy> CPBase; | ||||||
|  |   Metadata M; | ||||||
|  |  | ||||||
|  |   // acquire resource | ||||||
|  |   virtual void initialize(){ | ||||||
|  |      this->CheckPointPtr.reset(new ScidacHmcCheckpointer<ImplementationPolicy, Metadata>(this->Par_, M)); | ||||||
|  |   } | ||||||
|  | public: | ||||||
|  |   ScidacCPModule(typename CPBase::APar Par, Metadata M_):M(M_), CPBase(Par) {} | ||||||
|  |   template <class ReaderClass> | ||||||
|  |   ScidacCPModule(Reader<ReaderClass>& Reader) : Parametrized<typename CPBase::APar>(Reader), M(Reader){}; | ||||||
|  | }; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -34,6 +34,7 @@ directory | |||||||
| #include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h> | #include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h> | ||||||
| #include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h> | #include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h> | ||||||
| #include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h> | #include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h> | ||||||
|  | #include <Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h> | ||||||
| //#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h> | //#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h> | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -74,10 +74,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> { | |||||||
|     if ((traj % Params.saveInterval) == 0) { |     if ((traj % Params.saveInterval) == 0) { | ||||||
|       std::string config, rng; |       std::string config, rng; | ||||||
|       this->build_filenames(traj, Params, config, rng); |       this->build_filenames(traj, Params, config, rng); | ||||||
|        |       GridBase *grid = U._grid; | ||||||
|       uint32_t nersc_csum,scidac_csuma,scidac_csumb; |       uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
|       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); |       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|       IldgWriter _IldgWriter; |       IldgWriter _IldgWriter(grid->IsBoss()); | ||||||
|       _IldgWriter.open(config); |       _IldgWriter.open(config); | ||||||
|       _IldgWriter.writeConfiguration(U, traj, config, config); |       _IldgWriter.writeConfiguration(U, traj, config, config); | ||||||
|       _IldgWriter.close(); |       _IldgWriter.close(); | ||||||
| @@ -95,6 +95,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> { | |||||||
|                          GridParallelRNG &pRNG) { |                          GridParallelRNG &pRNG) { | ||||||
|     std::string config, rng; |     std::string config, rng; | ||||||
|     this->build_filenames(traj, Params, config, rng); |     this->build_filenames(traj, Params, config, rng); | ||||||
|  |     this->check_filename(rng); | ||||||
|  |     this->check_filename(config); | ||||||
|  |  | ||||||
|  |      | ||||||
|  |  | ||||||
|     uint32_t nersc_csum,scidac_csuma,scidac_csumb; |     uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
|     BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); |     BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|   | |||||||
| @@ -69,6 +69,9 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> { | |||||||
|                          GridParallelRNG &pRNG) { |                          GridParallelRNG &pRNG) { | ||||||
|     std::string config, rng; |     std::string config, rng; | ||||||
|     this->build_filenames(traj, Params, config, rng); |     this->build_filenames(traj, Params, config, rng); | ||||||
|  |     this->check_filename(rng); | ||||||
|  |     this->check_filename(config); | ||||||
|  |  | ||||||
|  |  | ||||||
|     FieldMetaData header; |     FieldMetaData header; | ||||||
|     NerscIO::readRNGState(sRNG, pRNG, header, rng); |     NerscIO::readRNGState(sRNG, pRNG, header, rng); | ||||||
|   | |||||||
							
								
								
									
										125
									
								
								lib/qcd/hmc/checkpointers/ScidacCheckpointer.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								lib/qcd/hmc/checkpointers/ScidacCheckpointer.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,125 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
|  | Source file: ./lib/qcd/hmc/ScidacCheckpointer.h | ||||||
|  |  | ||||||
|  | Copyright (C) 2018 | ||||||
|  |  | ||||||
|  | Author: Guido Cossu <guido.cossu@ed.ac.uk> | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution | ||||||
|  | directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #ifndef SCIDAC_CHECKPOINTER | ||||||
|  | #define SCIDAC_CHECKPOINTER | ||||||
|  |  | ||||||
|  | #ifdef HAVE_LIME | ||||||
|  |  | ||||||
|  | #include <iostream> | ||||||
|  | #include <sstream> | ||||||
|  | #include <string> | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  | namespace QCD { | ||||||
|  |  | ||||||
|  | // For generic fields | ||||||
|  | template <class Implementation, class Metadata> | ||||||
|  | class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> { | ||||||
|  |  private: | ||||||
|  |   CheckpointerParameters Params; | ||||||
|  |   Metadata MData; | ||||||
|  |  | ||||||
|  |   typedef typename Implementation::Field Field; | ||||||
|  |  | ||||||
|  |  public: | ||||||
|  |   //INHERIT_GIMPL_TYPES(Implementation); | ||||||
|  |  | ||||||
|  |   ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); } | ||||||
|  |   ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); } | ||||||
|  |  | ||||||
|  |   void initialize(const CheckpointerParameters &Params_) { | ||||||
|  |     Params = Params_; | ||||||
|  |  | ||||||
|  |     // check here that the format is valid | ||||||
|  |     int ieee32big = (Params.format == std::string("IEEE32BIG")); | ||||||
|  |     int ieee32    = (Params.format == std::string("IEEE32")); | ||||||
|  |     int ieee64big = (Params.format == std::string("IEEE64BIG")); | ||||||
|  |     int ieee64    = (Params.format == std::string("IEEE64")); | ||||||
|  |  | ||||||
|  |     if (!(ieee64big || ieee32 || ieee32big || ieee64)) { | ||||||
|  |       std::cout << GridLogError << "Unrecognized file format " << Params.format | ||||||
|  |                 << std::endl; | ||||||
|  |       std::cout << GridLogError | ||||||
|  |                 << "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64" | ||||||
|  |                 << std::endl; | ||||||
|  |  | ||||||
|  |       exit(1); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, | ||||||
|  |                           GridParallelRNG &pRNG) { | ||||||
|  |     if ((traj % Params.saveInterval) == 0) { | ||||||
|  |       std::string config, rng; | ||||||
|  |       this->build_filenames(traj, Params, config, rng); | ||||||
|  |       GridBase *grid = U._grid; | ||||||
|  |       uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
|  |       BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|  |       ScidacWriter _ScidacWriter(grid->IsBoss()); | ||||||
|  |       _ScidacWriter.open(config); | ||||||
|  |       _ScidacWriter.writeScidacFieldRecord(U, MData); | ||||||
|  |       _ScidacWriter.close(); | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage << "Written Scidac Configuration on " << config | ||||||
|  |                 << " checksum " << std::hex << nersc_csum<<"/" | ||||||
|  | 		            << scidac_csuma<<"/" << scidac_csumb | ||||||
|  | 		            << std::dec << std::endl; | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, | ||||||
|  |                          GridParallelRNG &pRNG) { | ||||||
|  |     std::string config, rng; | ||||||
|  |     this->build_filenames(traj, Params, config, rng); | ||||||
|  |     this->check_filename(rng); | ||||||
|  |     this->check_filename(config); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     uint32_t nersc_csum,scidac_csuma,scidac_csumb; | ||||||
|  |     BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); | ||||||
|  |  | ||||||
|  |     Metadata md_content; | ||||||
|  |     ScidacReader _ScidacReader; | ||||||
|  |     _ScidacReader.open(config); | ||||||
|  |     _ScidacReader.readScidacFieldRecord(U,md_content);  // format from the header | ||||||
|  |     _ScidacReader.close(); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << "Read Scidac Configuration from " << config | ||||||
|  |               << " checksum " << std::hex  | ||||||
|  | 	      << nersc_csum<<"/" | ||||||
|  | 	      << scidac_csuma<<"/" | ||||||
|  | 	      << scidac_csumb | ||||||
|  | 	      << std::dec << std::endl; | ||||||
|  |   }; | ||||||
|  | }; | ||||||
|  | } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif  // HAVE_LIME | ||||||
|  | #endif  // ILDG_CHECKPOINTER | ||||||
| @@ -103,7 +103,7 @@ class Integrator { | |||||||
|         // Implement smearing only for the fundamental representation now |         // Implement smearing only for the fundamental representation now | ||||||
|         repr_set.at(a)->deriv(Rep.U, forceR); |         repr_set.at(a)->deriv(Rep.U, forceR); | ||||||
|         GF force = Rep.RtoFundamentalProject(forceR);  // Ta for the fundamental rep |         GF force = Rep.RtoFundamentalProject(forceR);  // Ta for the fundamental rep | ||||||
|         Real force_abs = std::sqrt(norm2(force))/(U._grid->gSites()); |         Real force_abs = std::sqrt(norm2(force)/(U._grid->gSites())); | ||||||
|         std::cout << GridLogIntegrator << "Hirep Force average: " << force_abs << std::endl; |         std::cout << GridLogIntegrator << "Hirep Force average: " << force_abs << std::endl; | ||||||
|         Mom -= force * ep ; |         Mom -= force * ep ; | ||||||
|       } |       } | ||||||
| @@ -115,18 +115,25 @@ class Integrator { | |||||||
|     // Fundamental updates, include smearing |     // Fundamental updates, include smearing | ||||||
|  |  | ||||||
|     for (int a = 0; a < as[level].actions.size(); ++a) { |     for (int a = 0; a < as[level].actions.size(); ++a) { | ||||||
|  |       double start_full = usecond(); | ||||||
|       Field force(U._grid); |       Field force(U._grid); | ||||||
|       force = zero; |  | ||||||
|       conformable(U._grid, Mom._grid); |       conformable(U._grid, Mom._grid); | ||||||
|  |  | ||||||
|       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); |       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); | ||||||
|  |       double start_force = usecond(); | ||||||
|       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta |       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta | ||||||
|  |  | ||||||
|       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; |       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; | ||||||
|       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); |       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); | ||||||
|       force = FieldImplementation::projectForce(force); // Ta for gauge fields |       force = FieldImplementation::projectForce(force); // Ta for gauge fields | ||||||
|       Real force_abs = std::sqrt(norm2(force))/U._grid->gSites(); |       double end_force = usecond(); | ||||||
|       std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl; |       Real force_abs = std::sqrt(norm2(force)/U._grid->gSites()); | ||||||
|  |       std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl; | ||||||
|       Mom -= force * ep;  |       Mom -= force * ep;  | ||||||
|  |       double end_full = usecond(); | ||||||
|  |       double time_full  = (end_full - start_full) / 1e3; | ||||||
|  |       double time_force = (end_force - start_force) / 1e3; | ||||||
|  |       std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Force from the other representations |     // Force from the other representations | ||||||
|   | |||||||
| @@ -92,20 +92,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters> | |||||||
|   PlaquetteMod(): ObsBase(NoParameters()){} |   PlaquetteMod(): ObsBase(NoParameters()){} | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template < class Impl > |  | ||||||
| class ExpScalarMod: public ObservableModule<ExpScalarLogger<Impl>, ExpScalarParameters>{ |  | ||||||
|   typedef ObservableModule<ExpScalarLogger<Impl>, ExpScalarParameters> ObsBase; |  | ||||||
|   using ObsBase::ObsBase; // for constructors |  | ||||||
|  |  | ||||||
|   // acquire resource |  | ||||||
|   virtual void initialize(){ |  | ||||||
|     this->ObservablePtr.reset(new ExpScalarLogger<Impl>(this->Par_)); |  | ||||||
|   } |  | ||||||
|   public: |  | ||||||
|   ExpScalarMod(ExpScalarParameters P): ObsBase(P){} |  | ||||||
|   ExpScalarMod():ObsBase(){}; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template < class Impl > | template < class Impl > | ||||||
| class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{ | class PolyakovMod: public ObservableModule<PolyakovLogger<Impl>, NoParameters>{ | ||||||
|   typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase; |   typedef ObservableModule<PolyakovLogger<Impl>, NoParameters> ObsBase; | ||||||
|   | |||||||
| @@ -1,80 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Source file: ./lib/qcd/modules/exp_scalar.h |  | ||||||
|  |  | ||||||
| Copyright (C) 2018 |  | ||||||
|  |  | ||||||
| Author: Guido Cossu <guido.cossu@ed.ac.uk> |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
|  |  | ||||||
| #ifndef HMC_EXP_SCALAR_H |  | ||||||
| #define HMC_EXP_SCALAR_H |  | ||||||
|  |  | ||||||
| namespace Grid { |  | ||||||
| namespace QCD { |  | ||||||
|  |  | ||||||
| struct ExpScalarParameters : Serializable { |  | ||||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(ExpScalarParameters, |  | ||||||
|     double, a) |  | ||||||
|  |  | ||||||
|     ExpScalarParameters(double _a = 0.0):a(_a){} |  | ||||||
|  |  | ||||||
|     template < class ReaderClass > |  | ||||||
|     ExpScalarParameters(Reader<ReaderClass>& Reader){ |  | ||||||
|         read(Reader, "ExpScalar", *this);   |  | ||||||
|     }   |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| class ExpScalarLogger : public HmcObservable<typename Impl::Field> { |  | ||||||
|   ExpScalarParameters Pars; |  | ||||||
|  public: |  | ||||||
|  |  | ||||||
|   // necessary for HmcObservable compatibility |  | ||||||
|   typedef typename Impl::Field Field; |  | ||||||
|  |  | ||||||
|   ExpScalarLogger(double _a):Pars(_a){} |  | ||||||
|  |  | ||||||
|   ExpScalarLogger(ExpScalarParameters P):Pars(P){} |  | ||||||
|  |  | ||||||
|   void TrajectoryComplete(int traj, typename Impl::Field &U, |  | ||||||
|                           GridSerialRNG &sRNG, |  | ||||||
|                           GridParallelRNG &pRNG) { |  | ||||||
|  |  | ||||||
|     double e = sum(trace(exp(Pars.a*U))); |  | ||||||
|  |  | ||||||
|     int def_prec = std::cout.precision(); |  | ||||||
|  |  | ||||||
|     std::cout << GridLogMessage |  | ||||||
|         << std::setprecision(std::numeric_limits<Real>::digits10 + 1) |  | ||||||
|         << "ExpScalar: [ " << traj << " ] "<< e << std::endl; |  | ||||||
|  |  | ||||||
|     std::cout.precision(def_prec); |  | ||||||
|  |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| }  // namespace QCD |  | ||||||
| }  // namespace Grid |  | ||||||
|  |  | ||||||
| #endif  // HMC_PLAQUETTE_H |  | ||||||
| @@ -46,6 +46,6 @@ class HmcObservable { | |||||||
| #include "plaquette.h" | #include "plaquette.h" | ||||||
| #include "topological_charge.h" | #include "topological_charge.h" | ||||||
| #include "polyakov_loop.h" | #include "polyakov_loop.h" | ||||||
| #include "exp_scalar.h" |  | ||||||
|  |  | ||||||
| #endif  //  HMC_OBSERVABLE_H | #endif  //  HMC_OBSERVABLE_H | ||||||
|   | |||||||
| @@ -6,13 +6,16 @@ | |||||||
| #ifndef GAUGE_CONFIG_ | #ifndef GAUGE_CONFIG_ | ||||||
| #define GAUGE_CONFIG_ | #define GAUGE_CONFIG_ | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid | ||||||
|  | { | ||||||
|  |  | ||||||
| namespace QCD { | namespace QCD | ||||||
|  | { | ||||||
|  |  | ||||||
| //trivial class for no smearing | //trivial class for no smearing | ||||||
| template <class Impl> | template <class Impl> | ||||||
| class NoSmearing { | class NoSmearing | ||||||
|  | { | ||||||
| public: | public: | ||||||
|   INHERIT_FIELD_TYPES(Impl); |   INHERIT_FIELD_TYPES(Impl); | ||||||
|  |  | ||||||
| @@ -26,10 +29,10 @@ public: | |||||||
|  |  | ||||||
|   Field &get_SmearedU() { return *ThinField; } |   Field &get_SmearedU() { return *ThinField; } | ||||||
|  |  | ||||||
|   Field& get_U(bool smeared = false) { |   Field &get_U(bool smeared = false) | ||||||
|  |   { | ||||||
|     return *ThinField; |     return *ThinField; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| /*! | /*! | ||||||
| @@ -44,7 +47,8 @@ public: | |||||||
|   It stores a list of smeared configurations. |   It stores a list of smeared configurations. | ||||||
| */ | */ | ||||||
| template <class Gimpl> | template <class Gimpl> | ||||||
| class SmearedConfiguration { | class SmearedConfiguration | ||||||
|  | { | ||||||
| public: | public: | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |   INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  |  | ||||||
| @@ -55,7 +59,8 @@ class SmearedConfiguration { | |||||||
|  |  | ||||||
|   // Member functions |   // Member functions | ||||||
|   //==================================================================== |   //==================================================================== | ||||||
|   void fill_smearedSet(GaugeField& U) { |   void fill_smearedSet(GaugeField &U) | ||||||
|  |   { | ||||||
|     ThinLinks = &U; // attach the smearing routine to the field U |     ThinLinks = &U; // attach the smearing routine to the field U | ||||||
|  |  | ||||||
|     // check the pointer is not null |     // check the pointer is not null | ||||||
| @@ -63,13 +68,15 @@ class SmearedConfiguration { | |||||||
|       std::cout << GridLogError |       std::cout << GridLogError | ||||||
|                 << "[SmearedConfiguration] Error in ThinLinks pointer\n"; |                 << "[SmearedConfiguration] Error in ThinLinks pointer\n"; | ||||||
|  |  | ||||||
|     if (smearingLevels > 0) { |     if (smearingLevels > 0) | ||||||
|  |     { | ||||||
|       std::cout << GridLogDebug |       std::cout << GridLogDebug | ||||||
|                 << "[SmearedConfiguration] Filling SmearedSet\n"; |                 << "[SmearedConfiguration] Filling SmearedSet\n"; | ||||||
|       GaugeField previous_u(ThinLinks->_grid); |       GaugeField previous_u(ThinLinks->_grid); | ||||||
|  |  | ||||||
|       previous_u = *ThinLinks; |       previous_u = *ThinLinks; | ||||||
|       for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) { |       for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) | ||||||
|  |       { | ||||||
|         StoutSmearing.smear(SmearedSet[smearLvl], previous_u); |         StoutSmearing.smear(SmearedSet[smearLvl], previous_u); | ||||||
|         previous_u = SmearedSet[smearLvl]; |         previous_u = SmearedSet[smearLvl]; | ||||||
|  |  | ||||||
| @@ -82,7 +89,8 @@ class SmearedConfiguration { | |||||||
|   } |   } | ||||||
|   //==================================================================== |   //==================================================================== | ||||||
|   GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime, |   GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime, | ||||||
|                                   const GaugeField& GaugeK) const { |                                   const GaugeField &GaugeK) const | ||||||
|  |   { | ||||||
|     GridBase *grid = GaugeK._grid; |     GridBase *grid = GaugeK._grid; | ||||||
|     GaugeField C(grid), SigmaK(grid), iLambda(grid); |     GaugeField C(grid), SigmaK(grid), iLambda(grid); | ||||||
|     GaugeLinkField iLambda_mu(grid); |     GaugeLinkField iLambda_mu(grid); | ||||||
| @@ -94,7 +102,8 @@ class SmearedConfiguration { | |||||||
|     SigmaK = zero; |     SigmaK = zero; | ||||||
|     iLambda = zero; |     iLambda = zero; | ||||||
|  |  | ||||||
|     for (int mu = 0; mu < Nd; mu++) { |     for (int mu = 0; mu < Nd; mu++) | ||||||
|  |     { | ||||||
|       Cmu = peekLorentz(C, mu); |       Cmu = peekLorentz(C, mu); | ||||||
|       GaugeKmu = peekLorentz(GaugeK, mu); |       GaugeKmu = peekLorentz(GaugeK, mu); | ||||||
|       SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu); |       SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu); | ||||||
| @@ -109,14 +118,16 @@ class SmearedConfiguration { | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   /*! @brief Returns smeared configuration at level 'Level' */ |   /*! @brief Returns smeared configuration at level 'Level' */ | ||||||
|   const GaugeField& get_smeared_conf(int Level) const { |   const GaugeField &get_smeared_conf(int Level) const | ||||||
|  |   { | ||||||
|     return SmearedSet[Level]; |     return SmearedSet[Level]; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   //==================================================================== |   //==================================================================== | ||||||
|   void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ, |   void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ, | ||||||
|                    const GaugeLinkField &iQ, const GaugeLinkField &Sigmap, |                    const GaugeLinkField &iQ, const GaugeLinkField &Sigmap, | ||||||
|                    const GaugeLinkField& GaugeK) const { |                    const GaugeLinkField &GaugeK) const | ||||||
|  |   { | ||||||
|     GridBase *grid = iQ._grid; |     GridBase *grid = iQ._grid; | ||||||
|     GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid); |     GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid); | ||||||
|     GaugeLinkField unity(grid); |     GaugeLinkField unity(grid); | ||||||
| @@ -208,13 +219,13 @@ class SmearedConfiguration { | |||||||
|   //==================================================================== |   //==================================================================== | ||||||
| public: | public: | ||||||
|   GaugeField * |   GaugeField * | ||||||
|       ThinLinks; /*!< @brief Pointer to the thin |       ThinLinks; /* Pointer to the thin links configuration */ | ||||||
|                                                          links configuration */ |  | ||||||
|  |  | ||||||
|   /*! @brief Standard constructor */ |   /* Standard constructor */ | ||||||
|   SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear, |   SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear, | ||||||
|                        Smear_Stout<Gimpl> &Stout) |                        Smear_Stout<Gimpl> &Stout) | ||||||
|       : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) { |       : smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) | ||||||
|  |   { | ||||||
|     for (unsigned int i = 0; i < smearingLevels; ++i) |     for (unsigned int i = 0; i < smearingLevels; ++i) | ||||||
|       SmearedSet.push_back(*(new GaugeField(UGrid))); |       SmearedSet.push_back(*(new GaugeField(UGrid))); | ||||||
|   } |   } | ||||||
| @@ -223,21 +234,29 @@ class SmearedConfiguration { | |||||||
|   SmearedConfiguration() |   SmearedConfiguration() | ||||||
|       : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} |       : smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {} | ||||||
|  |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   // attach the smeared routines to the thin links U and fill the smeared set |   // attach the smeared routines to the thin links U and fill the smeared set | ||||||
|   void set_Field(GaugeField& U) { fill_smearedSet(U); } |   void set_Field(GaugeField &U) | ||||||
|  |   { | ||||||
|  |     double start = usecond(); | ||||||
|  |     fill_smearedSet(U); | ||||||
|  |     double end = usecond(); | ||||||
|  |     double time = (end - start)/ 1e3; | ||||||
|  |     std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;   | ||||||
|  |   } | ||||||
|  |  | ||||||
|   //==================================================================== |   //==================================================================== | ||||||
|   void smeared_force(GaugeField& SigmaTilde) const { |   void smeared_force(GaugeField &SigmaTilde) const | ||||||
|     if (smearingLevels > 0) { |   { | ||||||
|  |     if (smearingLevels > 0) | ||||||
|  |     { | ||||||
|  |       double start = usecond(); | ||||||
|       GaugeField force = SigmaTilde; // actually = U*SigmaTilde |       GaugeField force = SigmaTilde; // actually = U*SigmaTilde | ||||||
|       GaugeLinkField tmp_mu(SigmaTilde._grid); |       GaugeLinkField tmp_mu(SigmaTilde._grid); | ||||||
|  |  | ||||||
|       for (int mu = 0; mu < Nd; mu++) { |       for (int mu = 0; mu < Nd; mu++) | ||||||
|  |       { | ||||||
|         // to get just SigmaTilde |         // to get just SigmaTilde | ||||||
|         tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * |         tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu); | ||||||
|                  peekLorentz(force, mu); |  | ||||||
|         pokeLorentz(force, tmp_mu, mu); |         pokeLorentz(force, tmp_mu, mu); | ||||||
|       } |       } | ||||||
|  |  | ||||||
| @@ -246,33 +265,43 @@ class SmearedConfiguration { | |||||||
|  |  | ||||||
|       force = AnalyticSmearedForce(force, *ThinLinks); |       force = AnalyticSmearedForce(force, *ThinLinks); | ||||||
|  |  | ||||||
|       for (int mu = 0; mu < Nd; mu++) { |       for (int mu = 0; mu < Nd; mu++) | ||||||
|  |       { | ||||||
|         tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu); |         tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu); | ||||||
|         pokeLorentz(SigmaTilde, tmp_mu, mu); |         pokeLorentz(SigmaTilde, tmp_mu, mu); | ||||||
|       } |       } | ||||||
|  |       double end = usecond(); | ||||||
|  |       double time = (end - start)/ 1e3; | ||||||
|  |       std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;   | ||||||
|     } // if smearingLevels = 0 do nothing |     } // if smearingLevels = 0 do nothing | ||||||
|   } |   } | ||||||
|   //==================================================================== |   //==================================================================== | ||||||
|  |  | ||||||
|   GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; } |   GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; } | ||||||
|  |  | ||||||
|   GaugeField& get_U(bool smeared = false) { |   GaugeField &get_U(bool smeared = false) | ||||||
|  |   { | ||||||
|     // get the config, thin links by default |     // get the config, thin links by default | ||||||
|     if (smeared) { |     if (smeared) | ||||||
|       if (smearingLevels) { |     { | ||||||
|  |       if (smearingLevels) | ||||||
|  |       { | ||||||
|         RealD impl_plaq = |         RealD impl_plaq = | ||||||
|             WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]); |             WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]); | ||||||
|         std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq |         std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq | ||||||
|                   << std::endl; |                   << std::endl; | ||||||
|         return get_SmearedU(); |         return get_SmearedU(); | ||||||
|  |       } | ||||||
|       } else { |       else | ||||||
|  |       { | ||||||
|         RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks); |         RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks); | ||||||
|         std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq |         std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq | ||||||
|                   << std::endl; |                   << std::endl; | ||||||
|         return *ThinLinks; |         return *ThinLinks; | ||||||
|       } |       } | ||||||
|     } else { |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|       RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks); |       RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks); | ||||||
|       std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq |       std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq | ||||||
|                 << std::endl; |                 << std::endl; | ||||||
|   | |||||||
| @@ -173,7 +173,7 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const { | |||||||
|         std::cout << "Time to evolve " << diff.count() << " s\n"; |         std::cout << "Time to evolve " << diff.count() << " s\n"; | ||||||
|         #endif |         #endif | ||||||
|         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " |         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " | ||||||
|             << step << "  " | 		  << step << "  " << tau(step) << "  "  | ||||||
| 		  << energyDensityPlaquette(step,out) << std::endl; | 		  << energyDensityPlaquette(step,out) << std::endl; | ||||||
|          if( step % measure_interval == 0){ |          if( step % measure_interval == 0){ | ||||||
|          std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : " |          std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : " | ||||||
| @@ -193,7 +193,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re | |||||||
|         //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; |         //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; | ||||||
|         evolve_step_adaptive(out, maxTau); |         evolve_step_adaptive(out, maxTau); | ||||||
|         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " |         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " | ||||||
|             << step << "  " | 		  << step << "  " << taus << "  " | ||||||
| 		  << energyDensityPlaquette(out) << std::endl; | 		  << energyDensityPlaquette(out) << std::endl; | ||||||
|          if( step % measure_interval == 0){ |          if( step % measure_interval == 0){ | ||||||
|          std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : " |          std::cout << GridLogMessage << "[WilsonFlow] Top. charge           : " | ||||||
|   | |||||||
| @@ -212,6 +212,7 @@ public: | |||||||
|  |  | ||||||
|  |  | ||||||
| // For the force term | // For the force term | ||||||
|  | /* | ||||||
| static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { | static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { | ||||||
|     GridBase *grid = Umu._grid; |     GridBase *grid = Umu._grid; | ||||||
|     std::vector<GaugeMat> U(Nd, grid); |     std::vector<GaugeMat> U(Nd, grid); | ||||||
| @@ -225,7 +226,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { | |||||||
|  |  | ||||||
|     for (int nu = 0; nu < Nd; nu++) { |     for (int nu = 0; nu < Nd; nu++) { | ||||||
|       if (nu != mu) { |       if (nu != mu) { | ||||||
|         // this is ~10% faster than the Staple |         // this is ~10% faster than the Staple  -- PAB: so what it gives the WRONG answers for other BC's! | ||||||
|         tmp1 = Cshift(U[nu], mu, 1); |         tmp1 = Cshift(U[nu], mu, 1); | ||||||
|         tmp2 = Cshift(U[mu], nu, 1); |         tmp2 = Cshift(U[mu], nu, 1); | ||||||
|         staple += tmp1* adj(U[nu]*tmp2); |         staple += tmp1* adj(U[nu]*tmp2); | ||||||
| @@ -235,7 +236,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { | |||||||
|     } |     } | ||||||
|     staple = U[mu]*staple; |     staple = U[mu]*staple; | ||||||
| } | } | ||||||
|  | */ | ||||||
|   ////////////////////////////////////////////////// |   ////////////////////////////////////////////////// | ||||||
|   // the sum over all staples on each site |   // the sum over all staples on each site | ||||||
|   ////////////////////////////////////////////////// |   ////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -31,113 +31,10 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk> | |||||||
| #define GRID_SERIALISATION_ABSTRACT_READER_H | #define GRID_SERIALISATION_ABSTRACT_READER_H | ||||||
|  |  | ||||||
| #include <type_traits> | #include <type_traits> | ||||||
|  | #include <Grid/tensors/Tensors.h> | ||||||
|  | #include <Grid/serialisation/VectorUtils.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|   // Vector IO utilities /////////////////////////////////////////////////////// |  | ||||||
|   // helper function to read space-separated values |  | ||||||
|   template <typename T> |  | ||||||
|   std::vector<T> strToVec(const std::string s) |  | ||||||
|   { |  | ||||||
|     std::istringstream sstr(s); |  | ||||||
|     T                  buf; |  | ||||||
|     std::vector<T>     v; |  | ||||||
|      |  | ||||||
|     while(!sstr.eof()) |  | ||||||
|     { |  | ||||||
|       sstr >> buf; |  | ||||||
|       v.push_back(buf); |  | ||||||
|     } |  | ||||||
|      |  | ||||||
|     return v; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // output to streams for vectors |  | ||||||
|   template < class T > |  | ||||||
|   inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v) |  | ||||||
|   { |  | ||||||
|     os << "["; |  | ||||||
|     for (auto &x: v) |  | ||||||
|     { |  | ||||||
|       os << x << " "; |  | ||||||
|     } |  | ||||||
|     if (v.size() > 0) |  | ||||||
|     { |  | ||||||
|       os << "\b"; |  | ||||||
|     } |  | ||||||
|     os << "]"; |  | ||||||
|      |  | ||||||
|     return os; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Vector element trait //////////////////////////////////////////////////////   |  | ||||||
|   template <typename T> |  | ||||||
|   struct element |  | ||||||
|   { |  | ||||||
|     typedef T type; |  | ||||||
|     static constexpr bool is_number = false; |  | ||||||
|   }; |  | ||||||
|    |  | ||||||
|   template <typename T> |  | ||||||
|   struct element<std::vector<T>> |  | ||||||
|   { |  | ||||||
|     typedef typename element<T>::type type; |  | ||||||
|     static constexpr bool is_number = std::is_arithmetic<T>::value |  | ||||||
|                                       or is_complex<T>::value |  | ||||||
|                                       or element<T>::is_number; |  | ||||||
|   }; |  | ||||||
|    |  | ||||||
|   // Vector flattening utility class //////////////////////////////////////////// |  | ||||||
|   // Class to flatten a multidimensional std::vector |  | ||||||
|   template <typename V> |  | ||||||
|   class Flatten |  | ||||||
|   { |  | ||||||
|   public: |  | ||||||
|     typedef typename element<V>::type Element; |  | ||||||
|   public: |  | ||||||
|     explicit                     Flatten(const V &vector); |  | ||||||
|     const V &                    getVector(void); |  | ||||||
|     const std::vector<Element> & getFlatVector(void); |  | ||||||
|     const std::vector<size_t>  & getDim(void); |  | ||||||
|   private: |  | ||||||
|     void accumulate(const Element &e); |  | ||||||
|     template <typename W> |  | ||||||
|     void accumulate(const W &v); |  | ||||||
|     void accumulateDim(const Element &e); |  | ||||||
|     template <typename W> |  | ||||||
|     void accumulateDim(const W &v); |  | ||||||
|   private: |  | ||||||
|     const V              &vector_; |  | ||||||
|     std::vector<Element> flatVector_; |  | ||||||
|     std::vector<size_t>  dim_; |  | ||||||
|   }; |  | ||||||
|    |  | ||||||
|   // Class to reconstruct a multidimensional std::vector |  | ||||||
|   template <typename V> |  | ||||||
|   class Reconstruct |  | ||||||
|   { |  | ||||||
|   public: |  | ||||||
|     typedef typename element<V>::type Element; |  | ||||||
|   public: |  | ||||||
|     Reconstruct(const std::vector<Element> &flatVector, |  | ||||||
|                 const std::vector<size_t> &dim); |  | ||||||
|     const V &                    getVector(void); |  | ||||||
|     const std::vector<Element> & getFlatVector(void); |  | ||||||
|     const std::vector<size_t>  & getDim(void); |  | ||||||
|   private: |  | ||||||
|     void fill(std::vector<Element> &v); |  | ||||||
|     template <typename W> |  | ||||||
|     void fill(W &v); |  | ||||||
|     void resize(std::vector<Element> &v, const unsigned int dim); |  | ||||||
|     template <typename W> |  | ||||||
|     void resize(W &v, const unsigned int dim); |  | ||||||
|   private: |  | ||||||
|     V                          vector_; |  | ||||||
|     const std::vector<Element> &flatVector_; |  | ||||||
|     std::vector<size_t>        dim_; |  | ||||||
|     size_t                     ind_{0}; |  | ||||||
|     unsigned int               dimInd_{0}; |  | ||||||
|   }; |  | ||||||
|    |  | ||||||
|   // Pair IO utilities ///////////////////////////////////////////////////////// |   // Pair IO utilities ///////////////////////////////////////////////////////// | ||||||
|   // helper function to parse input in the format "<obj1 obj2>" |   // helper function to parse input in the format "<obj1 obj2>" | ||||||
|   template <typename T1, typename T2> |   template <typename T1, typename T2> | ||||||
| @@ -151,15 +48,15 @@ namespace Grid { | |||||||
|     do |     do | ||||||
|     { |     { | ||||||
|       is.get(c); |       is.get(c); | ||||||
|     } while (c != '<' && !is.eof()); |     } while (c != '(' && !is.eof()); | ||||||
|     if (c == '<') |     if (c == '(') | ||||||
|     { |     { | ||||||
|       int start = is.tellg(); |       int start = is.tellg(); | ||||||
|       do |       do | ||||||
|       { |       { | ||||||
|         is.get(c); |         is.get(c); | ||||||
|       } while (c != '>' && !is.eof()); |       } while (c != ')' && !is.eof()); | ||||||
|       if (c == '>') |       if (c == ')') | ||||||
|       { |       { | ||||||
|         int end = is.tellg(); |         int end = is.tellg(); | ||||||
|         int psize = end - start - 1; |         int psize = end - start - 1; | ||||||
| @@ -182,7 +79,7 @@ namespace Grid { | |||||||
|   template <class T1, class T2> |   template <class T1, class T2> | ||||||
|   inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p) |   inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p) | ||||||
|   { |   { | ||||||
|     os << "<" << p.first << " " << p.second << ">"; |     os << "(" << p.first << " " << p.second << ")"; | ||||||
|     return os; |     return os; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -205,6 +102,12 @@ namespace Grid { | |||||||
|     template <typename U> |     template <typename U> | ||||||
|     typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type |     typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type | ||||||
|     write(const std::string& s, const U &output); |     write(const std::string& s, const U &output); | ||||||
|  |     template <typename U> | ||||||
|  |     void write(const std::string &s, const iScalar<U> &output); | ||||||
|  |     template <typename U, int N> | ||||||
|  |     void write(const std::string &s, const iVector<U, N> &output); | ||||||
|  |     template <typename U, int N> | ||||||
|  |     void write(const std::string &s, const iMatrix<U, N> &output); | ||||||
|   private: |   private: | ||||||
|     T *upcast; |     T *upcast; | ||||||
|   }; |   }; | ||||||
| @@ -224,6 +127,12 @@ namespace Grid { | |||||||
|     template <typename U> |     template <typename U> | ||||||
|     typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type |     typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type | ||||||
|     read(const std::string& s, U &output); |     read(const std::string& s, U &output); | ||||||
|  |     template <typename U> | ||||||
|  |     void read(const std::string &s, iScalar<U> &output); | ||||||
|  |     template <typename U, int N> | ||||||
|  |     void read(const std::string &s, iVector<U, N> &output); | ||||||
|  |     template <typename U, int N> | ||||||
|  |     void read(const std::string &s, iMatrix<U, N> &output); | ||||||
|   protected: |   protected: | ||||||
|     template <typename U> |     template <typename U> | ||||||
|     void fromString(U &output, const std::string &s); |     void fromString(U &output, const std::string &s); | ||||||
| @@ -239,201 +148,7 @@ namespace Grid { | |||||||
|     static const bool value = false; |     static const bool value = false; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   // Writer template implementation | ||||||
|  |  | ||||||
|   // Generic writer interface |  | ||||||
|   // serializable base class |  | ||||||
|   class Serializable |  | ||||||
|   { |  | ||||||
|   public: |  | ||||||
|     template <typename T> |  | ||||||
|     static inline void write(Writer<T> &WR,const std::string &s, |  | ||||||
|                              const Serializable &obj) |  | ||||||
|     {} |  | ||||||
|      |  | ||||||
|     template <typename T> |  | ||||||
|     static inline void read(Reader<T> &RD,const std::string &s, |  | ||||||
|                             Serializable &obj) |  | ||||||
|     {} |  | ||||||
|      |  | ||||||
|     friend inline std::ostream & operator<<(std::ostream &os, |  | ||||||
|                                             const Serializable &obj) |  | ||||||
|     { |  | ||||||
|       return os; |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|    |  | ||||||
|   // Flatten class template implementation ///////////////////////////////////// |  | ||||||
|   template <typename V> |  | ||||||
|   void Flatten<V>::accumulate(const Element &e) |  | ||||||
|   { |  | ||||||
|     flatVector_.push_back(e); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   template <typename W> |  | ||||||
|   void Flatten<V>::accumulate(const W &v) |  | ||||||
|   { |  | ||||||
|     for (auto &e: v) |  | ||||||
|     { |  | ||||||
|       accumulate(e); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   void Flatten<V>::accumulateDim(const Element &e) {}; |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   template <typename W> |  | ||||||
|   void Flatten<V>::accumulateDim(const W &v) |  | ||||||
|   { |  | ||||||
|     dim_.push_back(v.size()); |  | ||||||
|     accumulateDim(v[0]); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   Flatten<V>::Flatten(const V &vector) |  | ||||||
|   : vector_(vector) |  | ||||||
|   { |  | ||||||
|     accumulate(vector_); |  | ||||||
|     accumulateDim(vector_); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   const V & Flatten<V>::getVector(void) |  | ||||||
|   { |  | ||||||
|     return vector_; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   const std::vector<typename Flatten<V>::Element> & |  | ||||||
|   Flatten<V>::getFlatVector(void) |  | ||||||
|   { |  | ||||||
|     return flatVector_; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   const std::vector<size_t> & Flatten<V>::getDim(void) |  | ||||||
|   { |  | ||||||
|     return dim_; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Reconstruct class template implementation ///////////////////////////////// |  | ||||||
|   template <typename V> |  | ||||||
|   void Reconstruct<V>::fill(std::vector<Element> &v) |  | ||||||
|   { |  | ||||||
|     for (auto &e: v) |  | ||||||
|     { |  | ||||||
|       e = flatVector_[ind_++]; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   template <typename W> |  | ||||||
|   void Reconstruct<V>::fill(W &v) |  | ||||||
|   { |  | ||||||
|     for (auto &e: v) |  | ||||||
|     { |  | ||||||
|       fill(e); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim) |  | ||||||
|   { |  | ||||||
|     v.resize(dim_[dim]); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   template <typename W> |  | ||||||
|   void Reconstruct<V>::resize(W &v, const unsigned int dim) |  | ||||||
|   { |  | ||||||
|     v.resize(dim_[dim]); |  | ||||||
|     for (auto &e: v) |  | ||||||
|     { |  | ||||||
|       resize(e, dim + 1); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector, |  | ||||||
|                               const std::vector<size_t> &dim) |  | ||||||
|   : flatVector_(flatVector) |  | ||||||
|   , dim_(dim) |  | ||||||
|   { |  | ||||||
|     resize(vector_, 0); |  | ||||||
|     fill(vector_); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   const V & Reconstruct<V>::getVector(void) |  | ||||||
|   { |  | ||||||
|     return vector_; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   const std::vector<typename Reconstruct<V>::Element> & |  | ||||||
|   Reconstruct<V>::getFlatVector(void) |  | ||||||
|   { |  | ||||||
|     return flatVector_; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename V> |  | ||||||
|   const std::vector<size_t> & Reconstruct<V>::getDim(void) |  | ||||||
|   { |  | ||||||
|     return dim_; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Generic writer interface ////////////////////////////////////////////////// |  | ||||||
|   template <typename T> |  | ||||||
|   inline void push(Writer<T> &w, const std::string &s) { |  | ||||||
|     w.push(s); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename T> |  | ||||||
|   inline void push(Writer<T> &w, const char *s) |  | ||||||
|   { |  | ||||||
|     w.push(std::string(s)); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename T> |  | ||||||
|   inline void pop(Writer<T> &w) |  | ||||||
|   { |  | ||||||
|     w.pop(); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename T, typename U> |  | ||||||
|   inline void write(Writer<T> &w, const std::string& s, const U &output) |  | ||||||
|   { |  | ||||||
|     w.write(s, output); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Generic reader interface |  | ||||||
|   template <typename T> |  | ||||||
|   inline bool push(Reader<T> &r, const std::string &s) |  | ||||||
|   { |  | ||||||
|     return r.push(s); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename T> |  | ||||||
|   inline bool push(Reader<T> &r, const char *s) |  | ||||||
|   { |  | ||||||
|     return r.push(std::string(s)); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename T> |  | ||||||
|   inline void pop(Reader<T> &r) |  | ||||||
|   { |  | ||||||
|     r.pop(); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   template <typename T, typename U> |  | ||||||
|   inline void read(Reader<T> &r, const std::string &s, U &output) |  | ||||||
|   { |  | ||||||
|     r.read(s, output); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Writer template implementation //////////////////////////////////////////// |  | ||||||
|   template <typename T> |   template <typename T> | ||||||
|   Writer<T>::Writer(void) |   Writer<T>::Writer(void) | ||||||
|   { |   { | ||||||
| @@ -468,6 +183,27 @@ namespace Grid { | |||||||
|     upcast->writeDefault(s, output); |     upcast->writeDefault(s, output); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   template <typename T> | ||||||
|  |   template <typename U> | ||||||
|  |   void Writer<T>::write(const std::string &s, const iScalar<U> &output) | ||||||
|  |   { | ||||||
|  |     upcast->writeDefault(s, tensorToVec(output)); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   template <typename T> | ||||||
|  |   template <typename U, int N> | ||||||
|  |   void Writer<T>::write(const std::string &s, const iVector<U, N> &output) | ||||||
|  |   { | ||||||
|  |     upcast->writeDefault(s, tensorToVec(output)); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   template <typename T> | ||||||
|  |   template <typename U, int N> | ||||||
|  |   void Writer<T>::write(const std::string &s, const iMatrix<U, N> &output) | ||||||
|  |   { | ||||||
|  |     upcast->writeDefault(s, tensorToVec(output)); | ||||||
|  |   } | ||||||
|  |    | ||||||
|   // Reader template implementation |   // Reader template implementation | ||||||
|   template <typename T> |   template <typename T> | ||||||
|   Reader<T>::Reader(void) |   Reader<T>::Reader(void) | ||||||
| @@ -503,6 +239,36 @@ namespace Grid { | |||||||
|     upcast->readDefault(s, output); |     upcast->readDefault(s, output); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   template <typename T> | ||||||
|  |   template <typename U> | ||||||
|  |   void Reader<T>::read(const std::string &s, iScalar<U> &output) | ||||||
|  |   { | ||||||
|  |     typename TensorToVec<iScalar<U>>::type v; | ||||||
|  |  | ||||||
|  |     upcast->readDefault(s, v); | ||||||
|  |     vecToTensor(output, v); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   template <typename T> | ||||||
|  |   template <typename U, int N> | ||||||
|  |   void Reader<T>::read(const std::string &s, iVector<U, N> &output) | ||||||
|  |   { | ||||||
|  |     typename TensorToVec<iVector<U, N>>::type v; | ||||||
|  |      | ||||||
|  |     upcast->readDefault(s, v); | ||||||
|  |     vecToTensor(output, v); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <typename T> | ||||||
|  |   template <typename U, int N> | ||||||
|  |   void Reader<T>::read(const std::string &s, iMatrix<U, N> &output) | ||||||
|  |   { | ||||||
|  |     typename TensorToVec<iMatrix<U, N>>::type v; | ||||||
|  |      | ||||||
|  |     upcast->readDefault(s, v); | ||||||
|  |     vecToTensor(output, v); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   template <typename T> |   template <typename T> | ||||||
|   template <typename U> |   template <typename U> | ||||||
|   void Reader<T>::fromString(U &output, const std::string &s) |   void Reader<T>::fromString(U &output, const std::string &s) | ||||||
| @@ -521,6 +287,76 @@ namespace Grid { | |||||||
|       abort(); |       abort(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   // serializable base class /////////////////////////////////////////////////// | ||||||
|  |   class Serializable | ||||||
|  |   { | ||||||
|  |   public: | ||||||
|  |     template <typename T> | ||||||
|  |     static inline void write(Writer<T> &WR,const std::string &s, | ||||||
|  |                              const Serializable &obj) | ||||||
|  |     {} | ||||||
|  |      | ||||||
|  |     template <typename T> | ||||||
|  |     static inline void read(Reader<T> &RD,const std::string &s, | ||||||
|  |                             Serializable &obj) | ||||||
|  |     {} | ||||||
|  |      | ||||||
|  |     friend inline std::ostream & operator<<(std::ostream &os, | ||||||
|  |                                             const Serializable &obj) | ||||||
|  |     { | ||||||
|  |       return os; | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |    | ||||||
|  |   // Generic writer interface ////////////////////////////////////////////////// | ||||||
|  |   template <typename T> | ||||||
|  |   inline void push(Writer<T> &w, const std::string &s) { | ||||||
|  |     w.push(s); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <typename T> | ||||||
|  |   inline void push(Writer<T> &w, const char *s) | ||||||
|  |   { | ||||||
|  |     w.push(std::string(s)); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <typename T> | ||||||
|  |   inline void pop(Writer<T> &w) | ||||||
|  |   { | ||||||
|  |     w.pop(); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <typename T, typename U> | ||||||
|  |   inline void write(Writer<T> &w, const std::string& s, const U &output) | ||||||
|  |   { | ||||||
|  |     w.write(s, output); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   // Generic reader interface ////////////////////////////////////////////////// | ||||||
|  |   template <typename T> | ||||||
|  |   inline bool push(Reader<T> &r, const std::string &s) | ||||||
|  |   { | ||||||
|  |     return r.push(s); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <typename T> | ||||||
|  |   inline bool push(Reader<T> &r, const char *s) | ||||||
|  |   { | ||||||
|  |     return r.push(std::string(s)); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <typename T> | ||||||
|  |   inline void pop(Reader<T> &r) | ||||||
|  |   { | ||||||
|  |     r.pop(); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <typename T, typename U> | ||||||
|  |   inline void read(Reader<T> &r, const std::string &s, U &output) | ||||||
|  |   { | ||||||
|  |     r.read(s, output); | ||||||
|  |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user