mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Compare commits
	
		
			3 Commits
		
	
	
		
			feature/sh
			...
			feature/mi
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 1e3fb32572 | ||
|  | 0d5af667d8 | ||
|  | e9712bc7fb | 
| @@ -102,5 +102,5 @@ script: | |||||||
|     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto |     - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto | ||||||
|     - make -j4 |     - make -j4 | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi |     - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi | ||||||
|  |     - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi | ||||||
|  |  | ||||||
|   | |||||||
| @@ -48,9 +48,9 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl; |   std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||||
|   int maxlat=24; |   int maxlat=16; | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=2){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=1;Ls<=16;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -124,8 +124,8 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=2){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=1;Ls<=16;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat,lat,lat,lat}); |       std::vector<int> latt_size  ({lat,lat,lat,lat}); | ||||||
|  |  | ||||||
| @@ -194,14 +194,14 @@ int main (int argc, char ** argv) | |||||||
|   }   |   }   | ||||||
|  |  | ||||||
|  |  | ||||||
|   Nloop=10; |   Nloop=100; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; |   std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=2){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=1;Ls<=16;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -281,8 +281,8 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=2){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=1;Ls<=16;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -324,8 +324,8 @@ int main (int argc, char ** argv) | |||||||
| 					    (void *)&rbuf[mu][0], | 					    (void *)&rbuf[mu][0], | ||||||
| 					    recv_from_rank, | 					    recv_from_rank, | ||||||
| 					    bytes); | 					    bytes); | ||||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | 	    //	    Grid.StencilSendToRecvFromComplete(requests); | ||||||
| 	    requests.resize(0); | 	    //	    requests.resize(0); | ||||||
|  |  | ||||||
| 	    comm_proc = mpi_layout[mu]-1; | 	    comm_proc = mpi_layout[mu]-1; | ||||||
| 	   | 	   | ||||||
|   | |||||||
| @@ -48,16 +48,16 @@ typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR; | |||||||
| typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF; | typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF; | ||||||
| typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD; | typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD; | ||||||
|  |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) | int main (int argc, char ** argv) | ||||||
| { | { | ||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |  | ||||||
|   int threads = GridThread::GetThreads(); |   int threads = GridThread::GetThreads(); | ||||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; |   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||||
|  |  | ||||||
|   std::vector<int> latt4 = GridDefaultLatt(); |   std::vector<int> latt4 = GridDefaultLatt(); | ||||||
|   const int Ls=16; |   const int Ls=8; | ||||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); |   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); |   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
| @@ -72,65 +72,34 @@ int main (int argc, char ** argv) | |||||||
|   std::vector<int> seeds4({1,2,3,4}); |   std::vector<int> seeds4({1,2,3,4}); | ||||||
|   std::vector<int> seeds5({5,6,7,8}); |   std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; |  | ||||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); |   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||||
|   std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; |  | ||||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); |   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||||
|   std::cout << GridLogMessage << "Initialised RNGs" << std::endl; |  | ||||||
|  |  | ||||||
|   LatticeFermion src   (FGrid); random(RNG5,src); |   LatticeFermion src   (FGrid); random(RNG5,src); | ||||||
| #if 0 |  | ||||||
|   src = zero; |  | ||||||
|   { |  | ||||||
|     std::vector<int> origin({0,0,0,latt4[2]-1,0}); |  | ||||||
|     SpinColourVectorF tmp; |  | ||||||
|     tmp=zero; |  | ||||||
|     tmp()(0)(0)=Complex(-2.0,0.0); |  | ||||||
|     std::cout << " source site 0 " << tmp<<std::endl; |  | ||||||
|     pokeSite(tmp,src,origin); |  | ||||||
|   } |  | ||||||
| #else |  | ||||||
|   RealD N2 = 1.0/::sqrt(norm2(src)); |  | ||||||
|   src = src*N2; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   LatticeFermion result(FGrid); result=zero; |   LatticeFermion result(FGrid); result=zero; | ||||||
|   LatticeFermion    ref(FGrid);    ref=zero; |   LatticeFermion    ref(FGrid);    ref=zero; | ||||||
|   LatticeFermion    tmp(FGrid); |   LatticeFermion    tmp(FGrid); | ||||||
|   LatticeFermion    err(FGrid); |   LatticeFermion    err(FGrid); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; |  | ||||||
|   LatticeGaugeField Umu(UGrid);  |   LatticeGaugeField Umu(UGrid);  | ||||||
|   SU3::HotConfiguration(RNG4,Umu);  |   random(RNG4,Umu); | ||||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; |  | ||||||
| #if 0 |  | ||||||
|   Umu=1.0; |  | ||||||
|   for(int mu=0;mu<Nd;mu++){ |  | ||||||
|     LatticeColourMatrix ttmp(UGrid); |  | ||||||
|     ttmp = PeekIndex<LorentzIndex>(Umu,mu); |  | ||||||
|     //    if (mu !=2 ) ttmp = 0; |  | ||||||
|     //    ttmp = ttmp* pow(10.0,mu); |  | ||||||
|     PokeIndex<LorentzIndex>(Umu,ttmp,mu); |  | ||||||
|   } |  | ||||||
|   std::cout << GridLogMessage << "Forced to diagonal " << std::endl; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Naive wilson implementation |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // replicate across fifth dimension |  | ||||||
|   LatticeGaugeField Umu5d(FGrid);  |   LatticeGaugeField Umu5d(FGrid);  | ||||||
|   std::vector<LatticeColourMatrix> U(4,FGrid); |  | ||||||
|  |   // replicate across fifth dimension | ||||||
|   for(int ss=0;ss<Umu._grid->oSites();ss++){ |   for(int ss=0;ss<Umu._grid->oSites();ss++){ | ||||||
|     for(int s=0;s<Ls;s++){ |     for(int s=0;s<Ls;s++){ | ||||||
|       Umu5d._odata[Ls*ss+s] = Umu._odata[ss]; |       Umu5d._odata[Ls*ss+s] = Umu._odata[ss]; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   // Naive wilson implementation | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   std::vector<LatticeColourMatrix> U(4,FGrid); | ||||||
|   for(int mu=0;mu<Nd;mu++){ |   for(int mu=0;mu<Nd;mu++){ | ||||||
|     U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu); |     U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu); | ||||||
|   } |   } | ||||||
|   std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; |  | ||||||
|  |  | ||||||
|   if (1) |   if (1) | ||||||
|   { |   { | ||||||
| @@ -152,7 +121,6 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   RealD NP = UGrid->_Nprocessors; |   RealD NP = UGrid->_Nprocessors; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "Creating action operator " << std::endl; |  | ||||||
|   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); |   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
| @@ -168,11 +136,10 @@ int main (int argc, char ** argv) | |||||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; |   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |  | ||||||
|   int ncall =1000; |   int ncall =100; | ||||||
|   if (1) { |   if (1) { | ||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|     Dw.ZeroCounters(); |     Dw.ZeroCounters(); | ||||||
|     Dw.Dhop(src,result,0); |  | ||||||
|     double t0=usecond(); |     double t0=usecond(); | ||||||
|     for(int i=0;i<ncall;i++){ |     for(int i=0;i<ncall;i++){ | ||||||
|       __SSC_START; |       __SSC_START; | ||||||
| @@ -186,22 +153,12 @@ int main (int argc, char ** argv) | |||||||
|     double flops=1344*volume*ncall; |     double flops=1344*volume*ncall; | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; |     std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
|     //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; |     std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; |     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|     err = ref-result;  |     err = ref-result;  | ||||||
|     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; |     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  |  | ||||||
|     /* |  | ||||||
|     if(( norm2(err)>1.0e-4) ) {  |  | ||||||
|       std::cout << "RESULT\n " << result<<std::endl; |  | ||||||
|       std::cout << "REF   \n " << ref   <<std::endl; |  | ||||||
|       std::cout << "ERR   \n " << err   <<std::endl; |  | ||||||
|       FGrid->Barrier(); |  | ||||||
|       exit(-1); |  | ||||||
|     } |  | ||||||
|     */ |  | ||||||
|     assert (norm2(err)< 1.0e-4 ); |     assert (norm2(err)< 1.0e-4 ); | ||||||
|     Dw.Report(); |     Dw.Report(); | ||||||
|   } |   } | ||||||
| @@ -226,12 +183,20 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); |     WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); | ||||||
|    |    | ||||||
|     localConvert(src,ssrc); |     for(int x=0;x<latt4[0];x++){ | ||||||
|  |     for(int y=0;y<latt4[1];y++){ | ||||||
|  |     for(int z=0;z<latt4[2];z++){ | ||||||
|  |     for(int t=0;t<latt4[3];t++){ | ||||||
|  |     for(int s=0;s<Ls;s++){ | ||||||
|  |       std::vector<int> site({s,x,y,z,t}); | ||||||
|  |       SpinColourVector tmp; | ||||||
|  |       peekSite(tmp,src,site); | ||||||
|  |       pokeSite(tmp,ssrc,site); | ||||||
|  |     }}}}} | ||||||
|     std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl; |     std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl; | ||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|     sDw.Dhop(ssrc,sresult,0); |  | ||||||
|     sDw.ZeroCounters(); |  | ||||||
|     double t0=usecond(); |     double t0=usecond(); | ||||||
|  |     sDw.ZeroCounters(); | ||||||
|     for(int i=0;i<ncall;i++){ |     for(int i=0;i<ncall;i++){ | ||||||
|       __SSC_START; |       __SSC_START; | ||||||
|       sDw.Dhop(ssrc,sresult,0); |       sDw.Dhop(ssrc,sresult,0); | ||||||
| @@ -245,47 +210,46 @@ int main (int argc, char ** argv) | |||||||
|     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; |     std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||||
|     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; |     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||||
|     //    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl; |  | ||||||
|     sDw.Report(); |     sDw.Report(); | ||||||
|  |    | ||||||
|  |     if(0){ | ||||||
|  |       for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ | ||||||
|  | 	sDw.Dhop(ssrc,sresult,0); | ||||||
|  | 	PerformanceCounter Counter(i); | ||||||
|  | 	Counter.Start(); | ||||||
|  | 	sDw.Dhop(ssrc,sresult,0); | ||||||
|  | 	Counter.Stop(); | ||||||
|  | 	Counter.Report(); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl; | ||||||
|  |  | ||||||
|     RealD sum=0; |     RealD sum=0; | ||||||
|  |     for(int x=0;x<latt4[0];x++){ | ||||||
|     err=zero; |     for(int y=0;y<latt4[1];y++){ | ||||||
|     localConvert(sresult,err); |     for(int z=0;z<latt4[2];z++){ | ||||||
|     err = err - ref; |     for(int t=0;t<latt4[3];t++){ | ||||||
|     sum = norm2(err); |     for(int s=0;s<Ls;s++){ | ||||||
|     std::cout<<GridLogMessage<<" difference between normal ref and simd is "<<sum<<std::endl; |       std::vector<int> site({s,x,y,z,t}); | ||||||
|     if(sum > 1.0e-4 ){ |       SpinColourVector normal, simd; | ||||||
|       std::cout<< "sD REF\n " <<ref << std::endl; |       peekSite(normal,result,site); | ||||||
|       std::cout<< "sD ERR   \n " <<err  <<std::endl; |       peekSite(simd,sresult,site); | ||||||
|  |       sum=sum+norm2(normal-simd); | ||||||
|  |       if (norm2(normal-simd) > 1.0e-6 ) { | ||||||
|  | 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl; | ||||||
|  | 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl; | ||||||
|  | 	std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd   "<<simd<<std::endl; | ||||||
|       } |       } | ||||||
|     //    assert(sum < 1.0e-4); |     }}}}} | ||||||
|  |     std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl; | ||||||
|  |     assert (sum< 1.0e-4 ); | ||||||
|  |  | ||||||
|     err=zero; |  | ||||||
|     localConvert(sresult,err); |  | ||||||
|     err = err - result; |  | ||||||
|     sum = norm2(err); |  | ||||||
|     std::cout<<GridLogMessage<<" difference between normal result and simd is "<<sum<<std::endl; |  | ||||||
|     if(sum > 1.0e-4 ){ |  | ||||||
|       std::cout<< "sD REF\n " <<result << std::endl; |  | ||||||
|       std::cout<< "sD ERR   \n " << err  <<std::endl; |  | ||||||
|     } |  | ||||||
|     assert(sum < 1.0e-4); |  | ||||||
|  |  | ||||||
|     if(1){ |     if (1) { | ||||||
|       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; |  | ||||||
|       std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl; |  | ||||||
|       std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl; |  | ||||||
|       if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; |  | ||||||
|       if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; |  | ||||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   )  |  | ||||||
| 	std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; |  | ||||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll)  |  | ||||||
| 	std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; |  | ||||||
|       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm )  |  | ||||||
| 	std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; |  | ||||||
|       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; |  | ||||||
|  |  | ||||||
|       LatticeFermion sr_eo(sFGrid); |       LatticeFermion sr_eo(sFGrid); | ||||||
|  |  | ||||||
|       LatticeFermion ssrc_e (sFrbGrid); |       LatticeFermion ssrc_e (sFrbGrid); | ||||||
|       LatticeFermion ssrc_o (sFrbGrid); |       LatticeFermion ssrc_o (sFrbGrid); | ||||||
|       LatticeFermion sr_e   (sFrbGrid); |       LatticeFermion sr_e   (sFrbGrid); | ||||||
| @@ -293,23 +257,33 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|       pickCheckerboard(Even,ssrc_e,ssrc); |       pickCheckerboard(Even,ssrc_e,ssrc); | ||||||
|       pickCheckerboard(Odd,ssrc_o,ssrc); |       pickCheckerboard(Odd,ssrc_o,ssrc); | ||||||
|       //      setCheckerboard(sr_eo,ssrc_o); |  | ||||||
|       //      setCheckerboard(sr_eo,ssrc_e); |       setCheckerboard(sr_eo,ssrc_o); | ||||||
|  |       setCheckerboard(sr_eo,ssrc_e); | ||||||
|  |  | ||||||
|       sr_e = zero; |       sr_e = zero; | ||||||
|       sr_o = zero; |       sr_o = zero; | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||||
|  |       std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl; | ||||||
|  |       std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl; | ||||||
|  |       if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  |       if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  |       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  |       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  |       if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  |       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||||
|  |  | ||||||
|       FGrid->Barrier(); |       FGrid->Barrier(); | ||||||
|       sDw.DhopEO(ssrc_o, sr_e, DaggerNo); |  | ||||||
|       sDw.ZeroCounters(); |       sDw.ZeroCounters(); | ||||||
|       //      sDw.stat.init("DhopEO"); |       sDw.stat.init("DhopEO"); | ||||||
|       double t0=usecond(); |       double t0=usecond(); | ||||||
|       for (int i = 0; i < ncall; i++) { |       for (int i = 0; i < ncall; i++) { | ||||||
|         sDw.DhopEO(ssrc_o, sr_e, DaggerNo); |         sDw.DhopEO(ssrc_o, sr_e, DaggerNo); | ||||||
|       } |       } | ||||||
|       double t1=usecond(); |       double t1=usecond(); | ||||||
|       FGrid->Barrier(); |       FGrid->Barrier(); | ||||||
|       //      sDw.stat.print(); |       sDw.stat.print(); | ||||||
|  |  | ||||||
|       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |       double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|       double flops=(1344.0*volume*ncall)/2; |       double flops=(1344.0*volume*ncall)/2; | ||||||
| @@ -324,26 +298,22 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|       pickCheckerboard(Even,ssrc_e,sresult); |       pickCheckerboard(Even,ssrc_e,sresult); | ||||||
|       pickCheckerboard(Odd ,ssrc_o,sresult); |       pickCheckerboard(Odd ,ssrc_o,sresult); | ||||||
|  |  | ||||||
|       ssrc_e = ssrc_e - sr_e; |       ssrc_e = ssrc_e - sr_e; | ||||||
|       RealD error = norm2(ssrc_e); |       RealD error = norm2(ssrc_e); | ||||||
|       std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl; |  | ||||||
|  |  | ||||||
|  |       std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl; | ||||||
|       ssrc_o = ssrc_o - sr_o; |       ssrc_o = ssrc_o - sr_o; | ||||||
|  |  | ||||||
|       error+= norm2(ssrc_o); |       error+= norm2(ssrc_o); | ||||||
|       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl; |       std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl; | ||||||
|  |       if(error>1.0e-4) {  | ||||||
|       if(( error>1.0e-4) ) {  |  | ||||||
| 	setCheckerboard(ssrc,ssrc_o); | 	setCheckerboard(ssrc,ssrc_o); | ||||||
| 	setCheckerboard(ssrc,ssrc_e); | 	setCheckerboard(ssrc,ssrc_e); | ||||||
| 	std::cout<< "DIFF\n " <<ssrc << std::endl; | 	std::cout<< ssrc << std::endl; | ||||||
| 	setCheckerboard(ssrc,sr_o); |  | ||||||
| 	setCheckerboard(ssrc,sr_e); |  | ||||||
| 	std::cout<< "CBRESULT\n " <<ssrc << std::endl; |  | ||||||
| 	std::cout<< "RESULT\n " <<sresult<< std::endl; |  | ||||||
|       } |       } | ||||||
|       assert(error<1.0e-4); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if (1) |   if (1) | ||||||
| @@ -365,19 +335,14 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|     ref = -0.5*ref; |     ref = -0.5*ref; | ||||||
|   } |   } | ||||||
|   //  dump=1; |  | ||||||
|   Dw.Dhop(src,result,1); |   Dw.Dhop(src,result,1); | ||||||
|   std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; |   std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; | ||||||
|   std::cout<<GridLogMessage << "Called DwDag"<<std::endl; |   std::cout<<GridLogMessage << "Called DwDag"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl; |   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl; |   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||||
|   err = ref-result;  |   err = ref-result;  | ||||||
|   std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl; |   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|   if((norm2(err)>1.0e-4)){ |   assert(norm2(err)<1.0e-4); | ||||||
| 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl; |  | ||||||
| 	std::cout<< "DAG sRESULT\n " <<result  << std::endl; |  | ||||||
| 	std::cout<< "DAG ERR   \n "  << err    <<std::endl; |  | ||||||
|   } |  | ||||||
|   LatticeFermion src_e (FrbGrid); |   LatticeFermion src_e (FrbGrid); | ||||||
|   LatticeFermion src_o (FrbGrid); |   LatticeFermion src_o (FrbGrid); | ||||||
|   LatticeFermion r_e   (FrbGrid); |   LatticeFermion r_e   (FrbGrid); | ||||||
| @@ -385,18 +350,13 @@ int main (int argc, char ** argv) | |||||||
|   LatticeFermion r_eo  (FGrid); |   LatticeFermion r_eo  (FGrid); | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl; |   std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl; | ||||||
|   pickCheckerboard(Even,src_e,src); |   pickCheckerboard(Even,src_e,src); | ||||||
|   pickCheckerboard(Odd,src_o,src); |   pickCheckerboard(Odd,src_o,src); | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl; |   std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; |   std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|   // S-direction is INNERMOST and takes no part in the parity. |  | ||||||
|   static int Opt;  // these are a temporary hack |  | ||||||
|   static int Comms;  // these are a temporary hack |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||||
|   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl; |   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl; | ||||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl; |   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl; | ||||||
| @@ -409,7 +369,6 @@ int main (int argc, char ** argv) | |||||||
|   { |   { | ||||||
|     Dw.ZeroCounters(); |     Dw.ZeroCounters(); | ||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|     Dw.DhopEO(src_o,r_e,DaggerNo); |  | ||||||
|     double t0=usecond(); |     double t0=usecond(); | ||||||
|     for(int i=0;i<ncall;i++){ |     for(int i=0;i<ncall;i++){ | ||||||
|       Dw.DhopEO(src_o,r_e,DaggerNo); |       Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
| @@ -437,19 +396,14 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   err = r_eo-result;  |   err = r_eo-result;  | ||||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; |   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|   if((norm2(err)>1.0e-4)){ |   assert(norm2(err)<1.0e-4); | ||||||
| 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl; |  | ||||||
| 	std::cout<< "Deo REF\n " <<result  << std::endl; |  | ||||||
| 	std::cout<< "Deo ERR   \n " << err <<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   pickCheckerboard(Even,src_e,err); |   pickCheckerboard(Even,src_e,err); | ||||||
|   pickCheckerboard(Odd,src_o,err); |   pickCheckerboard(Odd,src_o,err); | ||||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; |   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; |   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||||
|  |   assert(norm2(src_e)<1.0e-4); | ||||||
|   //assert(norm2(src_e)<1.0e-4); |   assert(norm2(src_o)<1.0e-4); | ||||||
|   //assert(norm2(src_o)<1.0e-4); |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -66,8 +66,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|     Vec tsum; tsum = zero; |     Vec tsum; tsum = zero; | ||||||
|  |  | ||||||
|     GridParallelRNG          pRNG(&Grid);       |     GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|     pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101})); |  | ||||||
|  |  | ||||||
|     std::vector<double> stop(threads); |     std::vector<double> stop(threads); | ||||||
|     Vector<Vec> sum(threads); |     Vector<Vec> sum(threads); | ||||||
| @@ -78,7 +77,8 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     double start=usecond(); |     double start=usecond(); | ||||||
|     parallel_for(int t=0;t<threads;t++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int t=0;t<threads;t++){ | ||||||
|  |  | ||||||
|       sum[t] = x[t]._odata[0]; |       sum[t] = x[t]._odata[0]; | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int i=0;i<Nloop;i++){ | ||||||
|   | |||||||
| @@ -65,7 +65,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|       uint64_t Nloop=NLOOP; |       uint64_t Nloop=NLOOP; | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid); //random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid); //random(pRNG,x); | ||||||
| @@ -100,7 +100,7 @@ int main (int argc, char ** argv) | |||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid); //random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid); //random(pRNG,x); | ||||||
| @@ -138,7 +138,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid); //random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid); //random(pRNG,x); | ||||||
| @@ -173,7 +173,7 @@ int main (int argc, char ** argv) | |||||||
|       uint64_t Nloop=NLOOP; |       uint64_t Nloop=NLOOP; | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid); //random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid); //random(pRNG,x); | ||||||
|       LatticeVec y(&Grid); //random(pRNG,y); |       LatticeVec y(&Grid); //random(pRNG,y); | ||||||
|   | |||||||
| @@ -1,134 +0,0 @@ | |||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./benchmarks/Benchmark_staggered.cc |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
| using namespace std; |  | ||||||
| using namespace Grid; |  | ||||||
| using namespace Grid::QCD; |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) |  | ||||||
| { |  | ||||||
|   Grid_init(&argc,&argv); |  | ||||||
|  |  | ||||||
|   std::vector<int> latt_size   = GridDefaultLatt(); |  | ||||||
|   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); |  | ||||||
|   std::vector<int> mpi_layout  = GridDefaultMpi(); |  | ||||||
|   GridCartesian               Grid(latt_size,simd_layout,mpi_layout); |  | ||||||
|   GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout); |  | ||||||
|  |  | ||||||
|   int threads = GridThread::GetThreads(); |  | ||||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl; |  | ||||||
|  |  | ||||||
|   std::vector<int> seeds({1,2,3,4}); |  | ||||||
|   GridParallelRNG          pRNG(&Grid); |  | ||||||
|   pRNG.SeedFixedIntegers(seeds); |  | ||||||
|   //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |  | ||||||
|  |  | ||||||
|   typedef typename ImprovedStaggeredFermionR::FermionField FermionField;  |  | ||||||
|   typename ImprovedStaggeredFermionR::ImplParams params;  |  | ||||||
|  |  | ||||||
|   FermionField src   (&Grid); random(pRNG,src); |  | ||||||
|   FermionField result(&Grid); result=zero; |  | ||||||
|   FermionField    ref(&Grid);    ref=zero; |  | ||||||
|   FermionField    tmp(&Grid);    tmp=zero; |  | ||||||
|   FermionField    err(&Grid);    tmp=zero; |  | ||||||
|   LatticeGaugeField Umu(&Grid); random(pRNG,Umu); |  | ||||||
|   std::vector<LatticeColourMatrix> U(4,&Grid); |  | ||||||
|  |  | ||||||
|   double volume=1; |  | ||||||
|   for(int mu=0;mu<Nd;mu++){ |  | ||||||
|     volume=volume*latt_size[mu]; |  | ||||||
|   }   |  | ||||||
|  |  | ||||||
|   // Only one non-zero (y) |  | ||||||
| #if 0 |  | ||||||
|   Umu=zero; |  | ||||||
|   Complex cone(1.0,0.0); |  | ||||||
|   for(int nn=0;nn<Nd;nn++){ |  | ||||||
|     random(pRNG,U[nn]); |  | ||||||
|     if(1) { |  | ||||||
|       if (nn!=2) { U[nn]=zero; std::cout<<GridLogMessage << "zeroing gauge field in dir "<<nn<<std::endl; } |  | ||||||
|       //      else       { U[nn]= cone;std::cout<<GridLogMessage << "unit gauge field in dir "<<nn<<std::endl; } |  | ||||||
|       else       { std::cout<<GridLogMessage << "random gauge field in dir "<<nn<<std::endl; } |  | ||||||
|     } |  | ||||||
|     PokeIndex<LorentzIndex>(Umu,U[nn],nn); |  | ||||||
|   } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|   for(int mu=0;mu<Nd;mu++){ |  | ||||||
|     U[mu] = PeekIndex<LorentzIndex>(Umu,mu); |  | ||||||
|   } |  | ||||||
|   ref = zero; |  | ||||||
|   /*   |  | ||||||
|   { // Naive wilson implementation |  | ||||||
|     ref = zero; |  | ||||||
|     for(int mu=0;mu<Nd;mu++){ |  | ||||||
|       //    ref =  src + Gamma(Gamma::GammaX)* src ; // 1-gamma_x |  | ||||||
|       tmp = U[mu]*Cshift(src,mu,1); |  | ||||||
|       for(int i=0;i<ref._odata.size();i++){ |  | ||||||
| 	ref._odata[i]+= tmp._odata[i] - Gamma(Gmu[mu])*tmp._odata[i]; ; |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|       tmp =adj(U[mu])*src; |  | ||||||
|       tmp =Cshift(tmp,mu,-1); |  | ||||||
|       for(int i=0;i<ref._odata.size();i++){ |  | ||||||
| 	ref._odata[i]+= tmp._odata[i] + Gamma(Gmu[mu])*tmp._odata[i]; ; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   ref = -0.5*ref; |  | ||||||
|   */ |  | ||||||
|  |  | ||||||
|   RealD mass=0.1; |  | ||||||
|   RealD c1=9.0/8.0; |  | ||||||
|   RealD c2=-1.0/24.0; |  | ||||||
|   RealD u0=1.0; |  | ||||||
|   ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0,params); |  | ||||||
|    |  | ||||||
|   std::cout<<GridLogMessage << "Calling Ds"<<std::endl; |  | ||||||
|   int ncall=1000; |  | ||||||
|   double t0=usecond(); |  | ||||||
|   for(int i=0;i<ncall;i++){ |  | ||||||
|     Ds.Dhop(src,result,0); |  | ||||||
|   } |  | ||||||
|   double t1=usecond(); |  | ||||||
|   double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 +  == 1146 |  | ||||||
|    |  | ||||||
|   std::cout<<GridLogMessage << "Called Ds"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; |  | ||||||
|   err = ref-result;  |  | ||||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } |  | ||||||
| @@ -55,7 +55,7 @@ int main (int argc, char ** argv) | |||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid);// random(pRNG,z); |       LatticeColourMatrix z(&Grid);// random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid);// random(pRNG,x); |       LatticeColourMatrix x(&Grid);// random(pRNG,x); | ||||||
| @@ -88,7 +88,7 @@ int main (int argc, char ** argv) | |||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); |       LatticeColourMatrix z(&Grid); //random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); |       LatticeColourMatrix x(&Grid); //random(pRNG,x); | ||||||
| @@ -119,7 +119,7 @@ int main (int argc, char ** argv) | |||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); |       LatticeColourMatrix z(&Grid); //random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); |       LatticeColourMatrix x(&Grid); //random(pRNG,x); | ||||||
| @@ -150,7 +150,7 @@ int main (int argc, char ** argv) | |||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); |       LatticeColourMatrix z(&Grid); //random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); |       LatticeColourMatrix x(&Grid); //random(pRNG,x); | ||||||
|   | |||||||
| @@ -69,7 +69,7 @@ int main (int argc, char ** argv) | |||||||
|   std::vector<int> seeds({1,2,3,4}); |   std::vector<int> seeds({1,2,3,4}); | ||||||
|   GridParallelRNG          pRNG(&Grid); |   GridParallelRNG          pRNG(&Grid); | ||||||
|   pRNG.SeedFixedIntegers(seeds); |   pRNG.SeedFixedIntegers(seeds); | ||||||
|   //  pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |   //  pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|   LatticeFermion src   (&Grid); random(pRNG,src); |   LatticeFermion src   (&Grid); random(pRNG,src); | ||||||
|   LatticeFermion result(&Grid); result=zero; |   LatticeFermion result(&Grid); result=zero; | ||||||
|   | |||||||
| @@ -321,7 +321,7 @@ AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | |||||||
| ############### RNG selection | ############### RNG selection | ||||||
| AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\ | AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937|sitmo],\ | ||||||
| 	            [Select Random Number Generator to be used])],\ | 	            [Select Random Number Generator to be used])],\ | ||||||
| 	            [ac_RNG=${enable_rng}],[ac_RNG=sitmo]) | 	            [ac_RNG=${enable_rng}],[ac_RNG=ranlux48]) | ||||||
|  |  | ||||||
| case ${ac_RNG} in | case ${ac_RNG} in | ||||||
|      ranlux48) |      ranlux48) | ||||||
| @@ -401,7 +401,6 @@ AC_CONFIG_FILES(tests/hadrons/Makefile) | |||||||
| AC_CONFIG_FILES(tests/hmc/Makefile) | AC_CONFIG_FILES(tests/hmc/Makefile) | ||||||
| AC_CONFIG_FILES(tests/solver/Makefile) | AC_CONFIG_FILES(tests/solver/Makefile) | ||||||
| AC_CONFIG_FILES(tests/qdpxx/Makefile) | AC_CONFIG_FILES(tests/qdpxx/Makefile) | ||||||
| AC_CONFIG_FILES(tests/testu01/Makefile) |  | ||||||
| AC_CONFIG_FILES(benchmarks/Makefile) | AC_CONFIG_FILES(benchmarks/Makefile) | ||||||
| AC_CONFIG_FILES(extras/Makefile) | AC_CONFIG_FILES(extras/Makefile) | ||||||
| AC_CONFIG_FILES(extras/Hadrons/Makefile) | AC_CONFIG_FILES(extras/Hadrons/Makefile) | ||||||
|   | |||||||
| @@ -39,17 +39,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/algorithms/approx/MultiShiftFunction.h> | #include <Grid/algorithms/approx/MultiShiftFunction.h> | ||||||
| 
 | 
 | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradient.h> | #include <Grid/algorithms/iterative/ConjugateGradient.h> | ||||||
|  | #include <Grid/algorithms/iterative/ConjugateGradientShifted.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateResidual.h> | #include <Grid/algorithms/iterative/ConjugateResidual.h> | ||||||
| #include <Grid/algorithms/iterative/NormalEquations.h> | #include <Grid/algorithms/iterative/NormalEquations.h> | ||||||
| #include <Grid/algorithms/iterative/SchurRedBlack.h> | #include <Grid/algorithms/iterative/SchurRedBlack.h> | ||||||
|  | 
 | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> | ||||||
| #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> | ||||||
| 
 | 
 | ||||||
| // Lanczos support
 | // Lanczos support
 | ||||||
| #include <Grid/algorithms/iterative/MatrixUtils.h> | #include <Grid/algorithms/iterative/MatrixUtils.h> | ||||||
| #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> | #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> | ||||||
|  | 
 | ||||||
| #include <Grid/algorithms/CoarsenedMatrix.h> | #include <Grid/algorithms/CoarsenedMatrix.h> | ||||||
| #include <Grid/algorithms/FFT.h> |  | ||||||
| 
 | 
 | ||||||
| // Eigen/lanczos
 | // Eigen/lanczos
 | ||||||
| // EigCg
 | // EigCg
 | ||||||
| @@ -1,7 +1,7 @@ | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
| 
 | 
 | ||||||
| namespace Grid { | namespace Grid { | ||||||
| 
 | 
 | ||||||
| @@ -13,10 +13,9 @@ void *PointerCache::Insert(void *ptr,size_t bytes) { | |||||||
| 
 | 
 | ||||||
|   if (bytes < 4096 ) return NULL; |   if (bytes < 4096 ) return NULL; | ||||||
| 
 | 
 | ||||||
| #ifdef GRID_OMP | #ifdef _OPENMP | ||||||
|   assert(omp_in_parallel()==0); |   assert(omp_in_parallel()==0); | ||||||
| #endif  | #endif  | ||||||
| 
 |  | ||||||
|   void * ret = NULL; |   void * ret = NULL; | ||||||
|   int v = -1; |   int v = -1; | ||||||
| 
 | 
 | ||||||
							
								
								
									
										52
									
								
								lib/Grid.h
									
									
									
									
									
								
							
							
						
						
									
										52
									
								
								lib/Grid.h
									
									
									
									
									
								
							| @@ -38,10 +38,52 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef GRID_H | #ifndef GRID_H | ||||||
| #define GRID_H | #define GRID_H | ||||||
|  |  | ||||||
| #include <Grid/GridCore.h> | /////////////////// | ||||||
| #include <Grid/GridQCDcore.h> | // Std C++ dependencies | ||||||
| #include <Grid/qcd/action/Action.h> | /////////////////// | ||||||
| #include <Grid/qcd/smearing/Smearing.h> | #include <cassert> | ||||||
| #include <Grid/qcd/hmc/HMC_aggregate.h> | #include <complex> | ||||||
|  | #include <vector> | ||||||
|  | #include <iostream> | ||||||
|  | #include <iomanip> | ||||||
|  | #include <random> | ||||||
|  | #include <functional> | ||||||
|  | #include <stdio.h> | ||||||
|  | #include <stdlib.h> | ||||||
|  | #include <stdio.h> | ||||||
|  | #include <signal.h> | ||||||
|  | #include <ctime> | ||||||
|  | #include <sys/time.h> | ||||||
|  | #include <chrono> | ||||||
|  |  | ||||||
|  | /////////////////// | ||||||
|  | // Grid headers | ||||||
|  | /////////////////// | ||||||
|  | #include "Config.h" | ||||||
|  | #include <Grid/Timer.h> | ||||||
|  | #include <Grid/PerfCount.h> | ||||||
|  | #include <Grid/Log.h> | ||||||
|  | #include <Grid/AlignedAllocator.h> | ||||||
|  | #include <Grid/Simd.h> | ||||||
|  | #include <Grid/serialisation/Serialisation.h> | ||||||
|  | #include <Grid/Threads.h> | ||||||
|  | #include <Grid/Lexicographic.h> | ||||||
|  | #include <Grid/Init.h> | ||||||
|  | #include <Grid/Communicator.h>  | ||||||
|  | #include <Grid/Cartesian.h>     | ||||||
|  | #include <Grid/Tensors.h>       | ||||||
|  | #include <Grid/Lattice.h>       | ||||||
|  | #include <Grid/Cshift.h>        | ||||||
|  | #include <Grid/Stencil.h>       | ||||||
|  | #include <Grid/Algorithms.h>    | ||||||
|  | #include <Grid/parallelIO/BinaryIO.h> | ||||||
|  | #include <Grid/FFT.h> | ||||||
|  |  | ||||||
|  | #include <Grid/qcd/QCD.h> | ||||||
|  | #include <Grid/parallelIO/NerscIO.h> | ||||||
|  | #include <Grid/qcd/hmc/NerscCheckpointer.h> | ||||||
|  | #include <Grid/qcd/hmc/HmcRunner.h> | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -1,81 +0,0 @@ | |||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./lib/Grid.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: azusayamaguchi <ayamaguc@YAMAKAZE.local> |  | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| // |  | ||||||
| //  Grid.h |  | ||||||
| //  simd |  | ||||||
| // |  | ||||||
| //  Created by Peter Boyle on 09/05/2014. |  | ||||||
| //  Copyright (c) 2014 University of Edinburgh. All rights reserved. |  | ||||||
| // |  | ||||||
|  |  | ||||||
| #ifndef GRID_BASE_H |  | ||||||
| #define GRID_BASE_H |  | ||||||
|  |  | ||||||
| /////////////////// |  | ||||||
| // Std C++ dependencies |  | ||||||
| /////////////////// |  | ||||||
| #include <cassert> |  | ||||||
| #include <complex> |  | ||||||
| #include <vector> |  | ||||||
| #include <iostream> |  | ||||||
| #include <iomanip> |  | ||||||
| #include <random> |  | ||||||
| #include <functional> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <stdlib.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <signal.h> |  | ||||||
| #include <ctime> |  | ||||||
| #include <sys/time.h> |  | ||||||
| #include <chrono> |  | ||||||
|  |  | ||||||
| /////////////////// |  | ||||||
| // Grid headers |  | ||||||
| /////////////////// |  | ||||||
| #include "Config.h" |  | ||||||
|  |  | ||||||
| #include <Grid/perfmon/Timer.h> |  | ||||||
| #include <Grid/perfmon/PerfCount.h> |  | ||||||
| #include <Grid/log/Log.h> |  | ||||||
| #include <Grid/allocator/AlignedAllocator.h> |  | ||||||
| #include <Grid/simd/Simd.h> |  | ||||||
| #include <Grid/serialisation/Serialisation.h> |  | ||||||
| #include <Grid/threads/Threads.h> |  | ||||||
| #include <Grid/util/Util.h> |  | ||||||
| #include <Grid/communicator/Communicator.h>  |  | ||||||
| #include <Grid/cartesian/Cartesian.h>     |  | ||||||
| #include <Grid/tensors/Tensors.h>       |  | ||||||
| #include <Grid/lattice/Lattice.h>       |  | ||||||
| #include <Grid/cshift/Cshift.h>        |  | ||||||
| #include <Grid/stencil/Stencil.h>       |  | ||||||
| #include <Grid/parallelIO/BinaryIO.h> |  | ||||||
| #include <Grid/algorithms/Algorithms.h>    |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -1,4 +1,4 @@ | |||||||
| /*************************************************************************************
 |     /*************************************************************************************
 | ||||||
| 
 | 
 | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
| 
 | 
 | ||||||
| @@ -41,13 +41,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <signal.h> | #include <signal.h> | ||||||
| #include <iostream> | #include <iostream> | ||||||
| #include <iterator> | #include <iterator> | ||||||
|  | #include <Grid/Grid.h> | ||||||
| #include <algorithm> | #include <algorithm> | ||||||
| #include <iterator> | #include <iterator> | ||||||
| #include <cstdlib> | #include <cstdlib> | ||||||
| #include <memory> | #include <memory> | ||||||
| 
 | 
 | ||||||
| #include <Grid/Grid.h> |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| #include <fenv.h> | #include <fenv.h> | ||||||
| #ifdef __APPLE__ | #ifdef __APPLE__ | ||||||
| @@ -220,57 +219,8 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; |     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ |  | ||||||
|     Grid_debug_handler_init(); |  | ||||||
|   } |  | ||||||
| 
 |  | ||||||
|   CartesianCommunicator::Init(argc,argv); |   CartesianCommunicator::Init(argc,argv); | ||||||
| 
 | 
 | ||||||
|   if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){ |  | ||||||
|     Grid_quiesce_nodes(); |  | ||||||
|   } else {  |  | ||||||
|     std::ostringstream fname; |  | ||||||
|     fname<<"Grid.stdout."; |  | ||||||
|     fname<<CartesianCommunicator::RankWorld(); |  | ||||||
|     freopen(fname.str().c_str(),"w",stdout); |  | ||||||
|   } |  | ||||||
| 
 |  | ||||||
|   ////////////////////////////////////
 |  | ||||||
|   // Banner
 |  | ||||||
|   ////////////////////////////////////
 |  | ||||||
|   if ( CartesianCommunicator::RankWorld() == 0 ) {  |  | ||||||
|     std::cout <<std::endl; |  | ||||||
|     std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;  |  | ||||||
|     std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;  |  | ||||||
|     std::cout  << "__|_ |  |  |  |  |  |  |  |  |  |  |  | _|__"<<std::endl;  |  | ||||||
|     std::cout  << "__|_                                    _|__"<<std::endl;  |  | ||||||
|     std::cout  << "__|_   GGGG    RRRR    III    DDDD      _|__"<<std::endl; |  | ||||||
|     std::cout  << "__|_  G        R   R    I     D   D     _|__"<<std::endl; |  | ||||||
|     std::cout  << "__|_  G        R   R    I     D    D    _|__"<<std::endl; |  | ||||||
|     std::cout  << "__|_  G  GG    RRRR     I     D    D    _|__"<<std::endl; |  | ||||||
|     std::cout  << "__|_  G   G    R  R     I     D   D     _|__"<<std::endl; |  | ||||||
|     std::cout  << "__|_   GGGG    R   R   III    DDDD      _|__"<<std::endl; |  | ||||||
|     std::cout  << "__|_                                    _|__"<<std::endl;  |  | ||||||
|     std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;  |  | ||||||
|     std::cout  << "__|__|__|__|__|__|__|__|__|__|__|__|__|__|__"<<std::endl;  |  | ||||||
|     std::cout  << "  |  |  |  |  |  |  |  |  |  |  |  |  |  |  "<<std::endl;  |  | ||||||
|     std::cout << std::endl; |  | ||||||
|     std::cout << std::endl; |  | ||||||
|     std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl; |  | ||||||
|     std::cout << std::endl; |  | ||||||
|     std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl; |  | ||||||
|     std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl; |  | ||||||
|     std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl; |  | ||||||
|     std::cout << "(at your option) any later version."<<std::endl; |  | ||||||
|     std::cout << std::endl; |  | ||||||
|     std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl; |  | ||||||
|     std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl; |  | ||||||
|     std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl; |  | ||||||
|     std::cout << "GNU General Public License for more details."<<std::endl; |  | ||||||
|     std::cout << std::endl; |  | ||||||
|   } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|   ////////////////////////////////////
 |   ////////////////////////////////////
 | ||||||
|   // Logging
 |   // Logging
 | ||||||
|   ////////////////////////////////////
 |   ////////////////////////////////////
 | ||||||
| @@ -280,6 +230,9 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   GridCmdOptionCSL(defaultLog,logstreams); |   GridCmdOptionCSL(defaultLog,logstreams); | ||||||
|   GridLogConfigure(logstreams); |   GridLogConfigure(logstreams); | ||||||
| 
 | 
 | ||||||
|  |   if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){ | ||||||
|  |     Grid_quiesce_nodes(); | ||||||
|  |   } | ||||||
| 
 | 
 | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){ | ||||||
|     arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log"); |     arg = GridCmdOptionPayload(*argv,*argv+*argc,"--log"); | ||||||
| @@ -295,67 +248,94 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     std::cout<<GridLogMessage<<"  --help : this message"<<std::endl; |     std::cout<<GridLogMessage<<"  --help : this message"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Geometry:"<<std::endl; |     std::cout<<GridLogMessage<<"Geometry:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;     |     std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; |     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;     |     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;     |     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; |     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<"  --log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --log list      : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;     |     std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl; |     std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;     |     std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;     |     std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Performance:"<<std::endl; |     std::cout<<GridLogMessage<<"Performance:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"  --comms-isend   : Asynchronous MPI calls; several dirs at a time "<<std::endl;     |  | ||||||
|     std::cout<<GridLogMessage<<"  --comms-sendrecv: Synchronous MPI calls; one dirs at a time "<<std::endl;     |  | ||||||
|     std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;     |  | ||||||
|     std::cout<<GridLogMessage<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;     |     std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;     |     std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     exit(EXIT_SUCCESS); |     exit(EXIT_SUCCESS); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  |   ////////////////////////////////////
 | ||||||
|  |   // Banner
 | ||||||
|  |   ////////////////////////////////////
 | ||||||
|  | 
 | ||||||
|  |   std::string COL_RED    = GridLogColours.colour["RED"]; | ||||||
|  |   std::string COL_PURPLE = GridLogColours.colour["PURPLE"]; | ||||||
|  |   std::string COL_BLACK  = GridLogColours.colour["BLACK"]; | ||||||
|  |   std::string COL_GREEN  = GridLogColours.colour["GREEN"]; | ||||||
|  |   std::string COL_BLUE   = GridLogColours.colour["BLUE"]; | ||||||
|  |   std::string COL_YELLOW = GridLogColours.colour["YELLOW"]; | ||||||
|  |   std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"]; | ||||||
|  |    | ||||||
|  |   std::cout <<std::endl; | ||||||
|  |   std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl;  | ||||||
|  |   std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl;  | ||||||
|  |   std::cout <<COL_RED  << "__|_ |  |  |  "<<             "|  |  | "<<COL_PURPLE<<" |  |  |"<<                "  |  |  | _|__"<<std::endl;  | ||||||
|  |   std::cout <<COL_RED  << "__|_          "<<             "        "<<COL_PURPLE<<"        "<<                "          _|__"<<std::endl;  | ||||||
|  |   std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_PURPLE<<"    _|__"<<std::endl; | ||||||
|  |   std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_PURPLE<<"    _|__"<<std::endl; | ||||||
|  |   std::cout <<COL_RED  << "__|_  "<<COL_GREEN<<"G       "<<COL_RED<<" R   R  "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_PURPLE<<"    _|__"<<std::endl; | ||||||
|  |   std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G  GG   "<<COL_RED<<" RRRR   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D    D"<<COL_GREEN <<"    _|__"<<std::endl; | ||||||
|  |   std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<"G   G   "<<COL_RED<<" R  R   "<<COL_BLUE  <<"  I     "<<COL_PURPLE<<"D   D "<<COL_GREEN <<"    _|__"<<std::endl; | ||||||
|  |   std::cout <<COL_BLUE << "__|_  "<<COL_GREEN<<" GGGG   "<<COL_RED<<" R   R  "<<COL_BLUE  <<" III    "<<COL_PURPLE<<"DDDD  "<<COL_GREEN <<"    _|__"<<std::endl; | ||||||
|  |   std::cout <<COL_BLUE << "__|_          "<<             "        "<<COL_GREEN <<"        "<<                "          _|__"<<std::endl;  | ||||||
|  |   std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl;  | ||||||
|  |   std::cout <<COL_BLUE << "__|__|__|__|__"<<             "|__|__|_"<<COL_GREEN <<"_|__|__|"<<                "__|__|__|__|__"<<std::endl;  | ||||||
|  |   std::cout <<COL_BLUE << "  |  |  |  |  "<<             "|  |  | "<<COL_GREEN <<" |  |  |"<<                "  |  |  |  |  "<<std::endl;  | ||||||
|  |   std::cout << std::endl; | ||||||
|  |   std::cout << std::endl; | ||||||
|  |   std::cout <<COL_YELLOW<< std::endl; | ||||||
|  |   std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl; | ||||||
|  |   std::cout << std::endl; | ||||||
|  |   std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl; | ||||||
|  |   std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl; | ||||||
|  |   std::cout << "the Free Software Foundation; either version 2 of the License, or"<<std::endl; | ||||||
|  |   std::cout << "(at your option) any later version."<<std::endl; | ||||||
|  |   std::cout << std::endl; | ||||||
|  |   std::cout << "This program is distributed in the hope that it will be useful,"<<std::endl; | ||||||
|  |   std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl; | ||||||
|  |   std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl; | ||||||
|  |   std::cout << "GNU General Public License for more details."<<std::endl; | ||||||
|  |   std::cout << COL_BACKGROUND <<std::endl; | ||||||
|  |   std::cout << std::endl; | ||||||
|  | 
 | ||||||
|   ////////////////////////////////////
 |   ////////////////////////////////////
 | ||||||
|   // Debug and performance options
 |   // Debug and performance options
 | ||||||
|   ////////////////////////////////////
 |   ////////////////////////////////////
 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ | ||||||
|  |     Grid_debug_handler_init(); | ||||||
|  |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){ | ||||||
|     QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll; |     QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll; | ||||||
|     QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll; |  | ||||||
|   } |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){ | ||||||
|     QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm; |     QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm; | ||||||
|     QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm; |  | ||||||
|   } |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){ | ||||||
|     QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric; |     QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric; | ||||||
|     QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric; |  | ||||||
|   } |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ |  | ||||||
|     QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute; |  | ||||||
|   } else { |  | ||||||
|     QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; |  | ||||||
|   } |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){ |  | ||||||
|     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent); |  | ||||||
|   } |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ |  | ||||||
|     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |  | ||||||
|   } |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ | ||||||
|     LebesgueOrder::UseLebesgueOrder=1; |     LebesgueOrder::UseLebesgueOrder=1; | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); | ||||||
|     GridCmdOptionIntVector(arg,LebesgueOrder::Block); |     GridCmdOptionIntVector(arg,LebesgueOrder::Block); | ||||||
| @@ -393,25 +373,23 @@ void Grid_finalize(void) | |||||||
|   MPI_Finalize(); |   MPI_Finalize(); | ||||||
|   Grid_unquiesce_nodes(); |   Grid_unquiesce_nodes(); | ||||||
| #endif | #endif | ||||||
| #if defined (GRID_COMMS_SHMEM) |  | ||||||
|   shmem_finalize(); |  | ||||||
| #endif |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void * Grid_backtrace_buffer[_NBACKTRACE]; | void * Grid_backtrace_buffer[_NBACKTRACE]; | ||||||
| 
 | 
 | ||||||
| void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) | void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) | ||||||
| { | { | ||||||
|   fprintf(stderr,"Caught signal %d\n",si->si_signo); |   printf("Caught signal %d\n",si->si_signo); | ||||||
|   fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr); |   printf("  mem address %llx\n",(unsigned long long)si->si_addr); | ||||||
|   fprintf(stderr,"         code %d\n",si->si_code); |   printf("         code %d\n",si->si_code); | ||||||
|  | 
 | ||||||
|   // Linux/Posix
 |   // Linux/Posix
 | ||||||
| #ifdef __linux__ | #ifdef __linux__ | ||||||
|   // And x86 64bit
 |   // And x86 64bit
 | ||||||
| #ifdef __x86_64__ | #ifdef __x86_64__ | ||||||
|   ucontext_t * uc= (ucontext_t *)ptr; |   ucontext_t * uc= (ucontext_t *)ptr; | ||||||
|   struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; |   struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; | ||||||
|   fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip); |   printf("  instruction %llx\n",(unsigned long long)sc->rip); | ||||||
| #define REG(A)  printf("  %s %lx\n",#A,sc-> A); | #define REG(A)  printf("  %s %lx\n",#A,sc-> A); | ||||||
|   REG(rdi); |   REG(rdi); | ||||||
|   REG(rsi); |   REG(rsi); | ||||||
| @@ -434,11 +412,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) | |||||||
|   REG(r15); |   REG(r15); | ||||||
| #endif | #endif | ||||||
| #endif | #endif | ||||||
|   fflush(stderr); |   BACKTRACE(); | ||||||
|   BACKTRACEFP(stderr); |  | ||||||
|   fprintf(stderr,"Called backtrace\n"); |  | ||||||
|   fflush(stdout); |  | ||||||
|   fflush(stderr); |  | ||||||
|   exit(0); |   exit(0); | ||||||
|   return; |   return; | ||||||
| }; | }; | ||||||
| @@ -451,11 +425,9 @@ void Grid_debug_handler_init(void) | |||||||
|   sa.sa_flags    = SA_SIGINFO; |   sa.sa_flags    = SA_SIGINFO; | ||||||
|   sigaction(SIGSEGV,&sa,NULL); |   sigaction(SIGSEGV,&sa,NULL); | ||||||
|   sigaction(SIGTRAP,&sa,NULL); |   sigaction(SIGTRAP,&sa,NULL); | ||||||
|   sigaction(SIGBUS,&sa,NULL); |  | ||||||
| 
 | 
 | ||||||
|   feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); |   feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); | ||||||
| 
 | 
 | ||||||
|   sigaction(SIGFPE,&sa,NULL); |   sigaction(SIGFPE,&sa,NULL); | ||||||
|   sigaction(SIGKILL,&sa,NULL); |  | ||||||
| } | } | ||||||
| } | } | ||||||
| @@ -29,10 +29,9 @@ See the full license in the file "LICENSE" in the top level distribution | |||||||
| directory | directory | ||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
| 
 | 
 | ||||||
| #include <cxxabi.h> | #include <cxxabi.h> | ||||||
| #include <memory> |  | ||||||
| 
 | 
 | ||||||
| namespace Grid { | namespace Grid { | ||||||
| 
 | 
 | ||||||
							
								
								
									
										
											BIN
										
									
								
								lib/Old/Endeavour.tgz
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								lib/Old/Endeavour.tgz
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										154
									
								
								lib/Old/Tensor_peek.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								lib/Old/Tensor_peek.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,154 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/Old/Tensor_peek.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #ifndef GRID_MATH_PEEK_H | ||||||
|  | #define GRID_MATH_PEEK_H | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  | ////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Peek on a specific index; returns a scalar in that index, tensor inherits rest | ||||||
|  | ////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // If we hit the right index, return scalar with no further recursion | ||||||
|  |  | ||||||
|  | //template<int Level> inline ComplexF peekIndex(const ComplexF arg) { return arg;} | ||||||
|  | //template<int Level> inline ComplexD peekIndex(const ComplexD arg) { return arg;} | ||||||
|  | //template<int Level> inline RealF peekIndex(const RealF arg) { return arg;} | ||||||
|  | //template<int Level> inline RealD peekIndex(const RealD arg) { return arg;} | ||||||
|  | #if 0 | ||||||
|  | // Scalar peek, no indices | ||||||
|  | template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iScalar<vtype> &arg) ->  iScalar<vtype>  | ||||||
|  | { | ||||||
|  |   return arg; | ||||||
|  | } | ||||||
|  | // Vector peek, one index | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iVector<vtype,N> &arg,int i) -> iScalar<vtype> // Index matches | ||||||
|  | { | ||||||
|  |   iScalar<vtype> ret;                              // return scalar | ||||||
|  |   ret._internal = arg._internal[i]; | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | // Matrix peek, two indices | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->  iScalar<vtype> | ||||||
|  | { | ||||||
|  |   iScalar<vtype> ret;                              // return scalar | ||||||
|  |   ret._internal = arg._internal[i][j]; | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ///////////// | ||||||
|  | // No match peek for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue | ||||||
|  | ///////////// | ||||||
|  | // scalar | ||||||
|  | template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iScalar<vtype> &arg) -> iScalar<decltype(peekIndex<Level>(arg._internal))> | ||||||
|  | { | ||||||
|  |   iScalar<decltype(peekIndex<Level>(arg._internal))> ret; | ||||||
|  |   ret._internal= peekIndex<Level>(arg._internal); | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iScalar<vtype> &arg,int i) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i))>  | ||||||
|  | { | ||||||
|  |   iScalar<decltype(peekIndex<Level>(arg._internal,i))> ret; | ||||||
|  |   ret._internal=peekIndex<Level>(arg._internal,i); | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | template<int Level,class vtype, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iScalar<vtype> &arg,int i,int j) ->  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> | ||||||
|  | { | ||||||
|  |   iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> ret; | ||||||
|  |   ret._internal=peekIndex<Level>(arg._internal,i,j); | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | // vector | ||||||
|  | template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  | auto peekIndex(const iVector<vtype,N> &arg) ->   iVector<decltype(peekIndex<Level>(arg._internal[0])),N> | ||||||
|  | { | ||||||
|  |   iVector<decltype(peekIndex<Level>(arg._internal[0])),N> ret; | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |     ret._internal[ii]=peekIndex<Level>(arg._internal[ii]); | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iVector<vtype,N> &arg,int i) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> | ||||||
|  | { | ||||||
|  |   iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> ret; | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |     ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i); | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iVector<vtype,N> &arg,int i,int j) ->  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N>  | ||||||
|  | { | ||||||
|  |   iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> ret; | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |     ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i,j); | ||||||
|  |   } | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // matrix | ||||||
|  | template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  | auto peekIndex(const iMatrix<vtype,N> &arg) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N>  | ||||||
|  | { | ||||||
|  |   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> ret; | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |   for(int jj=0;jj<N;jj++){ | ||||||
|  |     ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj]);// Could avoid this because peeking a scalar is dumb | ||||||
|  |   }} | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iMatrix<vtype,N> &arg,int i) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N> | ||||||
|  | { | ||||||
|  |   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i)),N> ret; | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |   for(int jj=0;jj<N;jj++){ | ||||||
|  |     ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i); | ||||||
|  |   }} | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N, typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) ->   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> | ||||||
|  | { | ||||||
|  |   iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> ret; | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |   for(int jj=0;jj<N;jj++){ | ||||||
|  |     ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i,j); | ||||||
|  |   }} | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
|  | #endif | ||||||
							
								
								
									
										127
									
								
								lib/Old/Tensor_poke.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										127
									
								
								lib/Old/Tensor_poke.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,127 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/Old/Tensor_poke.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #ifndef GRID_MATH_POKE_H | ||||||
|  | #define GRID_MATH_POKE_H | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  | ////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Poke a specific index;  | ||||||
|  | ////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #if 0 | ||||||
|  | // Scalar poke | ||||||
|  | template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iScalar<vtype> &ret, const iScalar<vtype> &arg) | ||||||
|  | { | ||||||
|  |   ret._internal = arg._internal; | ||||||
|  | } | ||||||
|  | // Vector poke, one index | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iVector<vtype,N> &ret, const iScalar<vtype> &arg,int i) | ||||||
|  | { | ||||||
|  |   ret._internal[i] = arg._internal; | ||||||
|  | } | ||||||
|  | //Matrix poke, two indices | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel == Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iMatrix<vtype,N> &ret, const iScalar<vtype> &arg,int i,int j) | ||||||
|  | { | ||||||
|  |   ret._internal[i][j] = arg._internal; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | ///////////// | ||||||
|  | // No match poke for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue | ||||||
|  | ///////////// | ||||||
|  | // scalar | ||||||
|  | template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  | void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal))>  &arg) | ||||||
|  | { | ||||||
|  |   pokeIndex<Level>(ret._internal,arg._internal); | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0))> &arg, int i) | ||||||
|  | 		  | ||||||
|  | { | ||||||
|  |   pokeIndex<Level>(ret._internal,arg._internal,i); | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(peekIndex<Level>(ret._internal,0,0))> &arg,int i,int j) | ||||||
|  | { | ||||||
|  |   pokeIndex<Level>(ret._internal,arg._internal,i,j); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Vector | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iVector<vtype,N> &ret, iVector<decltype(peekIndex<Level>(ret._internal)),N>  &arg) | ||||||
|  | { | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |     pokeIndex<Level>(ret._internal[ii],arg._internal[ii]); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i) | ||||||
|  | { | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |     pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg,int i,int j) | ||||||
|  | { | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |     pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i,j); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Matrix | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal)),N> &arg)		  | ||||||
|  | { | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |   for(int jj=0;jj<N;jj++){ | ||||||
|  |     pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj]); | ||||||
|  |   }} | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0)),N> &arg,int i) | ||||||
|  | { | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |   for(int jj=0;jj<N;jj++){ | ||||||
|  |     pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i); | ||||||
|  |   }} | ||||||
|  | } | ||||||
|  | template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline  | ||||||
|  |   void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(peekIndex<Level>(ret._internal,0,0)),N> &arg, int i,int j) | ||||||
|  | { | ||||||
|  |   for(int ii=0;ii<N;ii++){ | ||||||
|  |   for(int jj=0;jj<N;jj++){ | ||||||
|  |     pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i,j); | ||||||
|  |   }} | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | } | ||||||
|  | #endif | ||||||
| @@ -26,8 +26,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| 
 | 
 | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/perfmon/PerfCount.h> | #include <Grid/PerfCount.h> | ||||||
| 
 | 
 | ||||||
| namespace Grid { | namespace Grid { | ||||||
| 
 | 
 | ||||||
| @@ -172,7 +172,7 @@ public: | |||||||
|     const char * name = PerformanceCounterConfigs[PCT].name; |     const char * name = PerformanceCounterConfigs[PCT].name; | ||||||
|     fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
 |     fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
 | ||||||
|     if (fd == -1) { |     if (fd == -1) { | ||||||
|       fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name); |       fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name); | ||||||
|       perror("Error is"); |       perror("Error is"); | ||||||
|     } |     } | ||||||
|     int norm = PerformanceCounterConfigs[PCT].normalisation; |     int norm = PerformanceCounterConfigs[PCT].normalisation; | ||||||
| @@ -181,7 +181,7 @@ public: | |||||||
|     name = PerformanceCounterConfigs[norm].name; |     name = PerformanceCounterConfigs[norm].name; | ||||||
|     cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
 |     cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
 | ||||||
|     if (cyclefd == -1) { |     if (cyclefd == -1) { | ||||||
|       fprintf(stderr, "Error opening leader %llx for event %s\n",(long long) pe.config,name); |       fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name); | ||||||
|       perror("Error is"); |       perror("Error is"); | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
| @@ -172,8 +172,8 @@ namespace Grid { | |||||||
| 
 | 
 | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| #include <Grid/simd/Grid_vector_types.h> | #include "simd/Grid_vector_types.h" | ||||||
| #include <Grid/simd/Grid_vector_unops.h> | #include "simd/Grid_vector_unops.h" | ||||||
| 
 | 
 | ||||||
| namespace Grid { | namespace Grid { | ||||||
|   // Default precision
 |   // Default precision
 | ||||||
| @@ -1,9 +1,11 @@ | |||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/perfmon/PerfCount.h> | #include <Grid/PerfCount.h> | ||||||
| #include <Grid/perfmon/Stat.h> | #include <Grid/Stat.h> | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| namespace Grid {  | namespace Grid {  | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| bool PmuStat::pmu_initialized=false; | bool PmuStat::pmu_initialized=false; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @@ -1,4 +1,4 @@ | |||||||
| /*************************************************************************************
 |    /*************************************************************************************
 | ||||||
| 
 | 
 | ||||||
|      Grid physics library, www.github.com/paboyle/Grid  |      Grid physics library, www.github.com/paboyle/Grid  | ||||||
| 
 | 
 | ||||||
| @@ -25,11 +25,13 @@ | |||||||
|      See the full license in the file "LICENSE" in the top level distribution directory |      See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|      *************************************************************************************/ |      *************************************************************************************/ | ||||||
|      /*  END LEGAL */ |      /*  END LEGAL */ | ||||||
| #ifndef GRID_STENCIL_H |  #ifndef GRID_STENCIL_H | ||||||
| #define GRID_STENCIL_H |  #define GRID_STENCIL_H | ||||||
|  | 
 | ||||||
|  |  #include <thread> | ||||||
|  | 
 | ||||||
|  |  #include <Grid/stencil/Lebesgue.h>   // subdir aggregate | ||||||
| 
 | 
 | ||||||
| #include <Grid/stencil/Lebesgue.h>   // subdir aggregate |  | ||||||
| #define NEW_XYZT_GATHER |  | ||||||
|  //////////////////////////////////////////////////////////////////////////////////////////
 |  //////////////////////////////////////////////////////////////////////////////////////////
 | ||||||
|  // Must not lose sight that goal is to be able to construct really efficient
 |  // Must not lose sight that goal is to be able to construct really efficient
 | ||||||
|  // gather to a point stencil code. CSHIFT is not the best way, so need
 |  // gather to a point stencil code. CSHIFT is not the best way, so need
 | ||||||
| @@ -68,48 +70,50 @@ | |||||||
| 
 | 
 | ||||||
| namespace Grid { | namespace Grid { | ||||||
| 
 | 
 | ||||||
| ///////////////////////////////////////////////////////////////////
 | inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | ||||||
| // Gather for when there *is* need to SIMD split with compression
 | 					       int off,std::vector<std::pair<int,int> > & table) | ||||||
| ///////////////////////////////////////////////////////////////////
 |  | ||||||
| void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, |  | ||||||
| 					int off,std::vector<std::pair<int,int> > & table); |  | ||||||
| 
 |  | ||||||
| template<class vobj,class cobj,class compressor>  |  | ||||||
| void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline)); |  | ||||||
| 
 |  | ||||||
| template<class vobj,class cobj,class compressor>  |  | ||||||
| void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) |  | ||||||
| { | { | ||||||
|   int num=table.size(); |   table.resize(0); | ||||||
|   parallel_for(int i=0;i<num;i++){ |   int rd = grid->_rdimensions[dimension]; | ||||||
|     vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second])); | 
 | ||||||
|  |   if ( !grid->CheckerBoarded(dimension) ) { | ||||||
|  |     cbmask = 0x3; | ||||||
|  |   } | ||||||
|  |   int so= plane*grid->_ostride[dimension]; // base offset for start of plane 
 | ||||||
|  |   int e1=grid->_slice_nblock[dimension]; | ||||||
|  |   int e2=grid->_slice_block[dimension]; | ||||||
|  | 
 | ||||||
|  |   int stride=grid->_slice_stride[dimension]; | ||||||
|  |   if ( cbmask == 0x3 ) {  | ||||||
|  |     table.resize(e1*e2); | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int o  = n*stride; | ||||||
|  | 	int bo = n*e2; | ||||||
|  | 	table[bo+b]=std::pair<int,int>(bo+b,o+b); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } else {  | ||||||
|  |      int bo=0; | ||||||
|  |      table.resize(e1*e2/2); | ||||||
|  |      for(int n=0;n<e1;n++){ | ||||||
|  |        for(int b=0;b<e2;b++){ | ||||||
|  | 	 int o  = n*stride; | ||||||
|  | 	 int ocb=1<<grid->CheckerBoardFromOindexTable(o+b); | ||||||
|  | 	 if ( ocb &cbmask ) { | ||||||
|  | 	   table[bo]=std::pair<int,int>(bo,o+b); bo++; | ||||||
|  | 	 } | ||||||
|  |        } | ||||||
|  |      } | ||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| ///////////////////////////////////////////////////////////////////
 | template<class vobj,class cobj,class compressor> void  | ||||||
| // Gather for when there *is* need to SIMD split with compression
 | Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) | ||||||
| ///////////////////////////////////////////////////////////////////
 |  | ||||||
| template<class cobj,class vobj,class compressor> |  | ||||||
| void Gather_plane_exchange_table(const Lattice<vobj> &rhs, |  | ||||||
| 				 std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); |  | ||||||
| 
 |  | ||||||
| template<class cobj,class vobj,class compressor> |  | ||||||
| void Gather_plane_exchange_table(std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, |  | ||||||
| 				 std::vector<cobj *> pointers,int dimension,int plane,int cbmask, |  | ||||||
| 				 compressor &compress,int type) |  | ||||||
| { | { | ||||||
|   assert( (table.size()&0x1)==0); | PARALLEL_FOR_LOOP      | ||||||
|   int num=table.size()/2; |      for(int i=0;i<table.size();i++){ | ||||||
|   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 |        vstream(buffer[off+table[i].first],compress(rhs._odata[so+table[i].second])); | ||||||
|   parallel_for(int j=0;j<num;j++){ |  | ||||||
|     //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
 |  | ||||||
|     cobj temp1 =compress(rhs._odata[so+table[2*j].second]); |  | ||||||
|     cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]); |  | ||||||
|     cobj temp3; |  | ||||||
|     cobj temp4; |  | ||||||
|     exchange(temp3,temp4,temp1,temp2,type); |  | ||||||
|     vstream(pointers[0][j],temp3); |  | ||||||
|     vstream(pointers[1][j],temp4); |  | ||||||
|      } |      } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @@ -121,8 +125,6 @@ struct StencilEntry { | |||||||
|   uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
 |   uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
 | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| //extern int dump;
 |  | ||||||
| 
 |  | ||||||
| template<class vobj,class cobj> | template<class vobj,class cobj> | ||||||
| class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | ||||||
|  public: |  public: | ||||||
| @@ -157,6 +159,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     p.to_rank  = to; |     p.to_rank  = to; | ||||||
|     p.from_rank= from; |     p.from_rank= from; | ||||||
|     p.bytes    = bytes; |     p.bytes    = bytes; | ||||||
|  |     comms_bytes+=2.0*bytes; | ||||||
|     Packets.push_back(p); |     Packets.push_back(p); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
| @@ -165,45 +168,36 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     reqs.resize(Packets.size()); |     reqs.resize(Packets.size()); | ||||||
|     commtime-=usecond(); |     commtime-=usecond(); | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i], | 	_grid->StencilSendToRecvFromBegin(reqs[i], | ||||||
|  | 					  Packets[i].send_buf, | ||||||
|  | 					  Packets[i].to_rank, | ||||||
|  | 					  Packets[i].recv_buf, | ||||||
|  | 					  Packets[i].from_rank, | ||||||
|  | 					  Packets[i].bytes); | ||||||
|  | 	/*
 | ||||||
|  |       }else{ | ||||||
|  | 	_grid->SendToRecvFromBegin(reqs[i], | ||||||
| 				   Packets[i].send_buf, | 				   Packets[i].send_buf, | ||||||
| 				   Packets[i].to_rank, | 				   Packets[i].to_rank, | ||||||
| 				   Packets[i].recv_buf, | 				   Packets[i].recv_buf, | ||||||
| 				   Packets[i].from_rank, | 				   Packets[i].from_rank, | ||||||
| 				   Packets[i].bytes); | 				   Packets[i].bytes); | ||||||
|  |       } | ||||||
|  | 	*/ | ||||||
|     } |     } | ||||||
|     commtime+=usecond(); |     commtime+=usecond(); | ||||||
|   } |   } | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     commtime-=usecond(); |     commtime-=usecond(); | ||||||
|  | 
 | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       //      if( ShmDirectCopy ) 
 | ||||||
| 	_grid->StencilSendToRecvFromComplete(reqs[i]); | 	_grid->StencilSendToRecvFromComplete(reqs[i]); | ||||||
|  | 	//      else 
 | ||||||
|  | 	//	_grid->SendToRecvFromComplete(reqs[i]);
 | ||||||
|     } |     } | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes
 |  | ||||||
|     commtime+=usecond(); |     commtime+=usecond(); | ||||||
|     /*
 |  | ||||||
|     int dump=1; |  | ||||||
|     if(dump){ |  | ||||||
|       for(int i=0;i<Packets.size();i++){ |  | ||||||
| 	cobj * ptr  = (cobj *) Packets[i].recv_buf; |  | ||||||
| 	uint64_t num=Packets[i].bytes/sizeof(cobj); |  | ||||||
| 	  std::cout << " CommunicateComplete " << i<< " / " << Packets.size()<< " num " << num <<std::endl; |  | ||||||
| 	  std::stringstream ss; |  | ||||||
| 	  ss<<"recvbuf"; |  | ||||||
| 	  for(int d=0;d<_grid->_ndimension;d++){ |  | ||||||
| 	    ss<<"."<<_grid->_processor_coor[d]; |  | ||||||
| 	  } |  | ||||||
| 	  ss<<"_mu_"<<i; |  | ||||||
| 	  std::string fname(ss.str()); |  | ||||||
| 	  std::ofstream fout(fname); |  | ||||||
| 	  for(int k=0;k<num;k++) { |  | ||||||
| 	    fout << i<<" "<<k<<" "<<ptr[k]<<std::endl; |  | ||||||
| 	  } |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     dump =0; |  | ||||||
| */ |  | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   ///////////////////////////////////////////
 |   ///////////////////////////////////////////
 | ||||||
| @@ -212,18 +206,14 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   struct Merge { |   struct Merge { | ||||||
|     cobj * mpointer; |     cobj * mpointer; | ||||||
|     std::vector<scalar_object *> rpointers; |     std::vector<scalar_object *> rpointers; | ||||||
|     std::vector<cobj *> vpointers; |  | ||||||
|     Integer buffer_size; |     Integer buffer_size; | ||||||
|     Integer packet_id; |     Integer packet_id; | ||||||
|     Integer exchange; |  | ||||||
|     Integer type; |  | ||||||
|   }; |   }; | ||||||
|    |    | ||||||
|   std::vector<Merge> Mergers; |   std::vector<Merge> Mergers; | ||||||
| 
 | 
 | ||||||
|   void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) { |   void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) { | ||||||
|     Merge m; |     Merge m; | ||||||
|     m.exchange = 0; |  | ||||||
|     m.mpointer = merge_p; |     m.mpointer = merge_p; | ||||||
|     m.rpointers= rpointers; |     m.rpointers= rpointers; | ||||||
|     m.buffer_size = buffer_size; |     m.buffer_size = buffer_size; | ||||||
| @@ -231,48 +221,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     Mergers.push_back(m); |     Mergers.push_back(m); | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
|   void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) { |  | ||||||
|     Merge m; |  | ||||||
|     m.exchange = 1; |  | ||||||
|     m.type     = type; |  | ||||||
|     m.mpointer = merge_p; |  | ||||||
|     m.vpointers= rpointers; |  | ||||||
|     m.buffer_size = buffer_size; |  | ||||||
|     m.packet_id   = packet_id; |  | ||||||
|     Mergers.push_back(m); |  | ||||||
|   } |  | ||||||
| 
 |  | ||||||
|   void CommsMerge(void ) {  |   void CommsMerge(void ) {  | ||||||
| 
 | 
 | ||||||
|     for(int i=0;i<Mergers.size();i++){	 |     for(int i=0;i<Mergers.size();i++){	 | ||||||
|  |        | ||||||
|       mergetime-=usecond(); |       mergetime-=usecond(); | ||||||
| 
 | PARALLEL_FOR_LOOP | ||||||
|       //      std::cout << "Merge " <<i << std::endl;
 |       for(int o=0;o<Mergers[i].buffer_size;o++){ | ||||||
|       //      std::stringstream ss;
 |  | ||||||
|       //      ss<<"mergebuf";
 |  | ||||||
|       //      for(int d=0;d<_grid->_ndimension;d++){
 |  | ||||||
|       //	ss<<"."<<_grid->_processor_coor[d];
 |  | ||||||
|       //      }
 |  | ||||||
|       //      ss<<"_m_"<<i;
 |  | ||||||
|       //      std::string fname(ss.str());
 |  | ||||||
|       //      std::ofstream fout(fname);
 |  | ||||||
| 
 |  | ||||||
|       if ( Mergers[i].exchange == 0 ) {  |  | ||||||
| 	parallel_for(int o=0;o<Mergers[i].buffer_size;o++){ |  | ||||||
| 	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o); | 	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o); | ||||||
| 	  //	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
 |  | ||||||
| 	} |  | ||||||
|       } else {  |  | ||||||
| 	parallel_for(int o=0;o<Mergers[i].buffer_size/2;o++){ |  | ||||||
| 	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1], |  | ||||||
| 		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type); |  | ||||||
| 	  //	  cobj temp1,temp2;
 |  | ||||||
| 	  //	  exchange(temp1,temp2,Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 |  | ||||||
| 	  //	  vstream(Mergers[i].mpointer[2*o],temp1);
 |  | ||||||
| 	  //	  vstream(Mergers[i].mpointer[2*o+1],temp2);
 |  | ||||||
| 	} |  | ||||||
|       } |       } | ||||||
|       mergetime+=usecond(); |       mergetime+=usecond(); | ||||||
|  | 
 | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| 
 | 
 | ||||||
| @@ -336,8 +295,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   // depending on comms target
 |   // depending on comms target
 | ||||||
|   cobj* u_recv_buf_p; |   cobj* u_recv_buf_p; | ||||||
|   cobj* u_send_buf_p; |   cobj* u_send_buf_p; | ||||||
|   std::vector<cobj *> new_simd_send_buf; |  | ||||||
|   std::vector<cobj *> new_simd_recv_buf; |  | ||||||
|   std::vector<scalar_object *> u_simd_send_buf; |   std::vector<scalar_object *> u_simd_send_buf; | ||||||
|   std::vector<scalar_object *> u_simd_recv_buf; |   std::vector<scalar_object *> u_simd_recv_buf; | ||||||
| 
 | 
 | ||||||
| @@ -349,8 +306,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   /////////////////////////////////////////
 |   /////////////////////////////////////////
 | ||||||
|   // Timing info; ugly; possibly temporary
 |   // Timing info; ugly; possibly temporary
 | ||||||
|   /////////////////////////////////////////
 |   /////////////////////////////////////////
 | ||||||
| #define TIMING_HACK |  #define TIMING_HACK | ||||||
| #ifdef TIMING_HACK |  #ifdef TIMING_HACK | ||||||
|   double jointime; |   double jointime; | ||||||
|   double gathertime; |   double gathertime; | ||||||
|   double commtime; |   double commtime; | ||||||
| @@ -384,11 +341,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   void Report(void) { |   void Report(void) { | ||||||
| #define PRINTIT(A)	\ | #define PRINTIT(A)	\ | ||||||
|  std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; |  std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; | ||||||
| 
 |  | ||||||
|     RealD NP = _grid->_Nprocessors; |  | ||||||
|     RealD NN = _grid->NodeCount(); |  | ||||||
| 
 |  | ||||||
|     _grid->GlobalSum(commtime);    commtime/=NP; |  | ||||||
|     if ( calls > 0. ) { |     if ( calls > 0. ) { | ||||||
|       std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl; |       std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl; | ||||||
|       PRINTIT(halogtime); |       PRINTIT(halogtime); | ||||||
| @@ -398,8 +350,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|       if(comms_bytes>1.0){ |       if(comms_bytes>1.0){ | ||||||
| 	PRINTIT(comms_bytes); | 	PRINTIT(comms_bytes); | ||||||
| 	PRINTIT(commtime); | 	PRINTIT(commtime); | ||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; | 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl; | ||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; |  | ||||||
|       } |       } | ||||||
|       PRINTIT(jointime); |       PRINTIT(jointime); | ||||||
|       PRINTIT(spintime); |       PRINTIT(spintime); | ||||||
| @@ -443,9 +394,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|        |        | ||||||
|       _checkerboard = checkerboard; |       _checkerboard = checkerboard; | ||||||
|        |        | ||||||
|       //////////////////////////
 |  | ||||||
|       // the permute type
 |       // the permute type
 | ||||||
|       //////////////////////////
 |  | ||||||
|       int simd_layout     = _grid->_simd_layout[dimension]; |       int simd_layout     = _grid->_simd_layout[dimension]; | ||||||
|       int comm_dim        = _grid->_processors[dimension] >1 ; |       int comm_dim        = _grid->_processors[dimension] >1 ; | ||||||
|       int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim); |       int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim); | ||||||
| @@ -455,11 +404,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|        |        | ||||||
|       int sshift[2]; |       int sshift[2]; | ||||||
|        |        | ||||||
|       //////////////////////////
 |  | ||||||
|       // Underlying approach. For each local site build
 |       // Underlying approach. For each local site build
 | ||||||
|       // up a table containing the npoint "neighbours" and whether they 
 |       // up a table containing the npoint "neighbours" and whether they 
 | ||||||
|       // live in lattice or a comms buffer.
 |       // live in lattice or a comms buffer.
 | ||||||
|       //////////////////////////
 |  | ||||||
|       if ( !comm_dim ) { |       if ( !comm_dim ) { | ||||||
| 	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); | 	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); | ||||||
| 	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); | 	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); | ||||||
| @@ -470,11 +417,11 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 	  Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 	  Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | ||||||
| 	  Local(point,dimension,shift,0x2);// both with block stride loop iteration
 | 	  Local(point,dimension,shift,0x2);// both with block stride loop iteration
 | ||||||
| 	} | 	} | ||||||
|       } else {  |       } else { // All permute extract done in comms phase prior to Stencil application
 | ||||||
| 	// All permute extract done in comms phase prior to Stencil application
 |  | ||||||
| 	//        So tables are the same whether comm_dim or splice_dim
 | 	//        So tables are the same whether comm_dim or splice_dim
 | ||||||
| 	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); | 	sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even); | ||||||
| 	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); | 	sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); | ||||||
|  | 	 | ||||||
| 	if ( sshift[0] == sshift[1] ) { | 	if ( sshift[0] == sshift[1] ) { | ||||||
| 	  Comms(point,dimension,shift,0x3); | 	  Comms(point,dimension,shift,0x3); | ||||||
| 	} else { | 	} else { | ||||||
| @@ -493,21 +440,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 
 | 
 | ||||||
|     u_simd_send_buf.resize(Nsimd); |     u_simd_send_buf.resize(Nsimd); | ||||||
|     u_simd_recv_buf.resize(Nsimd); |     u_simd_recv_buf.resize(Nsimd); | ||||||
|     new_simd_send_buf.resize(Nsimd); | 
 | ||||||
|     new_simd_recv_buf.resize(Nsimd); |  | ||||||
|     u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |     u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
|     u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |     u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
| #ifdef NEW_XYZT_GATHER |  | ||||||
|     for(int l=0;l<2;l++){ |  | ||||||
|       new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |  | ||||||
|       new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     for(int l=0;l<Nsimd;l++){ |     for(int l=0;l<Nsimd;l++){ | ||||||
|       u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); |       u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); | ||||||
|       u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); |       u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object)); | ||||||
|     } |     } | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
|     PrecomputeByteOffsets(); |     PrecomputeByteOffsets(); | ||||||
|   } |   } | ||||||
| @@ -574,11 +513,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     assert(shift>=0); |     assert(shift>=0); | ||||||
|     assert(shift<fd); |     assert(shift<fd); | ||||||
|      |      | ||||||
|     // done in reduced dims, so SIMD factored
 |     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored
 | ||||||
|     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];  |  | ||||||
|      |      | ||||||
|     _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
 |     _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
 | ||||||
| 
 |  | ||||||
|     // send to one or more remote nodes.
 |     // send to one or more remote nodes.
 | ||||||
|      |      | ||||||
|     int cb= (cbmask==0x2)? Odd : Even; |     int cb= (cbmask==0x2)? Odd : Even; | ||||||
| @@ -741,10 +678,13 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     calls++; |     calls++; | ||||||
|     Mergers.resize(0); |     Mergers.resize(0); | ||||||
|     Packets.resize(0); |     Packets.resize(0); | ||||||
|  |     _grid->StencilBarrier(); | ||||||
|     HaloGather(source,compress); |     HaloGather(source,compress); | ||||||
|     this->CommunicateBegin(reqs); |     this->CommunicateBegin(reqs); | ||||||
|  |     _grid->StencilBarrier(); | ||||||
|     this->CommunicateComplete(reqs); |     this->CommunicateComplete(reqs); | ||||||
|     CommsMerge();  |     _grid->StencilBarrier(); | ||||||
|  |     CommsMerge(); // spins
 | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx) |   template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx) | ||||||
| @@ -775,13 +715,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|       if ( sshift[0] == sshift[1] ) { |       if ( sshift[0] == sshift[1] ) { | ||||||
| 	if (splice_dim) { | 	if (splice_dim) { | ||||||
| 	  splicetime-=usecond(); | 	  splicetime-=usecond(); | ||||||
| 	  //	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 |  | ||||||
| 	  //	  std::cout << "GatherSimdNew"<<std::endl;
 |  | ||||||
| #ifdef NEW_XYZT_GATHER |  | ||||||
| 	  GatherSimdNew(source,dimension,shift,0x3,compress,face_idx); |  | ||||||
| #else  |  | ||||||
| 	  GatherSimd(source,dimension,shift,0x3,compress,face_idx); | 	  GatherSimd(source,dimension,shift,0x3,compress,face_idx); | ||||||
| #endif |  | ||||||
| 	  splicetime+=usecond(); | 	  splicetime+=usecond(); | ||||||
| 	} else {  | 	} else {  | ||||||
| 	  nosplicetime-=usecond(); | 	  nosplicetime-=usecond(); | ||||||
| @@ -791,14 +725,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|       } else { |       } else { | ||||||
| 	if(splice_dim){ | 	if(splice_dim){ | ||||||
| 	  splicetime-=usecond(); | 	  splicetime-=usecond(); | ||||||
| 	  //	  std::cout << "GatherSimdNew2calls"<<std::endl;
 |  | ||||||
| #ifdef NEW_XYZT_GATHER |  | ||||||
| 	  GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 |  | ||||||
| 	  GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 |  | ||||||
| #else  |  | ||||||
| 	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 | 	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 | ||||||
| 	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 | 	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 | ||||||
| #endif |  | ||||||
| 	  splicetime+=usecond(); | 	  splicetime+=usecond(); | ||||||
| 	} else { | 	} else { | ||||||
| 	  nosplicetime-=usecond(); | 	  nosplicetime-=usecond(); | ||||||
| @@ -813,8 +741,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes
 |  | ||||||
| 
 |  | ||||||
|     // conformable(source._grid,_grid);
 |     // conformable(source._grid,_grid);
 | ||||||
|     assert(source._grid==_grid); |     assert(source._grid==_grid); | ||||||
|     halogtime-=usecond(); |     halogtime-=usecond(); | ||||||
| @@ -875,12 +801,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 	if ( !face_table_computed ) { | 	if ( !face_table_computed ) { | ||||||
| 	  t_table-=usecond(); | 	  t_table-=usecond(); | ||||||
| 	  face_table.resize(face_idx+1); | 	  face_table.resize(face_idx+1); | ||||||
| 	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); | 	  Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset, | ||||||
| 	  //	  std::cout << " face table size "<<face_idx <<" " <<  face_table[face_idx].size() <<" computed buffer size "<< words <<
 | 					     face_table[face_idx]); | ||||||
| 	  //		    " bytes = " << bytes <<std::endl;
 |  | ||||||
| 	  t_table+=usecond(); | 	  t_table+=usecond(); | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
|  | 	 | ||||||
| 	int rank           = _grid->_processor; | 	int rank           = _grid->_processor; | ||||||
| 	int recv_from_rank; | 	int recv_from_rank; | ||||||
| 	int xmit_to_rank; | 	int xmit_to_rank; | ||||||
| @@ -892,14 +818,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 	/////////////////////////////////////////////////////////
 | 	/////////////////////////////////////////////////////////
 | ||||||
| 	// try the direct copy if possible
 | 	// try the direct copy if possible
 | ||||||
| 	/////////////////////////////////////////////////////////
 | 	/////////////////////////////////////////////////////////
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); | 	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p); | ||||||
| 	if ( send_buf==NULL ) {  | 	if ( send_buf==NULL ) {  | ||||||
| 	  send_buf = u_send_buf_p; | 	  send_buf = u_send_buf_p; | ||||||
| 	} | 	} | ||||||
| 	 | 	//	std::cout << " send_bufs  "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
 | ||||||
| 	t_data-=usecond(); | 	t_data-=usecond(); | ||||||
|  | 	assert(u_send_buf_p!=NULL); | ||||||
| 	assert(send_buf!=NULL); | 	assert(send_buf!=NULL); | ||||||
| 	Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++; | 	Gather_plane_simple_table         (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++; | ||||||
| 	t_data+=usecond(); | 	t_data+=usecond(); | ||||||
| 	 | 	 | ||||||
| 	AddPacket((void *)&send_buf[u_comm_offset], | 	AddPacket((void *)&send_buf[u_comm_offset], | ||||||
| @@ -947,8 +876,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     std::vector<scalar_object *> rpointers(Nsimd); |     std::vector<scalar_object *> rpointers(Nsimd); | ||||||
|     std::vector<scalar_object *> spointers(Nsimd); |     std::vector<scalar_object *> spointers(Nsimd); | ||||||
|      |      | ||||||
|     //    std::cout << "GatherSimd " << dimension << " shift "<<shift<<std::endl;
 |  | ||||||
| 
 |  | ||||||
|     ///////////////////////////////////////////
 |     ///////////////////////////////////////////
 | ||||||
|     // Work out what to send where
 |     // Work out what to send where
 | ||||||
|     ///////////////////////////////////////////
 |     ///////////////////////////////////////////
 | ||||||
| @@ -975,10 +902,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 
 | 
 | ||||||
| 	for(int i=0;i<Nsimd;i++){ | 	for(int i=0;i<Nsimd;i++){ | ||||||
| 	   | 	   | ||||||
| 	  // FIXME -  This logic is hard coded to simd_layout==2 and not allowing >2
 | 	  // FIXME 
 | ||||||
| 	  //	  for(int w=0;w<buffer_size;w++){
 | 	  // This logic is hard coded to simd_layout ==2 and not allowing >2
 | ||||||
| 	  //	    std::cout << "GatherSimd<"<<Nsimd<<"> : lane " << i <<" elem "<<w<<" "<< u_simd_send_buf[i ][u_comm_offset+w]<<std::endl;
 | 	  //		std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
 | ||||||
| 	  //	  }
 | 	   | ||||||
| 	  int inner_bit = (Nsimd>>(permute_type+1)); | 	  int inner_bit = (Nsimd>>(permute_type+1)); | ||||||
| 	  int ic= (i&inner_bit)? 1:0; | 	  int ic= (i&inner_bit)? 1:0; | ||||||
| 	   | 	   | ||||||
| @@ -1003,9 +930,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 	     | 	     | ||||||
| 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|   |   | ||||||
| 	    // shm == receive pointer         if offnode
 |  | ||||||
| 	    // shm == Translate[send pointer] if on node -- my view of his send pointer
 |  | ||||||
| 	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); | 	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp); | ||||||
|  | 	    //	    if ((ShmDirectCopy==0)||(shm==NULL)) { 
 | ||||||
| 	    if (shm==NULL) {  | 	    if (shm==NULL) {  | ||||||
| 	      shm = rp; | 	      shm = rp; | ||||||
| 	    }  | 	    }  | ||||||
| @@ -1030,133 +956,6 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
| 
 |  | ||||||
|   template<class compressor> |  | ||||||
|   void  GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx) |  | ||||||
|   { |  | ||||||
|     const int Nsimd = _grid->Nsimd(); |  | ||||||
| 
 |  | ||||||
|     const int maxl =2;// max layout in a direction
 |  | ||||||
|     int fd = _grid->_fdimensions[dimension]; |  | ||||||
|     int rd = _grid->_rdimensions[dimension]; |  | ||||||
|     int ld = _grid->_ldimensions[dimension]; |  | ||||||
|     int pd              = _grid->_processors[dimension]; |  | ||||||
|     int simd_layout     = _grid->_simd_layout[dimension]; |  | ||||||
|     int comm_dim        = _grid->_processors[dimension] >1 ; |  | ||||||
|     assert(comm_dim==1); |  | ||||||
|     // This will not work with a rotate dim
 |  | ||||||
|     assert(simd_layout==maxl); |  | ||||||
|     assert(shift>=0); |  | ||||||
|     assert(shift<fd); |  | ||||||
| 
 |  | ||||||
|     int permute_type=_grid->PermuteType(dimension); |  | ||||||
|     //    std::cout << "SimdNew permute type "<<permute_type<<std::endl;
 |  | ||||||
| 
 |  | ||||||
|     ///////////////////////////////////////////////
 |  | ||||||
|     // Simd direction uses an extract/merge pair
 |  | ||||||
|     ///////////////////////////////////////////////
 |  | ||||||
|     int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; |  | ||||||
|     int words = sizeof(cobj)/sizeof(vector_type); |  | ||||||
|      |  | ||||||
|     assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 |  | ||||||
|      |  | ||||||
|     int reduced_buffer_size = buffer_size; |  | ||||||
|     if (cbmask != 0x3) reduced_buffer_size=buffer_size>>1; |  | ||||||
| 
 |  | ||||||
|     int bytes = (reduced_buffer_size*sizeof(cobj))/simd_layout; |  | ||||||
|     assert(bytes*simd_layout == reduced_buffer_size*sizeof(cobj)); |  | ||||||
| 
 |  | ||||||
|     std::vector<cobj *> rpointers(maxl); |  | ||||||
|     std::vector<cobj *> spointers(maxl); |  | ||||||
| 
 |  | ||||||
|     ///////////////////////////////////////////
 |  | ||||||
|     // Work out what to send where
 |  | ||||||
|     ///////////////////////////////////////////
 |  | ||||||
|      |  | ||||||
|     int cb    = (cbmask==0x2)? Odd : Even; |  | ||||||
|     int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); |  | ||||||
|      |  | ||||||
|     // loop over outer coord planes orthog to dim
 |  | ||||||
|     for(int x=0;x<rd;x++){        |  | ||||||
|        |  | ||||||
|       int any_offnode = ( ((x+sshift)%fd) >= rd ); |  | ||||||
| 
 |  | ||||||
|       if ( any_offnode ) { |  | ||||||
| 
 |  | ||||||
| 	 |  | ||||||
| 	for(int i=0;i<maxl;i++){        |  | ||||||
| 	  spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset]; |  | ||||||
| 	} |  | ||||||
| 	 |  | ||||||
| 	int sx   = (x+sshift)%rd; |  | ||||||
| 
 |  | ||||||
| 	//	if ( cbmask==0x3 ) { 
 |  | ||||||
| 	//	  std::vector<std::pair<int,int> > table;
 |  | ||||||
| 	t_table-=usecond(); |  | ||||||
| 	if ( !face_table_computed ) { |  | ||||||
| 	  face_table.resize(face_idx+1); |  | ||||||
| 	  Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); |  | ||||||
| 	  //	  std::cout << " face table size "<<face_idx <<" " <<  face_table[face_idx].size() <<" computed buffer size "<< reduced_buffer_size <<
 |  | ||||||
| 	  //		    " bytes = "<<bytes <<std::endl;
 |  | ||||||
| 	} |  | ||||||
| 	t_table+=usecond(); |  | ||||||
| 	gathermtime-=usecond(); |  | ||||||
| 	Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type);  face_idx++; |  | ||||||
| 	gathermtime+=usecond(); |  | ||||||
|        |  | ||||||
| 	//spointers[0] -- low
 |  | ||||||
| 	//spointers[1] -- high
 |  | ||||||
| 
 |  | ||||||
| 	for(int i=0;i<maxl;i++){ |  | ||||||
| 
 |  | ||||||
| 	  int my_coor  = rd*i + x;            // self explanatory
 |  | ||||||
| 	  int nbr_coor = my_coor+sshift;      // self explanatory
 |  | ||||||
| 
 |  | ||||||
| 	  int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
 |  | ||||||
| 	  int nbr_lcoor= (nbr_coor%ld);       // local plane coor on neighbour node
 |  | ||||||
| 	  int nbr_ic   = (nbr_lcoor)/rd;      // inner coord of peer simd lane "i"
 |  | ||||||
| 	  int nbr_ox   = (nbr_lcoor%rd);      // outer coord of peer "x"
 |  | ||||||
| 
 |  | ||||||
| 	  int nbr_plane = nbr_ic; |  | ||||||
| 	  assert (sx == nbr_ox); |  | ||||||
| 
 |  | ||||||
| 	  auto rp = &new_simd_recv_buf[i        ][u_comm_offset]; |  | ||||||
| 	  auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset]; |  | ||||||
| 
 |  | ||||||
| 	  if(nbr_proc){ |  | ||||||
| 
 |  | ||||||
| 	    int recv_from_rank; |  | ||||||
| 	    int xmit_to_rank; |  | ||||||
| 	     |  | ||||||
| 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  |  | ||||||
|   |  | ||||||
| 	    // shm == receive pointer         if offnode
 |  | ||||||
| 	    // shm == Translate[send pointer] if on node -- my view of his send pointer
 |  | ||||||
| 	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); |  | ||||||
| 	    if (shm==NULL) {  |  | ||||||
| 	      shm = rp; |  | ||||||
| 	    } |  | ||||||
| 
 |  | ||||||
| 	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
 |  | ||||||
| 	    // assuming above pointer flip
 |  | ||||||
| 	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); |  | ||||||
| 
 |  | ||||||
| 	    rpointers[i] = shm; |  | ||||||
| 	     |  | ||||||
| 	  } else {  |  | ||||||
| 	     |  | ||||||
| 	    rpointers[i] = sp; |  | ||||||
| 	     |  | ||||||
| 	  } |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,Packets.size()-1,permute_type); |  | ||||||
| 
 |  | ||||||
| 	u_comm_offset     +=buffer_size; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|    |  | ||||||
| }; | }; | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| @@ -37,9 +37,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| 
 | 
 | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
| #include <omp.h> | #include <omp.h> | ||||||
| 
 | #ifdef GRID_NUMA | ||||||
| #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)") | #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(static)") | ||||||
| #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)") | #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)") | ||||||
|  | #else | ||||||
|  | #define PARALLEL_FOR_LOOP        _Pragma("omp parallel for schedule(runtime)") | ||||||
|  | #define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)") | ||||||
|  | #endif | ||||||
| #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") | #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") | ||||||
| #define PARALLEL_REGION       _Pragma("omp parallel") | #define PARALLEL_REGION       _Pragma("omp parallel") | ||||||
| #define PARALLEL_CRITICAL     _Pragma("omp critical") | #define PARALLEL_CRITICAL     _Pragma("omp critical") | ||||||
| @@ -51,9 +55,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #define PARALLEL_CRITICAL | #define PARALLEL_CRITICAL | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #define parallel_for       PARALLEL_FOR_LOOP for |  | ||||||
| #define parallel_for_nest2 PARALLEL_NESTED_LOOP2 for |  | ||||||
| 
 |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| 
 | 
 | ||||||
|   // Introduce a class to gain deterministic bit reproducible reduction.
 |   // Introduce a class to gain deterministic bit reproducible reduction.
 | ||||||
| @@ -267,7 +267,8 @@ namespace Grid { | |||||||
|       SimpleCompressor<siteVector> compressor; |       SimpleCompressor<siteVector> compressor; | ||||||
|       Stencil.HaloExchange(in,compressor); |       Stencil.HaloExchange(in,compressor); | ||||||
|  |  | ||||||
|       parallel_for(int ss=0;ss<Grid()->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int ss=0;ss<Grid()->oSites();ss++){ | ||||||
|         siteVector res = zero; |         siteVector res = zero; | ||||||
| 	siteVector nbr; | 	siteVector nbr; | ||||||
| 	int ptype; | 	int ptype; | ||||||
| @@ -379,7 +380,8 @@ namespace Grid { | |||||||
| 	  Subspace.ProjectToSubspace(oProj,oblock); | 	  Subspace.ProjectToSubspace(oProj,oblock); | ||||||
| 	  //	  blockProject(iProj,iblock,Subspace.subspace); | 	  //	  blockProject(iProj,iblock,Subspace.subspace); | ||||||
| 	  //	  blockProject(oProj,oblock,Subspace.subspace); | 	  //	  blockProject(oProj,oblock,Subspace.subspace); | ||||||
| 	  parallel_for(int ss=0;ss<Grid()->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  | 	  for(int ss=0;ss<Grid()->oSites();ss++){ | ||||||
| 	    for(int j=0;j<nbasis;j++){ | 	    for(int j=0;j<nbasis;j++){ | ||||||
| 	      if( disp!= 0 ) { | 	      if( disp!= 0 ) { | ||||||
| 		A[p]._odata[ss](j,i) = oProj._odata[ss](j); | 		A[p]._odata[ss](j,i) = oProj._odata[ss](j); | ||||||
| @@ -425,7 +427,7 @@ namespace Grid { | |||||||
| 	A[p]=zero; | 	A[p]=zero; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       GridParallelRNG  RNG(Grid()); RNG.SeedFixedIntegers(std::vector<int>({55,72,19,17,34})); |       GridParallelRNG  RNG(Grid()); RNG.SeedRandomDevice(); | ||||||
|       Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val); |       Lattice<iScalar<CComplex> > val(Grid()); random(RNG,val); | ||||||
|  |  | ||||||
|       Complex one(1.0); |       Complex one(1.0); | ||||||
|   | |||||||
| @@ -25,7 +25,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| double MultiShiftFunction::approx(double x) | double MultiShiftFunction::approx(double x) | ||||||
|   | |||||||
| @@ -45,8 +45,6 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|                            // Defaults true. |                            // Defaults true. | ||||||
|   RealD Tolerance; |   RealD Tolerance; | ||||||
|   Integer MaxIterations; |   Integer MaxIterations; | ||||||
|   Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion |  | ||||||
|    |  | ||||||
|   ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) |   ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true) | ||||||
|       : Tolerance(tol), |       : Tolerance(tol), | ||||||
|         MaxIterations(maxit), |         MaxIterations(maxit), | ||||||
| @@ -157,14 +155,13 @@ class ConjugateGradient : public OperatorFunction<Field> { | |||||||
|         std::cout << std::endl; |         std::cout << std::endl; | ||||||
|  |  | ||||||
|         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); |         if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); | ||||||
| 	IterationsToComplete = k;	 |  | ||||||
|         return; |         return; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     std::cout << GridLogMessage << "ConjugateGradient did NOT converge" |     std::cout << GridLogMessage << "ConjugateGradient did NOT converge" | ||||||
|               << std::endl; |               << std::endl; | ||||||
|     if (ErrorOnNoConverge) assert(0); |     if (ErrorOnNoConverge) assert(0); | ||||||
|     IterationsToComplete = k; |  | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -35,7 +35,6 @@ namespace Grid { | |||||||
|   class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> { |   class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> { | ||||||
|   public:                                                 |   public:                                                 | ||||||
|     RealD   Tolerance; |     RealD   Tolerance; | ||||||
|     RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed |  | ||||||
|     Integer MaxInnerIterations; |     Integer MaxInnerIterations; | ||||||
|     Integer MaxOuterIterations; |     Integer MaxOuterIterations; | ||||||
|     GridBase* SinglePrecGrid; //Grid for single-precision fields |     GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||||
| @@ -43,16 +42,12 @@ namespace Grid { | |||||||
|     LinearOperatorBase<FieldF> &Linop_f; |     LinearOperatorBase<FieldF> &Linop_f; | ||||||
|     LinearOperatorBase<FieldD> &Linop_d; |     LinearOperatorBase<FieldD> &Linop_d; | ||||||
|  |  | ||||||
|     Integer TotalInnerIterations; //Number of inner CG iterations |  | ||||||
|     Integer TotalOuterIterations; //Number of restarts |  | ||||||
|     Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step |  | ||||||
|  |  | ||||||
|     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess |     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||||
|     LinearFunction<FieldF> *guesser; |     LinearFunction<FieldF> *guesser; | ||||||
|      |      | ||||||
|     MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) : |     MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) : | ||||||
|       Linop_f(_Linop_f), Linop_d(_Linop_d), |       Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||||
|       Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), |       Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), | ||||||
|       OuterLoopNormMult(100.), guesser(NULL){ }; |       OuterLoopNormMult(100.), guesser(NULL){ }; | ||||||
|  |  | ||||||
|     void useGuesser(LinearFunction<FieldF> &g){ |     void useGuesser(LinearFunction<FieldF> &g){ | ||||||
| @@ -60,8 +55,9 @@ namespace Grid { | |||||||
|     } |     } | ||||||
|    |    | ||||||
|     void operator() (const FieldD &src_d_in, FieldD &sol_d){ |     void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||||
|       TotalInnerIterations = 0; | 	(*this)(src_d_in,sol_d,NULL); | ||||||
| 	 |     } | ||||||
|  |     void operator() (const FieldD &src_d_in, FieldD &sol_d, RealD *shift){ | ||||||
|       GridStopWatch TotalTimer; |       GridStopWatch TotalTimer; | ||||||
|       TotalTimer.Start(); |       TotalTimer.Start(); | ||||||
|      |      | ||||||
| @@ -81,7 +77,7 @@ namespace Grid { | |||||||
|       FieldD src_d(DoublePrecGrid); |       FieldD src_d(DoublePrecGrid); | ||||||
|       src_d = src_d_in; //source for next inner iteration, computed from residual during operation |       src_d = src_d_in; //source for next inner iteration, computed from residual during operation | ||||||
|      |      | ||||||
|       RealD inner_tol = InnerTolerance; |       RealD inner_tol = Tolerance; | ||||||
|      |      | ||||||
|       FieldF src_f(SinglePrecGrid); |       FieldF src_f(SinglePrecGrid); | ||||||
|       src_f.checkerboard = cb; |       src_f.checkerboard = cb; | ||||||
| @@ -89,18 +85,17 @@ namespace Grid { | |||||||
|       FieldF sol_f(SinglePrecGrid); |       FieldF sol_f(SinglePrecGrid); | ||||||
|       sol_f.checkerboard = cb; |       sol_f.checkerboard = cb; | ||||||
|      |      | ||||||
|       ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); |       ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||||
|       CG_f.ErrorOnNoConverge = false; |       CG_f.ErrorOnNoConverge = false; | ||||||
|  |  | ||||||
|       GridStopWatch InnerCGtimer; |       GridStopWatch InnerCGtimer; | ||||||
|  |  | ||||||
|       GridStopWatch PrecChangeTimer; |       GridStopWatch PrecChangeTimer; | ||||||
|      |      | ||||||
|       Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count |       for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ | ||||||
|        |  | ||||||
|       for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ |  | ||||||
| 	//Compute double precision rsd and also new RHS vector. | 	//Compute double precision rsd and also new RHS vector. | ||||||
| 	Linop_d.HermOp(sol_d, tmp_d); | 	Linop_d.HermOp(sol_d, tmp_d); | ||||||
|  | 	if(shift) axpy(tmp_d,*shift,sol_d,tmp_d); | ||||||
| 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector | 	RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector | ||||||
|        |        | ||||||
| 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | ||||||
| @@ -124,9 +119,8 @@ namespace Grid { | |||||||
| 	//Inner CG | 	//Inner CG | ||||||
| 	CG_f.Tolerance = inner_tol; | 	CG_f.Tolerance = inner_tol; | ||||||
| 	InnerCGtimer.Start(); | 	InnerCGtimer.Start(); | ||||||
| 	CG_f(Linop_f, src_f, sol_f); | 	CG_f(Linop_f, src_f, sol_f,shift); | ||||||
| 	InnerCGtimer.Stop(); | 	InnerCGtimer.Stop(); | ||||||
| 	TotalInnerIterations += CG_f.IterationsToComplete; |  | ||||||
|        |        | ||||||
| 	//Convert sol back to double and add to double prec solution | 	//Convert sol back to double and add to double prec solution | ||||||
| 	PrecChangeTimer.Start(); | 	PrecChangeTimer.Start(); | ||||||
| @@ -139,13 +133,11 @@ namespace Grid { | |||||||
|       //Final trial CG |       //Final trial CG | ||||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl; |       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl; | ||||||
|      |      | ||||||
|       ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); |       ConjugateGradientShifted<FieldD> CG_d(Tolerance, MaxInnerIterations); | ||||||
|       CG_d(Linop_d, src_d_in, sol_d); |       CG_d(Linop_d, src_d_in, sol_d,shift); | ||||||
|       TotalFinalStepIterations = CG_d.IterationsToComplete; |  | ||||||
|  |  | ||||||
|       TotalTimer.Stop(); |       TotalTimer.Stop(); | ||||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; |       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; |  | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -45,6 +45,7 @@ public: | |||||||
|     Integer MaxIterations; |     Integer MaxIterations; | ||||||
|     int verbose; |     int verbose; | ||||||
|     MultiShiftFunction shifts; |     MultiShiftFunction shifts; | ||||||
|  |     int iter; | ||||||
|  |  | ||||||
|     ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :  |     ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :  | ||||||
| 	MaxIterations(maxit), | 	MaxIterations(maxit), | ||||||
| @@ -60,6 +61,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) | |||||||
|   std::vector<Field> results(nshift,grid); |   std::vector<Field> results(nshift,grid); | ||||||
|   (*this)(Linop,src,results,psi); |   (*this)(Linop,src,results,psi); | ||||||
| } | } | ||||||
|  |  | ||||||
| void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi) | void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi) | ||||||
| { | { | ||||||
|   int nshift = shifts.order; |   int nshift = shifts.order; | ||||||
| @@ -105,11 +107,12 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
|   RealD a,b,c,d; |   RealD a,b,c,d; | ||||||
|   RealD cp,bp,qq; //prev |   RealD cp,bp,qq; //prev | ||||||
|    |    | ||||||
|  |   int cb=src.checkerboard; | ||||||
|   // Matrix mult fields |   // Matrix mult fields | ||||||
|   Field r(grid); |   Field r(grid); | ||||||
|   Field p(grid); |   Field p(grid); p.checkerboard = src.checkerboard; | ||||||
|   Field tmp(grid); |   Field tmp(grid); | ||||||
|   Field mmp(grid); |   Field mmp(grid);mmp.checkerboard = src.checkerboard; | ||||||
|    |    | ||||||
|   // Check lightest mass |   // Check lightest mass | ||||||
|   for(int s=0;s<nshift;s++){ |   for(int s=0;s<nshift;s++){ | ||||||
| @@ -132,6 +135,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
|   p=src; |   p=src; | ||||||
|    |    | ||||||
|   //MdagM+m[0] |   //MdagM+m[0] | ||||||
|  |   std::cout << "p.checkerboard " << p.checkerboard | ||||||
|  |   << "mmp.checkerboard " << mmp.checkerboard << std::endl; | ||||||
|  |  | ||||||
|   Linop.HermOpAndNorm(p,mmp,d,qq); |   Linop.HermOpAndNorm(p,mmp,d,qq); | ||||||
|   axpy(mmp,mass[0],p,mmp); |   axpy(mmp,mass[0],p,mmp); | ||||||
|   RealD rn = norm2(p); |   RealD rn = norm2(p); | ||||||
| @@ -269,6 +275,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector | |||||||
| 	RealD cn = norm2(src); | 	RealD cn = norm2(src); | ||||||
| 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | ||||||
|       } |       } | ||||||
|  |       iter = k; | ||||||
|       return; |       return; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   | |||||||
							
								
								
									
										404
									
								
								lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										404
									
								
								lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,404 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Chulwoo Jung <chulwoo@quark.phy.bnl.gov> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END/ LEGAL */ | ||||||
|  | #ifndef GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H | ||||||
|  | #define GRID_CONJUGATE_GRADIENT_MULTI_MIXED_PREC_H | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  |   //Mixed precision restarted defect correction CG | ||||||
|  |   template<class FieldD,class FieldF | ||||||
|  | //, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0 | ||||||
|  | //, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0 | ||||||
|  | >  | ||||||
|  |   class MixedPrecisionConjugateGradientMultiShift : public LinearFunction<FieldD> { | ||||||
|  |   public:                                                 | ||||||
|  | //    RealD   Tolerance; | ||||||
|  |     Integer MaxInnerIterations; | ||||||
|  |     Integer MaxOuterIterations; | ||||||
|  |     GridBase* SinglePrecGrid; //Grid for single-precision fields | ||||||
|  |     RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||||
|  |     LinearOperatorBase<FieldF> &Linop_f; | ||||||
|  |     LinearOperatorBase<FieldD> &Linop_d; | ||||||
|  |     MultiShiftFunction shifts; | ||||||
|  |     Integer iter; | ||||||
|  |  | ||||||
|  |     //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess | ||||||
|  | //    LinearFunction<FieldF> *guesser; | ||||||
|  |      | ||||||
|  |     MixedPrecisionConjugateGradientMultiShift(GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d,  | ||||||
|  | Integer maxinnerit,	MultiShiftFunction &_shifts ) : | ||||||
|  |       Linop_f(_Linop_f), Linop_d(_Linop_d), | ||||||
|  |       MaxInnerIterations(maxinnerit), SinglePrecGrid(_sp_grid), | ||||||
|  |       OuterLoopNormMult(100.), shifts(_shifts) {}; | ||||||
|  |  | ||||||
|  |    | ||||||
|  |     void operator() (const FieldD &src_d_in, FieldD &sol_d){ | ||||||
|  | 	assert(0); // not yet implemented | ||||||
|  |     } | ||||||
|  |     void operator() (const FieldD &src_d_in, std::vector<FieldD> &sol_d){ | ||||||
|  |       GridStopWatch TotalTimer; | ||||||
|  |       TotalTimer.Start(); | ||||||
|  |      | ||||||
|  |       int cb = src_d_in.checkerboard; | ||||||
|  |  | ||||||
|  |       int nshift = shifts.order; | ||||||
|  |       assert(nshift == sol_d.size()); | ||||||
|  |       for(int i=0;i<nshift;i++) sol_d[i].checkerboard = cb; | ||||||
|  |      | ||||||
|  |       RealD src_norm = norm2(src_d_in); | ||||||
|  | //      RealD stop = src_norm * Tolerance*Tolerance; | ||||||
|  |  | ||||||
|  |       GridBase* DoublePrecGrid = src_d_in._grid; | ||||||
|  |       FieldD tmp_d(DoublePrecGrid); tmp_d.checkerboard = cb; | ||||||
|  |      | ||||||
|  |       FieldD tmp2_d(DoublePrecGrid); tmp2_d.checkerboard = cb; | ||||||
|  |      | ||||||
|  |       FieldD src_d(DoublePrecGrid); | ||||||
|  |       src_d = src_d_in; //source for next inner iteration, computed from residual during operation | ||||||
|  |      | ||||||
|  | //      RealD inner_tol = Tolerance; | ||||||
|  |   	FieldD psi_d(DoublePrecGrid);psi_d.checkerboard = cb; | ||||||
|  |      | ||||||
|  |       FieldF src_f(SinglePrecGrid); | ||||||
|  |       src_f.checkerboard = cb; | ||||||
|  |      | ||||||
|  |       std::vector<FieldF> sol_f(nshift,SinglePrecGrid); | ||||||
|  |       for(int i=0;i<nshift;i++) sol_f[i].checkerboard = cb; | ||||||
|  |      | ||||||
|  | //      ConjugateGradientShifted<FieldF> CG_f(inner_tol, MaxInnerIterations); | ||||||
|  |       ConjugateGradientMultiShift<FieldF> MSCG(MaxInnerIterations,shifts); | ||||||
|  | //      CG_f.ErrorOnNoConverge = false; | ||||||
|  |  | ||||||
|  |       GridStopWatch InnerCGtimer; | ||||||
|  |  | ||||||
|  |       GridStopWatch PrecChangeTimer; | ||||||
|  |      | ||||||
|  | { | ||||||
|  | //	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | ||||||
|  |  | ||||||
|  | //	if(norm < OuterLoopNormMult * stop){ | ||||||
|  | //	  std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; | ||||||
|  | //	  break; | ||||||
|  | //	} | ||||||
|  | //	while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||||
|  |  | ||||||
|  | 	PrecChangeTimer.Start(); | ||||||
|  | 	precisionChange(src_f, src_d); | ||||||
|  | 	PrecChangeTimer.Stop(); | ||||||
|  |        | ||||||
|  | //	zeroit(sol_f); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | 	//Inner CG | ||||||
|  | 	InnerCGtimer.Start(); | ||||||
|  |   int if_relup = 0; | ||||||
|  | #if 0 | ||||||
|  |         MSCG(Linop_f,src_f,sol_f); | ||||||
|  | #else | ||||||
|  | { | ||||||
|  |    | ||||||
|  |   GridBase *grid = SinglePrecGrid; | ||||||
|  |    | ||||||
|  |   //////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Convenience references to the info stored in "MultiShiftFunction" | ||||||
|  |   //////////////////////////////////////////////////////////////////////// | ||||||
|  |   int nshift = shifts.order; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts" | ||||||
|  |   std::vector<RealD> &mresidual(shifts.tolerances); | ||||||
|  |   std::vector<RealD> alpha(nshift,1.); | ||||||
|  |   std::vector<FieldF>   ps(nshift,grid);// Search directions | ||||||
|  |  | ||||||
|  |   assert(sol_f.size()==nshift); | ||||||
|  |   assert(mass.size()==nshift); | ||||||
|  |   assert(mresidual.size()==nshift); | ||||||
|  |    | ||||||
|  |   // dynamic sized arrays on stack; 2d is a pain with vector | ||||||
|  |   RealD  bs[nshift]; | ||||||
|  |   RealD  rsq[nshift]; | ||||||
|  |   RealD  z[nshift][2]; | ||||||
|  |   int     converged[nshift]; | ||||||
|  |    | ||||||
|  |   const int       primary =0; | ||||||
|  |    | ||||||
|  |   //Primary shift fields CG iteration | ||||||
|  |   RealD a,b,c,d; | ||||||
|  |   RealD cp,bp,qq; //prev | ||||||
|  |    | ||||||
|  |   int cb=src_f.checkerboard; | ||||||
|  |   // Matrix mult fields | ||||||
|  |   FieldF r(grid); r.checkerboard = src_f.checkerboard; | ||||||
|  |   FieldF p(grid); p.checkerboard = src_f.checkerboard; | ||||||
|  |   FieldF tmp(grid); tmp.checkerboard = src_f.checkerboard; | ||||||
|  |   FieldF mmp(grid);mmp.checkerboard = src_f.checkerboard; | ||||||
|  |   FieldF psi(grid);psi.checkerboard = src_f.checkerboard; | ||||||
|  |     std::cout.precision(12); | ||||||
|  |     std::cout<<GridLogMessage<<"norm2(psi_d)= "<<norm2(psi_d)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"norm2(psi)= "<<norm2(psi)<<std::endl; | ||||||
|  |    | ||||||
|  |    | ||||||
|  |   // Check lightest mass | ||||||
|  |   for(int s=0;s<nshift;s++){ | ||||||
|  |     assert( mass[s]>= mass[primary] ); | ||||||
|  |     converged[s]=0; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   // Wire guess to zero | ||||||
|  |   // Residuals "r" are src | ||||||
|  |   // First search direction "p" is also src | ||||||
|  |   cp = norm2(src_f); | ||||||
|  |   Real c_relup = cp; | ||||||
|  |   for(int s=0;s<nshift;s++){ | ||||||
|  |     rsq[s] = cp * mresidual[s] * mresidual[s]; | ||||||
|  |     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientMultiShift: shift "<<s | ||||||
|  | 	     <<" target resid "<<rsq[s]<<std::endl; | ||||||
|  |     ps[s] = src_f; | ||||||
|  |   } | ||||||
|  |   // r and p for primary | ||||||
|  |   r=src_f; | ||||||
|  |   p=src_f; | ||||||
|  |    | ||||||
|  |   //MdagM+m[0] | ||||||
|  |   std::cout << "p.checkerboard " << p.checkerboard | ||||||
|  |   << "mmp.checkerboard " << mmp.checkerboard << std::endl; | ||||||
|  |  | ||||||
|  |   Linop_f.HermOpAndNorm(p,mmp,d,qq); | ||||||
|  |   axpy(mmp,mass[0],p,mmp); | ||||||
|  |   RealD rn = norm2(p); | ||||||
|  |   d += rn*mass[0]; | ||||||
|  |    | ||||||
|  |   // have verified that inner product of  | ||||||
|  |   // p and mmp is equal to d after this since | ||||||
|  |   // the d computation is tricky | ||||||
|  |   //  qq = real(innerProduct(p,mmp)); | ||||||
|  |   //  std::cout<<GridLogMessage << "debug equal ?  qq "<<qq<<" d "<< d<<std::endl; | ||||||
|  |    | ||||||
|  |   b = -cp /d; | ||||||
|  |    | ||||||
|  |   // Set up the various shift variables | ||||||
|  |   int       iz=0; | ||||||
|  |   z[0][1-iz] = 1.0; | ||||||
|  |   z[0][iz]   = 1.0; | ||||||
|  |   bs[0]      = b; | ||||||
|  |   for(int s=1;s<nshift;s++){ | ||||||
|  |     z[s][1-iz] = 1.0; | ||||||
|  |     z[s][iz]   = 1.0/( 1.0 - b*(mass[s]-mass[0])); | ||||||
|  |     bs[s]      = b*z[s][iz];  | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   // r += b[0] A.p[0] | ||||||
|  |   // c= norm(r) | ||||||
|  |   c=axpy_norm(r,b,mmp,r); | ||||||
|  |    | ||||||
|  |  axpby(psi,0.,-bs[0],src_f,src_f); | ||||||
|  |   for(int s=0;s<nshift;s++) { | ||||||
|  |     axpby(sol_f[s],0.,-bs[s]*alpha[s],src_f,src_f); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |    | ||||||
|  |   // Iteration loop | ||||||
|  |   int k; | ||||||
|  |  // inefficient zeroing, please replace! | ||||||
|  | //  RealD sol_norm = axpy_norm(sol_d[0],-1.,sol_d[0],sol_d[0]); | ||||||
|  |   zeroit(sol_d[0]); | ||||||
|  |   std::cout<<GridLogMessage<<"norm(sol_d[0])= "<<norm2(sol_d[0])<<std::endl; | ||||||
|  |    | ||||||
|  |  | ||||||
|  |   int all_converged = 1; | ||||||
|  | 	RealD tmp1,tmp2; | ||||||
|  |   for (k=1;k<=MaxOuterIterations;k++){ | ||||||
|  |      | ||||||
|  |     a = c /cp; | ||||||
|  |     axpy(p,a,p,r); | ||||||
|  |      | ||||||
|  |     // Note to self - direction ps is iterated seperately | ||||||
|  |     // for each shift. Does not appear to have any scope | ||||||
|  |     // for avoiding linear algebra in "single" case. | ||||||
|  |     //  | ||||||
|  |     // However SAME r is used. Could load "r" and update | ||||||
|  |     // ALL ps[s]. 2/3 Bandwidth saving | ||||||
|  |     // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||||
|  |     for(int s=0;s<nshift;s++){ | ||||||
|  |       if ( ! converged[s] ) {  | ||||||
|  | 	if (s==0){ | ||||||
|  | 	  axpy(ps[s],a,ps[s],r); | ||||||
|  | 	} else{ | ||||||
|  | 	  RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b); | ||||||
|  | 	  axpby(ps[s],z[s][iz],as,r,ps[s]); | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     cp=c; | ||||||
|  |      | ||||||
|  |     Linop_f.HermOpAndNorm(p,mmp,d,qq); | ||||||
|  |     axpy(mmp,mass[0],p,mmp); | ||||||
|  |     RealD rn = norm2(p); | ||||||
|  |     d += rn*mass[0]; | ||||||
|  |      | ||||||
|  |     bp=b; | ||||||
|  |     b=-cp/d; | ||||||
|  |      | ||||||
|  |     c=axpy_norm(r,b,mmp,r); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     // Toggle the recurrence history | ||||||
|  |     bs[0] = b; | ||||||
|  |     iz = 1-iz; | ||||||
|  |     for(int s=1;s<nshift;s++){ | ||||||
|  |       if((!converged[s])){ | ||||||
|  | 	RealD z0 = z[s][1-iz]; | ||||||
|  | 	RealD z1 = z[s][iz]; | ||||||
|  | 	z[s][iz] = z0*z1*bp | ||||||
|  | 	  / (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));  | ||||||
|  | 	bs[s] = b*z[s][iz]/z0; // NB sign  rel to Mike | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     axpy(psi,-bs[0],ps[0],psi); | ||||||
|  |     for(int s=0;s<nshift;s++){ | ||||||
|  |       int ss = s; | ||||||
|  |       // Scope for optimisation here in case of "single". | ||||||
|  |       // Could load sol_f[0] and pull all ps[s] in. | ||||||
|  |       //      if ( single ) ss=primary; | ||||||
|  |       // Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving | ||||||
|  |       // Pipelined CG gain: | ||||||
|  |       // | ||||||
|  |       // New Kernel: Load r, vector of coeffs, vector of pointers ps | ||||||
|  |       // New Kernel: Load sol_f[0], vector of coeffs, vector of pointers ps | ||||||
|  |       // If can predict the coefficient bs then we can fuse these and avoid write reread cyce | ||||||
|  |       //  on ps[s]. | ||||||
|  |       // Before:  3 x npole  + 3 x npole | ||||||
|  |       // After :  2 x npole (ps[s])        => 3x speed up of multishift CG. | ||||||
|  |        | ||||||
|  |       if( (!converged[s]) ) {  | ||||||
|  | 	axpy(sol_f[ss],-bs[s]*alpha[s],ps[s],sol_f[ss]); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     if (k%MaxInnerIterations==0){ | ||||||
|  | //    if (c < 1e-4*c_relup){ | ||||||
|  |        RealD c_f=c; | ||||||
|  |        precisionChange(tmp_d,psi); | ||||||
|  |        RealD sol_norm =axpy_norm (psi_d,1.,tmp_d,psi_d); | ||||||
|  |        tmp1 = norm2(psi); | ||||||
|  |        zeroit(psi); | ||||||
|  |        tmp2 = norm2(psi); | ||||||
|  |        std::cout<<GridLogMessage<<"k= "<<k<<" norm2(sol)= "<<sol_norm<<" "<<tmp1<<" "<<tmp2<<std::endl; | ||||||
|  | //       precisionChange(sol_d[0],sol_f[0]); | ||||||
|  |        Linop_d.HermOpAndNorm(psi_d,tmp_d,tmp1,tmp2); | ||||||
|  |        axpy(tmp2_d,mass[0],psi_d,tmp_d); | ||||||
|  |        axpy(tmp_d,-1.,tmp2_d,src_d); | ||||||
|  |        precisionChange(r,tmp_d); | ||||||
|  | 	c_relup = norm2(r); | ||||||
|  |        std::cout<<GridLogMessage<<"k= "<<k<<" norm2(r)= "<<c<<" "<<c_relup<<" "<<c_f<<std::endl; | ||||||
|  | 	if_relup=1; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     // Convergence checks | ||||||
|  |   all_converged=1; | ||||||
|  |     for(int s=0;s<nshift;s++){ | ||||||
|  |        | ||||||
|  |       if ( (!converged[s]) ){ | ||||||
|  | 	 | ||||||
|  | 	RealD css  = c * z[s][iz]* z[s][iz]; | ||||||
|  | 	 | ||||||
|  | 	if(css<rsq[s]){ | ||||||
|  | 	  if ( ! converged[s] ) | ||||||
|  | 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl; | ||||||
|  | 	      converged[s]=1; | ||||||
|  | 	} else { | ||||||
|  | 		if (k%MaxInnerIterations==0) | ||||||
|  | 	    std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has not converged "<<css<<"<"<<rsq[s]<<std::endl; | ||||||
|  | 	  all_converged=0; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |      | ||||||
|  | #if 0 | ||||||
|  |     if ( all_converged ){ | ||||||
|  |       std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl; | ||||||
|  | #else | ||||||
|  |     if ( converged[0] ){ | ||||||
|  |       std::cout<<GridLogMessage<< "CGMultiShift: Shift 0 have converged iteration, terminating  "<<k<<std::endl; | ||||||
|  | #endif | ||||||
|  |        | ||||||
|  | #if 1 | ||||||
|  |       for(int s=1; s < nshift; s++) {  | ||||||
|  | 	Linop_f.HermOpAndNorm(sol_f[s],mmp,d,qq); | ||||||
|  | 	axpy(tmp,mass[s],sol_f[s],mmp); | ||||||
|  | 	axpy(r,-alpha[s],src_f,tmp); | ||||||
|  | 	RealD rn = norm2(r); | ||||||
|  | 	RealD cn = norm2(src_f); | ||||||
|  | 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl; | ||||||
|  |       } | ||||||
|  | #endif | ||||||
|  |      iter = k; | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   // ugly hack | ||||||
|  |   if ( !all_converged ) | ||||||
|  |   std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; | ||||||
|  | //  assert(0); | ||||||
|  | } | ||||||
|  | 	 | ||||||
|  | #endif | ||||||
|  | 	InnerCGtimer.Stop(); | ||||||
|  |        | ||||||
|  | 	//Convert sol back to double and add to double prec solution | ||||||
|  | 	PrecChangeTimer.Start(); | ||||||
|  | 	sol_d[0]=psi_d; | ||||||
|  | 	for(int i=1;i<nshift;i++)precisionChange(sol_d[i], sol_f[i]); | ||||||
|  |       std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl; | ||||||
|  |       // Check answers  | ||||||
|  |       for(int s=0; s < nshift; s++) {  | ||||||
|  | 	RealD tmp1,tmp2; | ||||||
|  |        Linop_d.HermOpAndNorm(sol_d[s],tmp_d,tmp1,tmp2); | ||||||
|  |        axpy(tmp2_d,shifts.poles[s],sol_d[s],tmp_d); | ||||||
|  |        axpy(tmp_d,-1.,src_d,tmp2_d); | ||||||
|  | 	std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(norm2(tmp_d)/norm2(src_d))<<std::endl; | ||||||
|  |       } | ||||||
|  | 	PrecChangeTimer.Stop(); | ||||||
|  |        | ||||||
|  | } | ||||||
|  |      | ||||||
|  |       //Final trial CG | ||||||
|  |  //     std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl; | ||||||
|  |      | ||||||
|  |       TotalTimer.Stop(); | ||||||
|  |       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl; | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif | ||||||
							
								
								
									
										168
									
								
								lib/algorithms/iterative/ConjugateGradientShifted.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								lib/algorithms/iterative/ConjugateGradientShifted.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,168 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/algorithms/iterative/ConjugateGradient.h | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #ifndef GRID_CONJUGATE_GRADIENT_SHIFTED_H | ||||||
|  | #define GRID_CONJUGATE_GRADIENT_SHIFTED_H | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  |     ///////////////////////////////////////////////////////////// | ||||||
|  |     // Base classes for iterative processes based on operators | ||||||
|  |     // single input vec, single output vec. | ||||||
|  |     ///////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  |   template<class Field>  | ||||||
|  |     class ConjugateGradientShifted : public OperatorFunction<Field> { | ||||||
|  | public:                                                 | ||||||
|  |     bool ErrorOnNoConverge; //throw an assert when the CG fails to converge. Defaults true. | ||||||
|  |     RealD   Tolerance; | ||||||
|  |     Integer MaxIterations; | ||||||
|  |     ConjugateGradientShifted(RealD tol,Integer maxit, bool err_on_no_conv = true) : Tolerance(tol), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv) {  | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi ){ | ||||||
|  | 	(*this)(Linop,src,psi,NULL); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi, RealD *shift){ | ||||||
|  |  | ||||||
|  |       psi.checkerboard = src.checkerboard; | ||||||
|  |       conformable(psi,src); | ||||||
|  |  | ||||||
|  |       RealD cp,c,a,d,b,ssq,qq,b_pred; | ||||||
|  |        | ||||||
|  |       Field   p(src); | ||||||
|  |       Field mmp(src); | ||||||
|  |       Field   r(src); | ||||||
|  |        | ||||||
|  |       //Initial residual computation & set up | ||||||
|  |       RealD guess = norm2(psi); | ||||||
|  |       assert(std::isnan(guess)==0); | ||||||
|  |  | ||||||
|  |       Linop.HermOpAndNorm(psi,mmp,d,b); | ||||||
|  | 	if(shift) axpy(mmp,*shift,psi,mmp); | ||||||
|  | 	RealD rn = norm2(psi); | ||||||
|  | 	if(shift) d += rn*(*shift); | ||||||
|  | 	RealD d2 = real(innerProduct(psi,mmp)); | ||||||
|  | 	b= norm2(mmp); | ||||||
|  |       RealD src_norm=norm2(src); | ||||||
|  |       r= src-mmp; | ||||||
|  |       p= r; | ||||||
|  |        | ||||||
|  |       a  =norm2(p); | ||||||
|  |       cp =a; | ||||||
|  |       ssq=norm2(src); | ||||||
|  |  | ||||||
|  |       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl; | ||||||
|  |       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl; | ||||||
|  |       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl; | ||||||
|  |       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl; | ||||||
|  |       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl; | ||||||
|  |       std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl; | ||||||
|  |  | ||||||
|  |       RealD rsq =  Tolerance* Tolerance*ssq; | ||||||
|  |        | ||||||
|  |       //Check if guess is really REALLY good :) | ||||||
|  |       if ( cp <= rsq ) { | ||||||
|  | 	return; | ||||||
|  |       } | ||||||
|  |        | ||||||
|  |       std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl; | ||||||
|  |  | ||||||
|  |       GridStopWatch LinalgTimer; | ||||||
|  |       GridStopWatch MatrixTimer; | ||||||
|  |       GridStopWatch SolverTimer; | ||||||
|  |  | ||||||
|  |       SolverTimer.Start(); | ||||||
|  |       int k; | ||||||
|  |       for (k=1;k<=MaxIterations;k++){ | ||||||
|  | 	 | ||||||
|  | 	c=cp; | ||||||
|  |  | ||||||
|  | 	MatrixTimer.Start(); | ||||||
|  | 	Linop.HermOpAndNorm(p,mmp,d,qq); | ||||||
|  | 	MatrixTimer.Stop(); | ||||||
|  | 	LinalgTimer.Start(); | ||||||
|  | 	if(shift) axpy(mmp,*shift,p,mmp); | ||||||
|  | 	RealD rn = norm2(p); | ||||||
|  | 	if(shift) d += rn*(*shift); | ||||||
|  | 	RealD d2 = real(innerProduct(p,mmp)); | ||||||
|  | 	qq = norm2(mmp); | ||||||
|  |       if (k%10==1) std::cout<< std::setprecision(4)<< "d:  "<<d<<" d2= "<<d2<<std::endl; | ||||||
|  |  | ||||||
|  | 	//	RealD    qqck = norm2(mmp); | ||||||
|  | 	//	ComplexD dck  = innerProduct(p,mmp); | ||||||
|  |        | ||||||
|  | 	a      = c/d; | ||||||
|  | 	b_pred = a*(a*qq-d)/c; | ||||||
|  |  | ||||||
|  | 	cp = axpy_norm(r,-a,mmp,r); | ||||||
|  | 	b = cp/c; | ||||||
|  |       if (k%10==1) std::cout<< std::setprecision(4)<<"k= "<<k<<" src:  "<<src_norm<<" r= "<<cp<<std::endl; | ||||||
|  | 	 | ||||||
|  | 	// Fuse these loops ; should be really easy | ||||||
|  | 	psi= a*p+psi; | ||||||
|  | 	p  = p*b+r; | ||||||
|  | 	   | ||||||
|  | 	LinalgTimer.Stop(); | ||||||
|  | 	std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl; | ||||||
|  | 	 | ||||||
|  | 	// Stopping condition | ||||||
|  | 	if ( cp <= rsq ) {  | ||||||
|  | 	   | ||||||
|  | 	  SolverTimer.Stop(); | ||||||
|  | 	  Linop.HermOpAndNorm(psi,mmp,d,qq); | ||||||
|  | 	  if(shift) mmp = mmp + (*shift) * psi; | ||||||
|  | 	  p=mmp-src; | ||||||
|  | 	   | ||||||
|  | 	  RealD mmpnorm = sqrt(norm2(mmp)); | ||||||
|  | 	  RealD psinorm = sqrt(norm2(psi)); | ||||||
|  | 	  RealD srcnorm = sqrt(norm2(src)); | ||||||
|  | 	  RealD resnorm = sqrt(norm2(p)); | ||||||
|  | 	  RealD true_residual = resnorm/srcnorm; | ||||||
|  |  | ||||||
|  | 	  std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k | ||||||
|  | 		   <<" computed residual "<<sqrt(cp/ssq) | ||||||
|  | 		   <<" true residual "    <<true_residual | ||||||
|  | 		   <<" target "<<Tolerance<<std::endl; | ||||||
|  | 	  std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix  "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed(); | ||||||
|  | 	  std::cout<<std::endl; | ||||||
|  | 	   | ||||||
|  | 	if(ErrorOnNoConverge) | ||||||
|  | 	  assert(true_residual/Tolerance < 1000.0); | ||||||
|  |  | ||||||
|  | 	  return; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl; | ||||||
|  | //      assert(0); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  | } | ||||||
|  | #endif | ||||||
| @@ -31,11 +31,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| #include <string.h> //memset | #include <string.h> //memset | ||||||
| #ifdef USE_LAPACK | #ifdef USE_LAPACK | ||||||
|  | #ifdef USE_MKL | ||||||
|  | #include<mkl_lapack.h> | ||||||
|  | #else | ||||||
| void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, | void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e, | ||||||
|                    double *vl, double *vu, int *il, int *iu, double *abstol, |                    double *vl, double *vu, int *il, int *iu, double *abstol, | ||||||
|                    int *m, double *w, double *z, int *ldz, int *isuppz, |                    int *m, double *w, double *z, int *ldz, int *isuppz, | ||||||
|                    double *work, int *lwork, int *iwork, int *liwork, |                    double *work, int *lwork, int *iwork, int *liwork, | ||||||
|                    int *info); |                    int *info); | ||||||
|  | //#include <lapacke/lapacke.h> | ||||||
|  | #endif | ||||||
| #endif | #endif | ||||||
| #include "DenseMatrix.h" | #include "DenseMatrix.h" | ||||||
| #include "EigenSort.h" | #include "EigenSort.h" | ||||||
| @@ -62,12 +67,13 @@ public: | |||||||
|     int Np;      // Np -- Number of spare vecs in kryloc space |     int Np;      // Np -- Number of spare vecs in kryloc space | ||||||
|     int Nm;      // Nm -- total number of vectors |     int Nm;      // Nm -- total number of vectors | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     RealD OrthoTime; | ||||||
|  |  | ||||||
|     RealD eresid; |     RealD eresid; | ||||||
|  |  | ||||||
|     SortEigen<Field> _sort; |     SortEigen<Field> _sort; | ||||||
|  |  | ||||||
| //    GridCartesian &_fgrid; |  | ||||||
|  |  | ||||||
|     LinearOperatorBase<Field> &_Linop; |     LinearOperatorBase<Field> &_Linop; | ||||||
|  |  | ||||||
|     OperatorFunction<Field>   &_poly; |     OperatorFunction<Field>   &_poly; | ||||||
| @@ -124,23 +130,23 @@ public: | |||||||
|  |  | ||||||
|       GridBase *grid = evec[0]._grid; |       GridBase *grid = evec[0]._grid; | ||||||
|       Field w(grid); |       Field w(grid); | ||||||
|       std::cout << "RitzMatrix "<<std::endl; |       std::cout<<GridLogMessage << "RitzMatrix "<<std::endl; | ||||||
|       for(int i=0;i<k;i++){ |       for(int i=0;i<k;i++){ | ||||||
| 	_poly(_Linop,evec[i],w); | 	_poly(_Linop,evec[i],w); | ||||||
| 	std::cout << "["<<i<<"] "; | 	std::cout<<GridLogMessage << "["<<i<<"] "; | ||||||
| 	for(int j=0;j<k;j++){ | 	for(int j=0;j<k;j++){ | ||||||
| 	  ComplexD in = innerProduct(evec[j],w); | 	  ComplexD in = innerProduct(evec[j],w); | ||||||
| 	  if ( fabs((double)i-j)>1 ) {  | 	  if ( fabs((double)i-j)>1 ) {  | ||||||
| 	    if (abs(in) >1.0e-9 )  {  | 	    if (abs(in) >1.0e-9 )  {  | ||||||
| 	      std::cout<<"oops"<<std::endl; | 	      std::cout<<GridLogMessage<<"oops"<<std::endl; | ||||||
| 	      abort(); | 	      abort(); | ||||||
| 	    } else  | 	    } else  | ||||||
| 	      std::cout << " 0 "; | 	      std::cout<<GridLogMessage << " 0 "; | ||||||
| 	  } else {  | 	  } else {  | ||||||
| 	    std::cout << " "<<in<<" "; | 	    std::cout<<GridLogMessage << " "<<in<<" "; | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
| 	std::cout << std::endl; | 	std::cout<<GridLogMessage << std::endl; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -174,10 +180,10 @@ public: | |||||||
|       RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop |       RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop | ||||||
|                                  // 7. vk+1 := wk/βk+1 |                                  // 7. vk+1 := wk/βk+1 | ||||||
|  |  | ||||||
| //	std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl; | 	std::cout<<GridLogMessage << "alpha = " << zalph << " beta "<<beta<<std::endl; | ||||||
|       const RealD tiny = 1.0e-20; |       const RealD tiny = 1.0e-20; | ||||||
|       if ( beta < tiny ) {  |       if ( beta < tiny ) {  | ||||||
| 	std::cout << " beta is tiny "<<beta<<std::endl; | 	std::cout<<GridLogMessage << " beta is tiny "<<beta<<std::endl; | ||||||
|      } |      } | ||||||
|       lmd[k] = alph; |       lmd[k] = alph; | ||||||
|       lme[k]  = beta; |       lme[k]  = beta; | ||||||
| @@ -253,6 +259,7 @@ public: | |||||||
|     } |     } | ||||||
|  |  | ||||||
| #ifdef USE_LAPACK | #ifdef USE_LAPACK | ||||||
|  | #define LAPACK_INT long long | ||||||
|     void diagonalize_lapack(DenseVector<RealD>& lmd, |     void diagonalize_lapack(DenseVector<RealD>& lmd, | ||||||
| 		     DenseVector<RealD>& lme,  | 		     DenseVector<RealD>& lme,  | ||||||
| 		     int N1, | 		     int N1, | ||||||
| @@ -262,7 +269,7 @@ public: | |||||||
|   const int size = Nm; |   const int size = Nm; | ||||||
| //  tevals.resize(size); | //  tevals.resize(size); | ||||||
| //  tevecs.resize(size); | //  tevecs.resize(size); | ||||||
|   int NN = N1; |   LAPACK_INT NN = N1; | ||||||
|   double evals_tmp[NN]; |   double evals_tmp[NN]; | ||||||
|   double evec_tmp[NN][NN]; |   double evec_tmp[NN][NN]; | ||||||
|   memset(evec_tmp[0],0,sizeof(double)*NN*NN); |   memset(evec_tmp[0],0,sizeof(double)*NN*NN); | ||||||
| @@ -276,19 +283,19 @@ public: | |||||||
|         if (i==j) evals_tmp[i] = lmd[i]; |         if (i==j) evals_tmp[i] = lmd[i]; | ||||||
|         if (j==(i-1)) EE[j] = lme[j]; |         if (j==(i-1)) EE[j] = lme[j]; | ||||||
|       } |       } | ||||||
|   int evals_found; |   LAPACK_INT evals_found; | ||||||
|   int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; |   LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ; | ||||||
|   int liwork =  3+NN*10 ; |   LAPACK_INT liwork =  3+NN*10 ; | ||||||
|   int iwork[liwork]; |   LAPACK_INT iwork[liwork]; | ||||||
|   double work[lwork]; |   double work[lwork]; | ||||||
|   int isuppz[2*NN]; |   LAPACK_INT isuppz[2*NN]; | ||||||
|   char jobz = 'V'; // calculate evals & evecs |   char jobz = 'V'; // calculate evals & evecs | ||||||
|   char range = 'I'; // calculate all evals |   char range = 'I'; // calculate all evals | ||||||
|   //    char range = 'A'; // calculate all evals |   //    char range = 'A'; // calculate all evals | ||||||
|   char uplo = 'U'; // refer to upper half of original matrix |   char uplo = 'U'; // refer to upper half of original matrix | ||||||
|   char compz = 'I'; // Compute eigenvectors of tridiagonal matrix |   char compz = 'I'; // Compute eigenvectors of tridiagonal matrix | ||||||
|   int ifail[NN]; |   int ifail[NN]; | ||||||
|   int info; |   long long info; | ||||||
| //  int total = QMP_get_number_of_nodes(); | //  int total = QMP_get_number_of_nodes(); | ||||||
| //  int node = QMP_get_node_number(); | //  int node = QMP_get_node_number(); | ||||||
| //  GridBase *grid = evec[0]._grid; | //  GridBase *grid = evec[0]._grid; | ||||||
| @@ -296,14 +303,18 @@ public: | |||||||
|   int node = grid->_processor; |   int node = grid->_processor; | ||||||
|   int interval = (NN/total)+1; |   int interval = (NN/total)+1; | ||||||
|   double vl = 0.0, vu = 0.0; |   double vl = 0.0, vu = 0.0; | ||||||
|   int il = interval*node+1 , iu = interval*(node+1); |   LAPACK_INT il = interval*node+1 , iu = interval*(node+1); | ||||||
|   if (iu > NN)  iu=NN; |   if (iu > NN)  iu=NN; | ||||||
|   double tol = 0.0; |   double tol = 0.0; | ||||||
|     if (1) { |     if (1) { | ||||||
|       memset(evals_tmp,0,sizeof(double)*NN); |       memset(evals_tmp,0,sizeof(double)*NN); | ||||||
|       if ( il <= NN){ |       if ( il <= NN){ | ||||||
|         printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu); |         printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu); | ||||||
|  | #ifdef USE_MKL | ||||||
|  |         dstegr(&jobz, &range, &NN, | ||||||
|  | #else | ||||||
|         LAPACK_dstegr(&jobz, &range, &NN, |         LAPACK_dstegr(&jobz, &range, &NN, | ||||||
|  | #endif | ||||||
|             (double*)DD, (double*)EE, |             (double*)DD, (double*)EE, | ||||||
|             &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' |             &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A' | ||||||
|             &tol, // tolerance |             &tol, // tolerance | ||||||
| @@ -335,6 +346,7 @@ public: | |||||||
|       lmd [NN-1-i]=evals_tmp[i]; |       lmd [NN-1-i]=evals_tmp[i]; | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  | #undef LAPACK_INT  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -365,12 +377,14 @@ public: | |||||||
| //	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid); | //	diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|       int Niter = 100*N1; |       int Niter = 10000*N1; | ||||||
|       int kmin = 1; |       int kmin = 1; | ||||||
|       int kmax = N2; |       int kmax = N2; | ||||||
|       // (this should be more sophisticated) |       // (this should be more sophisticated) | ||||||
|  |  | ||||||
|       for(int iter=0; iter<Niter; ++iter){ |       for(int iter=0; ; ++iter){ | ||||||
|  |       if ( (iter+1)%(100*N1)==0)  | ||||||
|  |       std::cout<<GridLogMessage << "[QL method] Not converged - iteration "<<iter+1<<"\n"; | ||||||
|  |  | ||||||
| 	// determination of 2x2 leading submatrix | 	// determination of 2x2 leading submatrix | ||||||
| 	RealD dsub = lmd[kmax-1]-lmd[kmax-2]; | 	RealD dsub = lmd[kmax-1]-lmd[kmax-2]; | ||||||
| @@ -399,11 +413,11 @@ public: | |||||||
|         _sort.push(lmd3,N2); |         _sort.push(lmd3,N2); | ||||||
|         _sort.push(lmd2,N2); |         _sort.push(lmd2,N2); | ||||||
|          for(int k=0; k<N2; ++k){ |          for(int k=0; k<N2; ++k){ | ||||||
| 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl; | 	    if (fabs(lmd2[k] - lmd3[k]) >SMALL)  std::cout<<GridLogMessage <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl; | ||||||
| //	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl; | //	    if (fabs(lme2[k] - lme[k]) >SMALL)  std::cout<<GridLogMessage <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl; | ||||||
| 	  } | 	  } | ||||||
|          for(int k=0; k<N1*N1; ++k){ |          for(int k=0; k<N1*N1; ++k){ | ||||||
| //	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl; | //	    if (fabs(Qt2[k] - Qt[k]) >SMALL)  std::cout<<GridLogMessage <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl; | ||||||
| 	} | 	} | ||||||
|     } |     } | ||||||
| #endif | #endif | ||||||
| @@ -418,7 +432,7 @@ public: | |||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|       std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n"; |       std::cout<<GridLogMessage << "[QL method] Error - Too many iteration: "<<Niter<<"\n"; | ||||||
|       abort(); |       abort(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -435,6 +449,7 @@ public: | |||||||
| 		       DenseVector<Field>& evec, | 		       DenseVector<Field>& evec, | ||||||
| 		       int k) | 		       int k) | ||||||
|     { |     { | ||||||
|  |       double t0=-usecond()/1e6; | ||||||
|       typedef typename Field::scalar_type MyComplex; |       typedef typename Field::scalar_type MyComplex; | ||||||
|       MyComplex ip; |       MyComplex ip; | ||||||
|  |  | ||||||
| @@ -453,6 +468,8 @@ public: | |||||||
| 	w = w - ip * evec[j]; | 	w = w - ip * evec[j]; | ||||||
|       } |       } | ||||||
|       normalise(w); |       normalise(w); | ||||||
|  |       t0+=usecond()/1e6; | ||||||
|  |       OrthoTime +=t0; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) { |     void setUnit_Qt(int Nm, DenseVector<RealD> &Qt) { | ||||||
| @@ -486,10 +503,10 @@ until convergence | |||||||
| 	GridBase *grid = evec[0]._grid; | 	GridBase *grid = evec[0]._grid; | ||||||
| 	assert(grid == src._grid); | 	assert(grid == src._grid); | ||||||
|  |  | ||||||
| 	std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl; | 	std::cout<<GridLogMessage << " -- Nk = " << Nk << " Np = "<< Np << std::endl; | ||||||
| 	std::cout << " -- Nm = " << Nm << std::endl; | 	std::cout<<GridLogMessage << " -- Nm = " << Nm << std::endl; | ||||||
| 	std::cout << " -- size of eval   = " << eval.size() << std::endl; | 	std::cout<<GridLogMessage << " -- size of eval   = " << eval.size() << std::endl; | ||||||
| 	std::cout << " -- size of evec  = " << evec.size() << std::endl; | 	std::cout<<GridLogMessage << " -- size of evec  = " << evec.size() << std::endl; | ||||||
| 	 | 	 | ||||||
| 	assert(Nm == evec.size() && Nm == eval.size()); | 	assert(Nm == evec.size() && Nm == eval.size()); | ||||||
| 	 | 	 | ||||||
| @@ -500,6 +517,7 @@ until convergence | |||||||
| 	DenseVector<int>   Iconv(Nm); | 	DenseVector<int>   Iconv(Nm); | ||||||
|  |  | ||||||
| 	DenseVector<Field>  B(Nm,grid); // waste of space replicating | 	DenseVector<Field>  B(Nm,grid); // waste of space replicating | ||||||
|  | //	DenseVector<Field>  Btemp(Nm,grid); // waste of space replicating | ||||||
| 	 | 	 | ||||||
| 	Field f(grid); | 	Field f(grid); | ||||||
| 	Field v(grid); | 	Field v(grid); | ||||||
| @@ -515,35 +533,48 @@ until convergence | |||||||
| 	// (uniform vector) Why not src?? | 	// (uniform vector) Why not src?? | ||||||
| 	//	evec[0] = 1.0; | 	//	evec[0] = 1.0; | ||||||
| 	evec[0] = src; | 	evec[0] = src; | ||||||
| 	std:: cout <<"norm2(src)= " << norm2(src)<<std::endl; | 	std:: cout<<GridLogMessage <<"norm2(src)= " << norm2(src)<<std::endl; | ||||||
| // << src._grid  << std::endl; | // << src._grid  << std::endl; | ||||||
| 	normalise(evec[0]); | 	normalise(evec[0]); | ||||||
| 	std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl; | 	std:: cout<<GridLogMessage <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl; | ||||||
| // << evec[0]._grid << std::endl; | // << evec[0]._grid << std::endl; | ||||||
| 	 | 	 | ||||||
| 	// Initial Nk steps | 	// Initial Nk steps | ||||||
|  | 	OrthoTime=0.; | ||||||
|  | 	double t0=usecond()/1e6; | ||||||
| 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k); | 	for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k); | ||||||
| //	std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl; | 	double t1=usecond()/1e6; | ||||||
| //	std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl; | 	std::cout<<GridLogMessage <<"IRL::Initial steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl; | ||||||
|  | //	std:: cout<<GridLogMessage <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl; | ||||||
|  | //	std:: cout<<GridLogMessage <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl; | ||||||
| 	RitzMatrix(evec,Nk); | 	RitzMatrix(evec,Nk); | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::RitzMatrix: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
| 	for(int k=0; k<Nk; ++k){ | 	for(int k=0; k<Nk; ++k){ | ||||||
| //	std:: cout <<"eval " << k << " " <<eval[k] << std::endl; | //	std:: cout<<GridLogMessage <<"eval " << k << " " <<eval[k] << std::endl; | ||||||
| //	std:: cout <<"lme " << k << " " << lme[k] << std::endl; | //	std:: cout<<GridLogMessage <<"lme " << k << " " << lme[k] << std::endl; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Restarting loop begins | 	// Restarting loop begins | ||||||
| 	for(int iter = 0; iter<Niter; ++iter){ | 	for(int iter = 0; iter<Niter; ++iter){ | ||||||
|  |  | ||||||
| 	  std::cout<<"\n Restart iteration = "<< iter << std::endl; | 	  std::cout<<GridLogMessage<<"\n Restart iteration = "<< iter << std::endl; | ||||||
|  |  | ||||||
| 	  //  | 	  //  | ||||||
| 	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs. | 	  // Rudy does a sort first which looks very different. Getting fed up with sorting out the algo defs. | ||||||
| 	  // We loop over  | 	  // We loop over  | ||||||
| 	  // | 	  // | ||||||
|  | 	OrthoTime=0.; | ||||||
| 	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k); | 	  for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k); | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL:: "<<Np <<" steps: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl; | ||||||
| 	  f *= lme[Nm-1]; | 	  f *= lme[Nm-1]; | ||||||
|  |  | ||||||
| 	  RitzMatrix(evec,k2); | 	  RitzMatrix(evec,k2); | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL:: RitzMatrix: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
| 	   | 	   | ||||||
| 	  // getting eigenvalues | 	  // getting eigenvalues | ||||||
| 	  for(int k=0; k<Nm; ++k){ | 	  for(int k=0; k<Nm; ++k){ | ||||||
| @@ -552,18 +583,27 @@ until convergence | |||||||
| 	  } | 	  } | ||||||
| 	  setUnit_Qt(Nm,Qt); | 	  setUnit_Qt(Nm,Qt); | ||||||
| 	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid); | 	  diagonalize(eval2,lme2,Nm,Nm,Qt,grid); | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL:: diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  |  | ||||||
| 	  // sorting | 	  // sorting | ||||||
| 	  _sort.push(eval2,Nm); | 	  _sort.push(eval2,Nm); | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL:: eval sorting: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
| 	   | 	   | ||||||
| 	  // Implicitly shifted QR transformations | 	  // Implicitly shifted QR transformations | ||||||
| 	  setUnit_Qt(Nm,Qt); | 	  setUnit_Qt(Nm,Qt); | ||||||
|  | 	  for(int ip=0; ip<k2; ++ip){ | ||||||
|  | 	std::cout<<GridLogMessage << "eval "<< ip << " "<< eval2[ip] << std::endl; | ||||||
|  | 	} | ||||||
| 	  for(int ip=k2; ip<Nm; ++ip){  | 	  for(int ip=k2; ip<Nm; ++ip){  | ||||||
| 	std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl; | 	std::cout<<GridLogMessage << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl; | ||||||
| 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm); | 	    qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm); | ||||||
| 		 | 		 | ||||||
| 	} | 	} | ||||||
|      | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::qr_decomp: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  | if (0) {   | ||||||
| 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0; | 	  for(int i=0; i<(Nk+1); ++i) B[i] = 0.0; | ||||||
| 	   | 	   | ||||||
| 	  for(int j=k1-1; j<k2+1; ++j){ | 	  for(int j=k1-1; j<k2+1; ++j){ | ||||||
| @@ -572,6 +612,30 @@ until convergence | |||||||
| 	      B[j] += Qt[k+Nm*j] * evec[k]; | 	      B[j] += Qt[k+Nm*j] * evec[k]; | ||||||
| 	    } | 	    } | ||||||
| 	  } | 	  } | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::QR Rotate: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | if (1) { | ||||||
|  | 	for(int i=0; i<(Nk+1); ++i) { | ||||||
|  | 		B[i] = 0.0; | ||||||
|  | 	  	B[i].checkerboard = evec[0].checkerboard; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	int j_block = 24; int k_block=24; | ||||||
|  | PARALLEL_FOR_LOOP | ||||||
|  | 	for(int ss=0;ss < grid->oSites();ss++){ | ||||||
|  | 	for(int jj=k1-1; jj<k2+1; jj += j_block) | ||||||
|  | 	for(int kk=0; kk<Nm; kk += k_block) | ||||||
|  | 	for(int j=jj; (j<(k2+1)) && j<(jj+j_block); ++j){ | ||||||
|  | 	for(int k=kk; (k<Nm) && k<(kk+k_block) ; ++k){ | ||||||
|  | 	    B[j]._odata[ss] +=Qt[k+Nm*j] * evec[k]._odata[ss];  | ||||||
|  | 	} | ||||||
|  | 	} | ||||||
|  | 	} | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::QR rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  | } | ||||||
| 	for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j]; | 	for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j]; | ||||||
|  |  | ||||||
| 	  // Compressed vector f and beta(k2) | 	  // Compressed vector f and beta(k2) | ||||||
| @@ -579,7 +643,7 @@ until convergence | |||||||
| 	  f += lme[k2-1] * evec[k2]; | 	  f += lme[k2-1] * evec[k2]; | ||||||
| 	  beta_k = norm2(f); | 	  beta_k = norm2(f); | ||||||
| 	  beta_k = sqrt(beta_k); | 	  beta_k = sqrt(beta_k); | ||||||
| 	  std::cout<<" beta(k) = "<<beta_k<<std::endl; | 	  std::cout<<GridLogMessage<<" beta(k) = "<<beta_k<<std::endl; | ||||||
|  |  | ||||||
| 	  RealD betar = 1.0/beta_k; | 	  RealD betar = 1.0/beta_k; | ||||||
| 	  evec[k2] = betar * f; | 	  evec[k2] = betar * f; | ||||||
| @@ -592,7 +656,10 @@ until convergence | |||||||
| 	  } | 	  } | ||||||
| 	  setUnit_Qt(Nm,Qt); | 	  setUnit_Qt(Nm,Qt); | ||||||
| 	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid); | 	  diagonalize(eval2,lme2,Nk,Nm,Qt,grid); | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::diagonalize: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
| 	   | 	   | ||||||
|  | if (0) { | ||||||
| 	  for(int k = 0; k<Nk; ++k) B[k]=0.0; | 	  for(int k = 0; k<Nk; ++k) B[k]=0.0; | ||||||
| 	   | 	   | ||||||
| 	  for(int j = 0; j<Nk; ++j){ | 	  for(int j = 0; j<Nk; ++j){ | ||||||
| @@ -600,12 +667,34 @@ until convergence | |||||||
| 	    B[j].checkerboard = evec[k].checkerboard; | 	    B[j].checkerboard = evec[k].checkerboard; | ||||||
| 	      B[j] += Qt[k+j*Nm] * evec[k]; | 	      B[j] += Qt[k+j*Nm] * evec[k]; | ||||||
| 	    } | 	    } | ||||||
| //	    std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl; | 	    std::cout<<GridLogMessage << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl; | ||||||
| 	  } | 	  } | ||||||
| //	_sort.push(eval2,B,Nk); | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::Convergence rotation: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  | } | ||||||
|  | if (1) { | ||||||
|  | 	for(int i=0; i<(Nk+1); ++i) { | ||||||
|  | 		B[i] = 0.0; | ||||||
|  | 	  	B[i].checkerboard = evec[0].checkerboard; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	int j_block = 24; int k_block=24; | ||||||
|  | PARALLEL_FOR_LOOP | ||||||
|  | 	for(int ss=0;ss < grid->oSites();ss++){ | ||||||
|  | 	for(int jj=0; jj<Nk; jj += j_block) | ||||||
|  | 	for(int kk=0; kk<Nk; kk += k_block) | ||||||
|  | 	for(int j=jj; (j<Nk) && j<(jj+j_block); ++j){ | ||||||
|  | 	for(int k=kk; (k<Nk) && k<(kk+k_block) ; ++k){ | ||||||
|  | 	    B[j]._odata[ss] +=Qt[k+Nm*j] * evec[k]._odata[ss];  | ||||||
|  | 	} | ||||||
|  | 	} | ||||||
|  | 	} | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::convergence rotation : "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  | } | ||||||
|  |  | ||||||
| 	  Nconv = 0; | 	  Nconv = 0; | ||||||
| 	  //	  std::cout << std::setiosflags(std::ios_base::scientific); | 	  //	  std::cout<<GridLogMessage << std::setiosflags(std::ios_base::scientific); | ||||||
| 	  for(int i=0; i<Nk; ++i){ | 	  for(int i=0; i<Nk; ++i){ | ||||||
|  |  | ||||||
| //	    _poly(_Linop,B[i],v); | //	    _poly(_Linop,B[i],v); | ||||||
| @@ -613,14 +702,16 @@ until convergence | |||||||
| 	     | 	     | ||||||
| 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp. | 	    RealD vnum = real(innerProduct(B[i],v)); // HermOp. | ||||||
| 	    RealD vden = norm2(B[i]); | 	    RealD vden = norm2(B[i]); | ||||||
|  | 	    RealD vv0 = norm2(v); | ||||||
| 	    eval2[i] = vnum/vden; | 	    eval2[i] = vnum/vden; | ||||||
| 	    v -= eval2[i]*B[i]; | 	    v -= eval2[i]*B[i]; | ||||||
| 	    RealD vv = norm2(v); | 	    RealD vv = norm2(v); | ||||||
| 	     | 	     | ||||||
| 	    std::cout.precision(13); | 	    std::cout.precision(13); | ||||||
| 	    std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | 	    std::cout<<GridLogMessage << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] "; | ||||||
| 	    std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i]; | 	    std::cout<<"eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i]; | ||||||
| 	    std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl; | 	    std::cout<<"|H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv; | ||||||
|  | 	    std::cout<<" "<< vnum/(sqrt(vden)*sqrt(vv0)) << std::endl; | ||||||
| 	     | 	     | ||||||
| 	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged | 	// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged | ||||||
| 	    if((vv<eresid*eresid) && (i == Nconv) ){ | 	    if((vv<eresid*eresid) && (i == Nconv) ){ | ||||||
| @@ -629,17 +720,19 @@ until convergence | |||||||
| 	    } | 	    } | ||||||
|  |  | ||||||
| 	  }  // i-loop end | 	  }  // i-loop end | ||||||
| 	  //	  std::cout << std::resetiosflags(std::ios_base::scientific); | 	  //	  std::cout<<GridLogMessage << std::resetiosflags(std::ios_base::scientific); | ||||||
|  | 	t1=usecond()/1e6; | ||||||
|  | 	std::cout<<GridLogMessage <<"IRL::convergence testing: "<<t1-t0<< "seconds"<<std::endl; t0=t1; | ||||||
|  |  | ||||||
|  |  | ||||||
| 	  std::cout<<" #modes converged: "<<Nconv<<std::endl; | 	  std::cout<<GridLogMessage<<" #modes converged: "<<Nconv<<std::endl; | ||||||
|  |  | ||||||
| 	  if( Nconv>=Nstop ){ | 	  if( Nconv>=Nstop ){ | ||||||
| 	    goto converged; | 	    goto converged; | ||||||
| 	  } | 	  } | ||||||
| 	} // end of iter loop | 	} // end of iter loop | ||||||
| 	 | 	 | ||||||
| 	std::cout<<"\n NOT converged.\n"; | 	std::cout<<GridLogMessage<<"\n NOT converged.\n"; | ||||||
| 	abort(); | 	abort(); | ||||||
| 	 | 	 | ||||||
|       converged: |       converged: | ||||||
| @@ -652,10 +745,10 @@ until convergence | |||||||
|        } |        } | ||||||
|       _sort.push(eval,evec,Nconv); |       _sort.push(eval,evec,Nconv); | ||||||
|  |  | ||||||
|       std::cout << "\n Converged\n Summary :\n"; |       std::cout<<GridLogMessage << "\n Converged\n Summary :\n"; | ||||||
|       std::cout << " -- Iterations  = "<< Nconv  << "\n"; |       std::cout<<GridLogMessage << " -- Iterations  = "<< Nconv  << "\n"; | ||||||
|       std::cout << " -- beta(k)     = "<< beta_k << "\n"; |       std::cout<<GridLogMessage << " -- beta(k)     = "<< beta_k << "\n"; | ||||||
|       std::cout << " -- Nconv       = "<< Nconv  << "\n"; |       std::cout<<GridLogMessage << " -- Nconv       = "<< Nconv  << "\n"; | ||||||
|      } |      } | ||||||
|  |  | ||||||
|     ///////////////////////////////////////////////// |     ///////////////////////////////////////////////// | ||||||
| @@ -678,25 +771,25 @@ until convergence | |||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       std::cout<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl; |       std::cout<<GridLogMessage<<"Lanczos_Factor start/end " <<start <<"/"<<end<<std::endl; | ||||||
|  |  | ||||||
|       // Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1 |       // Starting from scratch, bq[0] contains a random vector and |bq[0]| = 1 | ||||||
|       int first; |       int first; | ||||||
|       if(start == 0){ |       if(start == 0){ | ||||||
|  |  | ||||||
| 	std::cout << "start == 0\n"; //TESTING | 	std::cout<<GridLogMessage << "start == 0\n"; //TESTING | ||||||
|  |  | ||||||
| 	_poly(_Linop,bq[0],bf); | 	_poly(_Linop,bq[0],bf); | ||||||
|  |  | ||||||
| 	alpha = real(innerProduct(bq[0],bf));//alpha =  bq[0]^dag A bq[0] | 	alpha = real(innerProduct(bq[0],bf));//alpha =  bq[0]^dag A bq[0] | ||||||
|  |  | ||||||
| 	std::cout << "alpha = " << alpha << std::endl; | 	std::cout<<GridLogMessage << "alpha = " << alpha << std::endl; | ||||||
| 	 | 	 | ||||||
| 	bf = bf - alpha * bq[0];  //bf =  A bq[0] - alpha bq[0] | 	bf = bf - alpha * bq[0];  //bf =  A bq[0] - alpha bq[0] | ||||||
|  |  | ||||||
| 	H[0][0]=alpha; | 	H[0][0]=alpha; | ||||||
|  |  | ||||||
| 	std::cout << "Set H(0,0) to " << H[0][0] << std::endl; | 	std::cout<<GridLogMessage << "Set H(0,0) to " << H[0][0] << std::endl; | ||||||
|  |  | ||||||
| 	first = 1; | 	first = 1; | ||||||
|  |  | ||||||
| @@ -716,19 +809,19 @@ until convergence | |||||||
|  |  | ||||||
| 	beta = 0;sqbt = 0; | 	beta = 0;sqbt = 0; | ||||||
|  |  | ||||||
| 	std::cout << "cont is true so setting beta to zero\n"; | 	std::cout<<GridLogMessage << "cont is true so setting beta to zero\n"; | ||||||
|  |  | ||||||
|       }	else { |       }	else { | ||||||
|  |  | ||||||
| 	beta = norm2(bf); | 	beta = norm2(bf); | ||||||
| 	sqbt = sqrt(beta); | 	sqbt = sqrt(beta); | ||||||
|  |  | ||||||
| 	std::cout << "beta = " << beta << std::endl; | 	std::cout<<GridLogMessage << "beta = " << beta << std::endl; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       for(int j=first;j<end;j++){ |       for(int j=first;j<end;j++){ | ||||||
|  |  | ||||||
| 	std::cout << "Factor j " << j <<std::endl; | 	std::cout<<GridLogMessage << "Factor j " << j <<std::endl; | ||||||
|  |  | ||||||
| 	if(cont){ // switches to factoring; understand start!=0 and initial bf value is right. | 	if(cont){ // switches to factoring; understand start!=0 and initial bf value is right. | ||||||
| 	  bq[j] = bf; cont = false; | 	  bq[j] = bf; cont = false; | ||||||
| @@ -751,7 +844,7 @@ until convergence | |||||||
|  |  | ||||||
| 	beta = fnorm; | 	beta = fnorm; | ||||||
| 	sqbt = sqrt(beta); | 	sqbt = sqrt(beta); | ||||||
| 	std::cout << "alpha = " << alpha << " fnorm = " << fnorm << '\n'; | 	std::cout<<GridLogMessage << "alpha = " << alpha << " fnorm = " << fnorm << '\n'; | ||||||
|  |  | ||||||
| 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ] | 	///Iterative refinement of orthogonality V = [ bq[0]  bq[1]  ...  bq[M] ] | ||||||
| 	int re = 0; | 	int re = 0; | ||||||
| @@ -786,8 +879,8 @@ until convergence | |||||||
| 	  bck = sqrt( nmbex ); | 	  bck = sqrt( nmbex ); | ||||||
| 	  re++; | 	  re++; | ||||||
| 	} | 	} | ||||||
| 	std::cout << "Iteratively refined orthogonality, changes alpha\n"; | 	std::cout<<GridLogMessage << "Iteratively refined orthogonality, changes alpha\n"; | ||||||
| 	if(re > 1) std::cout << "orthagonality refined " << re << " times" <<std::endl; | 	if(re > 1) std::cout<<GridLogMessage << "orthagonality refined " << re << " times" <<std::endl; | ||||||
| 	H[j][j]=alpha; | 	H[j][j]=alpha; | ||||||
|       } |       } | ||||||
|  |  | ||||||
| @@ -802,11 +895,13 @@ until convergence | |||||||
|  |  | ||||||
|     void ImplicitRestart(int TM, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont) |     void ImplicitRestart(int TM, DenseVector<RealD> &evals,  DenseVector<DenseVector<RealD> > &evecs, DenseVector<Field> &bq, Field &bf, int cont) | ||||||
|     { |     { | ||||||
|       std::cout << "ImplicitRestart begin. Eigensort starting\n"; |       std::cout<<GridLogMessage << "ImplicitRestart begin. Eigensort starting\n"; | ||||||
|  |  | ||||||
|       DenseMatrix<RealD> H; Resize(H,Nm,Nm); |       DenseMatrix<RealD> H; Resize(H,Nm,Nm); | ||||||
|  |  | ||||||
|  | #ifndef USE_LAPACK | ||||||
|       EigenSort(evals, evecs); |       EigenSort(evals, evecs); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|       ///Assign shifts |       ///Assign shifts | ||||||
|       int K=Nk; |       int K=Nk; | ||||||
| @@ -829,15 +924,15 @@ until convergence | |||||||
|       /// Shifted H defines a new K step Arnoldi factorization |       /// Shifted H defines a new K step Arnoldi factorization | ||||||
|       RealD  beta = H[ff][ff-1];  |       RealD  beta = H[ff][ff-1];  | ||||||
|       RealD  sig  = Q[TM - 1][ff - 1]; |       RealD  sig  = Q[TM - 1][ff - 1]; | ||||||
|       std::cout << "beta = " << beta << " sig = " << real(sig) <<std::endl; |       std::cout<<GridLogMessage << "beta = " << beta << " sig = " << real(sig) <<std::endl; | ||||||
|  |  | ||||||
|       std::cout << "TM = " << TM << " "; |       std::cout<<GridLogMessage << "TM = " << TM << " "; | ||||||
|       std::cout << norm2(bq[0]) << " -- before" <<std::endl; |       std::cout<<GridLogMessage << norm2(bq[0]) << " -- before" <<std::endl; | ||||||
|  |  | ||||||
|       /// q -> q Q |       /// q -> q Q | ||||||
|       times_real(bq, Q, TM); |       times_real(bq, Q, TM); | ||||||
|  |  | ||||||
|       std::cout << norm2(bq[0]) << " -- after " << ff <<std::endl; |       std::cout<<GridLogMessage << norm2(bq[0]) << " -- after " << ff <<std::endl; | ||||||
|       bf =  beta* bq[ff] + sig* bf; |       bf =  beta* bq[ff] + sig* bf; | ||||||
|  |  | ||||||
|       /// Do the rest of the factorization |       /// Do the rest of the factorization | ||||||
| @@ -861,7 +956,7 @@ until convergence | |||||||
|       int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with |       int ff = Lanczos_Factor(0, M, cont, bq,bf,H); // 0--M to begin with | ||||||
|  |  | ||||||
|       if(ff < M) { |       if(ff < M) { | ||||||
| 	std::cout << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl; | 	std::cout<<GridLogMessage << "Krylov: aborting ff "<<ff <<" "<<M<<std::endl; | ||||||
| 	abort(); // Why would this happen? | 	abort(); // Why would this happen? | ||||||
|       } |       } | ||||||
|  |  | ||||||
| @@ -870,7 +965,7 @@ until convergence | |||||||
|  |  | ||||||
|       for(int it = 0; it < Niter && (converged < Nk); ++it) { |       for(int it = 0; it < Niter && (converged < Nk); ++it) { | ||||||
|  |  | ||||||
| 	std::cout << "Krylov: Iteration --> " << it << std::endl; | 	std::cout<<GridLogMessage << "Krylov: Iteration --> " << it << std::endl; | ||||||
| 	int lock_num = lock ? converged : 0; | 	int lock_num = lock ? converged : 0; | ||||||
| 	DenseVector<RealD> tevals(M - lock_num ); | 	DenseVector<RealD> tevals(M - lock_num ); | ||||||
| 	DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num); | 	DenseMatrix<RealD> tevecs; Resize(tevecs,M - lock_num,M - lock_num); | ||||||
| @@ -886,7 +981,7 @@ until convergence | |||||||
|       Wilkinson<RealD>(H, evals, evecs, small);  |       Wilkinson<RealD>(H, evals, evecs, small);  | ||||||
|       //      Check(); |       //      Check(); | ||||||
|  |  | ||||||
|       std::cout << "Done  "<<std::endl; |       std::cout<<GridLogMessage << "Done  "<<std::endl; | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -951,7 +1046,7 @@ until convergence | |||||||
| 		  DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs,  | 		  DenseVector<RealD> &tevals, DenseVector<DenseVector<RealD> > &tevecs,  | ||||||
| 		  int lock, int converged) | 		  int lock, int converged) | ||||||
|     { |     { | ||||||
|       std::cout << "Converged " << converged << " so far." << std::endl; |       std::cout<<GridLogMessage << "Converged " << converged << " so far." << std::endl; | ||||||
|       int lock_num = lock ? converged : 0; |       int lock_num = lock ? converged : 0; | ||||||
|       int M = Nm; |       int M = Nm; | ||||||
|  |  | ||||||
| @@ -966,7 +1061,9 @@ until convergence | |||||||
|       RealD small=1.0e-16; |       RealD small=1.0e-16; | ||||||
|       Wilkinson<RealD>(AH, tevals, tevecs, small); |       Wilkinson<RealD>(AH, tevals, tevecs, small); | ||||||
|  |  | ||||||
|  | #ifndef USE_LAPACK | ||||||
|       EigenSort(tevals, tevecs); |       EigenSort(tevals, tevecs); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|       RealD resid_nrm=  norm2(bf); |       RealD resid_nrm=  norm2(bf); | ||||||
|  |  | ||||||
| @@ -977,7 +1074,7 @@ until convergence | |||||||
| 	RealD diff = 0; | 	RealD diff = 0; | ||||||
| 	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm; | 	diff = abs( tevecs[i][Nm - 1 - lock_num] ) * resid_nrm; | ||||||
|  |  | ||||||
| 	std::cout << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl; | 	std::cout<<GridLogMessage << "residual estimate " << SS-1-i << " " << diff << " of (" << tevals[i] << ")" << std::endl; | ||||||
|  |  | ||||||
| 	if(diff < converged) { | 	if(diff < converged) { | ||||||
|  |  | ||||||
| @@ -993,13 +1090,13 @@ until convergence | |||||||
| 	    lock_num++; | 	    lock_num++; | ||||||
| 	  } | 	  } | ||||||
| 	  converged++; | 	  converged++; | ||||||
| 	  std::cout << " converged on eval " << converged << " of " << Nk << std::endl; | 	  std::cout<<GridLogMessage << " converged on eval " << converged << " of " << Nk << std::endl; | ||||||
| 	} else { | 	} else { | ||||||
| 	  break; | 	  break; | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
| #endif | #endif | ||||||
|       std::cout << "Got " << converged << " so far " <<std::endl;	 |       std::cout<<GridLogMessage << "Got " << converged << " so far " <<std::endl;	 | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     ///Check |     ///Check | ||||||
| @@ -1008,7 +1105,9 @@ until convergence | |||||||
|  |  | ||||||
|       DenseVector<RealD> goodval(this->get); |       DenseVector<RealD> goodval(this->get); | ||||||
|  |  | ||||||
|  | #ifndef USE_LAPACK | ||||||
|       EigenSort(evals,evecs); |       EigenSort(evals,evecs); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|       int NM = Nm; |       int NM = Nm; | ||||||
|  |  | ||||||
| @@ -1080,10 +1179,10 @@ say con = 2 | |||||||
| **/ | **/ | ||||||
|  |  | ||||||
| template<class T> | template<class T> | ||||||
| static void Lock(DenseMatrix<T> &H, 	// Hess mtx	 | static void Lock(DenseMatrix<T> &H, 	///Hess mtx	 | ||||||
| 		 DenseMatrix<T> &Q, 	// Lock Transform | 		 DenseMatrix<T> &Q, 	///Lock Transform | ||||||
| 		 T val, 		// value to be locked | 		 T val, 		///value to be locked | ||||||
| 		 int con, 	// number already locked | 		 int con, 	///number already locked | ||||||
| 		 RealD small, | 		 RealD small, | ||||||
| 		 int dfg, | 		 int dfg, | ||||||
| 		 bool herm) | 		 bool herm) | ||||||
|   | |||||||
| @@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <iomanip> | #include <iomanip> | ||||||
| #include <complex> | #include <complex> | ||||||
| #include <typeinfo> | #include <typeinfo> | ||||||
| #include <Grid/Grid.h> | #include <Grid.h> | ||||||
|  |  | ||||||
|  |  | ||||||
| /** Sign function **/ | /** Sign function **/ | ||||||
|   | |||||||
| @@ -52,7 +52,7 @@ public: | |||||||
|  |  | ||||||
|     // Physics Grid information. |     // Physics Grid information. | ||||||
|     std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes. |     std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes. | ||||||
|     std::vector<int> _fdimensions;// (full) Global dimensions of array prior to cb removal |     std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal | ||||||
|     std::vector<int> _gdimensions;// Global dimensions of array after cb removal |     std::vector<int> _gdimensions;// Global dimensions of array after cb removal | ||||||
|     std::vector<int> _ldimensions;// local dimensions of array with processor images removed |     std::vector<int> _ldimensions;// local dimensions of array with processor images removed | ||||||
|     std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed  |     std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed  | ||||||
| @@ -121,6 +121,7 @@ public: | |||||||
|       Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); |       Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|     ////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////// | ||||||
|     // SIMD lane addressing |     // SIMD lane addressing | ||||||
|     ////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////// | ||||||
| @@ -177,11 +178,9 @@ public: | |||||||
|     // Global addressing |     // Global addressing | ||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|     void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){ |     void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){ | ||||||
|       assert(gidx< gSites()); |  | ||||||
|       Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); |       Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); | ||||||
|     } |     } | ||||||
|     void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){ |     void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){ | ||||||
|       assert(lidx<lSites()); |  | ||||||
|       Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); |       Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); | ||||||
|     } |     } | ||||||
|     void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){ |     void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){ | ||||||
| @@ -208,16 +207,16 @@ public: | |||||||
|       std::vector<int> lcoor; |       std::vector<int> lcoor; | ||||||
|       GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); |       GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor); | ||||||
|       rank = RankFromProcessorCoor(pcoor); |       rank = RankFromProcessorCoor(pcoor); | ||||||
|       /* |  | ||||||
|       std::vector<int> cblcoor(lcoor); |       std::vector<int> cblcoor(lcoor); | ||||||
|       for(int d=0;d<cblcoor.size();d++){ |       for(int d=0;d<cblcoor.size();d++){ | ||||||
| 	if( this->CheckerBoarded(d) ) { | 	if( this->CheckerBoarded(d) ) { | ||||||
| 	  cblcoor[d] = lcoor[d]/2; | 	  cblcoor[d] = lcoor[d]/2; | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|       */ |  | ||||||
|       i_idx= iIndex(lcoor); |       i_idx= iIndex(cblcoor);// this does not imply divide by 2 on checker dim | ||||||
|       o_idx= oIndex(lcoor); |       o_idx= oIndex(lcoor);  // this implies divide by 2 on checkerdim | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor) |     void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor) | ||||||
|   | |||||||
| @@ -25,8 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////// | ||||||
| @@ -34,7 +33,6 @@ namespace Grid { | |||||||
| /////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////// | ||||||
| void *              CartesianCommunicator::ShmCommBuf; | void *              CartesianCommunicator::ShmCommBuf; | ||||||
| uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | ||||||
| CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; |  | ||||||
|  |  | ||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| // Alloc, free shmem region | // Alloc, free shmem region | ||||||
| @@ -90,9 +88,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | |||||||
|  |  | ||||||
| #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) | #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) | ||||||
|  |  | ||||||
| int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  |  | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, |  | ||||||
| 						       void *xmit, | 						       void *xmit, | ||||||
| 						       int xmit_to_rank, | 						       int xmit_to_rank, | ||||||
| 						       void *recv, | 						       void *recv, | ||||||
| @@ -100,7 +96,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
| 						       int bytes) | 						       int bytes) | ||||||
| { | { | ||||||
|   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); |   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|   return 2.0*bytes; |  | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -116,12 +116,6 @@ class CartesianCommunicator { | |||||||
|   // Implemented in Communicator_base.C |   // Implemented in Communicator_base.C | ||||||
|   ///////////////////////////////// |   ///////////////////////////////// | ||||||
|   static void * ShmCommBuf; |   static void * ShmCommBuf; | ||||||
|  |  | ||||||
|   // Isend/Irecv/Wait, or Sendrecv blocking |  | ||||||
|   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; |  | ||||||
|   static CommunicatorPolicy_t CommunicatorPolicy; |  | ||||||
|   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } |  | ||||||
|  |  | ||||||
|   size_t heap_top; |   size_t heap_top; | ||||||
|   size_t heap_bytes; |   size_t heap_bytes; | ||||||
|  |  | ||||||
| @@ -154,7 +148,6 @@ class CartesianCommunicator { | |||||||
|   const std::vector<int> & ThisProcessorCoor(void) ; |   const std::vector<int> & ThisProcessorCoor(void) ; | ||||||
|   const std::vector<int> & ProcessorGrid(void)     ; |   const std::vector<int> & ProcessorGrid(void)     ; | ||||||
|   int                      ProcessorCount(void)    ; |   int                      ProcessorCount(void)    ; | ||||||
|   int                      NodeCount(void)    ; |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////// | ||||||
|   // very VERY rarely (Log, serial RNG) we need world without a grid |   // very VERY rarely (Log, serial RNG) we need world without a grid | ||||||
| @@ -207,7 +200,7 @@ class CartesianCommunicator { | |||||||
|    |    | ||||||
|   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||||
|  |  | ||||||
|   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, |   void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 				  void *xmit, | 				  void *xmit, | ||||||
| 				  int xmit_to_rank, | 				  int xmit_to_rank, | ||||||
| 				  void *recv, | 				  void *recv, | ||||||
|   | |||||||
| @@ -25,9 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/GridQCDcore.h> |  | ||||||
| #include <Grid/qcd/action/ActionCore.h> |  | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| @@ -41,13 +39,9 @@ MPI_Comm CartesianCommunicator::communicator_world; | |||||||
| // Should error check all MPI calls. | // Should error check all MPI calls. | ||||||
| void CartesianCommunicator::Init(int *argc, char ***argv) { | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|   int flag; |   int flag; | ||||||
|   int provided; |  | ||||||
|   MPI_Initialized(&flag); // needed to coexist with other libs apparently |   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||||
|   if ( !flag ) { |   if ( !flag ) { | ||||||
|     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); |     MPI_Init(argc,argv); | ||||||
|     if ( provided != MPI_THREAD_MULTIPLE ) { |  | ||||||
|       QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; |  | ||||||
|     } |  | ||||||
|   } |   } | ||||||
|   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|   ShmInitGeneric(); |   ShmInitGeneric(); | ||||||
| @@ -158,34 +152,24 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
| 						int from, | 						int from, | ||||||
| 						int bytes) | 						int bytes) | ||||||
| { | { | ||||||
|   int myrank = _processor; |  | ||||||
|   int ierr; |  | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  |  | ||||||
|   MPI_Request xrq; |   MPI_Request xrq; | ||||||
|   MPI_Request rrq; |   MPI_Request rrq; | ||||||
|  |   int rank = _processor; | ||||||
|     ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); |   int ierr; | ||||||
|     ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); |   ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |   ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|    |    | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|  |  | ||||||
|   list.push_back(xrq); |   list.push_back(xrq); | ||||||
|   list.push_back(rrq); |   list.push_back(rrq); | ||||||
|   } else {  |  | ||||||
|     // Give the CPU to MPI immediately; can use threads to overlap optionally |  | ||||||
|     ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, |  | ||||||
| 		      recv,bytes,MPI_CHAR,from, from, |  | ||||||
| 		      communicator,MPI_STATUS_IGNORE); |  | ||||||
|     assert(ierr==0); |  | ||||||
|   } |  | ||||||
| } | } | ||||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
| { | { | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  |  | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|   std::vector<MPI_Status> status(nreq); |   std::vector<MPI_Status> status(nreq); | ||||||
|   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); |   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|   } |  | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::Barrier(void) | void CartesianCommunicator::Barrier(void) | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| /************************************************************************************* |     /************************************************************************************* | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
| @@ -25,23 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
|  |  | ||||||
| #include <semaphore.h> |  | ||||||
| #include <fcntl.h> |  | ||||||
| #include <unistd.h> |  | ||||||
| #include <limits.h> |  | ||||||
| #include <sys/types.h> |  | ||||||
| #include <sys/ipc.h> |  | ||||||
| #include <sys/shm.h> |  | ||||||
| #include <sys/mman.h> |  | ||||||
| //#include <zlib.h> |  | ||||||
| #ifndef SHM_HUGETLB |  | ||||||
| #define SHM_HUGETLB 04000 |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -64,10 +50,6 @@ std::vector<int> CartesianCommunicator::GroupRanks; | |||||||
| std::vector<int> CartesianCommunicator::MyGroup; | std::vector<int> CartesianCommunicator::MyGroup; | ||||||
| std::vector<void *> CartesianCommunicator::ShmCommBufs; | std::vector<void *> CartesianCommunicator::ShmCommBufs; | ||||||
|  |  | ||||||
| int CartesianCommunicator::NodeCount(void)    { return GroupSize;}; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #undef FORCE_COMMS |  | ||||||
| void *CartesianCommunicator::ShmBufferSelf(void) | void *CartesianCommunicator::ShmBufferSelf(void) | ||||||
| { | { | ||||||
|   return ShmCommBufs[ShmRank]; |   return ShmCommBufs[ShmRank]; | ||||||
| @@ -75,9 +57,6 @@ void *CartesianCommunicator::ShmBufferSelf(void) | |||||||
| void *CartesianCommunicator::ShmBuffer(int rank) | void *CartesianCommunicator::ShmBuffer(int rank) | ||||||
| { | { | ||||||
|   int gpeer = GroupRanks[rank]; |   int gpeer = GroupRanks[rank]; | ||||||
| #ifdef FORCE_COMMS |  | ||||||
|   return NULL; |  | ||||||
| #endif |  | ||||||
|   if (gpeer == MPI_UNDEFINED){ |   if (gpeer == MPI_UNDEFINED){ | ||||||
|     return NULL; |     return NULL; | ||||||
|   } else {  |   } else {  | ||||||
| @@ -86,13 +65,7 @@ void *CartesianCommunicator::ShmBuffer(int rank) | |||||||
| } | } | ||||||
| void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) | void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) | ||||||
| { | { | ||||||
|   static int count =0; |  | ||||||
|   int gpeer = GroupRanks[rank]; |   int gpeer = GroupRanks[rank]; | ||||||
|   assert(gpeer!=ShmRank); // never send to self |  | ||||||
|   assert(rank!=WorldRank);// never send to self |  | ||||||
| #ifdef FORCE_COMMS |  | ||||||
|   return NULL; |  | ||||||
| #endif |  | ||||||
|   if (gpeer == MPI_UNDEFINED){ |   if (gpeer == MPI_UNDEFINED){ | ||||||
|     return NULL; |     return NULL; | ||||||
|   } else {  |   } else {  | ||||||
| @@ -103,27 +76,16 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) | |||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::Init(int *argc, char ***argv) { | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|  |  | ||||||
|   int flag; |   int flag; | ||||||
|   int provided; |  | ||||||
|   //  mtrace(); |  | ||||||
|  |  | ||||||
|   MPI_Initialized(&flag); // needed to coexist with other libs apparently |   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||||
|   if ( !flag ) { |   if ( !flag ) { | ||||||
|     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); |     MPI_Init(argc,argv); | ||||||
|     assert (provided == MPI_THREAD_MULTIPLE); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   Grid_quiesce_nodes(); |  | ||||||
|  |  | ||||||
|   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|   MPI_Comm_rank(communicator_world,&WorldRank); |   MPI_Comm_rank(communicator_world,&WorldRank); | ||||||
|   MPI_Comm_size(communicator_world,&WorldSize); |   MPI_Comm_size(communicator_world,&WorldSize); | ||||||
|  |  | ||||||
|   if ( WorldRank == 0 ) { |  | ||||||
|     std::cout << GridLogMessage<< "Initialising MPI "<< WorldRank <<"/"<<WorldSize <<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
|   // Split into groups that can share memory |   // Split into groups that can share memory | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
| @@ -169,6 +131,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|   /////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////// | ||||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world); |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|  |    | ||||||
|   /////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////// | ||||||
|   // find the group leaders world rank |   // find the group leaders world rank | ||||||
|   /////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////// | ||||||
| @@ -178,6 +141,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|       leaders_group[group++] = l; |       leaders_group[group++] = l; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |    | ||||||
|   /////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////// | ||||||
|   // Identify the rank of the group in which I (and my leader) live |   // Identify the rank of the group in which I (and my leader) live | ||||||
|   /////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////// | ||||||
| @@ -188,113 +152,38 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   assert(GroupRank!=-1); |   assert(GroupRank!=-1); | ||||||
|  |    | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // allocate the shared window for our group |   // allocate the shared window for our group | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|    |    | ||||||
|   ShmCommBuf = 0; |   ShmCommBuf = 0; | ||||||
|   ShmCommBufs.resize(ShmSize); |   ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow); | ||||||
|  |   assert(ierr==0); | ||||||
| #if 1 |   // KNL hack -- force to numa-domain 1 in flat | ||||||
|   char shm_name [NAME_MAX]; | #if 0 | ||||||
|   if ( ShmRank == 0 ) { |   //#include <numaif.h> | ||||||
|     for(int r=0;r<ShmSize;r++){ |   for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){ | ||||||
|  |     void *pages = (void *) ( page + ShmCommBuf ); | ||||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; |     int status; | ||||||
|  |     int flags=MPOL_MF_MOVE_ALL; | ||||||
|       sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r); |     int nodes=1; // numa domain == MCDRAM | ||||||
|  |     unsigned long count=1; | ||||||
|       shm_unlink(shm_name); |     ierr= move_pages(0,count, &pages,&nodes,&status,flags); | ||||||
|       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666); |     if (ierr && (page==0)) perror("numa relocate command failed"); | ||||||
|       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } |  | ||||||
|       ftruncate(fd, size); |  | ||||||
|  |  | ||||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |  | ||||||
|       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } |  | ||||||
|       assert(((uint64_t)ptr&0x3F)==0); |  | ||||||
|       ShmCommBufs[r] =ptr; |  | ||||||
|        |  | ||||||
|   } |   } | ||||||
|   } |  | ||||||
|  |  | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|  |  | ||||||
|   if ( ShmRank != 0 ) {  |  | ||||||
|     for(int r=0;r<ShmSize;r++){ |  | ||||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ; |  | ||||||
|      |  | ||||||
|       sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",GroupRank,r); |  | ||||||
|  |  | ||||||
|       int fd=shm_open(shm_name,O_RDWR,0666); |  | ||||||
|       if ( fd<0 ) {	perror("failed shm_open");	assert(0);      } |  | ||||||
|  |  | ||||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |  | ||||||
|       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } |  | ||||||
|       assert(((uint64_t)ptr&0x3F)==0); |  | ||||||
|       ShmCommBufs[r] =ptr; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| #else |  | ||||||
|   std::vector<int> shmids(ShmSize); |  | ||||||
|  |  | ||||||
|   if ( ShmRank == 0 ) { |  | ||||||
|     for(int r=0;r<ShmSize;r++){ |  | ||||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; |  | ||||||
|       key_t key   = 0x4545 + r; |  | ||||||
|       if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { |  | ||||||
| 	int errsv = errno; |  | ||||||
| 	printf("Errno %d\n",errsv); |  | ||||||
| 	perror("shmget"); |  | ||||||
| 	exit(1); |  | ||||||
|       } |  | ||||||
|       printf("shmid: 0x%x\n", shmids[r]); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|   MPI_Bcast(&shmids[0],ShmSize*sizeof(int),MPI_BYTE,0,ShmComm); |  | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|  |  | ||||||
|   for(int r=0;r<ShmSize;r++){ |  | ||||||
|     ShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0); |  | ||||||
|     if (ShmCommBufs[r] == (uint64_t *)-1) { |  | ||||||
|       perror("Shared memory attach failure"); |  | ||||||
|       shmctl(shmids[r], IPC_RMID, NULL); |  | ||||||
|       exit(2); |  | ||||||
|     } |  | ||||||
|     printf("shmaddr: %p\n", ShmCommBufs[r]); |  | ||||||
|   } |  | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|   // Mark for clean up |  | ||||||
|   for(int r=0;r<ShmSize;r++){ |  | ||||||
|     shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL); |  | ||||||
|   } |  | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   ShmCommBuf         = ShmCommBufs[ShmRank]; |   MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow); | ||||||
|    |    | ||||||
|   MPI_Barrier(ShmComm); |   ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   if ( ShmRank == 0 ) { |   // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free. | ||||||
|  |   ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   ShmCommBufs.resize(ShmSize); | ||||||
|   for(int r=0;r<ShmSize;r++){ |   for(int r=0;r<ShmSize;r++){ | ||||||
|       uint64_t * check = (uint64_t *) ShmCommBufs[r]; |     MPI_Aint sz; | ||||||
|       check[0] = GroupRank; |     int dsp_unit; | ||||||
|       check[1] = r; |     MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]); | ||||||
|       check[2] = 0x5A5A5A; |  | ||||||
|   } |   } | ||||||
|   } |  | ||||||
|  |  | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|   for(int r=0;r<ShmSize;r++){ |  | ||||||
|     uint64_t * check = (uint64_t *) ShmCommBufs[r]; |  | ||||||
|      |  | ||||||
|     assert(check[0]==GroupRank); |  | ||||||
|     assert(check[1]==r); |  | ||||||
|     assert(check[2]==0x5A5A5A); |  | ||||||
|  |  | ||||||
|   } |  | ||||||
|   MPI_Barrier(ShmComm); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Verbose for now |   // Verbose for now | ||||||
| @@ -303,7 +192,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|     std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected "; |     std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected "; | ||||||
|     std::cout<< WorldSize << " Ranks " ; |     std::cout<< WorldSize << " Ranks " ; | ||||||
|     std::cout<< GroupSize << " Nodes " ; |     std::cout<< GroupSize << " Nodes " ; | ||||||
|     std::cout<< " with "<< ShmSize  << " ranks-per-node "<<std::endl; |     std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl; | ||||||
|      |      | ||||||
|     std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size "; |     std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size "; | ||||||
|     std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl; |     std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl; | ||||||
| @@ -318,6 +207,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|       if(g!=ShmSize-1) std::cout<<","; |       if(g!=ShmSize-1) std::cout<<","; | ||||||
|       else std::cout<<"}"<<std::endl; |       else std::cout<<"}"<<std::endl; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   for(int g=0;g<GroupSize;g++){ |   for(int g=0;g<GroupSize;g++){ | ||||||
| @@ -326,7 +216,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|       if ( (ShmRank == 0) && (GroupRank==g) ) { |       if ( (ShmRank == 0) && (GroupRank==g) ) { | ||||||
| 	std::cout<<MyGroup[r]; | 	std::cout<<MyGroup[r]; | ||||||
| 	if(r<ShmSize-1) std::cout<<","; | 	if(r<ShmSize-1) std::cout<<","; | ||||||
| 	else std::cout<<"}"<<std::endl<<std::flush; | 	else std::cout<<"}"<<std::endl; | ||||||
|       } |       } | ||||||
|       MPI_Barrier(communicator_world); |       MPI_Barrier(communicator_world); | ||||||
|     } |     } | ||||||
| @@ -335,12 +225,14 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|   assert(ShmSetup==0);  ShmSetup=1; |   assert(ShmSetup==0);  ShmSetup=1; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Want to implement some magic ... Group sub-cubes into those on same node | // Want to implement some magic ... Group sub-cubes into those on same node | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source) | void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||||
| { | { | ||||||
|   std::vector<int> coor = _processor_coor; // my coord |   std::vector<int> coor = _processor_coor; | ||||||
|  |  | ||||||
|   assert(std::abs(shift) <_processors[dim]); |   assert(std::abs(shift) <_processors[dim]); | ||||||
|  |  | ||||||
|   coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim]; |   coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim]; | ||||||
| @@ -350,30 +242,26 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &dest,int &source | |||||||
|   coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim]; |   coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim]; | ||||||
|   Lexicographic::IndexFromCoor(coor,dest,_processors); |   Lexicographic::IndexFromCoor(coor,dest,_processors); | ||||||
|   dest = LexicographicToWorldRank[dest]; |   dest = LexicographicToWorldRank[dest]; | ||||||
|  | } | ||||||
| }// rank is world rank. |  | ||||||
|  |  | ||||||
| int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | ||||||
| { | { | ||||||
|   int rank; |   int rank; | ||||||
|   Lexicographic::IndexFromCoor(coor,rank,_processors); |   Lexicographic::IndexFromCoor(coor,rank,_processors); | ||||||
|   rank = LexicographicToWorldRank[rank]; |   rank = LexicographicToWorldRank[rank]; | ||||||
|   return rank; |   return rank; | ||||||
| }// rank is world rank | } | ||||||
|  |  | ||||||
| void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | ||||||
| { | { | ||||||
|   int lr=-1; |   Lexicographic::CoorFromIndex(coor,rank,_processors); | ||||||
|   for(int r=0;r<WorldSize;r++){// map world Rank to lexico and then to coor |   rank = LexicographicToWorldRank[rank]; | ||||||
|     if( LexicographicToWorldRank[r]==rank) lr = r; |  | ||||||
|   } |  | ||||||
|   assert(lr!=-1); |  | ||||||
|   Lexicographic::CoorFromIndex(coor,lr,_processors); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
| {  | {  | ||||||
|   int ierr; |   int ierr; | ||||||
|  |  | ||||||
|   communicator=communicator_world; |   communicator=communicator_world; | ||||||
|  |  | ||||||
|   _ndimension = processors.size(); |   _ndimension = processors.size(); | ||||||
|    |    | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -392,17 +280,19 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
|   // Identify subblock of ranks on node spreading across dims |   // Identify subblock of ranks on node spreading across dims | ||||||
|   // in a maximally symmetrical way |   // in a maximally symmetrical way | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|  |   int dim = 0; | ||||||
|  |    | ||||||
|   std::vector<int> WorldDims = processors; |   std::vector<int> WorldDims = processors; | ||||||
|  |  | ||||||
|   ShmDims.resize  (_ndimension,1); |   ShmDims.resize(_ndimension,1); | ||||||
|   GroupDims.resize(_ndimension); |   GroupDims.resize(_ndimension); | ||||||
|   ShmCoor.resize  (_ndimension); |      | ||||||
|  |   ShmCoor.resize(_ndimension); | ||||||
|   GroupCoor.resize(_ndimension); |   GroupCoor.resize(_ndimension); | ||||||
|   WorldCoor.resize(_ndimension); |   WorldCoor.resize(_ndimension); | ||||||
|  |  | ||||||
|   int dim = 0; |  | ||||||
|   for(int l2=0;l2<log2size;l2++){ |   for(int l2=0;l2<log2size;l2++){ | ||||||
|     while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%_ndimension; |     while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension; | ||||||
|     ShmDims[dim]*=2; |     ShmDims[dim]*=2; | ||||||
|     dim=(dim+1)%_ndimension; |     dim=(dim+1)%_ndimension; | ||||||
|   } |   } | ||||||
| @@ -414,29 +304,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
|     GroupDims[d] = WorldDims[d]/ShmDims[d]; |     GroupDims[d] = WorldDims[d]/ShmDims[d]; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Verbose |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
| #if 0 |  | ||||||
|   std::cout<< GridLogMessage << "MPI-3 usage "<<std::endl; |  | ||||||
|   std::cout<< GridLogMessage << "SHM   "; |  | ||||||
|   for(int d=0;d<_ndimension;d++){ |  | ||||||
|     std::cout<< ShmDims[d] <<" "; |  | ||||||
|   } |  | ||||||
|   std::cout<< std::endl; |  | ||||||
|  |  | ||||||
|   std::cout<< GridLogMessage << "Group "; |  | ||||||
|   for(int d=0;d<_ndimension;d++){ |  | ||||||
|     std::cout<< GroupDims[d] <<" "; |  | ||||||
|   } |  | ||||||
|   std::cout<< std::endl; |  | ||||||
|  |  | ||||||
|   std::cout<< GridLogMessage<<"World "; |  | ||||||
|   for(int d=0;d<_ndimension;d++){ |  | ||||||
|     std::cout<< WorldDims[d] <<" "; |  | ||||||
|   } |  | ||||||
|   std::cout<< std::endl; |  | ||||||
| #endif |  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Check processor counts match |   // Check processor counts match | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -450,57 +317,29 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
|        |        | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Establish mapping between lexico physics coord and WorldRank |   // Establish mapping between lexico physics coord and WorldRank | ||||||
|  |   //  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|  |   LexicographicToWorldRank.resize(WorldSize,0); | ||||||
|   Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims); |   Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims); | ||||||
|   Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims); |   Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims); | ||||||
|   for(int d=0;d<_ndimension;d++){ |   for(int d=0;d<_ndimension;d++){ | ||||||
|     WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d]; |     WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d]; | ||||||
|   } |   } | ||||||
|   _processor_coor = WorldCoor; |   _processor_coor = WorldCoor; | ||||||
|   _processor      = WorldRank; |  | ||||||
|  |   int lexico; | ||||||
|  |   Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims); | ||||||
|  |   LexicographicToWorldRank[lexico]=WorldRank; | ||||||
|  |   _processor = lexico; | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////// | ||||||
|   // global sum Lexico to World mapping |   // global sum Lexico to World mapping | ||||||
|   /////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////// | ||||||
|   int lexico; |  | ||||||
|   LexicographicToWorldRank.resize(WorldSize,0); |  | ||||||
|   Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims); |  | ||||||
|   LexicographicToWorldRank[lexico] = WorldRank; |  | ||||||
|   ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator); |   ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|    |    | ||||||
|   for(int i=0;i<WorldSize;i++){ |  | ||||||
|  |  | ||||||
|     int wr = LexicographicToWorldRank[i]; |  | ||||||
|     //    int wr = i; |  | ||||||
|  |  | ||||||
|     std::vector<int> coor(_ndimension); |  | ||||||
|     ProcessorCoorFromRank(wr,coor); // from world rank |  | ||||||
|     int ck = RankFromProcessorCoor(coor); |  | ||||||
|     assert(ck==wr); |  | ||||||
|  |  | ||||||
|     if ( wr == WorldRank ) {  |  | ||||||
|       for(int j=0;j<coor.size();j++) { |  | ||||||
| 	assert(coor[j] == _processor_coor[j]); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     /* |  | ||||||
|     std::cout << GridLogMessage<< " Lexicographic "<<i; |  | ||||||
|     std::cout << " MPI rank      "<<wr; |  | ||||||
|     std::cout << " Coor          "; |  | ||||||
|     for(int j=0;j<coor.size();j++) std::cout << coor[j]; |  | ||||||
|     std::cout<< std::endl; |  | ||||||
|     */ |  | ||||||
|     ///////////////////////////////////////////////////// |  | ||||||
|     // Check everyone agrees on everyone elses coords |  | ||||||
|     ///////////////////////////////////////////////////// |  | ||||||
|     std::vector<int> mcoor = coor; |  | ||||||
|     this->Broadcast(0,(void *)&mcoor[0],mcoor.size()*sizeof(int)); |  | ||||||
|     for(int d = 0 ; d< _ndimension; d++) { |  | ||||||
|       assert(coor[d] == mcoor[d]); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| void CartesianCommunicator::GlobalSum(uint32_t &u){ | void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| @@ -528,6 +367,8 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N) | |||||||
|   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); |   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| // Basic Halo comms primitive | // Basic Halo comms primitive | ||||||
| void CartesianCommunicator::SendToRecvFrom(void *xmit, | void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||||
| 					   int dest, | 					   int dest, | ||||||
| @@ -536,14 +377,10 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, | |||||||
| 					   int bytes) | 					   int bytes) | ||||||
| { | { | ||||||
|   std::vector<CommsRequest_t> reqs(0); |   std::vector<CommsRequest_t> reqs(0); | ||||||
|   //    unsigned long  xcrc = crc32(0L, Z_NULL, 0); |  | ||||||
|   //    unsigned long  rcrc = crc32(0L, Z_NULL, 0); |  | ||||||
|   //    xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); |  | ||||||
|   SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); |   SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); | ||||||
|   SendToRecvFromComplete(reqs); |   SendToRecvFromComplete(reqs); | ||||||
|   //    rcrc = crc32(rcrc,(unsigned char *)recv,bytes); |  | ||||||
|   //    printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::SendRecvPacket(void *xmit, | void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||||
| 					   void *recv, | 					   void *recv, | ||||||
| 					   int sender, | 					   int sender, | ||||||
| @@ -560,6 +397,7 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, | |||||||
|     MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); |     MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| // Basic Halo comms primitive | // Basic Halo comms primitive | ||||||
| void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 						void *xmit, | 						void *xmit, | ||||||
| @@ -568,29 +406,95 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
| 						int from, | 						int from, | ||||||
| 						int bytes) | 						int bytes) | ||||||
| { | { | ||||||
|   int myrank = _processor; | #if 0 | ||||||
|   int ierr; |   this->StencilBarrier(); | ||||||
|  |  | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  |  | ||||||
|   MPI_Request xrq; |   MPI_Request xrq; | ||||||
|   MPI_Request rrq; |   MPI_Request rrq; | ||||||
|    |    | ||||||
|     ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); |   static int sequence; | ||||||
|     ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); |  | ||||||
|  |  | ||||||
|  |   int ierr; | ||||||
|  |   int tag; | ||||||
|  |   int check; | ||||||
|  |  | ||||||
|  |   assert(dest != _processor); | ||||||
|  |   assert(from != _processor); | ||||||
|  |    | ||||||
|  |   int gdest = GroupRanks[dest]; | ||||||
|  |   int gfrom = GroupRanks[from]; | ||||||
|  |   int gme   = GroupRanks[_processor]; | ||||||
|  |  | ||||||
|  |   sequence++; | ||||||
|  |    | ||||||
|  |   char *from_ptr = (char *)ShmCommBufs[ShmRank]; | ||||||
|  |  | ||||||
|  |   int small = (bytes<MAX_MPI_SHM_BYTES); | ||||||
|  |  | ||||||
|  |   typedef uint64_t T; | ||||||
|  |   int words = bytes/sizeof(T); | ||||||
|  |  | ||||||
|  |   assert(((size_t)bytes &(sizeof(T)-1))==0); | ||||||
|  |   assert(gme == ShmRank); | ||||||
|  |  | ||||||
|  |   if ( small && (gdest !=MPI_UNDEFINED) ) { | ||||||
|  |  | ||||||
|  |     char *to_ptr   = (char *)ShmCommBufs[gdest]; | ||||||
|  |  | ||||||
|  |     assert(gme != gdest); | ||||||
|  |  | ||||||
|  |     T *ip = (T *)xmit; | ||||||
|  |     T *op = (T *)to_ptr; | ||||||
|  | PARALLEL_FOR_LOOP  | ||||||
|  |     for(int w=0;w<words;w++) { | ||||||
|  |       op[w]=ip[w]; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     bcopy(&_processor,&to_ptr[bytes],sizeof(_processor)); | ||||||
|  |     bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence)); | ||||||
|  |   } else {  | ||||||
|  |     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(xrq); |     list.push_back(xrq); | ||||||
|     list.push_back(rrq); |  | ||||||
|   } else {  |  | ||||||
|     // Give the CPU to MPI immediately; can use threads to overlap optionally |  | ||||||
|     ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, |  | ||||||
| 		      recv,bytes,MPI_CHAR,from, from, |  | ||||||
| 		      communicator,MPI_STATUS_IGNORE); |  | ||||||
|     assert(ierr==0); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   this->StencilBarrier(); | ||||||
|  |    | ||||||
|  |   if (small && (gfrom !=MPI_UNDEFINED) ) { | ||||||
|  |     T *ip = (T *)from_ptr; | ||||||
|  |     T *op = (T *)recv; | ||||||
|  | PARALLEL_FOR_LOOP  | ||||||
|  |     for(int w=0;w<words;w++) { | ||||||
|  |       op[w]=ip[w]; | ||||||
|  |     } | ||||||
|  |     bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag)); | ||||||
|  |     bcopy(&from_ptr[bytes+4],&check,sizeof(check)); | ||||||
|  |     assert(check==sequence); | ||||||
|  |     assert(tag==from); | ||||||
|  |   } else {  | ||||||
|  |     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(rrq); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   this->StencilBarrier(); | ||||||
|  |  | ||||||
|  | #else | ||||||
|  |   MPI_Request xrq; | ||||||
|  |   MPI_Request rrq; | ||||||
|  |   int rank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |   ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |    | ||||||
|  |   assert(ierr==0); | ||||||
|  |  | ||||||
|  |   list.push_back(xrq); | ||||||
|  |   list.push_back(rrq); | ||||||
|  | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 						       void *xmit, | 						       void *xmit, | ||||||
| 						       int dest, | 						       int dest, | ||||||
| 						       void *recv, | 						       void *recv, | ||||||
| @@ -601,63 +505,57 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|   MPI_Request rrq; |   MPI_Request rrq; | ||||||
|  |  | ||||||
|   int ierr; |   int ierr; | ||||||
|  |  | ||||||
|  |   assert(dest != _processor); | ||||||
|  |   assert(from != _processor); | ||||||
|  |    | ||||||
|   int gdest = GroupRanks[dest]; |   int gdest = GroupRanks[dest]; | ||||||
|   int gfrom = GroupRanks[from]; |   int gfrom = GroupRanks[from]; | ||||||
|   int gme   = GroupRanks[_processor]; |   int gme   = GroupRanks[_processor]; | ||||||
|  |  | ||||||
|   assert(dest != _processor); |  | ||||||
|   assert(from != _processor); |  | ||||||
|   assert(gme == ShmRank); |   assert(gme == ShmRank); | ||||||
|   double off_node_bytes=0.0; |  | ||||||
|  |  | ||||||
| #ifdef FORCE_COMMS |  | ||||||
|   gdest = MPI_UNDEFINED; |  | ||||||
|   gfrom = MPI_UNDEFINED; |  | ||||||
| #endif |  | ||||||
|   if ( gfrom ==MPI_UNDEFINED) { |  | ||||||
|     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); |  | ||||||
|     assert(ierr==0); |  | ||||||
|     list.push_back(rrq); |  | ||||||
|     off_node_bytes+=bytes; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if ( gdest == MPI_UNDEFINED ) { |   if ( gdest == MPI_UNDEFINED ) { | ||||||
|     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); |     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(xrq); |     list.push_back(xrq); | ||||||
|     off_node_bytes+=bytes; |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  |   if ( gfrom ==MPI_UNDEFINED) { | ||||||
|     this->StencilSendToRecvFromComplete(list); |     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(rrq); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return off_node_bytes; |  | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) |  | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
| { | { | ||||||
|   SendToRecvFromComplete(waitall); |   SendToRecvFromComplete(list); | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::StencilBarrier(void) | void CartesianCommunicator::StencilBarrier(void) | ||||||
| { | { | ||||||
|  |   MPI_Win_sync (ShmWindow);    | ||||||
|   MPI_Barrier  (ShmComm); |   MPI_Barrier  (ShmComm); | ||||||
|  |   MPI_Win_sync (ShmWindow);    | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
| { | { | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|  |  | ||||||
|   if (nreq==0) return; |  | ||||||
|  |  | ||||||
|   std::vector<MPI_Status> status(nreq); |   std::vector<MPI_Status> status(nreq); | ||||||
|   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); |   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|   list.resize(0); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::Barrier(void) | void CartesianCommunicator::Barrier(void) | ||||||
| { | { | ||||||
|   int ierr = MPI_Barrier(communicator); |   int ierr = MPI_Barrier(communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||||
| { | { | ||||||
|   int ierr=MPI_Bcast(data, |   int ierr=MPI_Bcast(data, | ||||||
| @@ -667,11 +565,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | |||||||
| 		     communicator); | 		     communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
| int CartesianCommunicator::RankWorld(void){  |  | ||||||
|   int r;  |  | ||||||
|   MPI_Comm_rank(communicator_world,&r); |  | ||||||
|   return r; |  | ||||||
| } |  | ||||||
| void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||||
| { | { | ||||||
|   int ierr= MPI_Bcast(data, |   int ierr= MPI_Bcast(data, | ||||||
|   | |||||||
| @@ -27,7 +27,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include "Grid.h" | #include "Grid.h" | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| //#include <numaif.h> |  | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| /// Workarounds: | /// Workarounds: | ||||||
| @@ -43,27 +42,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <fcntl.h> | #include <fcntl.h> | ||||||
| #include <unistd.h> | #include <unistd.h> | ||||||
| #include <limits.h> | #include <limits.h> | ||||||
|  |  | ||||||
| typedef sem_t *Grid_semaphore; | typedef sem_t *Grid_semaphore; | ||||||
|  |  | ||||||
|  |  | ||||||
| #error  /*THis is deprecated*/ |  | ||||||
|  |  | ||||||
| #if 0  |  | ||||||
| #define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED ); | #define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED ); | ||||||
| #define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED ); | #define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED ); | ||||||
| #define SEM_POST(S) assert ( sem_post(S) == 0 );  | #define SEM_POST(S) assert ( sem_post(S) == 0 );  | ||||||
| #define SEM_WAIT(S) assert ( sem_wait(S) == 0 ); | #define SEM_WAIT(S) assert ( sem_wait(S) == 0 ); | ||||||
| #else |  | ||||||
| #define SEM_INIT(S)      ; |  | ||||||
| #define SEM_INIT_EXCL(S) ; |  | ||||||
| #define SEM_POST(S) ; |  | ||||||
| #define SEM_WAIT(S) ; |  | ||||||
| #endif |  | ||||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL, COMMAND_SENDRECV }; | enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL }; | ||||||
|  |  | ||||||
| struct Descriptor { | struct Descriptor { | ||||||
|   uint64_t buf; |   uint64_t buf; | ||||||
| @@ -71,12 +62,6 @@ struct Descriptor { | |||||||
|   int rank; |   int rank; | ||||||
|   int tag; |   int tag; | ||||||
|   int command; |   int command; | ||||||
|   uint64_t xbuf; |  | ||||||
|   uint64_t rbuf; |  | ||||||
|   int xtag; |  | ||||||
|   int rtag; |  | ||||||
|   int src; |  | ||||||
|   int dest; |  | ||||||
|   MPI_Request request; |   MPI_Request request; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -109,14 +94,18 @@ public: | |||||||
|  |  | ||||||
|   void SemInit(void) { |   void SemInit(void) { | ||||||
|     sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); |     sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); | ||||||
|  |     //    printf("SEM_NAME: %s \n",sem_name); | ||||||
|     SEM_INIT(sem_head); |     SEM_INIT(sem_head); | ||||||
|     sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); |     sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); | ||||||
|  |     //    printf("SEM_NAME: %s \n",sem_name); | ||||||
|     SEM_INIT(sem_tail); |     SEM_INIT(sem_tail); | ||||||
|   }   |   }   | ||||||
|   void SemInitExcl(void) { |   void SemInitExcl(void) { | ||||||
|     sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); |     sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank); | ||||||
|  |     //    printf("SEM_INIT_EXCL: %s \n",sem_name); | ||||||
|     SEM_INIT_EXCL(sem_head); |     SEM_INIT_EXCL(sem_head); | ||||||
|     sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); |     sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank); | ||||||
|  |     //    printf("SEM_INIT_EXCL: %s \n",sem_name); | ||||||
|     SEM_INIT_EXCL(sem_tail); |     SEM_INIT_EXCL(sem_tail); | ||||||
|   }   |   }   | ||||||
|   void WakeUpDMA(void) {  |   void WakeUpDMA(void) {  | ||||||
| @@ -136,13 +125,6 @@ public: | |||||||
|     while(1){ |     while(1){ | ||||||
|       WaitForCommand(); |       WaitForCommand(); | ||||||
|       //      std::cout << "Getting command "<<std::endl; |       //      std::cout << "Getting command "<<std::endl; | ||||||
| #if 0 |  | ||||||
|       _mm_monitor((void *)&state->head,0,0); |  | ||||||
|       int s=state->start; |  | ||||||
|       if ( s != state->head ) { |  | ||||||
| 	_mm_mwait(0,0); |  | ||||||
|       } |  | ||||||
| #endif |  | ||||||
|       Event(); |       Event(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -150,7 +132,6 @@ public: | |||||||
|   int Event (void) ; |   int Event (void) ; | ||||||
|  |  | ||||||
|   uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ; |   uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ; | ||||||
|   void QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) ; |  | ||||||
|  |  | ||||||
|   void WaitAll() { |   void WaitAll() { | ||||||
|     //    std::cout << "Queueing WAIT command  "<<std::endl; |     //    std::cout << "Queueing WAIT command  "<<std::endl; | ||||||
| @@ -160,7 +141,7 @@ public: | |||||||
|     //    std::cout << "Waiting from semaphore "<<std::endl; |     //    std::cout << "Waiting from semaphore "<<std::endl; | ||||||
|     WaitForComplete(); |     WaitForComplete(); | ||||||
|     //    std::cout << "Checking FIFO is empty "<<std::endl; |     //    std::cout << "Checking FIFO is empty "<<std::endl; | ||||||
|     while ( state->tail != state->head ); |     assert ( state->tail == state->head ); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -215,12 +196,6 @@ public: | |||||||
|     //    std::cout << "Waking up DMA "<< slave<<std::endl; |     //    std::cout << "Waking up DMA "<< slave<<std::endl; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   static void QueueSendRecv(int slave,void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)  |  | ||||||
|   { |  | ||||||
|     Slaves[slave].QueueSendRecv(xbuf,rbuf,bytes,xtag,rtag,comm,dest,src); |  | ||||||
|     Slaves[slave].WakeUpDMA(); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) { |   static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) { | ||||||
|     //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl; |     //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl; | ||||||
|     Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank); |     Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank); | ||||||
| @@ -251,28 +226,6 @@ public: | |||||||
|     return; |     return; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   static void QueueRoundRobinSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) { |  | ||||||
|     uint8_t * cxbuf = (uint8_t *) xbuf; |  | ||||||
|     uint8_t * crbuf = (uint8_t *) rbuf; |  | ||||||
|     static int rrp=0; |  | ||||||
|     int procs = VerticalSize-1; |  | ||||||
|     int myoff=0; |  | ||||||
|     int mywork=bytes; |  | ||||||
|     QueueSendRecv(rrp+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src); |  | ||||||
|     rrp = rrp+1; |  | ||||||
|     if ( rrp == (VerticalSize-1) ) rrp = 0; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   static void QueueMultiplexedSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src) { |  | ||||||
|     uint8_t * cxbuf = (uint8_t *) xbuf; |  | ||||||
|     uint8_t * crbuf = (uint8_t *) rbuf; |  | ||||||
|     int mywork, myoff, procs; |  | ||||||
|     procs = VerticalSize-1; |  | ||||||
|     for(int s=0;s<procs;s++) { |  | ||||||
|       GetWork(bytes,s,mywork,myoff,procs); |  | ||||||
|       QueueSendRecv(s+1,&cxbuf[myoff],&crbuf[myoff],mywork,xtag,rtag,comm,dest,src); |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|   static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) { |   static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) { | ||||||
|     uint8_t * cbuf = (uint8_t *) buf; |     uint8_t * cbuf = (uint8_t *) buf; | ||||||
|     int mywork, myoff, procs; |     int mywork, myoff, procs; | ||||||
| @@ -322,7 +275,6 @@ std::vector<void *>            MPIoffloadEngine::VerticalShmBufs; | |||||||
| std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks; | std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks; | ||||||
| std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks;  | std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks;  | ||||||
|  |  | ||||||
| int CartesianCommunicator::NodeCount(void)    { return HorizontalSize;}; |  | ||||||
| int MPIoffloadEngine::ShmSetup = 0; | int MPIoffloadEngine::ShmSetup = 0; | ||||||
|  |  | ||||||
| void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, | void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, | ||||||
| @@ -418,22 +370,12 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, | |||||||
|       ftruncate(fd, size); |       ftruncate(fd, size); | ||||||
|  |  | ||||||
|       VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |       VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | ||||||
|  |  | ||||||
|       if ( VerticalShmBufs[r] == MAP_FAILED ) {  |       if ( VerticalShmBufs[r] == MAP_FAILED ) {  | ||||||
| 	perror("failed mmap"); | 	perror("failed mmap"); | ||||||
| 	assert(0); | 	assert(0); | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       /* |  | ||||||
|       for(uint64_t page=0;page<size;page+=4096){ |  | ||||||
| 	void *pages = (void *) ( page + (uint64_t)VerticalShmBufs[r] ); |  | ||||||
| 	int status; |  | ||||||
| 	int flags=MPOL_MF_MOVE_ALL; |  | ||||||
| 	int nodes=1; // numa domain == MCDRAM |  | ||||||
| 	unsigned long count=1; |  | ||||||
| 	ierr= move_pages(0,count, &pages,&nodes,&status,flags); |  | ||||||
| 	if (ierr && (page==0)) perror("numa relocate command failed"); |  | ||||||
|       } |  | ||||||
|       */ |  | ||||||
|       uint64_t * check = (uint64_t *) VerticalShmBufs[r]; |       uint64_t * check = (uint64_t *) VerticalShmBufs[r]; | ||||||
|       check[0] = WorldRank; |       check[0] = WorldRank; | ||||||
|       check[1] = r; |       check[1] = r; | ||||||
| @@ -462,7 +404,7 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world, | |||||||
|     uint64_t * check = (uint64_t *) VerticalShmBufs[r]; |     uint64_t * check = (uint64_t *) VerticalShmBufs[r]; | ||||||
|     assert(check[0]== WorldRank); |     assert(check[0]== WorldRank); | ||||||
|     assert(check[1]== r); |     assert(check[1]== r); | ||||||
|     //    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl; |     std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl; | ||||||
|   } |   } | ||||||
|   } |   } | ||||||
| #endif | #endif | ||||||
| @@ -600,8 +542,6 @@ int Slave::Event (void) { | |||||||
|   static int head_last; |   static int head_last; | ||||||
|   static int start_last; |   static int start_last; | ||||||
|   int ierr; |   int ierr; | ||||||
|   MPI_Status stat; |  | ||||||
|   static int i=0; |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////// |   //////////////////////////////////////////////////// | ||||||
|   // Try to advance the start pointers |   // Try to advance the start pointers | ||||||
| @@ -610,6 +550,11 @@ int Slave::Event (void) { | |||||||
|   if ( s != state->head ) { |   if ( s != state->head ) { | ||||||
|     switch ( state->Descrs[s].command ) { |     switch ( state->Descrs[s].command ) { | ||||||
|     case COMMAND_ISEND: |     case COMMAND_ISEND: | ||||||
|  |       /* | ||||||
|  |             std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]" | ||||||
|  |       	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag | ||||||
|  |        << " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl; | ||||||
|  |       */ | ||||||
|       ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),  |       ierr = MPI_Isend((void *)(state->Descrs[s].buf+base),  | ||||||
| 		       state->Descrs[s].bytes,  | 		       state->Descrs[s].bytes,  | ||||||
| 		       MPI_CHAR, | 		       MPI_CHAR, | ||||||
| @@ -623,6 +568,11 @@ int Slave::Event (void) { | |||||||
|       break; |       break; | ||||||
|  |  | ||||||
|     case COMMAND_IRECV: |     case COMMAND_IRECV: | ||||||
|  |       /* | ||||||
|  |       std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]" | ||||||
|  | 	       << " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag | ||||||
|  | 	       << " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl; | ||||||
|  |       */ | ||||||
|       ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),  |       ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base),  | ||||||
| 		     state->Descrs[s].bytes,  | 		     state->Descrs[s].bytes,  | ||||||
| 		     MPI_CHAR, | 		     MPI_CHAR, | ||||||
| @@ -638,32 +588,10 @@ int Slave::Event (void) { | |||||||
|       return 1; |       return 1; | ||||||
|       break; |       break; | ||||||
|  |  | ||||||
|     case COMMAND_SENDRECV: |  | ||||||
|  |  | ||||||
|       //      fprintf(stderr,"Sendrecv ->%d %d : <-%d %d \n",state->Descrs[s].dest, state->Descrs[s].xtag+i*10,state->Descrs[s].src, state->Descrs[s].rtag+i*10); |  | ||||||
|  |  | ||||||
|       ierr=MPI_Sendrecv((void *)(state->Descrs[s].xbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].dest, state->Descrs[s].xtag+i*10, |  | ||||||
| 			(void *)(state->Descrs[s].rbuf+base), state->Descrs[s].bytes, MPI_CHAR, state->Descrs[s].src , state->Descrs[s].rtag+i*10, |  | ||||||
| 			MPIoffloadEngine::communicator_universe,MPI_STATUS_IGNORE); |  | ||||||
|  |  | ||||||
|       assert(ierr==0); |  | ||||||
|  |  | ||||||
|       //      fprintf(stderr,"Sendrecv done %d %d\n",ierr,i); |  | ||||||
|       //      MPI_Barrier(MPIoffloadEngine::HorizontalComm); |  | ||||||
|       //      fprintf(stderr,"Barrier\n"); |  | ||||||
|       i++; |  | ||||||
|  |  | ||||||
|       state->start = PERI_PLUS(s); |  | ||||||
|  |  | ||||||
|       return 1; |  | ||||||
|       break; |  | ||||||
|  |  | ||||||
|     case COMMAND_WAITALL: |     case COMMAND_WAITALL: | ||||||
|  |  | ||||||
|       for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){ |       for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){ | ||||||
| 	if ( state->Descrs[t].command != COMMAND_SENDRECV ) { |  | ||||||
| 	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE); | 	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE); | ||||||
| 	} |  | ||||||
|       }; |       }; | ||||||
|       s=PERI_PLUS(s); |       s=PERI_PLUS(s); | ||||||
|       state->start = s; |       state->start = s; | ||||||
| @@ -685,45 +613,6 @@ int Slave::Event (void) { | |||||||
|   // External interaction with the queue |   // External interaction with the queue | ||||||
|   ////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////// | ||||||
|    |    | ||||||
| void Slave::QueueSendRecv(void *xbuf, void *rbuf, int bytes, int xtag, int rtag, MPI_Comm comm,int dest,int src)  |  | ||||||
| { |  | ||||||
|   int head =state->head; |  | ||||||
|   int next = PERI_PLUS(head); |  | ||||||
|    |  | ||||||
|   // Set up descriptor |  | ||||||
|   int worldrank; |  | ||||||
|   int hashtag; |  | ||||||
|   MPI_Comm    communicator; |  | ||||||
|   MPI_Request request; |  | ||||||
|   uint64_t relative; |  | ||||||
|    |  | ||||||
|   relative = (uint64_t)xbuf - base; |  | ||||||
|   state->Descrs[head].xbuf    = relative; |  | ||||||
|    |  | ||||||
|   relative= (uint64_t)rbuf - base; |  | ||||||
|   state->Descrs[head].rbuf    = relative; |  | ||||||
|    |  | ||||||
|   state->Descrs[head].bytes  = bytes; |  | ||||||
|    |  | ||||||
|   MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,xtag,comm,dest); |  | ||||||
|   state->Descrs[head].dest   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]; |  | ||||||
|   state->Descrs[head].xtag    = hashtag; |  | ||||||
|    |  | ||||||
|   MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,rtag,comm,src); |  | ||||||
|   state->Descrs[head].src    = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]; |  | ||||||
|   state->Descrs[head].rtag    = hashtag; |  | ||||||
|    |  | ||||||
|   state->Descrs[head].command= COMMAND_SENDRECV; |  | ||||||
|    |  | ||||||
|   // Block until FIFO has space |  | ||||||
|   while( state->tail==next ); |  | ||||||
|    |  | ||||||
|   // Msync on weak order architectures |  | ||||||
|    |  | ||||||
|   // Advance pointer |  | ||||||
|   state->head = next; |  | ||||||
|    |  | ||||||
| }; |  | ||||||
| uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)  | uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank)  | ||||||
| { | { | ||||||
|   ///////////////////////////////////////// |   ///////////////////////////////////////// | ||||||
| @@ -923,22 +812,19 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_ | |||||||
|   assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) ); |   assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) ); | ||||||
|   assert(from!=_processor); |   assert(from!=_processor); | ||||||
|   assert(dest!=_processor); |   assert(dest!=_processor); | ||||||
|  |   MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest); | ||||||
|   MPIoffloadEngine::QueueMultiplexedSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from); |   MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from); | ||||||
|  |  | ||||||
|   //MPIoffloadEngine::QueueRoundRobinSendRecv(xmit,recv,bytes,_processor,from,communicator,dest,from); |  | ||||||
|  |  | ||||||
|   //MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest); |  | ||||||
|   //MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from); |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
| { | { | ||||||
|   MPIoffloadEngine::WaitAll(); |   MPIoffloadEngine::WaitAll(); | ||||||
|   //this->Barrier(); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::StencilBarrier(void) { } | void CartesianCommunicator::StencilBarrier(void) | ||||||
|  | { | ||||||
|  | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -25,8 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -88,7 +87,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
| { | { | ||||||
|   assert(0); |   assert(0); | ||||||
| @@ -99,7 +97,7 @@ void CartesianCommunicator::Barrier(void){} | |||||||
| void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} | void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} | ||||||
| void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } | ||||||
| int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;} | int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;} | ||||||
| void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  coor = _processor_coor; } | void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor ;} | ||||||
| void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||||
| { | { | ||||||
|   source =0; |   source =0; | ||||||
|   | |||||||
| @@ -27,7 +27,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/Grid.h> | #include <Grid/Grid.h> | ||||||
| #include <mpp/shmem.h> | #include <mpp/shmem.h> | ||||||
| #include <array> |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -52,7 +51,7 @@ typedef struct HandShake_t { | |||||||
| } HandShake; | } HandShake; | ||||||
|  |  | ||||||
| std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) { | std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) { | ||||||
|   std::array<long,_SHMEM_REDUCE_SYNC_SIZE> ret; |   array<long,_SHMEM_REDUCE_SYNC_SIZE> ret; | ||||||
|   ret.fill(SHMEM_SYNC_VALUE); |   ret.fill(SHMEM_SYNC_VALUE); | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| @@ -110,7 +109,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){ | |||||||
|  |  | ||||||
|   source = u; |   source = u; | ||||||
|   dest   = 0; |   dest   = 0; | ||||||
|   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); |   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||||
|   shmem_barrier_all(); // necessary? |   shmem_barrier_all(); // necessary? | ||||||
|   u = dest; |   u = dest; | ||||||
| } | } | ||||||
| @@ -126,7 +125,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){ | |||||||
|  |  | ||||||
|   source = u; |   source = u; | ||||||
|   dest   = 0; |   dest   = 0; | ||||||
|   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); |   shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||||
|   shmem_barrier_all(); // necessary? |   shmem_barrier_all(); // necessary? | ||||||
|   u = dest; |   u = dest; | ||||||
| } | } | ||||||
| @@ -138,8 +137,7 @@ void CartesianCommunicator::GlobalSum(float &f){ | |||||||
|  |  | ||||||
|   source = f; |   source = f; | ||||||
|   dest   =0.0; |   dest   =0.0; | ||||||
|   shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); |   shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||||
|   shmem_barrier_all(); |  | ||||||
|   f = dest; |   f = dest; | ||||||
| } | } | ||||||
| void CartesianCommunicator::GlobalSumVector(float *f,int N) | void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||||
| @@ -150,16 +148,14 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N) | |||||||
|   static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init; |   static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init; | ||||||
|  |  | ||||||
|   if ( shmem_addr_accessible(f,_processor)  ){ |   if ( shmem_addr_accessible(f,_processor)  ){ | ||||||
|     shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync.data()); |     shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync); | ||||||
|     shmem_barrier_all(); |  | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   for(int i=0;i<N;i++){ |   for(int i=0;i<N;i++){ | ||||||
|     dest   =0.0; |     dest   =0.0; | ||||||
|     source = f[i]; |     source = f[i]; | ||||||
|     shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); |     shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||||
|     shmem_barrier_all(); |  | ||||||
|     f[i] = dest; |     f[i] = dest; | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -172,8 +168,7 @@ void CartesianCommunicator::GlobalSum(double &d) | |||||||
|  |  | ||||||
|   source = d; |   source = d; | ||||||
|   dest   = 0; |   dest   = 0; | ||||||
|   shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); |   shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||||
|   shmem_barrier_all(); |  | ||||||
|   d = dest; |   d = dest; | ||||||
| } | } | ||||||
| void CartesianCommunicator::GlobalSumVector(double *d,int N) | void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||||
| @@ -185,16 +180,14 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N) | |||||||
|  |  | ||||||
|  |  | ||||||
|   if ( shmem_addr_accessible(d,_processor)  ){ |   if ( shmem_addr_accessible(d,_processor)  ){ | ||||||
|     shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync.data()); |     shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync); | ||||||
|     shmem_barrier_all(); |  | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   for(int i=0;i<N;i++){ |   for(int i=0;i<N;i++){ | ||||||
|     source = d[i]; |     source = d[i]; | ||||||
|     dest   =0.0; |     dest   =0.0; | ||||||
|     shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync.data()); |     shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync); | ||||||
|     shmem_barrier_all(); |  | ||||||
|     d[i] = dest; |     d[i] = dest; | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -289,13 +282,11 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
|   SHMEM_VET(recv); |   SHMEM_VET(recv); | ||||||
|   //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL); |   //  shmem_putmem_nb(recv,xmit,bytes,dest,NULL); | ||||||
|   shmem_putmem(recv,xmit,bytes,dest); |   shmem_putmem(recv,xmit,bytes,dest); | ||||||
|  |  | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) shmem_barrier_all();  |  | ||||||
| } | } | ||||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
| { | { | ||||||
|   //  shmem_quiet();      // I'm done |   //  shmem_quiet();      // I'm done | ||||||
|   if( CommunicatorPolicy == CommunicatorPolicyConcurrent ) shmem_barrier_all();// He's done too |   shmem_barrier_all();// He's done too | ||||||
| } | } | ||||||
| void CartesianCommunicator::Barrier(void) | void CartesianCommunicator::Barrier(void) | ||||||
| { | { | ||||||
| @@ -310,13 +301,13 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | |||||||
|   int words = bytes/4; |   int words = bytes/4; | ||||||
|  |  | ||||||
|   if ( shmem_addr_accessible(data,_processor)  ){ |   if ( shmem_addr_accessible(data,_processor)  ){ | ||||||
|     shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync.data()); |     shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync); | ||||||
|     return; |     return; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   for(int w=0;w<words;w++){ |   for(int w=0;w<words;w++){ | ||||||
|     word = array[w]; |     word = array[w]; | ||||||
|     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data()); |     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync); | ||||||
|     if ( shmem_my_pe() != root ) { |     if ( shmem_my_pe() != root ) { | ||||||
|       array[w] = word; |       array[w] = word; | ||||||
|     } |     } | ||||||
| @@ -334,7 +325,7 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | |||||||
|  |  | ||||||
|   for(int w=0;w<words;w++){ |   for(int w=0;w<words;w++){ | ||||||
|     word = array[w]; |     word = array[w]; | ||||||
|     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync.data()); |     shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync); | ||||||
|     if ( shmem_my_pe() != root ) { |     if ( shmem_my_pe() != root ) { | ||||||
|       array[w]= word; |       array[w]= word; | ||||||
|     } |     } | ||||||
| @@ -342,9 +333,5 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| int CartesianCommunicator::RankWorld(void){  |  | ||||||
|   return shmem_my_pe(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
| @@ -52,13 +53,15 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen | |||||||
|     cbmask = 0x3; |     cbmask = 0x3; | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  |   int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane  | ||||||
|  |    | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
|   int stride=rhs._grid->_slice_stride[dimension]; |   int stride=rhs._grid->_slice_stride[dimension]; | ||||||
|   if ( cbmask == 0x3 ) {  |   if ( cbmask == 0x3 ) {  | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o  = n*stride; | 	int o  = n*stride; | ||||||
| 	int bo = n*e2; | 	int bo = n*e2; | ||||||
| @@ -71,13 +74,14 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimen | |||||||
|      for(int n=0;n<e1;n++){ |      for(int n=0;n<e1;n++){ | ||||||
|        for(int b=0;b<e2;b++){ |        for(int b=0;b<e2;b++){ | ||||||
| 	 int o  = n*stride; | 	 int o  = n*stride; | ||||||
| 	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | 	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b); | ||||||
| 	 if ( ocb &cbmask ) { | 	 if ( ocb &cbmask ) { | ||||||
| 	   table.push_back(std::pair<int,int> (bo++,o+b)); | 	   table.push_back(std::pair<int,int> (bo++,o+b)); | ||||||
| 	 } | 	 } | ||||||
|        } |        } | ||||||
|      } |      } | ||||||
|      parallel_for(int i=0;i<table.size();i++){ | PARALLEL_FOR_LOOP      | ||||||
|  |      for(int i=0;i<table.size();i++){ | ||||||
|        buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]); |        buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]); | ||||||
|      } |      } | ||||||
|   } |   } | ||||||
| @@ -101,30 +105,29 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_ | |||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   int n1=rhs._grid->_slice_stride[dimension]; |   int n1=rhs._grid->_slice_stride[dimension]; | ||||||
|  |   int n2=rhs._grid->_slice_block[dimension]; | ||||||
|   if ( cbmask ==0x3){ |   if ( cbmask ==0x3){ | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|  |  | ||||||
| 	int o      =   n*n1; | 	int o      =   n*n1; | ||||||
| 	int offset = b+n*e2; | 	int offset = b+n*n2; | ||||||
| 	 |  | ||||||
| 	cobj temp =compress(rhs._odata[so+o+b]); | 	cobj temp =compress(rhs._odata[so+o+b]); | ||||||
|  |  | ||||||
| 	extract<cobj>(temp,pointers,offset); | 	extract<cobj>(temp,pointers,offset); | ||||||
|  |  | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } else {  |   } else {  | ||||||
|  |  | ||||||
|     // Case of SIMD split AND checker dim cannot currently be hit, except in  |     assert(0); //Fixme think this is buggy | ||||||
|     // Test_cshift_red_black code. |  | ||||||
|     std::cout << " Dense packed buffer WARNING " <<std::endl; |  | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ |  | ||||||
|       for(int b=0;b<e2;b++){ |  | ||||||
|  |  | ||||||
| 	int o=n*n1; |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int o=n*rhs._grid->_slice_stride[dimension]; | ||||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b); | ||||||
| 	int offset = b+n*e2; | 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
| 	if ( ocb & cbmask ) { | 	if ( ocb & cbmask ) { | ||||||
| 	  cobj temp =compress(rhs._odata[so+o+b]); | 	  cobj temp =compress(rhs._odata[so+o+b]); | ||||||
| @@ -168,10 +171,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo | |||||||
|      |      | ||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   int stride=rhs._grid->_slice_stride[dimension]; |  | ||||||
|    |    | ||||||
|   if ( cbmask ==0x3 ) { |   if ( cbmask ==0x3 ) { | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||||
| 	int bo  =n*rhs._grid->_slice_block[dimension]; | 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||||
| @@ -179,21 +182,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } else {  |   } else {  | ||||||
|     std::vector<std::pair<int,int> > table; |  | ||||||
|     int bo=0; |     int bo=0; | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o   =n*rhs._grid->_slice_stride[dimension]; | 	int o   =n*rhs._grid->_slice_stride[dimension]; | ||||||
|  | 	int bo  =n*rhs._grid->_slice_block[dimension]; | ||||||
| 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | 	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||||
| 	if ( ocb & cbmask ) { | 	if ( ocb & cbmask ) { | ||||||
| 	  table.push_back(std::pair<int,int> (so+o+b,bo++)); | 	  rhs._odata[so+o+b]=buffer[bo++]; | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     parallel_for(int i=0;i<table.size();i++){ |  | ||||||
|        //       std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl; |  | ||||||
|        rhs._odata[table[i].first]=buffer[table[i].second]; |  | ||||||
|      } |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -214,7 +213,8 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo | |||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|  |  | ||||||
|   if(cbmask ==0x3 ) { |   if(cbmask ==0x3 ) { | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o      = n*rhs._grid->_slice_stride[dimension]; | 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||||
| 	int offset = b+n*rhs._grid->_slice_block[dimension]; | 	int offset = b+n*rhs._grid->_slice_block[dimension]; | ||||||
| @@ -222,11 +222,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } else {  |   } else {  | ||||||
|  |     assert(0); // think this is buggy FIXME | ||||||
|     // Case of SIMD split AND checker dim cannot currently be hit, except in  |  | ||||||
|     // Test_cshift_red_black code. |  | ||||||
|     //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME |  | ||||||
|     std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; |  | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| 	int o      = n*rhs._grid->_slice_stride[dimension]; | 	int o      = n*rhs._grid->_slice_stride[dimension]; | ||||||
| @@ -258,7 +254,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|   int e2=rhs._grid->_slice_block[dimension]; |   int e2=rhs._grid->_slice_block[dimension]; | ||||||
|   int stride = rhs._grid->_slice_stride[dimension]; |   int stride = rhs._grid->_slice_stride[dimension]; | ||||||
|   if(cbmask == 0x3 ){ |   if(cbmask == 0x3 ){ | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|   |   | ||||||
|         int o =n*stride+b; |         int o =n*stride+b; | ||||||
| @@ -267,7 +264,8 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } else {  |   } else {  | ||||||
|     parallel_for_nest2(int n=0;n<e1;n++){ | PARALLEL_NESTED_LOOP2 | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|   |   | ||||||
|         int o =n*stride+b; |         int o =n*stride+b; | ||||||
| @@ -297,8 +295,8 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | |||||||
|   int e1=rhs._grid->_slice_nblock[dimension]; |   int e1=rhs._grid->_slice_nblock[dimension]; | ||||||
|   int e2=rhs._grid->_slice_block [dimension]; |   int e2=rhs._grid->_slice_block [dimension]; | ||||||
|   int stride = rhs._grid->_slice_stride[dimension]; |   int stride = rhs._grid->_slice_stride[dimension]; | ||||||
|  | PARALLEL_NESTED_LOOP2 | ||||||
|   parallel_for_nest2(int n=0;n<e1;n++){ |   for(int n=0;n<e1;n++){ | ||||||
|   for(int b=0;b<e2;b++){ |   for(int b=0;b<e2;b++){ | ||||||
|  |  | ||||||
|       int o  =n*stride; |       int o  =n*stride; | ||||||
| @@ -340,8 +338,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | |||||||
|   // Map to always positive shift modulo global full dimension. |   // Map to always positive shift modulo global full dimension. | ||||||
|   shift = (shift+fd)%fd; |   shift = (shift+fd)%fd; | ||||||
|  |  | ||||||
|   // the permute type |  | ||||||
|   ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); |   ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension); | ||||||
|  |   // the permute type | ||||||
|   int permute_dim =grid->PermuteDim(dimension); |   int permute_dim =grid->PermuteDim(dimension); | ||||||
|   int permute_type=grid->PermuteType(dimension); |   int permute_type=grid->PermuteType(dimension); | ||||||
|   int permute_type_dist; |   int permute_type_dist; | ||||||
| @@ -350,6 +348,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | |||||||
|  |  | ||||||
|     int o   = 0; |     int o   = 0; | ||||||
|     int bo  = x * grid->_ostride[dimension]; |     int bo  = x * grid->_ostride[dimension]; | ||||||
|  |      | ||||||
|     int cb= (cbmask==0x2)? Odd : Even; |     int cb= (cbmask==0x2)? Odd : Even; | ||||||
|  |  | ||||||
|     int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); |     int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); | ||||||
| @@ -362,23 +361,9 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | |||||||
|     // wrap is whether sshift > rd. |     // wrap is whether sshift > rd. | ||||||
|     //  num is sshift mod rd. |     //  num is sshift mod rd. | ||||||
|     //  |     //  | ||||||
|     //  shift 7 |  | ||||||
|     // |  | ||||||
|     //  XoXo YcYc  |  | ||||||
|     //  oXoX cYcY |  | ||||||
|     //  XoXo YcYc |  | ||||||
|     //  oXoX cYcY |  | ||||||
|     // |  | ||||||
|     //  sshift --  |  | ||||||
|     // |  | ||||||
|     //  XX YY ; 3 |  | ||||||
|     //  XX YY ; 0 |  | ||||||
|     //  XX YY ; 3 |  | ||||||
|     //  XX YY ; 0 |  | ||||||
|     // |  | ||||||
|     int permute_slice=0; |     int permute_slice=0; | ||||||
|     if(permute_dim){ |     if(permute_dim){ | ||||||
|       int wrap = sshift/rd; wrap=wrap % ly; |       int wrap = sshift/rd; | ||||||
|       int  num = sshift%rd; |       int  num = sshift%rd; | ||||||
|  |  | ||||||
|       if ( x< rd-num ) permute_slice=wrap; |       if ( x< rd-num ) permute_slice=wrap; | ||||||
| @@ -390,6 +375,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice | |||||||
|       } else { |       } else { | ||||||
| 	permute_type_dist = permute_type; | 	permute_type_dist = permute_type; | ||||||
|       } |       } | ||||||
|  |        | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist); |     if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist); | ||||||
|   | |||||||
| @@ -74,6 +74,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r | |||||||
|   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); |   sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd); | ||||||
|  |  | ||||||
|   //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; |   //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; | ||||||
|  |  | ||||||
|   if ( sshift[0] == sshift[1] ) { |   if ( sshift[0] == sshift[1] ) { | ||||||
|     //    std::cout << "Single pass Cshift_comms" <<std::endl; |     //    std::cout << "Single pass Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms(ret,rhs,dimension,shift,0x3); |     Cshift_comms(ret,rhs,dimension,shift,0x3); | ||||||
| @@ -153,14 +154,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
| 			   (void *)&recv_buf[0], | 			   (void *)&recv_buf[0], | ||||||
| 			   recv_from_rank, | 			   recv_from_rank, | ||||||
| 			   bytes); | 			   bytes); | ||||||
|       grid->Barrier(); |  | ||||||
|       /* |       //      for(int i=0;i<words;i++){ | ||||||
|       for(int i=0;i<send_buf.size();i++){ |       //	std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl; | ||||||
| 	assert(recv_buf.size()==buffer_size); |       //      } | ||||||
| 	assert(send_buf.size()==buffer_size); |  | ||||||
| 	std::cout << "SendRecv_Cshift_comms ["<<i<<" "<< dimension<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl; |  | ||||||
|       } |  | ||||||
|       */ |  | ||||||
|       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); |       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -246,14 +243,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
| 			     (void *)&recv_buf_extract[i][0], | 			     (void *)&recv_buf_extract[i][0], | ||||||
| 			     recv_from_rank, | 			     recv_from_rank, | ||||||
| 			     bytes); | 			     bytes); | ||||||
| 	/* |  | ||||||
| 	for(int w=0;w<recv_buf_extract[i].size();w++){ |  | ||||||
| 	  assert(recv_buf_extract[i].size()==buffer_size); |  | ||||||
| 	  assert(send_buf_extract[i].size()==buffer_size); |  | ||||||
| 	  std::cout << "SendRecv_Cshift_comms ["<<w<<" "<< dimension<<"] recv "<<recv_buf_extract[i][w]<<" send " << send_buf_extract[nbr_lane][w]  << cbmask<<std::endl; |  | ||||||
| 	} |  | ||||||
| 	*/	 |  | ||||||
| 	grid->Barrier(); |  | ||||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | 	rpointers[i] = &recv_buf_extract[i][0]; | ||||||
|       } else {  |       } else {  | ||||||
| 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | ||||||
|   | |||||||
| @@ -39,7 +39,8 @@ namespace Grid { | |||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     conformable(lhs,rhs); |     conformable(lhs,rhs); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); |       mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||||
| @@ -55,7 +56,8 @@ namespace Grid { | |||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     conformable(lhs,rhs); |     conformable(lhs,rhs); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); |       mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||||
| @@ -71,7 +73,8 @@ namespace Grid { | |||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     conformable(lhs,rhs); |     conformable(lhs,rhs); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); |       sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||||
| @@ -86,7 +89,8 @@ namespace Grid { | |||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     conformable(lhs,rhs); |     conformable(lhs,rhs); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); |       add(&tmp,&lhs._odata[ss],&rhs._odata[ss]); | ||||||
| @@ -104,7 +108,8 @@ namespace Grid { | |||||||
|     void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ |     void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(lhs,ret); |     conformable(lhs,ret); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       mult(&tmp,&lhs._odata[ss],&rhs); |       mult(&tmp,&lhs._odata[ss],&rhs); | ||||||
|       vstream(ret._odata[ss],tmp); |       vstream(ret._odata[ss],tmp); | ||||||
| @@ -115,7 +120,8 @@ namespace Grid { | |||||||
|     void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ |     void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(ret,lhs); |     conformable(ret,lhs); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       mac(&tmp,&lhs._odata[ss],&rhs); |       mac(&tmp,&lhs._odata[ss],&rhs); | ||||||
|       vstream(ret._odata[ss],tmp); |       vstream(ret._odata[ss],tmp); | ||||||
| @@ -126,7 +132,8 @@ namespace Grid { | |||||||
|     void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ |     void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(ret,lhs); |     conformable(ret,lhs); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       sub(&tmp,&lhs._odata[ss],&rhs); |       sub(&tmp,&lhs._odata[ss],&rhs); | ||||||
| @@ -140,7 +147,8 @@ namespace Grid { | |||||||
|     void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ |     void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||||
|     ret.checkerboard = lhs.checkerboard; |     ret.checkerboard = lhs.checkerboard; | ||||||
|     conformable(lhs,ret); |     conformable(lhs,ret); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       add(&tmp,&lhs._odata[ss],&rhs); |       add(&tmp,&lhs._odata[ss],&rhs); | ||||||
| @@ -158,7 +166,8 @@ namespace Grid { | |||||||
|     void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ |     void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       mult(&tmp,&lhs,&rhs._odata[ss]); |       mult(&tmp,&lhs,&rhs._odata[ss]); | ||||||
| @@ -173,7 +182,8 @@ namespace Grid { | |||||||
|     void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ |     void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       mac(&tmp,&lhs,&rhs._odata[ss]); |       mac(&tmp,&lhs,&rhs._odata[ss]); | ||||||
| @@ -188,7 +198,8 @@ namespace Grid { | |||||||
|     void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ |     void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       sub(&tmp,&lhs,&rhs._odata[ss]); |       sub(&tmp,&lhs,&rhs._odata[ss]); | ||||||
| @@ -202,7 +213,8 @@ namespace Grid { | |||||||
|     void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ |     void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       obj1 tmp; |       obj1 tmp; | ||||||
|       add(&tmp,&lhs,&rhs._odata[ss]); |       add(&tmp,&lhs,&rhs._odata[ss]); | ||||||
| @@ -218,7 +230,8 @@ namespace Grid { | |||||||
|     ret.checkerboard = x.checkerboard; |     ret.checkerboard = x.checkerboard; | ||||||
|     conformable(ret,x); |     conformable(ret,x); | ||||||
|     conformable(x,y); |     conformable(x,y); | ||||||
|     parallel_for(int ss=0;ss<x._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<x._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       vobj tmp = a*x._odata[ss]+y._odata[ss]; |       vobj tmp = a*x._odata[ss]+y._odata[ss]; | ||||||
|       vstream(ret._odata[ss],tmp); |       vstream(ret._odata[ss],tmp); | ||||||
| @@ -232,7 +245,8 @@ namespace Grid { | |||||||
|     ret.checkerboard = x.checkerboard; |     ret.checkerboard = x.checkerboard; | ||||||
|     conformable(ret,x); |     conformable(ret,x); | ||||||
|     conformable(x,y); |     conformable(x,y); | ||||||
|     parallel_for(int ss=0;ss<x._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<x._grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       vobj tmp = a*x._odata[ss]+b*y._odata[ss]; |       vobj tmp = a*x._odata[ss]+b*y._odata[ss]; | ||||||
|       vstream(ret._odata[ss],tmp); |       vstream(ret._odata[ss],tmp); | ||||||
|   | |||||||
| @@ -121,7 +121,8 @@ public: | |||||||
|     assert( (cb==Odd) || (cb==Even)); |     assert( (cb==Odd) || (cb==Even)); | ||||||
|     checkerboard=cb; |     checkerboard=cb; | ||||||
|  |  | ||||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       vobj tmp = eval(ss,expr); |       vobj tmp = eval(ss,expr); | ||||||
|       vstream(_odata[ss] ,tmp); |       vstream(_odata[ss] ,tmp); | ||||||
| @@ -143,7 +144,8 @@ public: | |||||||
|     assert( (cb==Odd) || (cb==Even)); |     assert( (cb==Odd) || (cb==Even)); | ||||||
|     checkerboard=cb; |     checkerboard=cb; | ||||||
|  |  | ||||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       vobj tmp = eval(ss,expr); |       vobj tmp = eval(ss,expr); | ||||||
|       vstream(_odata[ss] ,tmp); |       vstream(_odata[ss] ,tmp); | ||||||
| @@ -165,7 +167,8 @@ public: | |||||||
|     assert( (cb==Odd) || (cb==Even)); |     assert( (cb==Odd) || (cb==Even)); | ||||||
|     checkerboard=cb; |     checkerboard=cb; | ||||||
|  |  | ||||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       //vobj tmp = eval(ss,expr); |       //vobj tmp = eval(ss,expr); | ||||||
|       vstream(_odata[ss] ,eval(ss,expr)); |       vstream(_odata[ss] ,eval(ss,expr)); | ||||||
| @@ -188,7 +191,8 @@ public: | |||||||
|     checkerboard=cb; |     checkerboard=cb; | ||||||
|  |  | ||||||
|     _odata.resize(_grid->oSites()); |     _odata.resize(_grid->oSites()); | ||||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       vobj tmp = eval(ss,expr); |       vobj tmp = eval(ss,expr); | ||||||
|       vstream(_odata[ss] ,tmp); |       vstream(_odata[ss] ,tmp); | ||||||
| @@ -209,7 +213,8 @@ public: | |||||||
|     checkerboard=cb; |     checkerboard=cb; | ||||||
|  |  | ||||||
|     _odata.resize(_grid->oSites()); |     _odata.resize(_grid->oSites()); | ||||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
| #ifdef STREAMING_STORES | #ifdef STREAMING_STORES | ||||||
|       vobj tmp = eval(ss,expr); |       vobj tmp = eval(ss,expr); | ||||||
|       vstream(_odata[ss] ,tmp); |       vstream(_odata[ss] ,tmp); | ||||||
| @@ -230,7 +235,8 @@ public: | |||||||
|     checkerboard=cb; |     checkerboard=cb; | ||||||
|  |  | ||||||
|     _odata.resize(_grid->oSites()); |     _odata.resize(_grid->oSites()); | ||||||
|     parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
|       vstream(_odata[ss] ,eval(ss,expr)); |       vstream(_odata[ss] ,eval(ss,expr)); | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
| @@ -252,7 +258,8 @@ public: | |||||||
|     	_grid = r._grid; |     	_grid = r._grid; | ||||||
|     	checkerboard = r.checkerboard; |     	checkerboard = r.checkerboard; | ||||||
|     	_odata.resize(_grid->oSites());// essential |     	_odata.resize(_grid->oSites());// essential | ||||||
| 	parallel_for(int ss=0;ss<_grid->oSites();ss++){ |   		PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
|             _odata[ss]=r._odata[ss]; |             _odata[ss]=r._odata[ss]; | ||||||
|         }  	 |         }  	 | ||||||
|     } |     } | ||||||
| @@ -262,7 +269,8 @@ public: | |||||||
|     virtual ~Lattice(void) = default; |     virtual ~Lattice(void) = default; | ||||||
|      |      | ||||||
|     template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ |     template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ | ||||||
|       parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
|             this->_odata[ss]=r; |             this->_odata[ss]=r; | ||||||
|         } |         } | ||||||
|         return *this; |         return *this; | ||||||
| @@ -271,7 +279,8 @@ public: | |||||||
|       this->checkerboard = r.checkerboard; |       this->checkerboard = r.checkerboard; | ||||||
|       conformable(*this,r); |       conformable(*this,r); | ||||||
|        |        | ||||||
|       parallel_for(int ss=0;ss<_grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<_grid->oSites();ss++){ | ||||||
|             this->_odata[ss]=r._odata[ss]; |             this->_odata[ss]=r._odata[ss]; | ||||||
|         } |         } | ||||||
|         return *this; |         return *this; | ||||||
|   | |||||||
| @@ -47,7 +47,8 @@ namespace Grid { | |||||||
|     inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) |     inline Lattice<vInteger> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) | ||||||
|     { |     { | ||||||
|       Lattice<vInteger> ret(rhs._grid); |       Lattice<vInteger> ret(rhs._grid); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
| 	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); | 	  ret._odata[ss]=op(lhs._odata[ss],rhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -59,7 +60,8 @@ namespace Grid { | |||||||
|     inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) |     inline Lattice<vInteger> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) | ||||||
|     { |     { | ||||||
|       Lattice<vInteger> ret(lhs._grid); |       Lattice<vInteger> ret(lhs._grid); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||||
| 	  ret._odata[ss]=op(lhs._odata[ss],rhs); | 	  ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -71,7 +73,8 @@ namespace Grid { | |||||||
|     inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) |     inline Lattice<vInteger> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) | ||||||
|     { |     { | ||||||
|       Lattice<vInteger> ret(rhs._grid); |       Lattice<vInteger> ret(rhs._grid); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
| 	  ret._odata[ss]=op(lhs._odata[ss],rhs); | 	  ret._odata[ss]=op(lhs._odata[ss],rhs); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -165,5 +168,6 @@ namespace Grid { | |||||||
|    inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) { |    inline Lattice<vInteger> operator != (const lobj & lhs, const Lattice<robj> & rhs) { | ||||||
|      return SLComparison(vne<lobj,robj>(),lhs,rhs); |      return SLComparison(vne<lobj,robj>(),lhs,rhs); | ||||||
|    } |    } | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -43,7 +43,8 @@ namespace Grid { | |||||||
|     inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> |     inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> | ||||||
|     { |     { | ||||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); |       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
| 	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); | 	  ret._odata[ss]=innerProduct(rhs._odata[ss],rhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -54,7 +55,8 @@ namespace Grid { | |||||||
|     inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> |     inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> | ||||||
|     { |     { | ||||||
|       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); |       Lattice<typename vobj::tensor_reduced> ret(rhs._grid); | ||||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
| 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); | 	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||||
|       } |       } | ||||||
|       return ret; |       return ret; | ||||||
| @@ -66,10 +68,13 @@ namespace Grid { | |||||||
|     inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> |     inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> | ||||||
|     { |     { | ||||||
|         Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); |         Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
|             ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); |             ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
|      } |      } | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -37,7 +37,8 @@ namespace Grid { | |||||||
|   inline Lattice<vobj> operator -(const Lattice<vobj> &r) |   inline Lattice<vobj> operator -(const Lattice<vobj> &r) | ||||||
|   { |   { | ||||||
|     Lattice<vobj> ret(r._grid); |     Lattice<vobj> ret(r._grid); | ||||||
|     parallel_for(int ss=0;ss<r._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<r._grid->oSites();ss++){ | ||||||
|       vstream(ret._odata[ss], -r._odata[ss]); |       vstream(ret._odata[ss], -r._odata[ss]); | ||||||
|     } |     } | ||||||
|     return ret; |     return ret; | ||||||
| @@ -73,7 +74,8 @@ namespace Grid { | |||||||
|   inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])> |   inline auto operator * (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs*rhs._odata[0])> | ||||||
|   { |   { | ||||||
|     Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid); |     Lattice<decltype(lhs*rhs._odata[0])> ret(rhs._grid); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
|       decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];  |       decltype(lhs*rhs._odata[0]) tmp=lhs*rhs._odata[ss];  | ||||||
|       vstream(ret._odata[ss],tmp); |       vstream(ret._odata[ss],tmp); | ||||||
| 	   //      ret._odata[ss]=lhs*rhs._odata[ss]; | 	   //      ret._odata[ss]=lhs*rhs._odata[ss]; | ||||||
| @@ -84,7 +86,8 @@ namespace Grid { | |||||||
|     inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])> |     inline auto operator + (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs+rhs._odata[0])> | ||||||
|     { |     { | ||||||
|       Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid); |       Lattice<decltype(lhs+rhs._odata[0])> ret(rhs._grid); | ||||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
| 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];   | 	decltype(lhs+rhs._odata[0]) tmp =lhs-rhs._odata[ss];   | ||||||
| 	vstream(ret._odata[ss],tmp); | 	vstream(ret._odata[ss],tmp); | ||||||
| 	//	ret._odata[ss]=lhs+rhs._odata[ss]; | 	//	ret._odata[ss]=lhs+rhs._odata[ss]; | ||||||
| @@ -95,9 +98,11 @@ namespace Grid { | |||||||
|     inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])> |     inline auto operator - (const left &lhs,const Lattice<right> &rhs) -> Lattice<decltype(lhs-rhs._odata[0])> | ||||||
|   { |   { | ||||||
|     Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid); |     Lattice<decltype(lhs-rhs._odata[0])> ret(rhs._grid); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
|       decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];   |       decltype(lhs-rhs._odata[0]) tmp=lhs-rhs._odata[ss];   | ||||||
|       vstream(ret._odata[ss],tmp); |       vstream(ret._odata[ss],tmp); | ||||||
|  |       //      ret._odata[ss]=lhs-rhs._odata[ss]; | ||||||
|     } |     } | ||||||
|     return ret; |     return ret; | ||||||
|   } |   } | ||||||
| @@ -105,7 +110,8 @@ namespace Grid { | |||||||
|       inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)> |       inline auto operator * (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]*rhs)> | ||||||
|     { |     { | ||||||
|       Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid); |       Lattice<decltype(lhs._odata[0]*rhs)> ret(lhs._grid); | ||||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int ss=0;ss<lhs._grid->oSites(); ss++){ | ||||||
| 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs; | 	decltype(lhs._odata[0]*rhs) tmp =lhs._odata[ss]*rhs; | ||||||
| 	vstream(ret._odata[ss],tmp); | 	vstream(ret._odata[ss],tmp); | ||||||
| 	//            ret._odata[ss]=lhs._odata[ss]*rhs; | 	//            ret._odata[ss]=lhs._odata[ss]*rhs; | ||||||
| @@ -116,7 +122,8 @@ namespace Grid { | |||||||
|       inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)> |       inline auto operator + (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]+rhs)> | ||||||
|     { |     { | ||||||
|         Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid); |         Lattice<decltype(lhs._odata[0]+rhs)> ret(lhs._grid); | ||||||
| 	parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
| 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;  | 	  decltype(lhs._odata[0]+rhs) tmp=lhs._odata[ss]+rhs;  | ||||||
| 	  vstream(ret._odata[ss],tmp); | 	  vstream(ret._odata[ss],tmp); | ||||||
| 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs; | 	  //	  ret._odata[ss]=lhs._odata[ss]+rhs; | ||||||
| @@ -127,12 +134,15 @@ namespace Grid { | |||||||
|       inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)> |       inline auto operator - (const Lattice<left> &lhs,const right &rhs) -> Lattice<decltype(lhs._odata[0]-rhs)> | ||||||
|     { |     { | ||||||
|       Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid); |       Lattice<decltype(lhs._odata[0]-rhs)> ret(lhs._grid); | ||||||
|       parallel_for(int ss=0;ss<rhs._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int ss=0;ss<rhs._grid->oSites(); ss++){ | ||||||
| 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs; | 	  decltype(lhs._odata[0]-rhs) tmp=lhs._odata[ss]-rhs; | ||||||
| 	  vstream(ret._odata[ss],tmp); | 	  vstream(ret._odata[ss],tmp); | ||||||
| 	  //	ret._odata[ss]=lhs._odata[ss]-rhs; | 	  //	ret._odata[ss]=lhs._odata[ss]-rhs; | ||||||
|       } |       } | ||||||
|       return ret; |       return ret; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -44,7 +44,8 @@ namespace Grid { | |||||||
|     { |     { | ||||||
|       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid); |       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i))> ret(lhs._grid); | ||||||
|       ret.checkerboard=lhs.checkerboard; |       ret.checkerboard=lhs.checkerboard; | ||||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i); | 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -54,7 +55,8 @@ namespace Grid { | |||||||
|     { |     { | ||||||
|       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid); |       Lattice<decltype(peekIndex<Index>(lhs._odata[0],i,j))> ret(lhs._grid); | ||||||
|       ret.checkerboard=lhs.checkerboard; |       ret.checkerboard=lhs.checkerboard; | ||||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j); | 	  ret._odata[ss] = peekIndex<Index>(lhs._odata[ss],i,j); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -66,14 +68,16 @@ namespace Grid { | |||||||
|     template<int Index,class vobj>  |     template<int Index,class vobj>  | ||||||
|     void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i) |     void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0))> & rhs,int i) | ||||||
|     { |     { | ||||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| 	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i); | 	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i); | ||||||
| 	}       | 	}       | ||||||
|     } |     } | ||||||
|     template<int Index,class vobj> |     template<int Index,class vobj> | ||||||
|       void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j) |       void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(lhs._odata[0],0,0))> & rhs,int i,int j) | ||||||
|     { |     { | ||||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| 	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j); | 	  pokeIndex<Index>(lhs._odata[ss],rhs._odata[ss],i,j); | ||||||
| 	}       | 	}       | ||||||
|     } |     } | ||||||
| @@ -127,6 +131,9 @@ namespace Grid { | |||||||
|  |  | ||||||
|       assert( l.checkerboard == l._grid->CheckerBoard(site)); |       assert( l.checkerboard == l._grid->CheckerBoard(site)); | ||||||
|  |  | ||||||
|  |       // FIXME | ||||||
|  |       //      assert( sizeof(sobj)*Nsimd == sizeof(vobj)); | ||||||
|  |  | ||||||
|       int rank,odx,idx; |       int rank,odx,idx; | ||||||
|       grid->GlobalCoorToRankIndex(rank,odx,idx,site); |       grid->GlobalCoorToRankIndex(rank,odx,idx,site); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -40,7 +40,8 @@ namespace Grid { | |||||||
|  |  | ||||||
|     template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ |     template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ | ||||||
|         Lattice<vobj> ret(lhs._grid); |         Lattice<vobj> ret(lhs._grid); | ||||||
| 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
|             ret._odata[ss] = adj(lhs._odata[ss]); |             ret._odata[ss] = adj(lhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -48,10 +49,13 @@ namespace Grid { | |||||||
|  |  | ||||||
|     template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ |     template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ | ||||||
|         Lattice<vobj> ret(lhs._grid); |         Lattice<vobj> ret(lhs._grid); | ||||||
| 	parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
|             ret._odata[ss] = conjugate(lhs._odata[ss]); |             ret._odata[ss] = conjugate(lhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -57,7 +57,8 @@ namespace Grid { | |||||||
| 	sumarray[i]=zero; | 	sumarray[i]=zero; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||||
| 	int nwork, mywork, myoff; | 	int nwork, mywork, myoff; | ||||||
| 	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); | 	GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff); | ||||||
| 	 | 	 | ||||||
| @@ -113,7 +114,8 @@ namespace Grid { | |||||||
| 	sumarray[i]=zero; | 	sumarray[i]=zero; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       parallel_for(int thr=0;thr<grid->SumArraySize();thr++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int thr=0;thr<grid->SumArraySize();thr++){ | ||||||
| 	int nwork, mywork, myoff; | 	int nwork, mywork, myoff; | ||||||
| 	GridThread::GetWork(grid->oSites(),thr,mywork,myoff); | 	GridThread::GetWork(grid->oSites(),thr,mywork,myoff); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -30,19 +30,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #define GRID_LATTICE_RNG_H | #define GRID_LATTICE_RNG_H | ||||||
|  |  | ||||||
| #include <random> | #include <random> | ||||||
|  |  | ||||||
| #ifdef RNG_SITMO |  | ||||||
| #include <Grid/sitmo_rng/sitmo_prng_engine.hpp> | #include <Grid/sitmo_rng/sitmo_prng_engine.hpp> | ||||||
| #endif  |  | ||||||
|  |  | ||||||
| #if defined(RNG_SITMO) |  | ||||||
| #define RNG_FAST_DISCARD |  | ||||||
| #else  |  | ||||||
| #undef  RNG_FAST_DISCARD |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////// | ||||||
|   // Allow the RNG state to be less dense than the fine grid |   // Allow the RNG state to be less dense than the fine grid | ||||||
|   ////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////// | ||||||
| @@ -72,139 +64,115 @@ namespace Grid { | |||||||
|  |  | ||||||
|       multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];  |       multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     return multiplicity; |     return multiplicity; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   // Wrap seed_seq to give common interface with random_device | ||||||
|  |   class fixedSeed { | ||||||
|  |   public: | ||||||
|  |  | ||||||
|  |     typedef std::seed_seq::result_type result_type; | ||||||
|  |  | ||||||
|  |     std::seed_seq src; | ||||||
|  |      | ||||||
|  |     fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {}; | ||||||
|  |  | ||||||
|  |     result_type operator () (void){ | ||||||
|  |  | ||||||
|  |       std::vector<result_type> list(1); | ||||||
|  |  | ||||||
|  |       src.generate(list.begin(),list.end()); | ||||||
|  |  | ||||||
|  |       return list[0]; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   // real scalars are one component |   // real scalars are one component | ||||||
|   template<class scalar,class distribution,class generator>  |   template<class scalar,class distribution,class generator> void fillScalar(scalar &s,distribution &dist,generator & gen) | ||||||
|   void fillScalar(scalar &s,distribution &dist,generator & gen) |  | ||||||
|   { |   { | ||||||
|     s=dist(gen); |     s=dist(gen); | ||||||
|   } |   } | ||||||
|   template<class distribution,class generator>  |   template<class distribution,class generator> void fillScalar(ComplexF &s,distribution &dist, generator &gen) | ||||||
|   void fillScalar(ComplexF &s,distribution &dist, generator &gen) |  | ||||||
|   { |   { | ||||||
|     s=ComplexF(dist(gen),dist(gen)); |     s=ComplexF(dist(gen),dist(gen)); | ||||||
|   } |   } | ||||||
|   template<class distribution,class generator>  |   template<class distribution,class generator> void fillScalar(ComplexD &s,distribution &dist,generator &gen) | ||||||
|   void fillScalar(ComplexD &s,distribution &dist,generator &gen) |  | ||||||
|   { |   { | ||||||
|     s=ComplexD(dist(gen),dist(gen)); |     s=ComplexD(dist(gen),dist(gen)); | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   class GridRNGbase { |   class GridRNGbase { | ||||||
|  |  | ||||||
|   public: |   public: | ||||||
|  |  | ||||||
|  |     int _seeded; | ||||||
|     // One generator per site. |     // One generator per site. | ||||||
|     // Uniform and Gaussian distributions from these generators. |     // Uniform and Gaussian distributions from these generators. | ||||||
| #ifdef RNG_RANLUX | #ifdef RNG_RANLUX | ||||||
|     typedef std::ranlux48 RngEngine; |  | ||||||
|     typedef uint64_t      RngStateType; |     typedef uint64_t      RngStateType; | ||||||
|  |     typedef std::ranlux48 RngEngine; | ||||||
|     static const int RngStateCount = 15; |     static const int RngStateCount = 15; | ||||||
| #endif  | #elif RNG_MT19937  | ||||||
| #ifdef RNG_MT19937  |  | ||||||
|     typedef std::mt19937 RngEngine; |     typedef std::mt19937 RngEngine; | ||||||
|     typedef uint32_t     RngStateType; |     typedef uint32_t     RngStateType; | ||||||
|     static const int     RngStateCount = std::mt19937::state_size; |     static const int     RngStateCount = std::mt19937::state_size; | ||||||
| #endif | #elif RNG_SITMO | ||||||
| #ifdef RNG_SITMO |  | ||||||
|     typedef sitmo::prng_engine 	RngEngine; |     typedef sitmo::prng_engine 	RngEngine; | ||||||
|     typedef uint64_t    	RngStateType; |     typedef uint64_t    	RngStateType; | ||||||
|     static const int    	RngStateCount = 4; |     static const int    	RngStateCount = 4; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     std::vector<RngEngine>                             _generators; |     std::vector<RngEngine>                             _generators; | ||||||
|     std::vector<std::uniform_real_distribution<RealD> > _uniform; |     std::vector<std::uniform_real_distribution<RealD>> _uniform; | ||||||
|     std::vector<std::normal_distribution<RealD> >       _gaussian; |     std::vector<std::normal_distribution<RealD>>       _gaussian; | ||||||
|     std::vector<std::discrete_distribution<int32_t> >   _bernoulli; |     std::vector<std::discrete_distribution<int32_t>>     _bernoulli; | ||||||
|     std::vector<std::uniform_int_distribution<uint32_t> > _uid; |  | ||||||
|  |  | ||||||
|     /////////////////////// |     void GetState(std::vector<RngStateType> & saved,int gen) { | ||||||
|     // support for parallel init |  | ||||||
|     /////////////////////// |  | ||||||
| #ifdef RNG_FAST_DISCARD |  | ||||||
|     static void Skip(RngEngine &eng) |  | ||||||
|     { |  | ||||||
|       ///////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|       // Skip by 2^40 elements between successive lattice sites |  | ||||||
|       // This goes by 10^12. |  | ||||||
|       // Consider quenched updating; likely never exceeding rate of 1000 sweeps |  | ||||||
|       // per second on any machine. This gives us of order 10^9 seconds, or 100 years |  | ||||||
|       // skip ahead. |  | ||||||
|       // For HMC unlikely to go at faster than a solve per second, and  |  | ||||||
|       // tens of seconds per trajectory so this is clean in all reasonable cases, |  | ||||||
|       // and margin of safety is orders of magnitude. |  | ||||||
|       // We could hack Sitmo to skip in the higher order words of state if necessary |  | ||||||
|       ///////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|       uint64_t skip = 0x1; skip = skip<<40; |  | ||||||
|       eng.discard(skip); |  | ||||||
|     }  |  | ||||||
| #endif |  | ||||||
|     static RngEngine Reseed(RngEngine &eng) |  | ||||||
|     { |  | ||||||
|       std::vector<uint32_t> newseed; |  | ||||||
|       std::uniform_int_distribution<uint32_t> uid; |  | ||||||
|       return Reseed(eng,newseed,uid); |  | ||||||
|     } |  | ||||||
|     static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed, |  | ||||||
| 			    std::uniform_int_distribution<uint32_t> &uid) |  | ||||||
|     { |  | ||||||
|       const int reseeds=4; |  | ||||||
|        |  | ||||||
|       newseed.resize(reseeds); |  | ||||||
|       for(int i=0;i<reseeds;i++){ |  | ||||||
| 	newseed[i] = uid(eng); |  | ||||||
|       } |  | ||||||
|       std::seed_seq sseq(newseed.begin(),newseed.end()); |  | ||||||
|       return RngEngine(sseq); |  | ||||||
|     }     |  | ||||||
|  |  | ||||||
|     void GetState(std::vector<RngStateType> & saved,RngEngine &eng) { |  | ||||||
|       saved.resize(RngStateCount); |       saved.resize(RngStateCount); | ||||||
|       std::stringstream ss; |       std::stringstream ss; | ||||||
|       ss<<eng; |       ss<<_generators[gen]; | ||||||
|       ss.seekg(0,ss.beg); |       ss.seekg(0,ss.beg); | ||||||
|       for(int i=0;i<RngStateCount;i++){ |       for(int i=0;i<RngStateCount;i++){ | ||||||
| 	ss>>saved[i]; | 	ss>>saved[i]; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     void GetState(std::vector<RngStateType> & saved,int gen) { |     void SetState(std::vector<RngStateType> & saved,int gen){ | ||||||
|       GetState(saved,_generators[gen]); |  | ||||||
|     } |  | ||||||
|     void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ |  | ||||||
|       assert(saved.size()==RngStateCount); |       assert(saved.size()==RngStateCount); | ||||||
|       std::stringstream ss; |       std::stringstream ss; | ||||||
|       for(int i=0;i<RngStateCount;i++){ |       for(int i=0;i<RngStateCount;i++){ | ||||||
| 	ss<< saved[i]<<" "; | 	ss<< saved[i]<<" "; | ||||||
|       } |       } | ||||||
|       ss.seekg(0,ss.beg); |       ss.seekg(0,ss.beg); | ||||||
|       ss>>eng; |       ss>>_generators[gen]; | ||||||
|     } |  | ||||||
|     void SetState(std::vector<RngStateType> & saved,int gen){ |  | ||||||
|       SetState(saved,_generators[gen]); |  | ||||||
|     } |  | ||||||
|     void SetEngine(RngEngine &Eng, int gen){ |  | ||||||
|       _generators[gen]=Eng; |  | ||||||
|     } |  | ||||||
|     void GetEngine(RngEngine &Eng, int gen){ |  | ||||||
|       Eng=_generators[gen]; |  | ||||||
|     } |  | ||||||
|     template<class source> void Seed(source &src, int gen) |  | ||||||
|     { |  | ||||||
|       _generators[gen] = RngEngine(src); |  | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   class GridSerialRNG : public GridRNGbase { |   class GridSerialRNG : public GridRNGbase { | ||||||
|   public: |   public: | ||||||
|  |  | ||||||
|  |     // FIXME ... do we require lockstep draws of randoms  | ||||||
|  |     // from all nodes keeping seeds consistent. | ||||||
|  |     // place a barrier/broadcast in the fill routine | ||||||
|  |     template<class source> void Seed(source &src) | ||||||
|  |     { | ||||||
|  |       typename source::result_type init = src(); | ||||||
|  |       CartesianCommunicator::BroadcastWorld(0,(void *)&init,sizeof(init)); | ||||||
|  |       _generators[0] = RngEngine(init); | ||||||
|  |       _seeded=1; | ||||||
|  |     }     | ||||||
|  |  | ||||||
|     GridSerialRNG() : GridRNGbase() { |     GridSerialRNG() : GridRNGbase() { | ||||||
|       _generators.resize(1); |       _generators.resize(1); | ||||||
|       _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); |       _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); | ||||||
|       _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) ); |       _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) ); | ||||||
|       _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); |       _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); | ||||||
|       _uid.resize(1,std::uniform_int_distribution<uint32_t>() ); |       _seeded=0; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){ |     template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){ | ||||||
|  |  | ||||||
|       typedef typename sobj::scalar_type scalar_type; |       typedef typename sobj::scalar_type scalar_type; | ||||||
| @@ -276,18 +244,23 @@ namespace Grid { | |||||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); |       CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l)); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void SeedFixedIntegers(const std::vector<int> &seeds){ |  | ||||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); |     void SeedRandomDevice(void){ | ||||||
|       std::seed_seq src(seeds.begin(),seeds.end()); |       std::random_device rd; | ||||||
|       Seed(src,0); |       Seed(rd); | ||||||
|     } |     } | ||||||
|  |     void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||||
|  |       fixedSeed src(seeds); | ||||||
|  |       Seed(src); | ||||||
|  |     } | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   class GridParallelRNG : public GridRNGbase { |   class GridParallelRNG : public GridRNGbase { | ||||||
|   public: |   public: | ||||||
|  |  | ||||||
|     GridBase *_grid; |     GridBase *_grid; | ||||||
|     int _vol; |     int _vol; | ||||||
|   public: |  | ||||||
|  |  | ||||||
|     int generator_idx(int os,int is){ |     int generator_idx(int os,int is){ | ||||||
|       return is*_grid->oSites()+os; |       return is*_grid->oSites()+os; | ||||||
| @@ -301,9 +274,55 @@ namespace Grid { | |||||||
|       _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); |       _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); | ||||||
|       _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) ); |       _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) ); | ||||||
|       _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); |       _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); | ||||||
|       _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); |       _seeded=0; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     // This loop could be made faster to avoid the Ahmdahl by | ||||||
|  |     // i)  seed generators on each timeslice, for x=y=z=0; | ||||||
|  |     // ii) seed generators on each z for x=y=0 | ||||||
|  |     // iii)seed generators on each y,z for x=0 | ||||||
|  |     // iv) seed generators on each y,z,x  | ||||||
|  |     // made possible by physical indexing. | ||||||
|  |     template<class source> void Seed(source &src) | ||||||
|  |     { | ||||||
|  |       std::vector<int> gcoor; | ||||||
|  |  | ||||||
|  |       int gsites = _grid->_gsites; | ||||||
|  |  | ||||||
|  |       typename source::result_type init = src(); | ||||||
|  |       RngEngine pseeder(init); | ||||||
|  |       std::uniform_int_distribution<uint64_t> ui; | ||||||
|  |  | ||||||
|  |       for(int gidx=0;gidx<gsites;gidx++){ | ||||||
|  |  | ||||||
|  | 	int rank,o_idx,i_idx; | ||||||
|  | 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); | ||||||
|  | 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||||
|  |          | ||||||
|  | 	int l_idx=generator_idx(o_idx,i_idx); | ||||||
|  |  | ||||||
|  | 	const int num_rand_seed=16; | ||||||
|  | 	std::vector<int> site_seeds(num_rand_seed); | ||||||
|  | 	for(int i=0;i<site_seeds.size();i++){ | ||||||
|  | 	  site_seeds[i]= ui(pseeder); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	_grid->Broadcast(0,(void *)&site_seeds[0],sizeof(int)*site_seeds.size()); | ||||||
|  |  | ||||||
|  | 	if( rank == _grid->ThisRank() ){ | ||||||
|  | 	  fixedSeed ssrc(site_seeds); | ||||||
|  | 	  typename source::result_type sinit = ssrc(); | ||||||
|  | 	  _generators[l_idx] = RngEngine(sinit); | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       _seeded=1; | ||||||
|  |     }     | ||||||
|  |  | ||||||
|  |     //FIXME implement generic IO and create state save/restore | ||||||
|  |     //void SaveState(const std::string<char> &file); | ||||||
|  |     //void LoadState(const std::string<char> &file); | ||||||
|  |  | ||||||
|     template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){ |     template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){ | ||||||
|  |  | ||||||
|       typedef typename vobj::scalar_object scalar_object; |       typedef typename vobj::scalar_object scalar_object; | ||||||
| @@ -316,7 +335,9 @@ namespace Grid { | |||||||
|       int     osites=_grid->oSites(); |       int     osites=_grid->oSites(); | ||||||
|       int words=sizeof(scalar_object)/sizeof(scalar_type); |       int words=sizeof(scalar_object)/sizeof(scalar_type); | ||||||
|  |  | ||||||
|       parallel_for(int ss=0;ss<osites;ss++){ |  | ||||||
|  | PARALLEL_FOR_LOOP | ||||||
|  |       for(int ss=0;ss<osites;ss++){ | ||||||
|  |  | ||||||
| 	std::vector<scalar_object> buf(Nsimd); | 	std::vector<scalar_object> buf(Nsimd); | ||||||
| 	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times | 	for(int m=0;m<multiplicity;m++) {// Draw from same generator multiplicity times | ||||||
| @@ -338,114 +359,40 @@ namespace Grid { | |||||||
|       } |       } | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     void SeedRandomDevice(void){ | ||||||
|  |       std::random_device rd; | ||||||
|  |       Seed(rd); | ||||||
|  |     } | ||||||
|     void SeedFixedIntegers(const std::vector<int> &seeds){ |     void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||||
|  |       fixedSeed src(seeds); | ||||||
|       // Everyone generates the same seed_seq based on input seeds |       Seed(src); | ||||||
|       CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); |  | ||||||
|  |  | ||||||
|       std::seed_seq source(seeds.begin(),seeds.end()); |  | ||||||
|  |  | ||||||
|       RngEngine master_engine(source); |  | ||||||
|  |  | ||||||
| #ifdef RNG_FAST_DISCARD |  | ||||||
|       //////////////////////////////////////////////// |  | ||||||
|       // Skip ahead through a single stream. |  | ||||||
|       // Applicable to SITMO and other has based/crypto RNGs |  | ||||||
|       // Should be applicable to Mersenne Twister, but the C++11 |  | ||||||
|       // MT implementation does not implement fast discard even though |  | ||||||
|       // in principle this is possible |  | ||||||
|       //////////////////////////////////////////////// |  | ||||||
|       std::vector<int> gcoor; |  | ||||||
|       int rank,o_idx,i_idx; |  | ||||||
|  |  | ||||||
|       // Everybody loops over global volume. |  | ||||||
|       for(int gidx=0;gidx<_grid->_gsites;gidx++){ |  | ||||||
|  |  | ||||||
| 	Skip(master_engine); // Skip to next RNG sequence |  | ||||||
|  |  | ||||||
| 	// Where is it? |  | ||||||
| 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); |  | ||||||
| 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); |  | ||||||
|  |  | ||||||
| 	// If this is one of mine we take it |  | ||||||
| 	if( rank == _grid->ThisRank() ){ |  | ||||||
| 	  int l_idx=generator_idx(o_idx,i_idx); |  | ||||||
| 	  _generators[l_idx] = master_engine; |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
|       } |  | ||||||
| #else  |  | ||||||
|       //////////////////////////////////////////////////////////////// |  | ||||||
|       // Machine and thread decomposition dependent seeding is efficient |  | ||||||
|       // and maximally parallel; but NOT reproducible from machine to machine.  |  | ||||||
|       // Not ideal, but fastest way to reseed all nodes. |  | ||||||
|       //////////////////////////////////////////////////////////////// |  | ||||||
|       { |  | ||||||
| 	// Obtain one Reseed per processor |  | ||||||
| 	int Nproc = _grid->ProcessorCount(); |  | ||||||
| 	std::vector<RngEngine> seeders(Nproc); |  | ||||||
| 	int me= _grid->ThisRank(); |  | ||||||
| 	for(int p=0;p<Nproc;p++){ |  | ||||||
| 	  seeders[p] = Reseed(master_engine); |  | ||||||
| 	} |  | ||||||
| 	master_engine = seeders[me]; |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|       { |  | ||||||
| 	// Obtain one reseeded generator per thread |  | ||||||
| 	int Nthread = GridThread::GetThreads(); |  | ||||||
| 	std::vector<RngEngine> seeders(Nthread); |  | ||||||
| 	for(int t=0;t<Nthread;t++){ |  | ||||||
| 	  seeders[t] = Reseed(master_engine); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	parallel_for(int t=0;t<Nthread;t++) { |  | ||||||
| 	  // set up one per local site in threaded fashion |  | ||||||
| 	  std::vector<uint32_t> newseeds; |  | ||||||
| 	  std::uniform_int_distribution<uint32_t> uid;	 |  | ||||||
| 	  for(int l=0;l<_grid->lSites();l++) { |  | ||||||
| 	    if ( (l%Nthread)==t ) { |  | ||||||
| 	      _generators[l] = Reseed(seeders[t],newseeds,uid); |  | ||||||
| 	    } |  | ||||||
| 	  } |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
| #endif |  | ||||||
|     } |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     // Support for rigorous test of RNG's |  | ||||||
|     // Return uniform random uint32_t from requested site generator |  | ||||||
|     //////////////////////////////////////////////////////////////////////// |  | ||||||
|     uint32_t GlobalU01(int gsite){ |  | ||||||
|  |  | ||||||
|       uint32_t the_number; |  | ||||||
|  |  | ||||||
|       // who |  | ||||||
|       std::vector<int> gcoor; |  | ||||||
|       int rank,o_idx,i_idx; |  | ||||||
|       _grid->GlobalIndexToGlobalCoor(gsite,gcoor); |  | ||||||
|       _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); |  | ||||||
|  |  | ||||||
|       // draw |  | ||||||
|       int l_idx=generator_idx(o_idx,i_idx); |  | ||||||
|       if( rank == _grid->ThisRank() ){ |  | ||||||
| 	the_number = _uid[l_idx](_generators[l_idx]); |  | ||||||
|       } |  | ||||||
|        |  | ||||||
|       // share & return |  | ||||||
|       _grid->Broadcast(rank,(void *)&the_number,sizeof(the_number)); |  | ||||||
|       return the_number; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l)   { rng.fill(l,rng._uniform);  } |   template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l){ | ||||||
|   template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); } |     rng.fill(l,rng._uniform); | ||||||
|   template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);} |   } | ||||||
|  |  | ||||||
|   template <class sobj> inline void random(GridSerialRNG &rng,sobj &l)   { rng.fill(l,rng._uniform  ); } |   template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l){ | ||||||
|   template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } |     rng.fill(l,rng._gaussian); | ||||||
|   template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } |   } | ||||||
|  |    | ||||||
|  |   template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ | ||||||
|  |     rng.fill(l,rng._bernoulli); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){ | ||||||
|  |     rng.fill(l,rng._uniform); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){ | ||||||
|  |     rng.fill(l,rng._gaussian); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ | ||||||
|  |     rng.fill(l,rng._bernoulli); | ||||||
|  |   } | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -42,7 +42,8 @@ namespace Grid { | |||||||
|       -> Lattice<decltype(trace(lhs._odata[0]))> |       -> Lattice<decltype(trace(lhs._odata[0]))> | ||||||
|     { |     { | ||||||
|       Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid); |       Lattice<decltype(trace(lhs._odata[0]))> ret(lhs._grid); | ||||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
|             ret._odata[ss] = trace(lhs._odata[ss]); |             ret._odata[ss] = trace(lhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -55,7 +56,8 @@ namespace Grid { | |||||||
|     inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> |     inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> | ||||||
|     { |     { | ||||||
|       Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid); |       Lattice<decltype(traceIndex<Index>(lhs._odata[0]))> ret(lhs._grid); | ||||||
|       parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |       for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
| 	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]); | 	ret._odata[ss] = traceIndex<Index>(lhs._odata[ss]); | ||||||
|       } |       } | ||||||
|       return ret; |       return ret; | ||||||
|   | |||||||
| @@ -51,7 +51,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine) | |||||||
|   template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ |   template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){ | ||||||
|     half.checkerboard = cb; |     half.checkerboard = cb; | ||||||
|     int ssh=0; |     int ssh=0; | ||||||
|     //parallel_for |     //PARALLEL_FOR_LOOP | ||||||
|     for(int ss=0;ss<full._grid->oSites();ss++){ |     for(int ss=0;ss<full._grid->oSites();ss++){ | ||||||
|       std::vector<int> coor; |       std::vector<int> coor; | ||||||
|       int cbos; |       int cbos; | ||||||
| @@ -68,7 +68,7 @@ inline void subdivides(GridBase *coarse,GridBase *fine) | |||||||
|   template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ |   template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ | ||||||
|     int cb = half.checkerboard; |     int cb = half.checkerboard; | ||||||
|     int ssh=0; |     int ssh=0; | ||||||
|     //parallel_for |     //PARALLEL_FOR_LOOP | ||||||
|     for(int ss=0;ss<full._grid->oSites();ss++){ |     for(int ss=0;ss<full._grid->oSites();ss++){ | ||||||
|       std::vector<int> coor; |       std::vector<int> coor; | ||||||
|       int cbos; |       int cbos; | ||||||
| @@ -153,7 +153,8 @@ inline void blockZAXPY(Lattice<vobj> &fineZ, | |||||||
|     assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); |     assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   parallel_for(int sf=0;sf<fine->oSites();sf++){ | PARALLEL_FOR_LOOP | ||||||
|  |   for(int sf=0;sf<fine->oSites();sf++){ | ||||||
|      |      | ||||||
|     int sc; |     int sc; | ||||||
|     std::vector<int> coor_c(_ndimension); |     std::vector<int> coor_c(_ndimension); | ||||||
| @@ -185,7 +186,8 @@ template<class vobj,class CComplex> | |||||||
|  |  | ||||||
|   fine_inner = localInnerProduct(fineX,fineY); |   fine_inner = localInnerProduct(fineX,fineY); | ||||||
|   blockSum(coarse_inner,fine_inner); |   blockSum(coarse_inner,fine_inner); | ||||||
|   parallel_for(int ss=0;ss<coarse->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |   for(int ss=0;ss<coarse->oSites();ss++){ | ||||||
|     CoarseInner._odata[ss] = coarse_inner._odata[ss]; |     CoarseInner._odata[ss] = coarse_inner._odata[ss]; | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -331,6 +333,9 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out) | |||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|   typedef typename vvobj::scalar_object ssobj; |   typedef typename vvobj::scalar_object ssobj; | ||||||
|  |  | ||||||
|  |   sobj s; | ||||||
|  |   ssobj ss; | ||||||
|  |  | ||||||
|   GridBase *ig = in._grid; |   GridBase *ig = in._grid; | ||||||
|   GridBase *og = out._grid; |   GridBase *og = out._grid; | ||||||
|  |  | ||||||
| @@ -342,13 +347,10 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out) | |||||||
|   for(int d=0;d<no;d++){ |   for(int d=0;d<no;d++){ | ||||||
|     assert(ig->_processors[d]  == og->_processors[d]); |     assert(ig->_processors[d]  == og->_processors[d]); | ||||||
|     assert(ig->_ldimensions[d] == og->_ldimensions[d]); |     assert(ig->_ldimensions[d] == og->_ldimensions[d]); | ||||||
|     assert(ig->lSites() == og->lSites()); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   parallel_for(int idx=0;idx<ig->lSites();idx++){ |   //PARALLEL_FOR_LOOP | ||||||
|     sobj s; |   for(int idx=0;idx<ig->lSites();idx++){ | ||||||
|     ssobj ss; |  | ||||||
|  |  | ||||||
|     std::vector<int> lcoor(ni); |     std::vector<int> lcoor(ni); | ||||||
|     ig->LocalIndexToLocalCoor(idx,lcoor); |     ig->LocalIndexToLocalCoor(idx,lcoor); | ||||||
|     peekLocalSite(s,in,lcoor); |     peekLocalSite(s,in,lcoor); | ||||||
| @@ -362,6 +364,7 @@ template<class vobj> | |||||||
| void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) | void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   sobj s; | ||||||
|  |  | ||||||
|   GridBase *lg = lowDim._grid; |   GridBase *lg = lowDim._grid; | ||||||
|   GridBase *hg = higherDim._grid; |   GridBase *hg = higherDim._grid; | ||||||
| @@ -383,16 +386,17 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   // the above should guarantee that the operations are local |   // the above should guarantee that the operations are local | ||||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ |   // Guido: check the threading here | ||||||
|     sobj s; |   //PARALLEL_FOR_LOOP | ||||||
|  |   for(int idx=0;idx<lg->lSites();idx++){ | ||||||
|     std::vector<int> lcoor(nl); |     std::vector<int> lcoor(nl); | ||||||
|     std::vector<int> hcoor(nh); |     std::vector<int> hcoor(nh); | ||||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); |     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||||
|     int ddl=0; |     dl=0; | ||||||
|     hcoor[orthog] = slice; |     hcoor[orthog] = slice; | ||||||
|     for(int d=0;d<nh;d++){ |     for(int d=0;d<nh;d++){ | ||||||
|       if ( d!=orthog ) {  |       if ( d!=orthog ) {  | ||||||
| 	hcoor[d]=lcoor[ddl++]; | 	hcoor[d]=lcoor[dl++]; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     peekLocalSite(s,lowDim,lcoor); |     peekLocalSite(s,lowDim,lcoor); | ||||||
| @@ -404,6 +408,7 @@ template<class vobj> | |||||||
| void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog) | void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog) | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   sobj s; | ||||||
|  |  | ||||||
|   GridBase *lg = lowDim._grid; |   GridBase *lg = lowDim._grid; | ||||||
|   GridBase *hg = higherDim._grid; |   GridBase *hg = higherDim._grid; | ||||||
| @@ -424,16 +429,16 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   // the above should guarantee that the operations are local |   // the above should guarantee that the operations are local | ||||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ |   //PARALLEL_FOR_LOOP | ||||||
|     sobj s; |   for(int idx=0;idx<lg->lSites();idx++){ | ||||||
|     std::vector<int> lcoor(nl); |     std::vector<int> lcoor(nl); | ||||||
|     std::vector<int> hcoor(nh); |     std::vector<int> hcoor(nh); | ||||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); |     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||||
|     int ddl=0; |     dl=0; | ||||||
|     hcoor[orthog] = slice; |     hcoor[orthog] = slice; | ||||||
|     for(int d=0;d<nh;d++){ |     for(int d=0;d<nh;d++){ | ||||||
|       if ( d!=orthog ) {  |       if ( d!=orthog ) {  | ||||||
| 	hcoor[d]=lcoor[ddl++]; | 	hcoor[d]=lcoor[dl++]; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     peekLocalSite(s,higherDim,hcoor); |     peekLocalSite(s,higherDim,hcoor); | ||||||
| @@ -447,6 +452,7 @@ template<class vobj> | |||||||
| void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   sobj s; | ||||||
|  |  | ||||||
|   GridBase *lg = lowDim._grid; |   GridBase *lg = lowDim._grid; | ||||||
|   GridBase *hg = higherDim._grid; |   GridBase *hg = higherDim._grid; | ||||||
| @@ -463,8 +469,8 @@ void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   // the above should guarantee that the operations are local |   // the above should guarantee that the operations are local | ||||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ |   //PARALLEL_FOR_LOOP | ||||||
|     sobj s; |   for(int idx=0;idx<lg->lSites();idx++){ | ||||||
|     std::vector<int> lcoor(nl); |     std::vector<int> lcoor(nl); | ||||||
|     std::vector<int> hcoor(nh); |     std::vector<int> hcoor(nh); | ||||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); |     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||||
| @@ -482,6 +488,7 @@ template<class vobj> | |||||||
| void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|  |   sobj s; | ||||||
|  |  | ||||||
|   GridBase *lg = lowDim._grid; |   GridBase *lg = lowDim._grid; | ||||||
|   GridBase *hg = higherDim._grid; |   GridBase *hg = higherDim._grid; | ||||||
| @@ -498,8 +505,8 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slic | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   // the above should guarantee that the operations are local |   // the above should guarantee that the operations are local | ||||||
|   parallel_for(int idx=0;idx<lg->lSites();idx++){ |   //PARALLEL_FOR_LOOP | ||||||
|     sobj s; |   for(int idx=0;idx<lg->lSites();idx++){ | ||||||
|     std::vector<int> lcoor(nl); |     std::vector<int> lcoor(nl); | ||||||
|     std::vector<int> hcoor(nh); |     std::vector<int> hcoor(nh); | ||||||
|     lg->LocalIndexToLocalCoor(idx,lcoor); |     lg->LocalIndexToLocalCoor(idx,lcoor); | ||||||
| @@ -567,7 +574,8 @@ typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj> | |||||||
|     in_grid->iCoorFromIindex(in_icoor[lane], lane); |     in_grid->iCoorFromIindex(in_icoor[lane], lane); | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index | PARALLEL_FOR_LOOP | ||||||
|  |   for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index | ||||||
|     //Assemble vector of pointers to output elements |     //Assemble vector of pointers to output elements | ||||||
|     std::vector<sobj*> out_ptrs(in_nsimd); |     std::vector<sobj*> out_ptrs(in_nsimd); | ||||||
|  |  | ||||||
| @@ -615,7 +623,8 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ | |||||||
|   std::vector<SobjOut> in_slex_conv(in_grid->lSites()); |   std::vector<SobjOut> in_slex_conv(in_grid->lSites()); | ||||||
|   unvectorizeToLexOrdArray(in_slex_conv, in); |   unvectorizeToLexOrdArray(in_slex_conv, in); | ||||||
|      |      | ||||||
|   parallel_for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){ |   PARALLEL_FOR_LOOP | ||||||
|  |   for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){ | ||||||
|     std::vector<int> out_ocoor(ndim); |     std::vector<int> out_ocoor(ndim); | ||||||
|     out_grid->oCoorFromOindex(out_ocoor, out_oidx); |     out_grid->oCoorFromOindex(out_ocoor, out_oidx); | ||||||
|  |  | ||||||
| @@ -634,5 +643,9 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |    | ||||||
|  |  | ||||||
|  |   | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -41,7 +41,8 @@ namespace Grid { | |||||||
|   template<class vobj> |   template<class vobj> | ||||||
|     inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ |     inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ | ||||||
|         Lattice<vobj> ret(lhs._grid); |         Lattice<vobj> ret(lhs._grid); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
|             ret._odata[ss] = transpose(lhs._odata[ss]); |             ret._odata[ss] = transpose(lhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
| @@ -54,10 +55,12 @@ namespace Grid { | |||||||
|     inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> |     inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> | ||||||
|     { |     { | ||||||
|       Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid); |       Lattice<decltype(transposeIndex<Index>(lhs._odata[0]))> ret(lhs._grid); | ||||||
|     parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |         for(int ss=0;ss<lhs._grid->oSites();ss++){ | ||||||
|             ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]); |             ret._odata[ss] = transposeIndex<Index>(lhs._odata[ss]); | ||||||
|         } |         } | ||||||
|         return ret; |         return ret; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -37,7 +37,8 @@ namespace Grid { | |||||||
|     Lattice<obj> ret(rhs._grid); |     Lattice<obj> ret(rhs._grid); | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
|       ret._odata[ss]=pow(rhs._odata[ss],y); |       ret._odata[ss]=pow(rhs._odata[ss],y); | ||||||
|     } |     } | ||||||
|     return ret; |     return ret; | ||||||
| @@ -46,7 +47,8 @@ namespace Grid { | |||||||
|     Lattice<obj> ret(rhs._grid); |     Lattice<obj> ret(rhs._grid); | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
|       ret._odata[ss]=mod(rhs._odata[ss],y); |       ret._odata[ss]=mod(rhs._odata[ss],y); | ||||||
|     } |     } | ||||||
|     return ret; |     return ret; | ||||||
| @@ -56,7 +58,8 @@ namespace Grid { | |||||||
|     Lattice<obj> ret(rhs._grid); |     Lattice<obj> ret(rhs._grid); | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
|       ret._odata[ss]=div(rhs._odata[ss],y); |       ret._odata[ss]=div(rhs._odata[ss],y); | ||||||
|     } |     } | ||||||
|     return ret; |     return ret; | ||||||
| @@ -66,7 +69,8 @@ namespace Grid { | |||||||
|     Lattice<obj> ret(rhs._grid); |     Lattice<obj> ret(rhs._grid); | ||||||
|     ret.checkerboard = rhs.checkerboard; |     ret.checkerboard = rhs.checkerboard; | ||||||
|     conformable(ret,rhs); |     conformable(ret,rhs); | ||||||
|     parallel_for(int ss=0;ss<rhs._grid->oSites();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |     for(int ss=0;ss<rhs._grid->oSites();ss++){ | ||||||
|       ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp); |       ret._odata[ss]=Exponentiate(rhs._odata[ss],alpha, Nexp); | ||||||
|     } |     } | ||||||
|     return ret; |     return ret; | ||||||
|   | |||||||
| @@ -56,7 +56,8 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice< | |||||||
|   std::vector<scalar_object> truevals (Nsimd); |   std::vector<scalar_object> truevals (Nsimd); | ||||||
|   std::vector<scalar_object> falsevals(Nsimd); |   std::vector<scalar_object> falsevals(Nsimd); | ||||||
|  |  | ||||||
|   parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |   for(int ss=0;ss<iftrue._grid->oSites(); ss++){ | ||||||
|  |  | ||||||
|     extract(iftrue._odata[ss]   ,truevals); |     extract(iftrue._odata[ss]   ,truevals); | ||||||
|     extract(iffalse._odata[ss]  ,falsevals); |     extract(iffalse._odata[ss]  ,falsevals); | ||||||
|   | |||||||
| @@ -35,27 +35,37 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| #endif | #endif | ||||||
| #include <arpa/inet.h> | #include <arpa/inet.h> | ||||||
| #include <algorithm> | #include <algorithm> | ||||||
|  | // 64bit endian swap is a portability pain | ||||||
|  | #ifndef __has_builtin         // Optional of course. | ||||||
|  | #define __has_builtin(x) 0  // Compatibility with non-clang compilers. | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #if HAVE_DECL_BE64TOH  | ||||||
|  | #undef Grid_ntohll | ||||||
|  | #define Grid_ntohll be64toh | ||||||
|  | #endif | ||||||
|  |  | ||||||
| inline uint32_t byte_reverse32(uint32_t f) {  | #if HAVE_DECL_NTOHLL | ||||||
|       f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;  | #undef  Grid_ntohll | ||||||
|       return f; | #define Grid_ntohll ntohll | ||||||
| } | #endif | ||||||
| inline uint64_t byte_reverse64(uint64_t f) {  |  | ||||||
|   uint64_t g; | #ifndef Grid_ntohll | ||||||
|   g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;  |  | ||||||
|   g = g << 32; |  | ||||||
|   f = f >> 32; |  | ||||||
|   g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;  |  | ||||||
|   return g; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #if BYTE_ORDER == BIG_ENDIAN  | #if BYTE_ORDER == BIG_ENDIAN  | ||||||
| inline uint64_t Grid_ntohll(uint64_t A) { return A; } |  | ||||||
|  | #define Grid_ntohll(A) (A) | ||||||
|  |  | ||||||
| #else  | #else  | ||||||
| inline uint64_t Grid_ntohll(uint64_t A) {  |  | ||||||
|   return byte_reverse64(A); | #if __has_builtin(__builtin_bswap64) | ||||||
| } | #define Grid_ntohll(A) __builtin_bswap64(A) | ||||||
|  | #else | ||||||
|  | #error | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| namespace Grid {  | namespace Grid {  | ||||||
| @@ -185,7 +195,7 @@ class BinaryIO { | |||||||
|       std::vector<int> site({x,y,z,t}); |       std::vector<int> site({x,y,z,t}); | ||||||
|  |  | ||||||
|       if (grid->IsBoss()) { |       if (grid->IsBoss()) { | ||||||
|         fin.read((char *)&file_object, sizeof(file_object));assert( fin.fail()==0); |         fin.read((char *)&file_object, sizeof(file_object)); | ||||||
|         bytes += sizeof(file_object); |         bytes += sizeof(file_object); | ||||||
|         if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object)); |         if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object)); | ||||||
|         if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object)); |         if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object)); | ||||||
| @@ -201,13 +211,11 @@ class BinaryIO { | |||||||
|     std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" " |     std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" " | ||||||
|        << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl; |        << (double)bytes/ (double)timer.useconds() <<" MB/s "  <<std::endl; | ||||||
|  |  | ||||||
|     grid->Broadcast(0,(void *)&csum,sizeof(csum)); |  | ||||||
|     return csum; |     return csum; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class vobj,class fobj,class munger>  |   template<class vobj,class fobj,class munger>  | ||||||
|   static inline uint32_t writeObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset, |   static inline uint32_t writeObjectSerial(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string & format) | ||||||
| 					   const std::string & format) |  | ||||||
|   { |   { | ||||||
|     typedef typename vobj::scalar_object sobj; |     typedef typename vobj::scalar_object sobj; | ||||||
|  |  | ||||||
| @@ -253,7 +261,7 @@ class BinaryIO { | |||||||
|   if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object)); |   if(ieee64)    htole64_v((void *)&file_object,sizeof(file_object)); | ||||||
|  |  | ||||||
|   // NB could gather an xstrip as an optimisation. |   // NB could gather an xstrip as an optimisation. | ||||||
| 	fout.write((char *)&file_object,sizeof(file_object));assert( fout.fail()==0); |   fout.write((char *)&file_object,sizeof(file_object)); | ||||||
|   bytes+=sizeof(file_object); |   bytes+=sizeof(file_object); | ||||||
|       } |       } | ||||||
|     }}}} |     }}}} | ||||||
| @@ -261,7 +269,6 @@ class BinaryIO { | |||||||
|     std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" " |     std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" " | ||||||
|        << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl; |        << (double)bytes/timer.useconds() <<" MB/s "  <<std::endl; | ||||||
|  |  | ||||||
|     grid->Broadcast(0,(void *)&csum,sizeof(csum)); |  | ||||||
|     return csum; |     return csum; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -306,7 +313,7 @@ class BinaryIO { | |||||||
|  |  | ||||||
|       if ( grid->IsBoss() ) { |       if ( grid->IsBoss() ) { | ||||||
|   Uint32Checksum((uint32_t *)&saved[0],bytes,csum); |   Uint32Checksum((uint32_t *)&saved[0],bytes,csum); | ||||||
| 	fout.write((char *)&saved[0],bytes);assert( fout.fail()==0); |   fout.write((char *)&saved[0],bytes); | ||||||
|       } |       } | ||||||
|  |  | ||||||
|     } |     } | ||||||
| @@ -314,7 +321,7 @@ class BinaryIO { | |||||||
|     if ( grid->IsBoss() ) { |     if ( grid->IsBoss() ) { | ||||||
|       serial.GetState(saved,0); |       serial.GetState(saved,0); | ||||||
|       Uint32Checksum((uint32_t *)&saved[0],bytes,csum); |       Uint32Checksum((uint32_t *)&saved[0],bytes,csum); | ||||||
|       fout.write((char *)&saved[0],bytes);assert( fout.fail()==0); |       fout.write((char *)&saved[0],bytes); | ||||||
|     } |     } | ||||||
|     grid->Broadcast(0,(void *)&csum,sizeof(csum)); |     grid->Broadcast(0,(void *)&csum,sizeof(csum)); | ||||||
|     return csum; |     return csum; | ||||||
| @@ -348,7 +355,7 @@ class BinaryIO { | |||||||
|       int l_idx=parallel.generator_idx(o_idx,i_idx); |       int l_idx=parallel.generator_idx(o_idx,i_idx); | ||||||
|  |  | ||||||
|       if ( grid->IsBoss() ) { |       if ( grid->IsBoss() ) { | ||||||
| 	fin.read((char *)&saved[0],bytes);assert( fin.fail()==0); |   fin.read((char *)&saved[0],bytes); | ||||||
|   Uint32Checksum((uint32_t *)&saved[0],bytes,csum); |   Uint32Checksum((uint32_t *)&saved[0],bytes,csum); | ||||||
|       } |       } | ||||||
|  |  | ||||||
| @@ -361,7 +368,7 @@ class BinaryIO { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     if ( grid->IsBoss() ) { |     if ( grid->IsBoss() ) { | ||||||
|       fin.read((char *)&saved[0],bytes);assert( fin.fail()==0); |       fin.read((char *)&saved[0],bytes); | ||||||
|       serial.SetState(saved,0); |       serial.SetState(saved,0); | ||||||
|       Uint32Checksum((uint32_t *)&saved[0],bytes,csum); |       Uint32Checksum((uint32_t *)&saved[0],bytes,csum); | ||||||
|     } |     } | ||||||
| @@ -373,8 +380,7 @@ class BinaryIO { | |||||||
|  |  | ||||||
|  |  | ||||||
|   template<class vobj,class fobj,class munger> |   template<class vobj,class fobj,class munger> | ||||||
|   static inline uint32_t readObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset, |   static inline uint32_t readObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string &format) | ||||||
| 					    const std::string &format) |  | ||||||
|   { |   { | ||||||
|     typedef typename vobj::scalar_object sobj; |     typedef typename vobj::scalar_object sobj; | ||||||
|  |  | ||||||
| @@ -483,7 +489,7 @@ class BinaryIO { | |||||||
|       if (myrank == iorank) { |       if (myrank == iorank) { | ||||||
|    |    | ||||||
|   fin.seekg(offset+g_idx*sizeof(fileObj)); |   fin.seekg(offset+g_idx*sizeof(fileObj)); | ||||||
| 	fin.read((char *)&fileObj,sizeof(fileObj));assert( fin.fail()==0); |   fin.read((char *)&fileObj,sizeof(fileObj)); | ||||||
|   bytes+=sizeof(fileObj); |   bytes+=sizeof(fileObj); | ||||||
|    |    | ||||||
|   if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj)); |   if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj)); | ||||||
| @@ -523,8 +529,7 @@ class BinaryIO { | |||||||
|   // Parallel writer |   // Parallel writer | ||||||
|   ////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////// | ||||||
|   template<class vobj,class fobj,class munger> |   template<class vobj,class fobj,class munger> | ||||||
|   static inline uint32_t writeObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset, |   static inline uint32_t writeObjectParallel(Lattice<vobj> &Umu,std::string file,munger munge,int offset,const std::string & format) | ||||||
| 					     const std::string & format) |  | ||||||
|   { |   { | ||||||
|     typedef typename vobj::scalar_object sobj; |     typedef typename vobj::scalar_object sobj; | ||||||
|     GridBase *grid = Umu._grid; |     GridBase *grid = Umu._grid; | ||||||
| @@ -653,7 +658,7 @@ class BinaryIO { | |||||||
|   if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj)); |   if(ieee64)    htole64_v((void *)&fileObj,sizeof(fileObj)); | ||||||
|    |    | ||||||
|   fout.seekp(offset+g_idx*sizeof(fileObj)); |   fout.seekp(offset+g_idx*sizeof(fileObj)); | ||||||
| 	fout.write((char *)&fileObj,sizeof(fileObj));assert( fout.fail()==0); |   fout.write((char *)&fileObj,sizeof(fileObj)); | ||||||
|   bytes+=sizeof(fileObj); |   bytes+=sizeof(fileObj); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -242,6 +242,7 @@ class NerscIO : public BinaryIO { | |||||||
|   static inline unsigned int writeHeader(NerscField &field,std::string file) |   static inline unsigned int writeHeader(NerscField &field,std::string file) | ||||||
|   { |   { | ||||||
|     std::ofstream fout(file,std::ios::out|std::ios::in); |     std::ofstream fout(file,std::ios::out|std::ios::in); | ||||||
|  |    | ||||||
|     fout.seekp(0,std::ios::beg); |     fout.seekp(0,std::ios::beg); | ||||||
|     dump_nersc_header(field, fout); |     dump_nersc_header(field, fout); | ||||||
|     field.data_start = fout.tellp(); |     field.data_start = fout.tellp(); | ||||||
| @@ -263,13 +264,10 @@ static inline int readHeader(std::string file,GridBase *grid,  NerscField &field | |||||||
|   getline(fin,line); // read one line and insist is  |   getline(fin,line); // read one line and insist is  | ||||||
|  |  | ||||||
|   removeWhitespace(line); |   removeWhitespace(line); | ||||||
|   std::cout << GridLogMessage << "* " << line << std::endl; |  | ||||||
|  |  | ||||||
|   assert(line==std::string("BEGIN_HEADER")); |   assert(line==std::string("BEGIN_HEADER")); | ||||||
|  |  | ||||||
|   do { |   do { | ||||||
|     getline(fin,line); // read one line |     getline(fin,line); // read one line | ||||||
|     std::cout << GridLogMessage << "* "<<line<< std::endl; |  | ||||||
|     int eq = line.find("="); |     int eq = line.find("="); | ||||||
|     if(eq >0) { |     if(eq >0) { | ||||||
|       std::string key=line.substr(0,eq); |       std::string key=line.substr(0,eq); | ||||||
| @@ -324,8 +322,6 @@ static inline int readHeader(std::string file,GridBase *grid,  NerscField &field | |||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Now the meat: the object readers | // Now the meat: the object readers | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| #define PARALLEL_READ |  | ||||||
| #define PARALLEL_WRITE |  | ||||||
|  |  | ||||||
| template<class vsimd> | template<class vsimd> | ||||||
| static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file) | static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,NerscField& header,std::string file) | ||||||
| @@ -349,41 +345,25 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, | |||||||
|   // munger is a function of <floating point, Real, data_type> |   // munger is a function of <floating point, Real, data_type> | ||||||
|   if ( header.data_type == std::string("4D_SU3_GAUGE") ) { |   if ( header.data_type == std::string("4D_SU3_GAUGE") ) { | ||||||
|     if ( ieee32 || ieee32big ) { |     if ( ieee32 || ieee32big ) { | ||||||
| #ifdef PARALLEL_READ |       //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>  | ||||||
| 	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>  | 	csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>  | ||||||
| 	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format); | 	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format); | ||||||
| #else |  | ||||||
|       csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>  |  | ||||||
| 	(Umu,file,Nersc3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format); |  | ||||||
| #endif |  | ||||||
|     } |     } | ||||||
|     if ( ieee64 || ieee64big ) { |     if ( ieee64 || ieee64big ) { | ||||||
| #ifdef PARALLEL_READ |       //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>  | ||||||
|       csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>  |       csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>  | ||||||
|       	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format); |       	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format); | ||||||
| #else  |  | ||||||
|       csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>  |  | ||||||
|       	(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format); |  | ||||||
| #endif |  | ||||||
|     } |     } | ||||||
|   } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { |   } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { | ||||||
|     if ( ieee32 || ieee32big ) { |     if ( ieee32 || ieee32big ) { | ||||||
| #ifdef PARALLEL_READ |       //csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF> | ||||||
|       csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF> |       csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF> | ||||||
| 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format); | 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format); | ||||||
| #else |  | ||||||
|       csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF> |  | ||||||
| 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format); |  | ||||||
| #endif |  | ||||||
|     } |     } | ||||||
|     if ( ieee64 || ieee64big ) { |     if ( ieee64 || ieee64big ) { | ||||||
| #ifdef PARALLEL_READ |       //      csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD> | ||||||
|       csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD> |       csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD> | ||||||
| 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format); | 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format); | ||||||
| #else |  | ||||||
|       csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD> |  | ||||||
| 	(Umu,file,NerscSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format); |  | ||||||
| #endif |  | ||||||
|     } |     } | ||||||
|   } else { |   } else { | ||||||
|     assert(0); |     assert(0); | ||||||
| @@ -391,17 +371,12 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, | |||||||
|  |  | ||||||
|   NerscStatistics<GaugeField>(Umu,clone); |   NerscStatistics<GaugeField>(Umu,clone); | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<            csum<< std::dec |  | ||||||
| 	                                                  <<" header   "<<std::hex<<header.checksum<<std::dec <<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette |  | ||||||
| 	                                                  <<" header    "<<header.plaquette<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace |  | ||||||
| 	                                                  <<" header    "<<header.link_trace<<std::endl; |  | ||||||
|   assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); |   assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); | ||||||
|   assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); |   assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); | ||||||
|  |  | ||||||
|   assert(csum == header.checksum ); |   assert(csum == header.checksum ); | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; |   std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vsimd> | template<class vsimd> | ||||||
| @@ -441,11 +416,19 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu | |||||||
|     Nersc3x2unmunger<fobj2D,sobj> munge; |     Nersc3x2unmunger<fobj2D,sobj> munge; | ||||||
|     BinaryIO::Uint32Checksum<vobj,fobj2D>(Umu, munge,header.checksum); |     BinaryIO::Uint32Checksum<vobj,fobj2D>(Umu, munge,header.checksum); | ||||||
|     offset = writeHeader(header,file); |     offset = writeHeader(header,file); | ||||||
| #ifdef PARALLEL_WRITE |  | ||||||
|     csum=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point); |  | ||||||
| #else |  | ||||||
|     csum=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point); |     csum=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file,munge,offset,header.floating_point); | ||||||
| #endif |  | ||||||
|  |     std::string file1 = file+"para"; | ||||||
|  |     int offset1 = writeHeader(header,file1); | ||||||
|  |     int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point); | ||||||
|  |     //int csum1=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point); | ||||||
|  |  | ||||||
|  |      | ||||||
|  |     std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl; | ||||||
|  |     std::cout << GridLogMessage << " TESTING PARALLEL WRITE csums   " << csum1 << " "<<std::hex<< csum << std::dec<< std::endl; | ||||||
|  |  | ||||||
|  |     assert(offset1==offset);   | ||||||
|  |     assert(csum1==csum);   | ||||||
|  |  | ||||||
|   } else {  |   } else {  | ||||||
|     header.floating_point = std::string("IEEE64BIG"); |     header.floating_point = std::string("IEEE64BIG"); | ||||||
| @@ -453,11 +436,8 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu | |||||||
|     NerscSimpleUnmunger<fobj3D,sobj> munge; |     NerscSimpleUnmunger<fobj3D,sobj> munge; | ||||||
|     BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum); |     BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum); | ||||||
|     offset = writeHeader(header,file); |     offset = writeHeader(header,file); | ||||||
| #ifdef PARALLEL_WRITE |     //    csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point); | ||||||
|     csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point); |     csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point); | ||||||
| #else |  | ||||||
|     csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point); |  | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl; |   std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl; | ||||||
| @@ -491,15 +471,10 @@ static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG ¶llel | |||||||
| #ifdef RNG_RANLUX | #ifdef RNG_RANLUX | ||||||
|     header.floating_point = std::string("UINT64"); |     header.floating_point = std::string("UINT64"); | ||||||
|     header.data_type      = std::string("RANLUX48"); |     header.data_type      = std::string("RANLUX48"); | ||||||
| #endif | #else | ||||||
| #ifdef RNG_MT19937 |  | ||||||
|     header.floating_point = std::string("UINT32"); |     header.floating_point = std::string("UINT32"); | ||||||
|     header.data_type      = std::string("MT19937"); |     header.data_type      = std::string("MT19937"); | ||||||
| #endif | #endif | ||||||
| #ifdef RNG_SITMO |  | ||||||
|     header.floating_point = std::string("UINT64"); |  | ||||||
|     header.data_type      = std::string("SITMO"); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|   truncate(file); |   truncate(file); | ||||||
|   offset = writeHeader(header,file); |   offset = writeHeader(header,file); | ||||||
| @@ -527,20 +502,17 @@ static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel | |||||||
| #ifdef RNG_RANLUX | #ifdef RNG_RANLUX | ||||||
|   assert(format == std::string("UINT64")); |   assert(format == std::string("UINT64")); | ||||||
|   assert(data_type == std::string("RANLUX48")); |   assert(data_type == std::string("RANLUX48")); | ||||||
| #endif | #else | ||||||
| #ifdef RNG_MT19937 |  | ||||||
|   assert(format == std::string("UINT32")); |   assert(format == std::string("UINT32")); | ||||||
|   assert(data_type == std::string("MT19937")); |   assert(data_type == std::string("MT19937")); | ||||||
| #endif | #endif | ||||||
| #ifdef RNG_SITMO |  | ||||||
|   assert(format == std::string("UINT64")); |  | ||||||
|   assert(data_type == std::string("SITMO")); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|   // depending on datatype, set up munger; |   // depending on datatype, set up munger; | ||||||
|   // munger is a function of <floating point, Real, data_type> |   // munger is a function of <floating point, Real, data_type> | ||||||
|   uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset); |   uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset); | ||||||
|  |  | ||||||
|  |   std::cerr<<" Csum "<< csum << " "<< header.checksum <<std::endl; | ||||||
|  |  | ||||||
|   assert(csum == header.checksum ); |   assert(csum == header.checksum ); | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl; |   std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl; | ||||||
|   | |||||||
| @@ -29,8 +29,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #ifndef GRID_QCD_BASE_H | #ifndef GRID_QCD_H | ||||||
| #define GRID_QCD_BASE_H | #define GRID_QCD_H | ||||||
| namespace Grid{ | namespace Grid{ | ||||||
|  |  | ||||||
| namespace QCD { | namespace QCD { | ||||||
| @@ -62,6 +62,7 @@ namespace QCD { | |||||||
|     #define SpinIndex    1 |     #define SpinIndex    1 | ||||||
|     #define LorentzIndex 0 |     #define LorentzIndex 0 | ||||||
|  |  | ||||||
|  |    | ||||||
|     // Also should make these a named enum type |     // Also should make these a named enum type | ||||||
|     static const int DaggerNo=0; |     static const int DaggerNo=0; | ||||||
|     static const int DaggerYes=1; |     static const int DaggerYes=1; | ||||||
| @@ -493,5 +494,26 @@ namespace QCD { | |||||||
| } // Grid | } // Grid | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #include <Grid/qcd/utils/SpaceTimeGrid.h> | ||||||
|  | #include <Grid/qcd/spin/Dirac.h> | ||||||
|  | #include <Grid/qcd/spin/TwoSpinor.h> | ||||||
|  | #include <Grid/qcd/utils/LinalgUtils.h> | ||||||
|  | #include <Grid/qcd/utils/CovariantCshift.h> | ||||||
|  |  | ||||||
|  | // Include representations 	 | ||||||
|  | #include <Grid/qcd/utils/SUn.h> | ||||||
|  | #include <Grid/qcd/utils/SUnAdjoint.h> | ||||||
|  | #include <Grid/qcd/utils/SUnTwoIndex.h> | ||||||
|  | #include <Grid/qcd/representations/hmc_types.h> | ||||||
|  |  | ||||||
|  | #include <Grid/qcd/action/Actions.h> | ||||||
|  |  | ||||||
|  | #include <Grid/qcd/smearing/Smearing.h> | ||||||
|  |  | ||||||
|  | #include <Grid/qcd/hmc/integrators/Integrator.h> | ||||||
|  | #include <Grid/qcd/hmc/integrators/Integrator_algorithm.h> | ||||||
|  | #include <Grid/qcd/hmc/HMC.h> | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -1,50 +0,0 @@ | |||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./lib/qcd/action/Actions.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> |  | ||||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> |  | ||||||
| Author: neo <cossu@post.kek.jp> |  | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #ifndef GRID_QCD_ACTION_H |  | ||||||
| #define GRID_QCD_ACTION_H |  | ||||||
|  |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| // Abstract base interface |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| #include <Grid/qcd/action/ActionCore.h> |  | ||||||
| //////////////////////////////////////////////////////////////////////// |  | ||||||
| // Fermion actions; prevent coupling fermion.cc files to other headers |  | ||||||
| //////////////////////////////////////////////////////////////////////// |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> |  | ||||||
| #include <Grid/qcd/action/fermion/Fermion.h> |  | ||||||
| //////////////////////////////////////// |  | ||||||
| // Pseudo fermion combinations for HMC |  | ||||||
| //////////////////////////////////////// |  | ||||||
| #include <Grid/qcd/action/pseudofermion/PseudoFermion.h> |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -150,5 +150,4 @@ using ActionSet = std::vector<ActionLevel<GaugeField, R> >; | |||||||
|  |  | ||||||
| } | } | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -1,45 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Source file: ./lib/qcd/action/ActionCore.h |  | ||||||
|  |  | ||||||
| Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: neo <cossu@post.kek.jp> |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #ifndef QCD_ACTION_CORE |  | ||||||
| #define QCD_ACTION_CORE |  | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/ActionBase.h> |  | ||||||
| #include <Grid/qcd/action/ActionParams.h> |  | ||||||
|  |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| // Gauge Actions |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| #include <Grid/qcd/action/gauge/Gauge.h> |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| // Fermion prereqs |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -45,10 +45,6 @@ namespace QCD { | |||||||
|       WilsonImplParams() : overlapCommsCompute(false) {}; |       WilsonImplParams() : overlapCommsCompute(false) {}; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     struct StaggeredImplParams { |  | ||||||
|       StaggeredImplParams()  {}; |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     struct OneFlavourRationalParams {  |     struct OneFlavourRationalParams {  | ||||||
|       RealD  lo; |       RealD  lo; | ||||||
|       RealD  hi; |       RealD  hi; | ||||||
|   | |||||||
| @@ -2,11 +2,16 @@ | |||||||
| 
 | 
 | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
| 
 | 
 | ||||||
|     Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h |     Source file: ./lib/qcd/action/Actions.h | ||||||
| 
 | 
 | ||||||
|     Copyright (C) 2015 |     Copyright (C) 2015 | ||||||
| 
 | 
 | ||||||
|  | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  | Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local> | ||||||
|  | Author: neo <cossu@post.kek.jp> | ||||||
|  | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
| 
 | 
 | ||||||
|     This program is free software; you can redistribute it and/or modify |     This program is free software; you can redistribute it and/or modify | ||||||
|     it under the terms of the GNU General Public License as published by |     it under the terms of the GNU General Public License as published by | ||||||
| @@ -25,8 +30,67 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #ifndef  GRID_QCD_FERMION_H | #ifndef GRID_QCD_ACTIONS_H | ||||||
| #define  GRID_QCD_FERMION_H | #define GRID_QCD_ACTIONS_H | ||||||
|  | 
 | ||||||
|  | // * Linear operators             (Hermitian and non-hermitian)  .. my LinearOperator
 | ||||||
|  | // * System solvers               (Hermitian and non-hermitian)  .. my OperatorFunction
 | ||||||
|  | // * MultiShift System solvers    (Hermitian and non-hermitian)  .. my OperatorFunction
 | ||||||
|  | 
 | ||||||
|  | ////////////////////////////////////////////
 | ||||||
|  | // Abstract base interface
 | ||||||
|  | ////////////////////////////////////////////
 | ||||||
|  | #include <Grid/qcd/action/ActionBase.h> | ||||||
|  | #include <Grid/qcd/action/ActionParams.h> | ||||||
|  | 
 | ||||||
|  | ////////////////////////////////////////////
 | ||||||
|  | // Utility functions
 | ||||||
|  | ////////////////////////////////////////////
 | ||||||
|  | #include <Grid/qcd/action/gauge/GaugeImpl.h> | ||||||
|  | #include <Grid/qcd/utils/WilsonLoops.h> | ||||||
|  | 
 | ||||||
|  | #include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions | ||||||
|  | #include <Grid/qcd/action/fermion/FermionOperatorImpl.h> | ||||||
|  | #include <Grid/qcd/action/fermion/FermionOperator.h> | ||||||
|  | #include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions | ||||||
|  | 
 | ||||||
|  | ////////////////////////////////////////////
 | ||||||
|  | // Gauge Actions
 | ||||||
|  | ////////////////////////////////////////////
 | ||||||
|  | #include <Grid/qcd/action/gauge/WilsonGaugeAction.h> | ||||||
|  | #include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h> | ||||||
|  | 
 | ||||||
|  | namespace Grid { | ||||||
|  | namespace QCD { | ||||||
|  | 
 | ||||||
|  | typedef WilsonGaugeAction<PeriodicGimplR>          WilsonGaugeActionR; | ||||||
|  | typedef WilsonGaugeAction<PeriodicGimplF>          WilsonGaugeActionF; | ||||||
|  | typedef WilsonGaugeAction<PeriodicGimplD>          WilsonGaugeActionD; | ||||||
|  | typedef PlaqPlusRectangleAction<PeriodicGimplR>    PlaqPlusRectangleActionR; | ||||||
|  | typedef PlaqPlusRectangleAction<PeriodicGimplF>    PlaqPlusRectangleActionF; | ||||||
|  | typedef PlaqPlusRectangleAction<PeriodicGimplD>    PlaqPlusRectangleActionD; | ||||||
|  | typedef IwasakiGaugeAction<PeriodicGimplR>         IwasakiGaugeActionR; | ||||||
|  | typedef IwasakiGaugeAction<PeriodicGimplF>         IwasakiGaugeActionF; | ||||||
|  | typedef IwasakiGaugeAction<PeriodicGimplD>         IwasakiGaugeActionD; | ||||||
|  | typedef SymanzikGaugeAction<PeriodicGimplR>        SymanzikGaugeActionR; | ||||||
|  | typedef SymanzikGaugeAction<PeriodicGimplF>        SymanzikGaugeActionF; | ||||||
|  | typedef SymanzikGaugeAction<PeriodicGimplD>        SymanzikGaugeActionD; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | typedef WilsonGaugeAction<ConjugateGimplR>          ConjugateWilsonGaugeActionR; | ||||||
|  | typedef WilsonGaugeAction<ConjugateGimplF>          ConjugateWilsonGaugeActionF; | ||||||
|  | typedef WilsonGaugeAction<ConjugateGimplD>          ConjugateWilsonGaugeActionD; | ||||||
|  | typedef PlaqPlusRectangleAction<ConjugateGimplR>    ConjugatePlaqPlusRectangleActionR; | ||||||
|  | typedef PlaqPlusRectangleAction<ConjugateGimplF>    ConjugatePlaqPlusRectangleActionF; | ||||||
|  | typedef PlaqPlusRectangleAction<ConjugateGimplD>    ConjugatePlaqPlusRectangleActionD; | ||||||
|  | typedef IwasakiGaugeAction<ConjugateGimplR>         ConjugateIwasakiGaugeActionR; | ||||||
|  | typedef IwasakiGaugeAction<ConjugateGimplF>         ConjugateIwasakiGaugeActionF; | ||||||
|  | typedef IwasakiGaugeAction<ConjugateGimplD>         ConjugateIwasakiGaugeActionD; | ||||||
|  | typedef SymanzikGaugeAction<ConjugateGimplR>        ConjugateSymanzikGaugeActionR; | ||||||
|  | typedef SymanzikGaugeAction<ConjugateGimplF>        ConjugateSymanzikGaugeActionF; | ||||||
|  | typedef SymanzikGaugeAction<ConjugateGimplD>        ConjugateSymanzikGaugeActionD; | ||||||
|  | 
 | ||||||
|  | }} | ||||||
| 
 | 
 | ||||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ||||||
| // Explicit explicit template instantiation is still required in the .cc files
 | // Explicit explicit template instantiation is still required in the .cc files
 | ||||||
| @@ -43,6 +107,36 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | |||||||
| // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 | // for EVERY .cc file. This define centralises the list and restores global push of impl cases
 | ||||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | #define FermOp4dVecTemplateInstantiate(A) \ | ||||||
|  |   template class A<WilsonImplF>;		\ | ||||||
|  |   template class A<WilsonImplD>;		\ | ||||||
|  |   template class A<ZWilsonImplF>;		\ | ||||||
|  |   template class A<ZWilsonImplD>;		\ | ||||||
|  |   template class A<GparityWilsonImplF>;		\ | ||||||
|  |   template class A<GparityWilsonImplD>;		 | ||||||
|  | 
 | ||||||
|  | #define AdjointFermOpTemplateInstantiate(A) \ | ||||||
|  |   template class A<WilsonAdjImplF>; \ | ||||||
|  |   template class A<WilsonAdjImplD>;  | ||||||
|  | 
 | ||||||
|  | #define TwoIndexFermOpTemplateInstantiate(A) \ | ||||||
|  |   template class A<WilsonTwoIndexSymmetricImplF>; \ | ||||||
|  |   template class A<WilsonTwoIndexSymmetricImplD>;  | ||||||
|  | 
 | ||||||
|  | #define FermOp5dVecTemplateInstantiate(A) \ | ||||||
|  |   template class A<DomainWallVec5dImplF>;	\ | ||||||
|  |   template class A<DomainWallVec5dImplD>;	\ | ||||||
|  |   template class A<ZDomainWallVec5dImplF>;	\ | ||||||
|  |   template class A<ZDomainWallVec5dImplD>;	 | ||||||
|  | 
 | ||||||
|  | #define FermOpTemplateInstantiate(A) \ | ||||||
|  |  FermOp4dVecTemplateInstantiate(A) \ | ||||||
|  |  FermOp5dVecTemplateInstantiate(A)  | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #define GparityFermOpTemplateInstantiate(A)  | ||||||
|  | 
 | ||||||
| ////////////////////////////////////////////
 | ////////////////////////////////////////////
 | ||||||
| // Fermion operators / actions
 | // Fermion operators / actions
 | ||||||
| ////////////////////////////////////////////
 | ////////////////////////////////////////////
 | ||||||
| @@ -50,9 +144,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | |||||||
| #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like | #include <Grid/qcd/action/fermion/WilsonFermion.h>       // 4d wilson like | ||||||
| #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like | #include <Grid/qcd/action/fermion/WilsonTMFermion.h>       // 4d wilson like | ||||||
| #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types | #include <Grid/qcd/action/fermion/WilsonFermion5D.h>     // 5d base used by all 5d overlap types | ||||||
|  | 
 | ||||||
| //#include <Grid/qcd/action/fermion/CloverFermion.h>
 | //#include <Grid/qcd/action/fermion/CloverFermion.h>
 | ||||||
| #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h> | 
 | ||||||
| #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h> |  | ||||||
| #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types | #include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types | ||||||
| #include <Grid/qcd/action/fermion/DomainWallFermion.h> | #include <Grid/qcd/action/fermion/DomainWallFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/DomainWallFermion.h> | #include <Grid/qcd/action/fermion/DomainWallFermion.h> | ||||||
| @@ -63,16 +157,14 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | |||||||
| #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h> | #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h> | #include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h> | #include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h> | ||||||
|  | 
 | ||||||
| #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction | #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h>               // Continued fraction | ||||||
| #include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h> | #include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h> | #include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h> | ||||||
|  | 
 | ||||||
| #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction | #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h>                 // Partial fraction | ||||||
| #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h> | #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h> | ||||||
| #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h> | #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h> | ||||||
| ///////////////////////////////////////////////////////////////////////////////
 |  | ||||||
| // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 |  | ||||||
| ///////////////////////////////////////////////////////////////////////////////
 |  | ||||||
| #include <Grid/qcd/action/fermion/g5HermitianLinop.h> |  | ||||||
| 
 | 
 | ||||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ||||||
| // More maintainable to maintain the following typedef list centrally, as more "impl" targets
 | // More maintainable to maintain the following typedef list centrally, as more "impl" targets
 | ||||||
| @@ -176,19 +268,27 @@ typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR; | |||||||
| typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF; | typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF; | ||||||
| typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD; | typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD; | ||||||
| 
 | 
 | ||||||
| typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR; |  | ||||||
| typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; |  | ||||||
| typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; |  | ||||||
| 
 |  | ||||||
| typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR; |  | ||||||
| typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; |  | ||||||
| typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; |  | ||||||
| 
 |  | ||||||
| typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR; |  | ||||||
| typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF; |  | ||||||
| typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD; |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|   }} |   }} | ||||||
|  | ///////////////////////////////////////////////////////////////////////////////
 | ||||||
|  | // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
 | ||||||
|  | ///////////////////////////////////////////////////////////////////////////////
 | ||||||
|  | #include <Grid/qcd/action/fermion/g5HermitianLinop.h> | ||||||
|  | 
 | ||||||
|  | ////////////////////////////////////////
 | ||||||
|  | // Pseudo fermion combinations for HMC
 | ||||||
|  | ////////////////////////////////////////
 | ||||||
|  | #include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h> | ||||||
|  | 
 | ||||||
|  | #include <Grid/qcd/action/pseudofermion/TwoFlavour.h> | ||||||
|  | #include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h> | ||||||
|  | #include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h> | ||||||
|  | #include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h> | ||||||
|  | 
 | ||||||
|  | #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h> | ||||||
|  | #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h> | ||||||
|  | #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h> | ||||||
|  | #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h> | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
| @@ -30,8 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
|  |  | ||||||
| #include <Grid/Eigen/Dense> | #include <Grid/Eigen/Dense> | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/qcd/action/fermion/CayleyFermion5D.h> |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
| @@ -64,18 +64,6 @@ void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi) | |||||||
|     axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi |     axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi | ||||||
|   } |   } | ||||||
| } | } | ||||||
| template<class Impl>   |  | ||||||
| void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi) |  | ||||||
| { |  | ||||||
|   int Ls=this->Ls; |  | ||||||
|  |  | ||||||
|   FermionField tmp_f(this->FermionGrid()); |  | ||||||
|   this->DW(psi,tmp_f,DaggerYes); |  | ||||||
|  |  | ||||||
|   for(int s=0;s<Ls;s++){ |  | ||||||
|     axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void) | template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void) | ||||||
| @@ -120,6 +108,18 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void) | |||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template<class Impl>   | ||||||
|  | void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi) | ||||||
|  | { | ||||||
|  |   int Ls=this->Ls; | ||||||
|  |  | ||||||
|  |   FermionField tmp_f(this->FermionGrid()); | ||||||
|  |   this->DW(psi,tmp_f,DaggerYes); | ||||||
|  |  | ||||||
|  |   for(int s=0;s<Ls;s++){ | ||||||
|  |     axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi | ||||||
|  |   } | ||||||
|  | } | ||||||
| template<class Impl>   | template<class Impl>   | ||||||
| void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
| @@ -170,6 +170,7 @@ void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField & | |||||||
|   lower[0]   =-mass*lower[0]; |   lower[0]   =-mass*lower[0]; | ||||||
|   M5D(psi,psi,chi,lower,diag,upper); |   M5D(psi,psi,chi,lower,diag,upper); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
| @@ -191,12 +192,7 @@ void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField & | |||||||
|       lower[s]=-cee[s-1]; |       lower[s]=-cee[s-1]; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   // Conjugate the terms  |  | ||||||
|   for (int s=0;s<Ls;s++){ |  | ||||||
|     diag[s] =conjugate(diag[s]); |  | ||||||
|     upper[s]=conjugate(upper[s]); |  | ||||||
|     lower[s]=conjugate(lower[s]); |  | ||||||
|   } |  | ||||||
|   M5Ddag(psi,psi,chi,lower,diag,upper); |   M5Ddag(psi,psi,chi,lower,diag,upper); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -219,22 +215,8 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField | |||||||
|   std::vector<Coeff_t> diag =bs; |   std::vector<Coeff_t> diag =bs; | ||||||
|   std::vector<Coeff_t> upper=cs; |   std::vector<Coeff_t> upper=cs; | ||||||
|   std::vector<Coeff_t> lower=cs; |   std::vector<Coeff_t> lower=cs; | ||||||
|  |   upper[Ls-1]=-mass*upper[Ls-1]; | ||||||
|   for (int s=0;s<Ls;s++){ |   lower[0]   =-mass*lower[0]; | ||||||
|     if ( s== 0 ) { |  | ||||||
|       upper[s] = cs[s+1]; |  | ||||||
|       lower[s] =-mass*cs[Ls-1]; |  | ||||||
|     } else if ( s==(Ls-1) ) {  |  | ||||||
|       upper[s] =-mass*cs[0]; |  | ||||||
|       lower[s] = cs[s-1]; |  | ||||||
|     } else {  |  | ||||||
|       upper[s] = cs[s+1]; |  | ||||||
|       lower[s] = cs[s-1]; |  | ||||||
|     } |  | ||||||
|     upper[s] = conjugate(upper[s]); |  | ||||||
|     lower[s] = conjugate(lower[s]); |  | ||||||
|     diag[s]  = conjugate(diag[s]); |  | ||||||
|   } |  | ||||||
|   M5Ddag(psi,psi,Din,lower,diag,upper); |   M5Ddag(psi,psi,Din,lower,diag,upper); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -320,7 +302,7 @@ void CayleyFermion5D<Impl>::MDeriv  (GaugeField &mat,const FermionField &U,const | |||||||
|     this->DhopDeriv(mat,U,Din,dag); |     this->DhopDeriv(mat,U,Din,dag); | ||||||
|   } else { |   } else { | ||||||
|     //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call |     //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call | ||||||
|     MeooeDag5D(U,Din); |     Meooe5D(U,Din); | ||||||
|     this->DhopDeriv(mat,Din,V,dag); |     this->DhopDeriv(mat,Din,V,dag); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| @@ -335,7 +317,7 @@ void CayleyFermion5D<Impl>::MoeDeriv(GaugeField &mat,const FermionField &U,const | |||||||
|     this->DhopDerivOE(mat,U,Din,dag); |     this->DhopDerivOE(mat,U,Din,dag); | ||||||
|   } else { |   } else { | ||||||
|     //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call |     //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call | ||||||
|       MeooeDag5D(U,Din); |       Meooe5D(U,Din); | ||||||
|       this->DhopDerivOE(mat,Din,V,dag); |       this->DhopDerivOE(mat,Din,V,dag); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
| @@ -350,7 +332,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const | |||||||
|     this->DhopDerivEO(mat,U,Din,dag); |     this->DhopDerivEO(mat,U,Din,dag); | ||||||
|   } else { |   } else { | ||||||
|     //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call |     //      U d/du [D_w D5]^dag V = U D5^dag d/du DW^dag Y // implicit adj on U in call | ||||||
|     MeooeDag5D(U,Din); |     Meooe5D(U,Din); | ||||||
|     this->DhopDerivEO(mat,Din,V,dag); |     this->DhopDerivEO(mat,Din,V,dag); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -29,8 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef  GRID_QCD_CAYLEY_FERMION_H | #ifndef  GRID_QCD_CAYLEY_FERMION_H | ||||||
| #define  GRID_QCD_CAYLEY_FERMION_H | #define  GRID_QCD_CAYLEY_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/WilsonFermion5D.h> |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   namespace QCD { |   namespace QCD { | ||||||
| @@ -194,9 +192,7 @@ template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const Fermion | |||||||
| template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \ | template void CayleyFermion5D< A >::MooeeInv    (const FermionField &psi, FermionField &chi); \ | ||||||
| template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi); | template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi); | ||||||
|  |  | ||||||
| #undef  CAYLEY_DPERP_DENSE |  | ||||||
| #define CAYLEY_DPERP_CACHE | #define CAYLEY_DPERP_CACHE | ||||||
| #undef  CAYLEY_DPERP_LINALG | #undef  CAYLEY_DPERP_LINALG | ||||||
| #define CAYLEY_DPERP_VEC |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/qcd/action/fermion/CayleyFermion5D.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| @@ -55,8 +54,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi, | |||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|   M5Dcalls++; |   M5Dcalls++; | ||||||
|   M5Dtime-=usecond(); |   M5Dtime-=usecond(); | ||||||
|  | PARALLEL_FOR_LOOP | ||||||
|   parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls |   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls | ||||||
|     for(int s=0;s<Ls;s++){ |     for(int s=0;s<Ls;s++){ | ||||||
|       auto tmp = psi._odata[0]; |       auto tmp = psi._odata[0]; | ||||||
|       if ( s==0 ) { |       if ( s==0 ) { | ||||||
| @@ -99,8 +98,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi, | |||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|   M5Dcalls++; |   M5Dcalls++; | ||||||
|   M5Dtime-=usecond(); |   M5Dtime-=usecond(); | ||||||
|  | PARALLEL_FOR_LOOP | ||||||
|   parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls |   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls | ||||||
|     auto tmp = psi._odata[0]; |     auto tmp = psi._odata[0]; | ||||||
|     for(int s=0;s<Ls;s++){ |     for(int s=0;s<Ls;s++){ | ||||||
|       if ( s==0 ) { |       if ( s==0 ) { | ||||||
| @@ -138,7 +137,8 @@ void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField & | |||||||
|   MooeeInvCalls++; |   MooeeInvCalls++; | ||||||
|   MooeeInvTime-=usecond(); |   MooeeInvTime-=usecond(); | ||||||
|  |  | ||||||
|   parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls | PARALLEL_FOR_LOOP | ||||||
|  |   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls | ||||||
|     auto tmp = psi._odata[0]; |     auto tmp = psi._odata[0]; | ||||||
|  |  | ||||||
|     // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops |     // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops | ||||||
| @@ -181,22 +181,11 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField & | |||||||
|   assert(psi.checkerboard == psi.checkerboard); |   assert(psi.checkerboard == psi.checkerboard); | ||||||
|   chi.checkerboard=psi.checkerboard; |   chi.checkerboard=psi.checkerboard; | ||||||
|  |  | ||||||
|   std::vector<Coeff_t> ueec(Ls); |  | ||||||
|   std::vector<Coeff_t> deec(Ls); |  | ||||||
|   std::vector<Coeff_t> leec(Ls); |  | ||||||
|   std::vector<Coeff_t> ueemc(Ls); |  | ||||||
|   std::vector<Coeff_t> leemc(Ls); |  | ||||||
|   for(int s=0;s<ueec.size();s++){ |  | ||||||
|     ueec[s] = conjugate(uee[s]); |  | ||||||
|     deec[s] = conjugate(dee[s]); |  | ||||||
|     leec[s] = conjugate(lee[s]); |  | ||||||
|     ueemc[s]= conjugate(ueem[s]); |  | ||||||
|     leemc[s]= conjugate(leem[s]); |  | ||||||
|   } |  | ||||||
|   MooeeInvCalls++; |   MooeeInvCalls++; | ||||||
|   MooeeInvTime-=usecond(); |   MooeeInvTime-=usecond(); | ||||||
|  |  | ||||||
|   parallel_for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls | PARALLEL_FOR_LOOP | ||||||
|  |   for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls | ||||||
|  |  | ||||||
|     auto tmp = psi._odata[0]; |     auto tmp = psi._odata[0]; | ||||||
|  |  | ||||||
| @@ -204,25 +193,25 @@ void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField & | |||||||
|     chi[ss]=psi[ss]; |     chi[ss]=psi[ss]; | ||||||
|     for (int s=1;s<Ls;s++){ |     for (int s=1;s<Ls;s++){ | ||||||
|                             spProj5m(tmp,chi[ss+s-1]); |                             spProj5m(tmp,chi[ss+s-1]); | ||||||
|       chi[ss+s] = psi[ss+s]-ueec[s-1]*tmp; |       chi[ss+s] = psi[ss+s]-uee[s-1]*tmp; | ||||||
|     } |     } | ||||||
|     // U_m^{-\dagger}  |     // U_m^{-\dagger}  | ||||||
|     for (int s=0;s<Ls-1;s++){ |     for (int s=0;s<Ls-1;s++){ | ||||||
|                                    spProj5p(tmp,chi[ss+s]); |                                    spProj5p(tmp,chi[ss+s]); | ||||||
|       chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp; |       chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // L_m^{-\dagger} D^{-dagger} |     // L_m^{-\dagger} D^{-dagger} | ||||||
|     for (int s=0;s<Ls-1;s++){ |     for (int s=0;s<Ls-1;s++){ | ||||||
|       spProj5m(tmp,chi[ss+Ls-1]); |       spProj5m(tmp,chi[ss+Ls-1]); | ||||||
|       chi[ss+s] = (1.0/deec[s])*chi[ss+s]-(leemc[s]/deec[Ls-1])*tmp; |       chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp; | ||||||
|     }	 |     }	 | ||||||
|     chi[ss+Ls-1]= (1.0/deec[Ls-1])*chi[ss+Ls-1]; |     chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1]; | ||||||
|    |    | ||||||
|     // Apply L^{-dagger} |     // Apply L^{-dagger} | ||||||
|     for (int s=Ls-2;s>=0;s--){ |     for (int s=Ls-2;s>=0;s--){ | ||||||
|       spProj5p(tmp,chi[ss+s+1]); |       spProj5p(tmp,chi[ss+s+1]); | ||||||
|       chi[ss+s] = chi[ss+s] - leec[s]*tmp; |       chi[ss+s] = chi[ss+s] - lee[s]*tmp; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -30,8 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
|  |  | ||||||
| #include <Grid/Eigen/Dense> | #include <Grid/Eigen/Dense> | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/qcd/action/fermion/CayleyFermion5D.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| @@ -39,17 +38,20 @@ namespace QCD { | |||||||
|   /* |   /* | ||||||
|    * Dense matrix versions of routines |    * Dense matrix versions of routines | ||||||
|    */ |    */ | ||||||
|  |  | ||||||
|  |   /* | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   this->MooeeInternal(psi,chi,DaggerYes,InverseYes); |   this->MooeeInternal(psi,chi,DaggerYes,InverseYes); | ||||||
| } | } | ||||||
|  |    | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   this->MooeeInternal(psi,chi,DaggerNo,InverseYes); |   this->MooeeInternal(psi,chi,DaggerNo,InverseYes); | ||||||
| } | } | ||||||
|  |   */ | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) | void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv) | ||||||
| { | { | ||||||
| @@ -123,20 +125,9 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| #ifdef CAYLEY_DPERP_DENSE |  | ||||||
| INSTANTIATE_DPERP(GparityWilsonImplF); |  | ||||||
| INSTANTIATE_DPERP(GparityWilsonImplD); |  | ||||||
| INSTANTIATE_DPERP(WilsonImplF); |  | ||||||
| INSTANTIATE_DPERP(WilsonImplD); |  | ||||||
| INSTANTIATE_DPERP(ZWilsonImplF); |  | ||||||
| INSTANTIATE_DPERP(ZWilsonImplD); |  | ||||||
|  |  | ||||||
| template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | ||||||
| template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | ||||||
| template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | ||||||
| template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); | ||||||
| template void CayleyFermion5D<ZWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); |  | ||||||
| template void CayleyFermion5D<ZWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| }} | }} | ||||||
|   | |||||||
| @@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/qcd/action/fermion/CayleyFermion5D.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| @@ -48,18 +47,17 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi, | |||||||
| 				std::vector<Coeff_t> &diag, | 				std::vector<Coeff_t> &diag, | ||||||
| 				std::vector<Coeff_t> &upper) | 				std::vector<Coeff_t> &upper) | ||||||
| { | { | ||||||
|   Coeff_t one(1.0); |  | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   for(int s=0;s<Ls;s++){ |   for(int s=0;s<Ls;s++){ | ||||||
|     if ( s==0 ) { |     if ( s==0 ) { | ||||||
|       axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1); |       axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1); | ||||||
|       axpby_ssp_pplus (chi,one,chi,lower[s],psi,s,Ls-1); |       axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1); | ||||||
|     } else if ( s==(Ls-1)) {  |     } else if ( s==(Ls-1)) {  | ||||||
|       axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0); |       axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0); | ||||||
|       axpby_ssp_pplus (chi,one,chi,lower[s],psi,s,s-1); |       axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1); | ||||||
|     } else { |     } else { | ||||||
|       axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1); |       axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1); | ||||||
|       axpby_ssp_pplus(chi,one,chi,lower[s],psi,s,s-1); |       axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -71,18 +69,17 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi, | |||||||
| 				   std::vector<Coeff_t> &diag, | 				   std::vector<Coeff_t> &diag, | ||||||
| 				   std::vector<Coeff_t> &upper) | 				   std::vector<Coeff_t> &upper) | ||||||
| { | { | ||||||
|   Coeff_t one(1.0); |  | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   for(int s=0;s<Ls;s++){ |   for(int s=0;s<Ls;s++){ | ||||||
|     if ( s==0 ) { |     if ( s==0 ) { | ||||||
|       axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1); |       axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1); | ||||||
|       axpby_ssp_pminus(chi,one,chi,lower[s],psi,s,Ls-1); |       axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1); | ||||||
|     } else if ( s==(Ls-1)) {  |     } else if ( s==(Ls-1)) {  | ||||||
|       axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0); |       axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0); | ||||||
|       axpby_ssp_pminus(chi,one,chi,lower[s],psi,s,s-1); |       axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1); | ||||||
|     } else { |     } else { | ||||||
|       axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1); |       axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1); | ||||||
|       axpby_ssp_pminus(chi,one,chi,lower[s],psi,s,s-1); |       axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -90,68 +87,62 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi, | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   Coeff_t one(1.0); |  | ||||||
|   Coeff_t czero(0.0); |  | ||||||
|   chi.checkerboard=psi.checkerboard; |   chi.checkerboard=psi.checkerboard; | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   // Apply (L^{\prime})^{-1} |   // Apply (L^{\prime})^{-1} | ||||||
|   axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0] |   axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0] | ||||||
|   for (int s=1;s<Ls;s++){ |   for (int s=1;s<Ls;s++){ | ||||||
|     axpby_ssp_pplus(chi,one,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1] |     axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1] | ||||||
|   } |   } | ||||||
|   // L_m^{-1}  |   // L_m^{-1}  | ||||||
|   for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi |   for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi | ||||||
|     axpby_ssp_pminus(chi,one,chi,-leem[s],chi,Ls-1,s); |     axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s); | ||||||
|   } |   } | ||||||
|   // U_m^{-1} D^{-1} |   // U_m^{-1} D^{-1} | ||||||
|   for (int s=0;s<Ls-1;s++){ |   for (int s=0;s<Ls-1;s++){ | ||||||
|     // Chi[s] + 1/d chi[s]  |     // Chi[s] + 1/d chi[s]  | ||||||
|     axpby_ssp_pplus(chi,one/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1); |     axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1); | ||||||
|   }	 |   }	 | ||||||
|   axpby_ssp(chi,one/dee[Ls-1],chi,czero,chi,Ls-1,Ls-1); // Modest avoidable  |   axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable  | ||||||
|    |    | ||||||
|   // Apply U^{-1} |   // Apply U^{-1} | ||||||
|   for (int s=Ls-2;s>=0;s--){ |   for (int s=Ls-2;s>=0;s--){ | ||||||
|     axpby_ssp_pminus (chi,one,chi,-uee[s],chi,s,s+1);  // chi[Ls] |     axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1);  // chi[Ls] | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   Coeff_t one(1.0); |  | ||||||
|   Coeff_t czero(0.0); |  | ||||||
|   chi.checkerboard=psi.checkerboard; |   chi.checkerboard=psi.checkerboard; | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   // Apply (U^{\prime})^{-dagger} |   // Apply (U^{\prime})^{-dagger} | ||||||
|   axpby_ssp (chi,one,psi,     czero,psi,0,0);      // chi[0]=psi[0] |   axpby_ssp (chi,1.0,psi,     0.0,psi,0,0);      // chi[0]=psi[0] | ||||||
|   for (int s=1;s<Ls;s++){ |   for (int s=1;s<Ls;s++){ | ||||||
|     axpby_ssp_pminus(chi,one,psi,-conjugate(uee[s-1]),chi,s,s-1); |     axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1); | ||||||
|   } |   } | ||||||
|   // U_m^{-\dagger}  |   // U_m^{-\dagger}  | ||||||
|   for (int s=0;s<Ls-1;s++){ |   for (int s=0;s<Ls-1;s++){ | ||||||
|     axpby_ssp_pplus(chi,one,chi,-conjugate(ueem[s]),chi,Ls-1,s); |     axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s); | ||||||
|   } |   } | ||||||
|   // L_m^{-\dagger} D^{-dagger} |   // L_m^{-\dagger} D^{-dagger} | ||||||
|   for (int s=0;s<Ls-1;s++){ |   for (int s=0;s<Ls-1;s++){ | ||||||
|     axpby_ssp_pminus(chi,one/conjugate(dee[s]),chi,-conjugate(leem[s]/dee[Ls-1]),chi,s,Ls-1); |     axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1); | ||||||
|   }	 |   }	 | ||||||
|   axpby_ssp(chi,one/conjugate(dee[Ls-1]),chi,czero,chi,Ls-1,Ls-1); // Modest avoidable  |   axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable  | ||||||
|    |    | ||||||
|   // Apply L^{-dagger} |   // Apply L^{-dagger} | ||||||
|   for (int s=Ls-2;s>=0;s--){ |   for (int s=Ls-2;s>=0;s--){ | ||||||
|     axpby_ssp_pplus (chi,one,chi,-conjugate(lee[s]),chi,s,s+1);  // chi[Ls] |     axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1);  // chi[Ls] | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef CAYLEY_DPERP_LINALG | #ifdef CAYLEY_DPERP_LINALG | ||||||
|   INSTANTIATE_DPERP(WilsonImplF); |   INSTANTIATE(WilsonImplF); | ||||||
|   INSTANTIATE_DPERP(WilsonImplD); |   INSTANTIATE(WilsonImplD); | ||||||
|   INSTANTIATE_DPERP(GparityWilsonImplF); |   INSTANTIATE(GparityWilsonImplF); | ||||||
|   INSTANTIATE_DPERP(GparityWilsonImplD); |   INSTANTIATE(GparityWilsonImplD); | ||||||
|   INSTANTIATE_DPERP(ZWilsonImplF); |  | ||||||
|   INSTANTIATE_DPERP(ZWilsonImplD); |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -30,13 +30,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
|  |  | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/qcd/action/fermion/CayleyFermion5D.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD {   | namespace QCD {  /* | ||||||
|   /* |  | ||||||
|    * Dense matrix versions of routines |    * Dense matrix versions of routines | ||||||
|    */ |    */ | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -93,7 +91,8 @@ void CayleyFermion5D<Impl>::M5D(const FermionField &psi, | |||||||
|  |  | ||||||
|   assert(Nc==3); |   assert(Nc==3); | ||||||
|  |  | ||||||
|   parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs | PARALLEL_FOR_LOOP | ||||||
|  |   for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs | ||||||
| #if 0 | #if 0 | ||||||
|       alignas(64) SiteHalfSpinor hp; |       alignas(64) SiteHalfSpinor hp; | ||||||
|       alignas(64) SiteHalfSpinor hm; |       alignas(64) SiteHalfSpinor hm; | ||||||
| @@ -233,7 +232,8 @@ void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi, | |||||||
|  |  | ||||||
|   M5Dcalls++; |   M5Dcalls++; | ||||||
|   M5Dtime-=usecond(); |   M5Dtime-=usecond(); | ||||||
|   parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs | PARALLEL_FOR_LOOP | ||||||
|  |   for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs | ||||||
| #if 0 | #if 0 | ||||||
|     alignas(64) SiteHalfSpinor hp; |     alignas(64) SiteHalfSpinor hp; | ||||||
|     alignas(64) SiteHalfSpinor hm; |     alignas(64) SiteHalfSpinor hm; | ||||||
| @@ -792,11 +792,13 @@ void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField | |||||||
|   MooeeInvTime-=usecond(); |   MooeeInvTime-=usecond(); | ||||||
|  |  | ||||||
|   if ( switcheroo<Coeff_t>::iscomplex() ) { |   if ( switcheroo<Coeff_t>::iscomplex() ) { | ||||||
|     parallel_for(auto site=0;site<vol;site++){ |   PARALLEL_FOR_LOOP | ||||||
|  |     for(auto site=0;site<vol;site++){ | ||||||
|       MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm); |       MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm); | ||||||
|     } |     } | ||||||
|   } else {  |   } else {  | ||||||
|     parallel_for(auto site=0;site<vol;site++){ |   PARALLEL_FOR_LOOP | ||||||
|  |     for(auto site=0;site<vol;site++){ | ||||||
|       MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm); |       MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -26,8 +26,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
| #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h> |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|   namespace QCD { |   namespace QCD { | ||||||
|   | |||||||
| @@ -29,8 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef  GRID_QCD_CONTINUED_FRACTION_H | #ifndef  GRID_QCD_CONTINUED_FRACTION_H | ||||||
| #define  GRID_QCD_CONTINUED_FRACTION_H | #define  GRID_QCD_CONTINUED_FRACTION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/WilsonFermion5D.h> |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   namespace QCD { |   namespace QCD { | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H | #ifndef  GRID_QCD_DOMAIN_WALL_FERMION_H | ||||||
| #define  GRID_QCD_DOMAIN_WALL_FERMION_H | #define  GRID_QCD_DOMAIN_WALL_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,80 +0,0 @@ | |||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #ifndef  GRID_QCD_FERMION_CORE_H |  | ||||||
| #define  GRID_QCD_FERMION_CORE_H |  | ||||||
|  |  | ||||||
| #include <Grid/GridCore.h> |  | ||||||
| #include <Grid/GridQCDcore.h> |  | ||||||
| #include <Grid/qcd/action/ActionCore.h> |  | ||||||
|  |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| // Fermion prereqs |  | ||||||
| //////////////////////////////////////////// |  | ||||||
| #include <Grid/qcd/action/fermion/WilsonCompressor.h>     //used by all wilson type fermions |  | ||||||
| #include <Grid/qcd/action/fermion/FermionOperatorImpl.h> |  | ||||||
| #include <Grid/qcd/action/fermion/FermionOperator.h> |  | ||||||
| #include <Grid/qcd/action/fermion/WilsonKernels.h>        //used by all wilson type fermions |  | ||||||
| #include <Grid/qcd/action/fermion/StaggeredKernels.h>        //used by all wilson type fermions |  | ||||||
|  |  | ||||||
| #define FermOpStaggeredTemplateInstantiate(A) \ |  | ||||||
|   template class A<StaggeredImplF>; \ |  | ||||||
|   template class A<StaggeredImplD>;  |  | ||||||
|  |  | ||||||
| #define FermOpStaggeredVec5dTemplateInstantiate(A) \ |  | ||||||
|   template class A<StaggeredVec5dImplF>; \ |  | ||||||
|   template class A<StaggeredVec5dImplD>;  |  | ||||||
|  |  | ||||||
| #define FermOp4dVecTemplateInstantiate(A) \ |  | ||||||
|   template class A<WilsonImplF>;		\ |  | ||||||
|   template class A<WilsonImplD>;		\ |  | ||||||
|   template class A<ZWilsonImplF>;		\ |  | ||||||
|   template class A<ZWilsonImplD>;		\ |  | ||||||
|   template class A<GparityWilsonImplF>;		\ |  | ||||||
|   template class A<GparityWilsonImplD>;		 |  | ||||||
|  |  | ||||||
| #define AdjointFermOpTemplateInstantiate(A) \ |  | ||||||
|   template class A<WilsonAdjImplF>; \ |  | ||||||
|   template class A<WilsonAdjImplD>;  |  | ||||||
|  |  | ||||||
| #define TwoIndexFermOpTemplateInstantiate(A) \ |  | ||||||
|   template class A<WilsonTwoIndexSymmetricImplF>; \ |  | ||||||
|   template class A<WilsonTwoIndexSymmetricImplD>;  |  | ||||||
|  |  | ||||||
| #define FermOp5dVecTemplateInstantiate(A) \ |  | ||||||
|   template class A<DomainWallVec5dImplF>;	\ |  | ||||||
|   template class A<DomainWallVec5dImplD>;	\ |  | ||||||
|   template class A<ZDomainWallVec5dImplF>;	\ |  | ||||||
|   template class A<ZDomainWallVec5dImplD>;	 |  | ||||||
|  |  | ||||||
| #define FermOpTemplateInstantiate(A) \ |  | ||||||
|  FermOp4dVecTemplateInstantiate(A) \ |  | ||||||
|  FermOp5dVecTemplateInstantiate(A)  |  | ||||||
|  |  | ||||||
| #define GparityFermOpTemplateInstantiate(A)  |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -194,7 +194,8 @@ namespace QCD { | |||||||
|       GaugeLinkField tmp(mat._grid); |       GaugeLinkField tmp(mat._grid); | ||||||
|       tmp = zero; |       tmp = zero; | ||||||
|        |        | ||||||
|       parallel_for(int sss=0;sss<tmp._grid->oSites();sss++){ |       PARALLEL_FOR_LOOP | ||||||
|  |       for(int sss=0;sss<tmp._grid->oSites();sss++){ | ||||||
| 	int sU=sss; | 	int sU=sss; | ||||||
| 	for(int s=0;s<Ls;s++){ | 	for(int s=0;s<Ls;s++){ | ||||||
| 	  int sF = s+Ls*sU; | 	  int sF = s+Ls*sU; | ||||||
| @@ -234,13 +235,11 @@ class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres | |||||||
|   typedef Lattice<SiteSpinor> FermionField; |   typedef Lattice<SiteSpinor> FermionField; | ||||||
|   typedef Lattice<SitePropagator> PropagatorField; |   typedef Lattice<SitePropagator> PropagatorField; | ||||||
|    |    | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////// |  | ||||||
|   // Make the doubled gauge field a *scalar* |   // Make the doubled gauge field a *scalar* | ||||||
|   ///////////////////////////////////////////////// |  | ||||||
|   typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar |   typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar | ||||||
|   typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar |   typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar | ||||||
|   typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar |   typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar | ||||||
|  |        | ||||||
|   typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; |   typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; | ||||||
|        |        | ||||||
|   typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor; |   typedef WilsonCompressor<SiteHalfSpinor, SiteSpinor> Compressor; | ||||||
| @@ -446,7 +445,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent | |||||||
|        Uconj = where(coor==neglink,-Uconj,Uconj); |        Uconj = where(coor==neglink,-Uconj,Uconj); | ||||||
|      } |      } | ||||||
| 	   | 	   | ||||||
|      parallel_for(auto ss=U.begin();ss<U.end();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |      for(auto ss=U.begin();ss<U.end();ss++){ | ||||||
|        Uds[ss](0)(mu) = U[ss](); |        Uds[ss](0)(mu) = U[ss](); | ||||||
|        Uds[ss](1)(mu) = Uconj[ss](); |        Uds[ss](1)(mu) = Uconj[ss](); | ||||||
|      } |      } | ||||||
| @@ -459,7 +459,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent | |||||||
|        Utmp = where(coor==0,Uconj,Utmp); |        Utmp = where(coor==0,Uconj,Utmp); | ||||||
|      } |      } | ||||||
| 	   | 	   | ||||||
|      parallel_for(auto ss=U.begin();ss<U.end();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |      for(auto ss=U.begin();ss<U.end();ss++){ | ||||||
|        Uds[ss](0)(mu+4) = Utmp[ss](); |        Uds[ss](0)(mu+4) = Utmp[ss](); | ||||||
|      } |      } | ||||||
| 	   | 	   | ||||||
| @@ -468,7 +469,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent | |||||||
|        Utmp = where(coor==0,U,Utmp); |        Utmp = where(coor==0,U,Utmp); | ||||||
|      } |      } | ||||||
| 	   | 	   | ||||||
|      parallel_for(auto ss=U.begin();ss<U.end();ss++){ | PARALLEL_FOR_LOOP | ||||||
|  |      for(auto ss=U.begin();ss<U.end();ss++){ | ||||||
|        Uds[ss](1)(mu+4) = Utmp[ss](); |        Uds[ss](1)(mu+4) = Utmp[ss](); | ||||||
|      } |      } | ||||||
| 	   | 	   | ||||||
| @@ -482,7 +484,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent | |||||||
|    GaugeLinkField link(mat._grid); |    GaugeLinkField link(mat._grid); | ||||||
|    // use lorentz for flavour as hack. |    // use lorentz for flavour as hack. | ||||||
|    auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); |    auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); | ||||||
|    parallel_for(auto ss = tmp.begin(); ss < tmp.end(); ss++) { | PARALLEL_FOR_LOOP | ||||||
|  |    for (auto ss = tmp.begin(); ss < tmp.end(); ss++) { | ||||||
|      link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); |      link[ss]() = tmp[ss](0, 0) - conjugate(tmp[ss](1, 1)); | ||||||
|    } |    } | ||||||
|    PokeIndex<LorentzIndex>(mat, link, mu); |    PokeIndex<LorentzIndex>(mat, link, mu); | ||||||
| @@ -495,7 +498,8 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent | |||||||
| 	 | 	 | ||||||
|    GaugeLinkField tmp(mat._grid); |    GaugeLinkField tmp(mat._grid); | ||||||
|    tmp = zero; |    tmp = zero; | ||||||
|    parallel_for(int ss = 0; ss < tmp._grid->oSites(); ss++) { | PARALLEL_FOR_LOOP | ||||||
|  |    for (int ss = 0; ss < tmp._grid->oSites(); ss++) { | ||||||
|      for (int s = 0; s < Ls; s++) { |      for (int s = 0; s < Ls; s++) { | ||||||
|        int sF = s + Ls * ss; |        int sF = s + Ls * ss; | ||||||
|        auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])); |        auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde[sF], Atilde[sF])); | ||||||
| @@ -508,323 +512,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent | |||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Single flavour one component spinors with colour index |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////// |  | ||||||
|   template <class S, class Representation = FundamentalRepresentation > |  | ||||||
|   class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > { |  | ||||||
|  |  | ||||||
|     public: |  | ||||||
|  |  | ||||||
|     typedef RealD  _Coeff_t ; |  | ||||||
|     static const int Dimension = Representation::Dimension; |  | ||||||
|     typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl; |  | ||||||
|        |  | ||||||
|     //Necessary? |  | ||||||
|     constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} |  | ||||||
|      |  | ||||||
|     const bool LsVectorised=false; |  | ||||||
|     typedef _Coeff_t Coeff_t; |  | ||||||
|  |  | ||||||
|     INHERIT_GIMPL_TYPES(Gimpl); |  | ||||||
|        |  | ||||||
|     template <typename vtype> using iImplScalar            = iScalar<iScalar<iScalar<vtype> > >; |  | ||||||
|     template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >; |  | ||||||
|     template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >; |  | ||||||
|     template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>; |  | ||||||
|     template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >; |  | ||||||
|      |  | ||||||
|     typedef iImplScalar<Simd>            SiteComplex; |  | ||||||
|     typedef iImplSpinor<Simd>            SiteSpinor; |  | ||||||
|     typedef iImplHalfSpinor<Simd>        SiteHalfSpinor; |  | ||||||
|     typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField; |  | ||||||
|     typedef iImplPropagator<Simd>        SitePropagator; |  | ||||||
|      |  | ||||||
|     typedef Lattice<SiteComplex>           ComplexField; |  | ||||||
|     typedef Lattice<SiteSpinor>            FermionField; |  | ||||||
|     typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; |  | ||||||
|     typedef Lattice<SitePropagator> PropagatorField; |  | ||||||
|      |  | ||||||
|     typedef SimpleCompressor<SiteSpinor> Compressor; |  | ||||||
|     typedef StaggeredImplParams ImplParams; |  | ||||||
|     typedef CartesianStencil<SiteSpinor, SiteSpinor> StencilImpl; |  | ||||||
|      |  | ||||||
|     ImplParams Params; |  | ||||||
|      |  | ||||||
|     StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){}; |  | ||||||
|        |  | ||||||
|     inline void multLink(SiteSpinor &phi, |  | ||||||
| 			 const SiteDoubledGaugeField &U, |  | ||||||
| 			 const SiteSpinor &chi, |  | ||||||
| 			 int mu){ |  | ||||||
|       mult(&phi(), &U(mu), &chi()); |  | ||||||
|     } |  | ||||||
|     inline void multLinkAdd(SiteSpinor &phi, |  | ||||||
| 			    const SiteDoubledGaugeField &U, |  | ||||||
| 			    const SiteSpinor &chi, |  | ||||||
| 			    int mu){ |  | ||||||
|       mac(&phi(), &U(mu), &chi()); |  | ||||||
|     } |  | ||||||
|        |  | ||||||
|     template <class ref> |  | ||||||
|     inline void loadLinkElement(Simd ®, ref &memory) { |  | ||||||
|       reg = memory; |  | ||||||
|     } |  | ||||||
|        |  | ||||||
|     inline void DoubleStore(GridBase *GaugeGrid, |  | ||||||
| 			    DoubledGaugeField &UUUds, // for Naik term |  | ||||||
| 			    DoubledGaugeField &Uds, |  | ||||||
| 			    const GaugeField &Uthin, |  | ||||||
| 			    const GaugeField &Ufat) { |  | ||||||
|       conformable(Uds._grid, GaugeGrid); |  | ||||||
|       conformable(Uthin._grid, GaugeGrid); |  | ||||||
|       conformable(Ufat._grid, GaugeGrid); |  | ||||||
|       GaugeLinkField U(GaugeGrid); |  | ||||||
|       GaugeLinkField UU(GaugeGrid); |  | ||||||
|       GaugeLinkField UUU(GaugeGrid); |  | ||||||
|       GaugeLinkField Udag(GaugeGrid); |  | ||||||
|       GaugeLinkField UUUdag(GaugeGrid); |  | ||||||
|       for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|  |  | ||||||
| 	// Staggered Phase. |  | ||||||
| 	Lattice<iScalar<vInteger> > coor(GaugeGrid); |  | ||||||
| 	Lattice<iScalar<vInteger> > x(GaugeGrid); LatticeCoordinate(x,0); |  | ||||||
| 	Lattice<iScalar<vInteger> > y(GaugeGrid); LatticeCoordinate(y,1); |  | ||||||
| 	Lattice<iScalar<vInteger> > z(GaugeGrid); LatticeCoordinate(z,2); |  | ||||||
| 	Lattice<iScalar<vInteger> > t(GaugeGrid); LatticeCoordinate(t,3); |  | ||||||
|  |  | ||||||
| 	Lattice<iScalar<vInteger> > lin_z(GaugeGrid); lin_z=x+y; |  | ||||||
| 	Lattice<iScalar<vInteger> > lin_t(GaugeGrid); lin_t=x+y+z; |  | ||||||
|  |  | ||||||
| 	ComplexField phases(GaugeGrid);	phases=1.0; |  | ||||||
|  |  | ||||||
| 	if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases); |  | ||||||
| 	if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases); |  | ||||||
| 	if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases); |  | ||||||
|  |  | ||||||
| 	// 1 hop based on fat links |  | ||||||
| 	U      = PeekIndex<LorentzIndex>(Ufat, mu); |  | ||||||
| 	Udag   = adj( Cshift(U, mu, -1)); |  | ||||||
|  |  | ||||||
| 	U    = U    *phases; |  | ||||||
| 	Udag = Udag *phases; |  | ||||||
|  |  | ||||||
| 	PokeIndex<LorentzIndex>(Uds, U, mu); |  | ||||||
| 	PokeIndex<LorentzIndex>(Uds, Udag, mu + 4); |  | ||||||
|  |  | ||||||
| 	// 3 hop based on thin links. Crazy huh ? |  | ||||||
| 	U  = PeekIndex<LorentzIndex>(Uthin, mu); |  | ||||||
| 	UU = Gimpl::CovShiftForward(U,mu,U); |  | ||||||
| 	UUU= Gimpl::CovShiftForward(U,mu,UU); |  | ||||||
| 	 |  | ||||||
| 	UUUdag = adj( Cshift(UUU, mu, -3)); |  | ||||||
|  |  | ||||||
| 	UUU    = UUU    *phases; |  | ||||||
| 	UUUdag = UUUdag *phases; |  | ||||||
|  |  | ||||||
| 	PokeIndex<LorentzIndex>(UUUds, UUU, mu); |  | ||||||
| 	PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4); |  | ||||||
|  |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ |  | ||||||
|       GaugeLinkField link(mat._grid); |  | ||||||
|       link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));  |  | ||||||
|       PokeIndex<LorentzIndex>(mat,link,mu); |  | ||||||
|     }    |  | ||||||
|        |  | ||||||
|     inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ |  | ||||||
|       assert (0);  |  | ||||||
|       // Must never hit |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Single flavour one component spinors with colour index. 5d vec |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////// |  | ||||||
|   template <class S, class Representation = FundamentalRepresentation > |  | ||||||
|   class StaggeredVec5dImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > { |  | ||||||
|  |  | ||||||
|     public: |  | ||||||
|  |  | ||||||
|     typedef RealD  _Coeff_t ; |  | ||||||
|     static const int Dimension = Representation::Dimension; |  | ||||||
|     typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl; |  | ||||||
|        |  | ||||||
|     //Necessary? |  | ||||||
|     constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;} |  | ||||||
|      |  | ||||||
|     const bool LsVectorised=true; |  | ||||||
|  |  | ||||||
|     typedef _Coeff_t Coeff_t; |  | ||||||
|  |  | ||||||
|     INHERIT_GIMPL_TYPES(Gimpl); |  | ||||||
|  |  | ||||||
|     template <typename vtype> using iImplScalar            = iScalar<iScalar<iScalar<vtype> > >; |  | ||||||
|     template <typename vtype> using iImplSpinor            = iScalar<iScalar<iVector<vtype, Dimension> > >; |  | ||||||
|     template <typename vtype> using iImplHalfSpinor        = iScalar<iScalar<iVector<vtype, Dimension> > >; |  | ||||||
|     template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>; |  | ||||||
|     template <typename vtype> using iImplGaugeField        = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>; |  | ||||||
|     template <typename vtype> using iImplGaugeLink         = iScalar<iScalar<iMatrix<vtype, Dimension> > >; |  | ||||||
|     template <typename vtype> using iImplPropagator        = iScalar<iScalar<iMatrix<vtype, Dimension> > >; |  | ||||||
|  |  | ||||||
|     // Make the doubled gauge field a *scalar* |  | ||||||
|     typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar |  | ||||||
|     typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar |  | ||||||
|     typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar |  | ||||||
|     typedef iImplPropagator<Simd>        SitePropagator; |  | ||||||
|  |  | ||||||
|     typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; |  | ||||||
|     typedef Lattice<SitePropagator> PropagatorField; |  | ||||||
|      |  | ||||||
|     typedef iImplScalar<Simd>            SiteComplex; |  | ||||||
|     typedef iImplSpinor<Simd>            SiteSpinor; |  | ||||||
|     typedef iImplHalfSpinor<Simd>        SiteHalfSpinor; |  | ||||||
|  |  | ||||||
|      |  | ||||||
|     typedef Lattice<SiteComplex>           ComplexField; |  | ||||||
|     typedef Lattice<SiteSpinor>            FermionField; |  | ||||||
|      |  | ||||||
|     typedef SimpleCompressor<SiteSpinor> Compressor; |  | ||||||
|     typedef StaggeredImplParams ImplParams; |  | ||||||
|     typedef CartesianStencil<SiteSpinor, SiteSpinor> StencilImpl; |  | ||||||
|      |  | ||||||
|     ImplParams Params; |  | ||||||
|      |  | ||||||
|     StaggeredVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){}; |  | ||||||
|  |  | ||||||
|     template <class ref> |  | ||||||
|     inline void loadLinkElement(Simd ®, ref &memory) { |  | ||||||
|       vsplat(reg, memory); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, |  | ||||||
| 			 const SiteHalfSpinor &chi, int mu) { |  | ||||||
|       SiteGaugeLink UU; |  | ||||||
|       for (int i = 0; i < Dimension; i++) { |  | ||||||
| 	for (int j = 0; j < Dimension; j++) { |  | ||||||
| 	  vsplat(UU()()(i, j), U(mu)()(i, j)); |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|       mult(&phi(), &UU(), &chi()); |  | ||||||
|     } |  | ||||||
|     inline void multLinkAdd(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U, |  | ||||||
| 			    const SiteHalfSpinor &chi, int mu) { |  | ||||||
|       SiteGaugeLink UU; |  | ||||||
|       for (int i = 0; i < Dimension; i++) { |  | ||||||
| 	for (int j = 0; j < Dimension; j++) { |  | ||||||
| 	  vsplat(UU()()(i, j), U(mu)()(i, j)); |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|       mac(&phi(), &UU(), &chi()); |  | ||||||
|     } |  | ||||||
|        |  | ||||||
|     inline void DoubleStore(GridBase *GaugeGrid, |  | ||||||
| 			    DoubledGaugeField &UUUds, // for Naik term |  | ||||||
| 			    DoubledGaugeField &Uds, |  | ||||||
| 			    const GaugeField &Uthin, |  | ||||||
| 			    const GaugeField &Ufat)  |  | ||||||
|     { |  | ||||||
|  |  | ||||||
|       GridBase * InputGrid = Uthin._grid; |  | ||||||
|       conformable(InputGrid,Ufat._grid); |  | ||||||
|  |  | ||||||
|       GaugeLinkField U(InputGrid); |  | ||||||
|       GaugeLinkField UU(InputGrid); |  | ||||||
|       GaugeLinkField UUU(InputGrid); |  | ||||||
|       GaugeLinkField Udag(InputGrid); |  | ||||||
|       GaugeLinkField UUUdag(InputGrid); |  | ||||||
|  |  | ||||||
|       for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|  |  | ||||||
| 	// Staggered Phase. |  | ||||||
| 	Lattice<iScalar<vInteger> > coor(InputGrid); |  | ||||||
| 	Lattice<iScalar<vInteger> > x(InputGrid); LatticeCoordinate(x,0); |  | ||||||
| 	Lattice<iScalar<vInteger> > y(InputGrid); LatticeCoordinate(y,1); |  | ||||||
| 	Lattice<iScalar<vInteger> > z(InputGrid); LatticeCoordinate(z,2); |  | ||||||
| 	Lattice<iScalar<vInteger> > t(InputGrid); LatticeCoordinate(t,3); |  | ||||||
|  |  | ||||||
| 	Lattice<iScalar<vInteger> > lin_z(InputGrid); lin_z=x+y; |  | ||||||
| 	Lattice<iScalar<vInteger> > lin_t(InputGrid); lin_t=x+y+z; |  | ||||||
|  |  | ||||||
| 	ComplexField phases(InputGrid);	phases=1.0; |  | ||||||
|  |  | ||||||
| 	if ( mu == 1 ) phases = where( mod(x    ,2)==(Integer)0, phases,-phases); |  | ||||||
| 	if ( mu == 2 ) phases = where( mod(lin_z,2)==(Integer)0, phases,-phases); |  | ||||||
| 	if ( mu == 3 ) phases = where( mod(lin_t,2)==(Integer)0, phases,-phases); |  | ||||||
|  |  | ||||||
| 	// 1 hop based on fat links |  | ||||||
| 	U      = PeekIndex<LorentzIndex>(Ufat, mu); |  | ||||||
| 	Udag   = adj( Cshift(U, mu, -1)); |  | ||||||
|  |  | ||||||
| 	U    = U    *phases; |  | ||||||
| 	Udag = Udag *phases; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| 	for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { |  | ||||||
| 	  SiteScalarGaugeLink   ScalarU; |  | ||||||
| 	  SiteDoubledGaugeField ScalarUds; |  | ||||||
| 	   |  | ||||||
| 	  std::vector<int> lcoor; |  | ||||||
| 	  GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); |  | ||||||
| 	  peekLocalSite(ScalarUds, Uds, lcoor); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, U, lcoor); |  | ||||||
| 	  ScalarUds(mu) = ScalarU(); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, Udag, lcoor); |  | ||||||
| 	  ScalarUds(mu + 4) = ScalarU(); |  | ||||||
|  |  | ||||||
| 	  pokeLocalSite(ScalarUds, Uds, lcoor); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	// 3 hop based on thin links. Crazy huh ? |  | ||||||
| 	U  = PeekIndex<LorentzIndex>(Uthin, mu); |  | ||||||
| 	UU = Gimpl::CovShiftForward(U,mu,U); |  | ||||||
| 	UUU= Gimpl::CovShiftForward(U,mu,UU); |  | ||||||
| 	 |  | ||||||
| 	UUUdag = adj( Cshift(UUU, mu, -3)); |  | ||||||
|  |  | ||||||
| 	UUU    = UUU    *phases; |  | ||||||
| 	UUUdag = UUUdag *phases; |  | ||||||
|  |  | ||||||
| 	for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { |  | ||||||
|  |  | ||||||
| 	  SiteScalarGaugeLink  ScalarU; |  | ||||||
| 	  SiteDoubledGaugeField ScalarUds; |  | ||||||
| 	   |  | ||||||
| 	  std::vector<int> lcoor; |  | ||||||
| 	  GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); |  | ||||||
|        |  | ||||||
| 	  peekLocalSite(ScalarUds, UUUds, lcoor); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, UUU, lcoor); |  | ||||||
| 	  ScalarUds(mu) = ScalarU(); |  | ||||||
|  |  | ||||||
| 	  peekLocalSite(ScalarU, UUUdag, lcoor); |  | ||||||
| 	  ScalarUds(mu + 4) = ScalarU(); |  | ||||||
| 	   |  | ||||||
| 	  pokeLocalSite(ScalarUds, UUUds, lcoor); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ |  | ||||||
|       assert(0); |  | ||||||
|     }    |  | ||||||
|        |  | ||||||
|     inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ |  | ||||||
|       assert (0);  |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec |  typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec | ||||||
|  typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float |  typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float | ||||||
|  typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double |  typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double | ||||||
| @@ -853,14 +540,6 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent | |||||||
|  typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float |  typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float | ||||||
|  typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double |  typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double | ||||||
|  |  | ||||||
|  typedef StaggeredImpl<vComplex,  FundamentalRepresentation > StaggeredImplR;   // Real.. whichever prec |  | ||||||
|  typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF;  // Float |  | ||||||
|  typedef StaggeredImpl<vComplexD, FundamentalRepresentation > StaggeredImplD;  // Double |  | ||||||
|  |  | ||||||
|  typedef StaggeredVec5dImpl<vComplex,  FundamentalRepresentation > StaggeredVec5dImplR;   // Real.. whichever prec |  | ||||||
|  typedef StaggeredVec5dImpl<vComplexF, FundamentalRepresentation > StaggeredVec5dImplF;  // Float |  | ||||||
|  typedef StaggeredVec5dImpl<vComplexD, FundamentalRepresentation > StaggeredVec5dImplD;  // Double |  | ||||||
|  |  | ||||||
| }} | }} | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -1,403 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc |  | ||||||
|  |  | ||||||
| Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Azusa Yamaguchi, Peter Boyle |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid.h> |  | ||||||
|  |  | ||||||
| namespace Grid { |  | ||||||
| namespace QCD { |  | ||||||
|  |  | ||||||
| const std::vector<int>  |  | ||||||
| ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}); |  | ||||||
| const std::vector<int>  |  | ||||||
| ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3}); |  | ||||||
|  |  | ||||||
| ///////////////////////////////// |  | ||||||
| // Constructor and gauge import |  | ||||||
| ///////////////////////////////// |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,  |  | ||||||
| 							 RealD _mass, |  | ||||||
| 							 const ImplParams &p) |  | ||||||
|     : Kernels(p), |  | ||||||
|       _grid(&Fgrid), |  | ||||||
|       _cbgrid(&Hgrid), |  | ||||||
|       Stencil(&Fgrid, npoint, Even, directions, displacements), |  | ||||||
|       StencilEven(&Hgrid, npoint, Even, directions, displacements),  // source is Even |  | ||||||
|       StencilOdd(&Hgrid, npoint, Odd, directions, displacements),  // source is Odd |  | ||||||
|       mass(_mass), |  | ||||||
|       Lebesgue(_grid), |  | ||||||
|       LebesgueEvenOdd(_cbgrid), |  | ||||||
|       Umu(&Fgrid), |  | ||||||
|       UmuEven(&Hgrid), |  | ||||||
|       UmuOdd(&Hgrid), |  | ||||||
|       UUUmu(&Fgrid), |  | ||||||
|       UUUmuEven(&Hgrid), |  | ||||||
|       UUUmuOdd(&Hgrid) , |  | ||||||
|       _tmp(&Hgrid) |  | ||||||
| { |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, |  | ||||||
| 							 GridRedBlackCartesian &Hgrid, RealD _mass, |  | ||||||
| 							 RealD _c1, RealD _c2,RealD _u0, |  | ||||||
| 							 const ImplParams &p) |  | ||||||
|   : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) |  | ||||||
| { |  | ||||||
|   c1=_c1; |  | ||||||
|   c2=_c2; |  | ||||||
|   u0=_u0; |  | ||||||
|   ImportGauge(_Uthin,_Ufat); |  | ||||||
| } |  | ||||||
| template <class Impl> |  | ||||||
| ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, |  | ||||||
| 							 GridRedBlackCartesian &Hgrid, RealD _mass, |  | ||||||
| 							 const ImplParams &p) |  | ||||||
|   : ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p) |  | ||||||
| { |  | ||||||
|   ImportGaugeSimple(_Utriple,_Ufat); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////// |  | ||||||
|   // Momentum space propagator should be  |  | ||||||
|   // https://arxiv.org/pdf/hep-lat/9712010.pdf |  | ||||||
|   // |  | ||||||
|   // mom space action. |  | ||||||
|   //   gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m |  | ||||||
|   // |  | ||||||
|   // must track through staggered flavour/spin reduction in literature to  |  | ||||||
|   // turn to free propagator for the one component chi field, a la page 4/5 |  | ||||||
|   // of above link to implmement fourier based solver. |  | ||||||
|   //////////////////////////////////////////////////////////// |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)  |  | ||||||
| { |  | ||||||
|   ImportGauge(_Uthin,_Uthin); |  | ||||||
| }; |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)  |  | ||||||
| { |  | ||||||
|   ///////////////////////////////////////////////////////////////// |  | ||||||
|   // Trivial import; phases and fattening and such like preapplied |  | ||||||
|   ///////////////////////////////////////////////////////////////// |  | ||||||
|   GaugeLinkField U(GaugeGrid()); |  | ||||||
|  |  | ||||||
|   for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|  |  | ||||||
|     U = PeekIndex<LorentzIndex>(_Utriple, mu); |  | ||||||
|     PokeIndex<LorentzIndex>(UUUmu, U, mu ); |  | ||||||
|  |  | ||||||
|     U = adj( Cshift(U, mu, -3)); |  | ||||||
|     PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 ); |  | ||||||
|  |  | ||||||
|     U = PeekIndex<LorentzIndex>(_Ufat, mu); |  | ||||||
|     PokeIndex<LorentzIndex>(Umu, U, mu); |  | ||||||
|  |  | ||||||
|     U = adj( Cshift(U, mu, -1)); |  | ||||||
|     PokeIndex<LorentzIndex>(Umu, -U, mu+4); |  | ||||||
|  |  | ||||||
|   } |  | ||||||
|   pickCheckerboard(Even, UmuEven,  Umu); |  | ||||||
|   pickCheckerboard(Odd,  UmuOdd ,  Umu); |  | ||||||
|   pickCheckerboard(Even, UUUmuEven,UUUmu); |  | ||||||
|   pickCheckerboard(Odd,  UUUmuOdd, UUUmu); |  | ||||||
| } |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)  |  | ||||||
| { |  | ||||||
|   GaugeLinkField U(GaugeGrid()); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   // Double Store should take two fields for Naik and one hop separately. |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat ); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   // Apply scale factors to get the right fermion Kinetic term |  | ||||||
|   // Could pass coeffs into the double store to save work. |  | ||||||
|   // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )  |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|  |  | ||||||
|     U = PeekIndex<LorentzIndex>(Umu, mu); |  | ||||||
|     PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu ); |  | ||||||
|      |  | ||||||
|     U = PeekIndex<LorentzIndex>(Umu, mu+4); |  | ||||||
|     PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4); |  | ||||||
|  |  | ||||||
|     U = PeekIndex<LorentzIndex>(UUUmu, mu); |  | ||||||
|     PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu ); |  | ||||||
|      |  | ||||||
|     U = PeekIndex<LorentzIndex>(UUUmu, mu+4); |  | ||||||
|     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::cout << " Umu " << Umu._odata[0]<<std::endl; |  | ||||||
|   std::cout << " UUUmu " << UUUmu._odata[0]<<std::endl; |  | ||||||
|   pickCheckerboard(Even, UmuEven, Umu); |  | ||||||
|   pickCheckerboard(Odd,  UmuOdd , Umu); |  | ||||||
|   pickCheckerboard(Even, UUUmuEven, UUUmu); |  | ||||||
|   pickCheckerboard(Odd,   UUUmuOdd, UUUmu); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| ///////////////////////////// |  | ||||||
| // Implement the interface |  | ||||||
| ///////////////////////////// |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   Dhop(in, out, DaggerNo); |  | ||||||
|   return axpy_norm(out, mass, in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   Dhop(in, out, DaggerYes); |  | ||||||
|   return axpy_norm(out, mass, in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) { |  | ||||||
|   if (in.checkerboard == Odd) { |  | ||||||
|     DhopEO(in, out, DaggerNo); |  | ||||||
|   } else { |  | ||||||
|     DhopOE(in, out, DaggerNo); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) { |  | ||||||
|   if (in.checkerboard == Odd) { |  | ||||||
|     DhopEO(in, out, DaggerYes); |  | ||||||
|   } else { |  | ||||||
|     DhopOE(in, out, DaggerYes); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   typename FermionField::scalar_type scal(mass); |  | ||||||
|   out = scal * in; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   Mooee(in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   out = (1.0 / (mass)) * in; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in, |  | ||||||
|                                       FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   MooeeInv(in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /////////////////////////////////// |  | ||||||
| // Internal |  | ||||||
| /////////////////////////////////// |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU,  |  | ||||||
| 						   GaugeField & mat, |  | ||||||
| 						   const FermionField &A, const FermionField &B, int dag) { |  | ||||||
|   assert((dag == DaggerNo) || (dag == DaggerYes)); |  | ||||||
|  |  | ||||||
|   Compressor compressor; |  | ||||||
|  |  | ||||||
|   FermionField Btilde(B._grid); |  | ||||||
|   FermionField Atilde(B._grid); |  | ||||||
|   Atilde = A; |  | ||||||
|  |  | ||||||
|   st.HaloExchange(B, compressor); |  | ||||||
|  |  | ||||||
|   for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|  |  | ||||||
|     //////////////////////// |  | ||||||
|     // Call the single hop |  | ||||||
|     //////////////////////// |  | ||||||
|     PARALLEL_FOR_LOOP |  | ||||||
|     for (int sss = 0; sss < B._grid->oSites(); sss++) { |  | ||||||
|       Kernels::DhopDir(st, U, UUU, st.CommBuf(), sss, sss, B, Btilde, mu,1); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Force in three link terms |  | ||||||
|     // |  | ||||||
|     //    Impl::InsertForce4D(mat, Btilde, Atilde, mu); |  | ||||||
|     // |  | ||||||
|     // dU_ac(x)/dt = i p_ab U_bc(x) |  | ||||||
|     // |  | ||||||
|     // => dS_f/dt = dS_f/dU_ac(x) . dU_ac(x)/dt =  i p_ab U_bc(x) dS_f/dU_ac(x)  |  | ||||||
|     // |  | ||||||
|     // One link: form fragments S_f = A U B  |  | ||||||
|     // |  | ||||||
|     //         write Btilde = U(x) B(x+mu) |  | ||||||
|     // |  | ||||||
|     // mat+= TraceIndex<SpinIndex>(outerProduct(Btilde,A));  |  | ||||||
|     //  |  | ||||||
|     // Three link: form fragments S_f = A UUU B  |  | ||||||
|     // |  | ||||||
|     // mat+= outer ( A, UUUB) <-- Best take DhopDeriv with one linke or identity matrix |  | ||||||
|     // mat+= outer ( AU, UUB) <-- and then use covariant cshift? |  | ||||||
|     // mat+= outer ( AUU, UB) <-- Returned from call to DhopDir |  | ||||||
|  |  | ||||||
|     assert(0);// need to figure out the force interface with a blasted three link term. |  | ||||||
|      |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { |  | ||||||
|  |  | ||||||
|   conformable(U._grid, _grid); |  | ||||||
|   conformable(U._grid, V._grid); |  | ||||||
|   conformable(U._grid, mat._grid); |  | ||||||
|  |  | ||||||
|   mat.checkerboard = U.checkerboard; |  | ||||||
|  |  | ||||||
|   DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { |  | ||||||
|  |  | ||||||
|   conformable(U._grid, _cbgrid); |  | ||||||
|   conformable(U._grid, V._grid); |  | ||||||
|   conformable(U._grid, mat._grid); |  | ||||||
|  |  | ||||||
|   assert(V.checkerboard == Even); |  | ||||||
|   assert(U.checkerboard == Odd); |  | ||||||
|   mat.checkerboard = Odd; |  | ||||||
|  |  | ||||||
|   DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) { |  | ||||||
|  |  | ||||||
|   conformable(U._grid, _cbgrid); |  | ||||||
|   conformable(U._grid, V._grid); |  | ||||||
|   conformable(U._grid, mat._grid); |  | ||||||
|  |  | ||||||
|   assert(V.checkerboard == Odd); |  | ||||||
|   assert(U.checkerboard == Even); |  | ||||||
|   mat.checkerboard = Even; |  | ||||||
|  |  | ||||||
|   DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) { |  | ||||||
|   conformable(in._grid, _grid);  // verifies full grid |  | ||||||
|   conformable(in._grid, out._grid); |  | ||||||
|  |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|  |  | ||||||
|   DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) { |  | ||||||
|   conformable(in._grid, _cbgrid);    // verifies half grid |  | ||||||
|   conformable(in._grid, out._grid);  // drops the cb check |  | ||||||
|  |  | ||||||
|   assert(in.checkerboard == Even); |  | ||||||
|   out.checkerboard = Odd; |  | ||||||
|  |  | ||||||
|   DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) { |  | ||||||
|   conformable(in._grid, _cbgrid);    // verifies half grid |  | ||||||
|   conformable(in._grid, out._grid);  // drops the cb check |  | ||||||
|  |  | ||||||
|   assert(in.checkerboard == Odd); |  | ||||||
|   out.checkerboard = Even; |  | ||||||
|  |  | ||||||
|   DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { |  | ||||||
|   DhopDir(in, out, dir, disp); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { |  | ||||||
|  |  | ||||||
|   Compressor compressor; |  | ||||||
|   Stencil.HaloExchange(in, compressor); |  | ||||||
|  |  | ||||||
|   PARALLEL_FOR_LOOP |  | ||||||
|   for (int sss = 0; sss < in._grid->oSites(); sss++) { |  | ||||||
|     Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sss, sss, in, out, dir, disp); |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, |  | ||||||
| 						  DoubledGaugeField &U, |  | ||||||
| 						  DoubledGaugeField &UUU, |  | ||||||
| 						  const FermionField &in, |  | ||||||
| 						  FermionField &out, int dag) { |  | ||||||
|   assert((dag == DaggerNo) || (dag == DaggerYes)); |  | ||||||
|  |  | ||||||
|   Compressor compressor; |  | ||||||
|   st.HaloExchange(in, compressor); |  | ||||||
|  |  | ||||||
|   if (dag == DaggerYes) { |  | ||||||
|     PARALLEL_FOR_LOOP |  | ||||||
|     for (int sss = 0; sss < in._grid->oSites(); sss++) { |  | ||||||
|       Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     PARALLEL_FOR_LOOP |  | ||||||
|     for (int sss = 0; sss < in._grid->oSites(); sss++) { |  | ||||||
|       Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion); |  | ||||||
|  |  | ||||||
|   //AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion); |  | ||||||
|   //TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion); |  | ||||||
|  |  | ||||||
| }} |  | ||||||
| @@ -1,167 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h |  | ||||||
|  |  | ||||||
| Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Azusa Yamaguchi, Peter Boyle |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #ifndef GRID_QCD_IMPR_STAG_FERMION_H |  | ||||||
| #define GRID_QCD_IMPR_STAG_FERMION_H |  | ||||||
|  |  | ||||||
| namespace Grid { |  | ||||||
|  |  | ||||||
| namespace QCD { |  | ||||||
|  |  | ||||||
| class ImprovedStaggeredFermionStatic { |  | ||||||
|  public: |  | ||||||
|   static const std::vector<int> directions; |  | ||||||
|   static const std::vector<int> displacements; |  | ||||||
|   static const int npoint = 16; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedStaggeredFermionStatic { |  | ||||||
|  public: |  | ||||||
|   INHERIT_IMPL_TYPES(Impl); |  | ||||||
|   typedef StaggeredKernels<Impl> Kernels; |  | ||||||
|  |  | ||||||
|   FermionField _tmp; |  | ||||||
|   FermionField &tmp(void) { return _tmp; } |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   // Implement the abstract base |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   GridBase *GaugeGrid(void) { return _grid; } |  | ||||||
|   GridBase *GaugeRedBlackGrid(void) { return _cbgrid; } |  | ||||||
|   GridBase *FermionGrid(void) { return _grid; } |  | ||||||
|   GridBase *FermionRedBlackGrid(void) { return _cbgrid; } |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////// |  | ||||||
|   // override multiply; cut number routines if pass dagger argument |  | ||||||
|   // and also make interface more uniformly consistent |  | ||||||
|   ////////////////////////////////////////////////////////////////// |  | ||||||
|   RealD M(const FermionField &in, FermionField &out); |  | ||||||
|   RealD Mdag(const FermionField &in, FermionField &out); |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////// |  | ||||||
|   // half checkerboard operations |  | ||||||
|   ///////////////////////////////////////////////////////// |  | ||||||
|   void Meooe(const FermionField &in, FermionField &out); |  | ||||||
|   void MeooeDag(const FermionField &in, FermionField &out); |  | ||||||
|   void Mooee(const FermionField &in, FermionField &out); |  | ||||||
|   void MooeeDag(const FermionField &in, FermionField &out); |  | ||||||
|   void MooeeInv(const FermionField &in, FermionField &out); |  | ||||||
|   void MooeeInvDag(const FermionField &in, FermionField &out); |  | ||||||
|  |  | ||||||
|   //////////////////////// |  | ||||||
|   // Derivative interface |  | ||||||
|   //////////////////////// |  | ||||||
|   // Interface calls an internal routine |  | ||||||
|   void DhopDeriv  (GaugeField &mat, const FermionField &U, const FermionField &V, int dag); |  | ||||||
|   void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); |  | ||||||
|   void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag); |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   // non-hermitian hopping term; half cb or both |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   void Dhop  (const FermionField &in, FermionField &out, int dag); |  | ||||||
|   void DhopOE(const FermionField &in, FermionField &out, int dag); |  | ||||||
|   void DhopEO(const FermionField &in, FermionField &out, int dag); |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   // Multigrid assistance; force term uses too |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   void Mdir(const FermionField &in, FermionField &out, int dir, int disp); |  | ||||||
|   void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   // Extra methods added by derived |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   void DerivInternal(StencilImpl &st,  |  | ||||||
| 		     DoubledGaugeField &U,DoubledGaugeField &UUU, |  | ||||||
| 		     GaugeField &mat,  |  | ||||||
| 		     const FermionField &A, const FermionField &B, int dag); |  | ||||||
|  |  | ||||||
|   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, |  | ||||||
|                     const FermionField &in, FermionField &out, int dag); |  | ||||||
|  |  | ||||||
|   // Constructor |  | ||||||
|   ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid, |  | ||||||
| 			   GridRedBlackCartesian &Hgrid, RealD _mass, |  | ||||||
| 			   RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, |  | ||||||
| 			   const ImplParams &p = ImplParams()); |  | ||||||
|  |  | ||||||
|   ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid, |  | ||||||
| 			   GridRedBlackCartesian &Hgrid, RealD _mass, |  | ||||||
| 			   const ImplParams &p = ImplParams()); |  | ||||||
|  |  | ||||||
|   ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass, |  | ||||||
| 			   const ImplParams &p = ImplParams()); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   // DoubleStore impl dependent |  | ||||||
|   void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat); |  | ||||||
|   void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); |  | ||||||
|   void ImportGauge(const GaugeField &_Uthin); |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|   // Data members require to support the functionality |  | ||||||
|   /////////////////////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   //    protected: |  | ||||||
|  public: |  | ||||||
|   // any other parameters of action ??? |  | ||||||
|  |  | ||||||
|   RealD mass; |  | ||||||
|   RealD u0; |  | ||||||
|   RealD c1; |  | ||||||
|   RealD c2; |  | ||||||
|  |  | ||||||
|   GridBase *_grid; |  | ||||||
|   GridBase *_cbgrid; |  | ||||||
|  |  | ||||||
|   // Defines the stencils for even and odd |  | ||||||
|   StencilImpl Stencil; |  | ||||||
|   StencilImpl StencilEven; |  | ||||||
|   StencilImpl StencilOdd; |  | ||||||
|  |  | ||||||
|   // Copy of the gauge field , with even and odd subsets |  | ||||||
|   DoubledGaugeField Umu; |  | ||||||
|   DoubledGaugeField UmuEven; |  | ||||||
|   DoubledGaugeField UmuOdd; |  | ||||||
|  |  | ||||||
|   DoubledGaugeField UUUmu; |  | ||||||
|   DoubledGaugeField UUUmuEven; |  | ||||||
|   DoubledGaugeField UUUmuOdd; |  | ||||||
|  |  | ||||||
|   LebesgueOrder Lebesgue; |  | ||||||
|   LebesgueOrder LebesgueEvenOdd; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; |  | ||||||
| typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; |  | ||||||
|  |  | ||||||
| } |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| @@ -1,355 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> |  | ||||||
| #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h> |  | ||||||
| #include <Grid/perfmon/PerfCount.h> |  | ||||||
|  |  | ||||||
| namespace Grid { |  | ||||||
| namespace QCD { |  | ||||||
|    |  | ||||||
| // S-direction is INNERMOST and takes no part in the parity. |  | ||||||
| const std::vector<int>  |  | ||||||
| ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4}); |  | ||||||
| const std::vector<int>  |  | ||||||
| ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3}); |  | ||||||
|  |  | ||||||
|   // 5d lattice for DWF. |  | ||||||
| template<class Impl> |  | ||||||
| ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat, |  | ||||||
| 							     GridCartesian         &FiveDimGrid, |  | ||||||
| 							     GridRedBlackCartesian &FiveDimRedBlackGrid, |  | ||||||
| 							     GridCartesian         &FourDimGrid, |  | ||||||
| 							     GridRedBlackCartesian &FourDimRedBlackGrid, |  | ||||||
| 							     RealD _mass, |  | ||||||
| 							     RealD _c1,RealD _c2, RealD _u0, |  | ||||||
| 							     const ImplParams &p) : |  | ||||||
|   Kernels(p), |  | ||||||
|   _FiveDimGrid        (&FiveDimGrid), |  | ||||||
|   _FiveDimRedBlackGrid(&FiveDimRedBlackGrid), |  | ||||||
|   _FourDimGrid        (&FourDimGrid), |  | ||||||
|   _FourDimRedBlackGrid(&FourDimRedBlackGrid), |  | ||||||
|   Stencil    (&FiveDimGrid,npoint,Even,directions,displacements), |  | ||||||
|   StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even |  | ||||||
|   StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd |  | ||||||
|   mass(_mass), |  | ||||||
|   c1(_c1), |  | ||||||
|   c2(_c2), |  | ||||||
|   u0(_u0), |  | ||||||
|   Umu(&FourDimGrid), |  | ||||||
|   UmuEven(&FourDimRedBlackGrid), |  | ||||||
|   UmuOdd (&FourDimRedBlackGrid), |  | ||||||
|   UUUmu(&FourDimGrid), |  | ||||||
|   UUUmuEven(&FourDimRedBlackGrid), |  | ||||||
|   UUUmuOdd(&FourDimRedBlackGrid), |  | ||||||
|   Lebesgue(&FourDimGrid), |  | ||||||
|   LebesgueEvenOdd(&FourDimRedBlackGrid), |  | ||||||
|   _tmp(&FiveDimRedBlackGrid) |  | ||||||
| { |  | ||||||
|  |  | ||||||
|   // some assertions |  | ||||||
|   assert(FiveDimGrid._ndimension==5); |  | ||||||
|   assert(FourDimGrid._ndimension==4); |  | ||||||
|   assert(FourDimRedBlackGrid._ndimension==4); |  | ||||||
|   assert(FiveDimRedBlackGrid._ndimension==5); |  | ||||||
|   assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction |  | ||||||
|  |  | ||||||
|   // extent of fifth dim and not spread out |  | ||||||
|   Ls=FiveDimGrid._fdimensions[0]; |  | ||||||
|   assert(FiveDimRedBlackGrid._fdimensions[0]==Ls); |  | ||||||
|   assert(FiveDimGrid._processors[0]         ==1); |  | ||||||
|   assert(FiveDimRedBlackGrid._processors[0] ==1); |  | ||||||
|  |  | ||||||
|   // Other dimensions must match the decomposition of the four-D fields  |  | ||||||
|   for(int d=0;d<4;d++){ |  | ||||||
|     assert(FiveDimGrid._processors[d+1]         ==FourDimGrid._processors[d]); |  | ||||||
|     assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); |  | ||||||
|     assert(FourDimRedBlackGrid._processors[d]   ==FourDimGrid._processors[d]); |  | ||||||
|  |  | ||||||
|     assert(FiveDimGrid._fdimensions[d+1]        ==FourDimGrid._fdimensions[d]); |  | ||||||
|     assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); |  | ||||||
|     assert(FourDimRedBlackGrid._fdimensions[d]  ==FourDimGrid._fdimensions[d]); |  | ||||||
|  |  | ||||||
|     assert(FiveDimGrid._simd_layout[d+1]        ==FourDimGrid._simd_layout[d]); |  | ||||||
|     assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]); |  | ||||||
|     assert(FourDimRedBlackGrid._simd_layout[d]  ==FourDimGrid._simd_layout[d]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if (Impl::LsVectorised) {  |  | ||||||
|  |  | ||||||
|     int nsimd = Simd::Nsimd(); |  | ||||||
|      |  | ||||||
|     // Dimension zero of the five-d is the Ls direction |  | ||||||
|     assert(FiveDimGrid._simd_layout[0]        ==nsimd); |  | ||||||
|     assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd); |  | ||||||
|  |  | ||||||
|     for(int d=0;d<4;d++){ |  | ||||||
|       assert(FourDimGrid._simd_layout[d]=1); |  | ||||||
|       assert(FourDimRedBlackGrid._simd_layout[d]=1); |  | ||||||
|       assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|   } else { |  | ||||||
|      |  | ||||||
|     // Dimension zero of the five-d is the Ls direction |  | ||||||
|     assert(FiveDimRedBlackGrid._simd_layout[0]==1); |  | ||||||
|     assert(FiveDimGrid._simd_layout[0]        ==1); |  | ||||||
|  |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   // Allocate the required comms buffer |  | ||||||
|   ImportGauge(_Uthin,_Ufat); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)  |  | ||||||
| { |  | ||||||
|   ImportGauge(_Uthin,_Uthin); |  | ||||||
| }; |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat) |  | ||||||
| { |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   // Double Store should take two fields for Naik and one hop separately. |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat ); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   // Apply scale factors to get the right fermion Kinetic term |  | ||||||
|   // Could pass coeffs into the double store to save work. |  | ||||||
|   // 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )  |  | ||||||
|   //////////////////////////////////////////////////////// |  | ||||||
|   for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|  |  | ||||||
|     auto U = PeekIndex<LorentzIndex>(Umu, mu); |  | ||||||
|     PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu ); |  | ||||||
|      |  | ||||||
|     U = PeekIndex<LorentzIndex>(Umu, mu+4); |  | ||||||
|     PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4); |  | ||||||
|  |  | ||||||
|     U = PeekIndex<LorentzIndex>(UUUmu, mu); |  | ||||||
|     PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu ); |  | ||||||
|      |  | ||||||
|     U = PeekIndex<LorentzIndex>(UUUmu, mu+4); |  | ||||||
|     PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   pickCheckerboard(Even, UmuEven, Umu); |  | ||||||
|   pickCheckerboard(Odd,  UmuOdd , Umu); |  | ||||||
|   pickCheckerboard(Even, UUUmuEven, UUUmu); |  | ||||||
|   pickCheckerboard(Odd,  UUUmuOdd, UUUmu); |  | ||||||
| } |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp) |  | ||||||
| { |  | ||||||
|   int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil |  | ||||||
|                     // we drop off the innermost fifth dimension |  | ||||||
|  |  | ||||||
|   Compressor compressor; |  | ||||||
|   Stencil.HaloExchange(in,compressor); |  | ||||||
|  |  | ||||||
|   parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){ |  | ||||||
|     for(int s=0;s<Ls;s++){ |  | ||||||
|       int sU=ss; |  | ||||||
|       int sF = s+Ls*sU;  |  | ||||||
|       Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st, |  | ||||||
|             DoubledGaugeField & U, |  | ||||||
|             DoubledGaugeField & UUU, |  | ||||||
|             GaugeField &mat, |  | ||||||
|             const FermionField &A, |  | ||||||
|             const FermionField &B, |  | ||||||
|             int dag) |  | ||||||
| { |  | ||||||
|   // No force terms in multi-rhs solver staggered |  | ||||||
|   assert(0); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat, |  | ||||||
| 				      const FermionField &A, |  | ||||||
| 				      const FermionField &B, |  | ||||||
| 				      int dag) |  | ||||||
| { |  | ||||||
|   assert(0); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat, |  | ||||||
| 					const FermionField &A, |  | ||||||
| 					const FermionField &B, |  | ||||||
| 					int dag) |  | ||||||
| { |  | ||||||
|   assert(0); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat, |  | ||||||
| 					const FermionField &A, |  | ||||||
| 					const FermionField &B, |  | ||||||
| 					int dag) |  | ||||||
| { |  | ||||||
|   assert(0); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, |  | ||||||
| 						    DoubledGaugeField & U,DoubledGaugeField & UUU, |  | ||||||
| 						    const FermionField &in, FermionField &out,int dag) |  | ||||||
| { |  | ||||||
|   Compressor compressor; |  | ||||||
|   int LLs = in._grid->_rdimensions[0]; |  | ||||||
|   st.HaloExchange(in,compressor); |  | ||||||
|    |  | ||||||
|   // Dhop takes the 4d grid from U, and makes a 5d index for fermion |  | ||||||
|   if (dag == DaggerYes) { |  | ||||||
|     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { |  | ||||||
|       int sU=ss; |  | ||||||
|       Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out); |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { |  | ||||||
|       int sU=ss; |  | ||||||
| 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) |  | ||||||
| { |  | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |  | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |  | ||||||
|  |  | ||||||
|   assert(in.checkerboard==Even); |  | ||||||
|   out.checkerboard = Odd; |  | ||||||
|  |  | ||||||
|   DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag); |  | ||||||
| } |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) |  | ||||||
| { |  | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |  | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |  | ||||||
|  |  | ||||||
|   assert(in.checkerboard==Odd); |  | ||||||
|   out.checkerboard = Even; |  | ||||||
|  |  | ||||||
|   DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag); |  | ||||||
| } |  | ||||||
| template<class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) |  | ||||||
| { |  | ||||||
|   conformable(in._grid,FermionGrid()); // verifies full grid |  | ||||||
|   conformable(in._grid,out._grid); |  | ||||||
|  |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|  |  | ||||||
|   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////////////// |  | ||||||
| // Implement the general interface. Here we use SAME mass on all slices |  | ||||||
| ///////////////////////////////////////////////////////////////////////// |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) { |  | ||||||
|   DhopDir(in, out, dir, disp); |  | ||||||
| } |  | ||||||
| template <class Impl> |  | ||||||
| RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   Dhop(in, out, DaggerNo); |  | ||||||
|   return axpy_norm(out, mass, in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   Dhop(in, out, DaggerYes); |  | ||||||
|   return axpy_norm(out, mass, in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) { |  | ||||||
|   if (in.checkerboard == Odd) { |  | ||||||
|     DhopEO(in, out, DaggerNo); |  | ||||||
|   } else { |  | ||||||
|     DhopOE(in, out, DaggerNo); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) { |  | ||||||
|   if (in.checkerboard == Odd) { |  | ||||||
|     DhopEO(in, out, DaggerYes); |  | ||||||
|   } else { |  | ||||||
|     DhopOE(in, out, DaggerYes); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   typename FermionField::scalar_type scal(mass); |  | ||||||
|   out = scal * in; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   Mooee(in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   out = (1.0 / (mass)) * in; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template <class Impl> |  | ||||||
| void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in, |  | ||||||
|                                       FermionField &out) { |  | ||||||
|   out.checkerboard = in.checkerboard; |  | ||||||
|   MooeeInv(in, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D); |  | ||||||
| FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D); |  | ||||||
|    |  | ||||||
| }} |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,167 +0,0 @@ | |||||||
|  |  | ||||||
|     /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: AzusaYamaguchi <ayamaguc@staffmail.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #ifndef  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H |  | ||||||
| #define  GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H |  | ||||||
|  |  | ||||||
| namespace Grid { |  | ||||||
| namespace QCD { |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // This is the 4d red black case appropriate to support |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|     class ImprovedStaggeredFermion5DStatic {  |  | ||||||
|     public: |  | ||||||
|       // S-direction is INNERMOST and takes no part in the parity. |  | ||||||
|       static const std::vector<int> directions; |  | ||||||
|       static const std::vector<int> displacements; |  | ||||||
|       const int npoint = 16; |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     template<class Impl> |  | ||||||
|     class ImprovedStaggeredFermion5D :  public StaggeredKernels<Impl>, public ImprovedStaggeredFermion5DStatic  |  | ||||||
|     { |  | ||||||
|     public: |  | ||||||
|       INHERIT_IMPL_TYPES(Impl); |  | ||||||
|       typedef StaggeredKernels<Impl> Kernels; |  | ||||||
|  |  | ||||||
|       FermionField _tmp; |  | ||||||
|       FermionField &tmp(void) { return _tmp; } |  | ||||||
|  |  | ||||||
|       /////////////////////////////////////////////////////////////// |  | ||||||
|       // Implement the abstract base |  | ||||||
|       /////////////////////////////////////////////////////////////// |  | ||||||
|       GridBase *GaugeGrid(void)              { return _FourDimGrid ;} |  | ||||||
|       GridBase *GaugeRedBlackGrid(void)      { return _FourDimRedBlackGrid ;} |  | ||||||
|       GridBase *FermionGrid(void)            { return _FiveDimGrid;} |  | ||||||
|       GridBase *FermionRedBlackGrid(void)    { return _FiveDimRedBlackGrid;} |  | ||||||
|  |  | ||||||
|       // full checkerboard operations; leave unimplemented as abstract for now |  | ||||||
|       RealD  M    (const FermionField &in, FermionField &out); |  | ||||||
|       RealD  Mdag (const FermionField &in, FermionField &out); |  | ||||||
|  |  | ||||||
|       // half checkerboard operations |  | ||||||
|       void   Meooe       (const FermionField &in, FermionField &out); |  | ||||||
|       void   Mooee       (const FermionField &in, FermionField &out); |  | ||||||
|       void   MooeeInv    (const FermionField &in, FermionField &out); |  | ||||||
|  |  | ||||||
|       void   MeooeDag    (const FermionField &in, FermionField &out); |  | ||||||
|       void   MooeeDag    (const FermionField &in, FermionField &out); |  | ||||||
|       void   MooeeInvDag (const FermionField &in, FermionField &out); |  | ||||||
|  |  | ||||||
|       void   Mdir   (const FermionField &in, FermionField &out,int dir,int disp); |  | ||||||
|       void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); |  | ||||||
|  |  | ||||||
|       // These can be overridden by fancy 5d chiral action |  | ||||||
|       void DhopDeriv  (GaugeField &mat,const FermionField &U,const FermionField &V,int dag); |  | ||||||
|       void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); |  | ||||||
|       void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); |  | ||||||
|  |  | ||||||
|       // Implement hopping term non-hermitian hopping term; half cb or both |  | ||||||
|       void Dhop  (const FermionField &in, FermionField &out,int dag); |  | ||||||
|       void DhopOE(const FermionField &in, FermionField &out,int dag); |  | ||||||
|       void DhopEO(const FermionField &in, FermionField &out,int dag); |  | ||||||
|  |  | ||||||
|      |  | ||||||
|     /////////////////////////////////////////////////////////////// |  | ||||||
|     // New methods added  |  | ||||||
|     /////////////////////////////////////////////////////////////// |  | ||||||
|     void DerivInternal(StencilImpl & st, |  | ||||||
| 		       DoubledGaugeField & U, |  | ||||||
| 		       DoubledGaugeField & UUU, |  | ||||||
| 		       GaugeField &mat, |  | ||||||
| 		       const FermionField &A, |  | ||||||
| 		       const FermionField &B, |  | ||||||
| 		       int dag); |  | ||||||
|      |  | ||||||
|     void DhopInternal(StencilImpl & st, |  | ||||||
| 		      LebesgueOrder &lo, |  | ||||||
| 		      DoubledGaugeField &U, |  | ||||||
| 		      DoubledGaugeField &UUU, |  | ||||||
| 		      const FermionField &in,  |  | ||||||
| 		      FermionField &out, |  | ||||||
| 		      int dag); |  | ||||||
|      |  | ||||||
|     // Constructors |  | ||||||
|     ImprovedStaggeredFermion5D(GaugeField &_Uthin, |  | ||||||
| 			       GaugeField &_Ufat, |  | ||||||
| 			       GridCartesian         &FiveDimGrid, |  | ||||||
| 			       GridRedBlackCartesian &FiveDimRedBlackGrid, |  | ||||||
| 			       GridCartesian         &FourDimGrid, |  | ||||||
| 			       GridRedBlackCartesian &FourDimRedBlackGrid, |  | ||||||
| 			       double _mass, |  | ||||||
| 			       RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0, |  | ||||||
| 			       const ImplParams &p= ImplParams()); |  | ||||||
|      |  | ||||||
|     // DoubleStore |  | ||||||
|     void ImportGauge(const GaugeField &_U); |  | ||||||
|     void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); |  | ||||||
|      |  | ||||||
|     /////////////////////////////////////////////////////////////// |  | ||||||
|     // Data members require to support the functionality |  | ||||||
|     /////////////////////////////////////////////////////////////// |  | ||||||
|   public: |  | ||||||
|      |  | ||||||
|     GridBase *_FourDimGrid; |  | ||||||
|     GridBase *_FourDimRedBlackGrid; |  | ||||||
|     GridBase *_FiveDimGrid; |  | ||||||
|     GridBase *_FiveDimRedBlackGrid; |  | ||||||
|      |  | ||||||
|     RealD mass; |  | ||||||
|     RealD c1; |  | ||||||
|     RealD c2; |  | ||||||
|     RealD u0; |  | ||||||
|     int Ls; |  | ||||||
|      |  | ||||||
|     //Defines the stencils for even and odd |  | ||||||
|     StencilImpl Stencil;  |  | ||||||
|     StencilImpl StencilEven;  |  | ||||||
|     StencilImpl StencilOdd;  |  | ||||||
|      |  | ||||||
|     // Copy of the gauge field , with even and odd subsets |  | ||||||
|     DoubledGaugeField Umu; |  | ||||||
|     DoubledGaugeField UmuEven; |  | ||||||
|     DoubledGaugeField UmuOdd; |  | ||||||
|  |  | ||||||
|     DoubledGaugeField UUUmu; |  | ||||||
|     DoubledGaugeField UUUmuEven; |  | ||||||
|     DoubledGaugeField UUUmuOdd; |  | ||||||
|      |  | ||||||
|     LebesgueOrder Lebesgue; |  | ||||||
|     LebesgueOrder LebesgueEvenOdd; |  | ||||||
|      |  | ||||||
|     // Comms buffer |  | ||||||
|     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf; |  | ||||||
|      |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
| }} |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef  GRID_QCD_MOBIUS_FERMION_H | #ifndef  GRID_QCD_MOBIUS_FERMION_H | ||||||
| #define  GRID_QCD_MOBIUS_FERMION_H | #define  GRID_QCD_MOBIUS_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H | #ifndef  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H | ||||||
| #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H | #define  GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H | #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H | ||||||
| #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H | #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H | #ifndef  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H | ||||||
| #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H | #define  OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H | #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H | ||||||
| #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H | #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H | #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H | ||||||
| #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H | #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H | #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H | ||||||
| #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H | #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H | #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H | ||||||
| #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H | #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H | ||||||
|  |  | ||||||
| #include <Grid/qcd/action/fermion/FermionCore.h> | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
|   | |||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user