diff --git a/.travis.yml b/.travis.yml
index cdea0f9f..ad4e5b73 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,68 +9,6 @@ matrix:
- os: osx
osx_image: xcode8.3
compiler: clang
- - compiler: gcc
- dist: trusty
- sudo: required
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.9
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: VERSION=-4.9
- - compiler: gcc
- dist: trusty
- sudo: required
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-5
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: VERSION=-5
- - compiler: clang
- dist: trusty
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.8
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- - compiler: clang
- dist: trusty
- addons:
- apt:
- sources:
- - ubuntu-toolchain-r-test
- packages:
- - g++-4.8
- - libmpfr-dev
- - libgmp-dev
- - libmpc-dev
- - libopenmpi-dev
- - openmpi-bin
- - binutils-dev
- env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
before_install:
- export GRIDDIR=`pwd`
@@ -106,8 +44,4 @@ script:
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check
- - echo make clean
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
- - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
diff --git a/README.md b/README.md
index 1e0988f3..13dd6996 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,4 @@
-# Grid
-
-
- Last stable release |
-
-
- |
-
-
- Development branch |
-
-
- |
-
-
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
**Data parallel C++ mathematical object library.**
diff --git a/TODO b/TODO
index 001c6c0c..83bfda5e 100644
--- a/TODO
+++ b/TODO
@@ -2,18 +2,20 @@ TODO:
---------------
Large item work list:
-1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
+1)- BG/Q port and check ; Andrew says ok.
2)- Christoph's local basis expansion Lanczos
-3)- BG/Q port and check
-4)- Precision conversion and sort out localConvert <-- partial
- - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
-5)- Physical propagator interface
-6)- Conserved currents
-7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
-8)- HDCR resume
+--
+3a)- RNG I/O in ILDG/SciDAC (minor)
+3b)- Precision conversion and sort out localConvert <-- partial/easy
+3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
+4)- Physical propagator interface
+5)- Conserved currents
+6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
+7)- HDCR resume
Recent DONE
+-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O ; <-- DONE ; bmark cori
-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
-- GaugeFix into central location <-- DONE
-- Scidac and Ildg metadata handling <-- DONE
diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc
new file mode 100644
index 00000000..666e4830
--- /dev/null
+++ b/benchmarks/Benchmark_ITT.cc
@@ -0,0 +1,800 @@
+ /*************************************************************************************
+
+ Grid physics library, www.github.com/paboyle/Grid
+
+ Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
+
+ Copyright (C) 2015
+
+Author: Peter Boyle
+Author: paboyle
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+ /* END LEGAL */
+#include
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+typedef WilsonFermion5D WilsonFermion5DR;
+typedef WilsonFermion5D WilsonFermion5DF;
+typedef WilsonFermion5D WilsonFermion5DD;
+
+
+std::vector L_list;
+std::vector Ls_list;
+std::vector mflop_list;
+
+double mflop_ref;
+double mflop_ref_err;
+
+int NN_global;
+
+struct time_statistics{
+ double mean;
+ double err;
+ double min;
+ double max;
+
+ void statistics(std::vector v){
+ double sum = std::accumulate(v.begin(), v.end(), 0.0);
+ mean = sum / v.size();
+
+ std::vector diff(v.size());
+ std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
+ double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
+ err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
+
+ auto result = std::minmax_element(v.begin(), v.end());
+ min = *result.first;
+ max = *result.second;
+}
+};
+
+void comms_header(){
+ std::cout < simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
+ std::vector mpi_layout = GridDefaultMpi();
+
+ for(int mu=0;mu1) nmu++;
+
+ std::vector t_time(Nloop);
+ time_statistics timestat;
+
+ std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ int ncomm;
+ double dbytes;
+ std::vector times(Nloop);
+ for(int i=0;i1 ) {
+
+ int xmit_to_rank;
+ int recv_from_rank;
+ if ( dir == mu ) {
+ int comm_proc=1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ } else {
+ int comm_proc = mpi_layout[mu]-1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ }
+ tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+ (void *)&rbuf[dir][0], recv_from_rank,
+ bytes,dir);
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
+ ncomm++;
+
+#ifdef GRID_OMP
+#pragma omp atomic
+#endif
+ dbytes+=tbytes;
+ }
+ }
+ Grid.Barrier();
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+ }
+
+ timestat.statistics(t_time);
+ // for(int i=0;i > LatticeVec;
+ typedef iVector Vec;
+
+ std::vector simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
+ std::vector mpi_layout = GridDefaultMpi();
+
+ std::cout<({45,12,81,9}));
+ for(int lat=8;lat<=lmax;lat+=4){
+
+ std::vector latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
+ int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+
+ NP= Grid.RankCount();
+ NN =Grid.NodeCount();
+
+ Vec rn ; random(sRNG,rn);
+
+ LatticeVec z(&Grid); z=rn;
+ LatticeVec x(&Grid); x=rn;
+ LatticeVec y(&Grid); y=rn;
+ double a=2.0;
+
+ uint64_t Nloop=NLOOP;
+
+ double start=usecond();
+ for(int i=0;i mflops_all;
+
+ ///////////////////////////////////////////////////////
+ // Set/Get the layout & grid size
+ ///////////////////////////////////////////////////////
+ int threads = GridThread::GetThreads();
+ std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4);
+ std::vector local({L,L,L,L});
+
+ GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}),
+ GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+ uint64_t NP = TmpGrid->RankCount();
+ uint64_t NN = TmpGrid->NodeCount();
+ NN_global=NN;
+ uint64_t SHM=NP/NN;
+
+ std::vector internal;
+ if ( SHM == 1 ) internal = std::vector({1,1,1,1});
+ else if ( SHM == 2 ) internal = std::vector({2,1,1,1});
+ else if ( SHM == 4 ) internal = std::vector({2,2,1,1});
+ else if ( SHM == 8 ) internal = std::vector({2,2,2,1});
+ else assert(0);
+
+ std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+ std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+ ///////// Welcome message ////////////
+ std::cout< seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ GridParallelRNG RNG5(sFGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ ///////// Source preparation ////////////
+ LatticeFermion src (sFGrid); random(RNG5,src);
+ LatticeFermion tmp (sFGrid);
+
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+
+ WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
+ LatticeFermion src_e (sFrbGrid);
+ LatticeFermion src_o (sFrbGrid);
+ LatticeFermion r_e (sFrbGrid);
+ LatticeFermion r_o (sFrbGrid);
+ LatticeFermion r_eo (sFGrid);
+ LatticeFermion err (sFGrid);
+ {
+
+ pickCheckerboard(Even,src_e,src);
+ pickCheckerboard(Odd,src_o,src);
+
+#if defined(AVX512)
+ const int num_cases = 6;
+ std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
+#else
+ const int num_cases = 4;
+ std::string fmt("U/S ; U/O ; G/S ; G/O ");
+#endif
+ controls Cases [] = {
+#ifdef AVX512
+ { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+#endif
+ { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential },
+ { QCD::WilsonKernelsStatic::OptGeneric , QCD::WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicySequential }
+ };
+
+ for(int c=0;cBarrier();
+ for(int i=0;iBarrier();
+ double t1=usecond();
+
+ sDw.ZeroCounters();
+ time_statistics timestat;
+ std::vector t_time(ncall);
+ for(uint64_t i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops;
+ if ( mflops mflops_all;
+
+ ///////////////////////////////////////////////////////
+ // Set/Get the layout & grid size
+ ///////////////////////////////////////////////////////
+ int threads = GridThread::GetThreads();
+ std::vector mpi = GridDefaultMpi(); assert(mpi.size()==4);
+ std::vector local({L,L,L,L});
+
+ GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(std::vector({64,64,64,64}),
+ GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
+ uint64_t NP = TmpGrid->RankCount();
+ uint64_t NN = TmpGrid->NodeCount();
+ NN_global=NN;
+ uint64_t SHM=NP/NN;
+
+ std::vector internal;
+ if ( SHM == 1 ) internal = std::vector({1,1,1,1});
+ else if ( SHM == 2 ) internal = std::vector({2,1,1,1});
+ else if ( SHM == 4 ) internal = std::vector({2,2,1,1});
+ else if ( SHM == 8 ) internal = std::vector({2,2,2,1});
+ else assert(0);
+
+ std::vector nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
+ std::vector latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
+
+ ///////// Welcome message ////////////
+ std::cout< seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ ///////// Source preparation ////////////
+ LatticeFermion src (FGrid); random(RNG5,src);
+ LatticeFermion ref (FGrid);
+ LatticeFermion tmp (FGrid);
+
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
+
+ DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
+
+ ////////////////////////////////////
+ // Naive wilson implementation
+ ////////////////////////////////////
+ {
+ LatticeGaugeField Umu5d(FGrid);
+ std::vector U(4,FGrid);
+ for(int ss=0;ssoSites();ss++){
+ for(int s=0;s(Umu5d,mu);
+ }
+ for(int mu=0;muBarrier();
+ for(int i=0;iBarrier();
+ double t1=usecond();
+ // uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+ // if (ncall < 500) ncall = 500;
+ uint64_t ncall = 1000;
+
+ FGrid->Broadcast(0,&ncall,sizeof(ncall));
+
+ // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall);
+ for(uint64_t i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops;
+ if ( mflops({8,2,2,2});
+#else
+ LebesgueOrder::Block = std::vector({2,2,2,2});
+#endif
+ Benchmark::Decomposition();
+
+ int do_memory=1;
+ int do_comms =1;
+ int do_su3 =0;
+ int do_wilson=1;
+ int do_dwf =1;
+
+ if ( do_su3 ) {
+ // empty for now
+ }
+#if 1
+ int sel=2;
+ std::vector L_list({8,12,16,24});
+#else
+ int sel=1;
+ std::vector L_list({8,12});
+#endif
+ int selm1=sel-1;
+ std::vector robust_list;
+
+ std::vector wilson;
+ std::vector dwf4;
+ std::vector dwf5;
+
+ if ( do_wilson ) {
+ int Ls=1;
+ std::cout<1) ) {
+ std::cout<1) nmu++;
std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
std::cout< latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
- std::vector > xbuf(8,std::vector(lat*lat*lat*Ls));
- std::vector > rbuf(8,std::vector(lat*lat*lat*Ls));
+ std::vector > xbuf(8);
+ std::vector > rbuf(8);
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ for(int mu=0;mu<8;mu++){
+ xbuf[mu].resize(lat*lat*lat*Ls);
+ rbuf[mu].resize(lat*lat*lat*Ls);
+ // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat,lat,lat,lat});
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
- std::vector > xbuf(8,std::vector(lat*lat*lat*Ls));
- std::vector > rbuf(8,std::vector(lat*lat*lat*Ls));
+ std::vector > xbuf(8);
+ std::vector > rbuf(8);
+ for(int mu=0;mu<8;mu++){
+ xbuf[mu].resize(lat*lat*lat*Ls);
+ rbuf[mu].resize(lat*lat*lat*Ls);
+ // std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] < latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -299,7 +308,7 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
- bytes);
+ bytes,mu);
comm_proc = mpi_layout[mu]-1;
@@ -310,11 +319,11 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
- bytes);
+ bytes,mu+4);
}
}
- Grid.StencilSendToRecvFromComplete(requests);
+ Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
@@ -346,7 +355,7 @@ int main (int argc, char ** argv)
header();
for(int lat=4;lat<=maxlat;lat+=4){
- for(int Ls=8;Ls<=32;Ls*=2){
+ for(int Ls=8;Ls<=8;Ls*=2){
std::vector latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@@ -393,8 +402,8 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
- bytes);
- Grid.StencilSendToRecvFromComplete(requests);
+ bytes,mu);
+ Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0);
comm_proc = mpi_layout[mu]-1;
@@ -406,8 +415,8 @@ int main (int argc, char ** argv)
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
- bytes);
- Grid.StencilSendToRecvFromComplete(requests);
+ bytes,mu+4);
+ Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
}
@@ -436,5 +445,97 @@ int main (int argc, char ** argv)
}
}
+
+
+ std::cout< latt_size ({lat*mpi_layout[0],
+ lat*mpi_layout[1],
+ lat*mpi_layout[2],
+ lat*mpi_layout[3]});
+
+ GridCartesian Grid(latt_size,simd_layout,mpi_layout);
+ RealD Nrank = Grid._Nprocessors;
+ RealD Nnode = Grid.NodeCount();
+ RealD ppn = Nrank/Nnode;
+
+ std::vector xbuf(8);
+ std::vector rbuf(8);
+ Grid.ShmBufferFreeAll();
+ for(int d=0;d<8;d++){
+ xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+ }
+
+ int ncomm;
+ int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+ double dbytes;
+ for(int i=0;i requests;
+ dbytes=0;
+ ncomm=0;
+
+ parallel_for(int dir=0;dir<8;dir++){
+
+ double tbytes;
+ int mu =dir % 4;
+
+ if (mpi_layout[mu]>1 ) {
+
+ ncomm++;
+ int xmit_to_rank;
+ int recv_from_rank;
+ if ( dir == mu ) {
+ int comm_proc=1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ } else {
+ int comm_proc = mpi_layout[mu]-1;
+ Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+ }
+
+ tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+ (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
+
+#pragma omp atomic
+ dbytes+=tbytes;
+ }
+ }
+ Grid.Barrier();
+ double stop=usecond();
+ t_time[i] = stop-start; // microseconds
+ }
+
+ timestat.statistics(t_time);
+
+ dbytes=dbytes*ppn;
+ double xbytes = dbytes*0.5;
+ double rbytes = dbytes*0.5;
+ double bidibytes = dbytes;
+
+
+ std::cout< latt4 = GridDefaultLatt();
- const int Ls=16;
+ int Ls=16;
+ for(int i=0;i> Ls;
+ }
+
+
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@@ -503,9 +509,9 @@ int main (int argc, char ** argv)
std::cout<
+#include
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+template
+struct scal {
+ d internal;
+};
+
+ Gamma::Algebra Gmu [] = {
+ Gamma::Algebra::GammaX,
+ Gamma::Algebra::GammaY,
+ Gamma::Algebra::GammaZ,
+ Gamma::Algebra::GammaT
+ };
+
+typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
+typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
+
+
+
+int main (int argc, char ** argv)
+{
+ Grid_init(&argc,&argv);
+
+ int Ls=16;
+ for(int i=0;i> Ls;
+ }
+
+
+ int threads = GridThread::GetThreads();
+ std::cout< latt4 = GridDefaultLatt();
+
+ GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
+ GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+ GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
+ GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
+
+ std::vector seeds4({1,2,3,4});
+ std::vector seeds5({5,6,7,8});
+
+ std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
+ GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
+ std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
+ GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
+ std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
+
+ GparityLatticeFermionF src (FGrid); random(RNG5,src);
+ RealD N2 = 1.0/::sqrt(norm2(src));
+ src = src*N2;
+
+ GparityLatticeFermionF result(FGrid); result=zero;
+ GparityLatticeFermionF ref(FGrid); ref=zero;
+ GparityLatticeFermionF tmp(FGrid);
+ GparityLatticeFermionF err(FGrid);
+
+ std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
+ LatticeGaugeFieldF Umu(UGrid);
+ SU3::HotConfiguration(RNG4,Umu);
+ std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
+
+ RealD mass=0.1;
+ RealD M5 =1.8;
+
+ RealD NP = UGrid->_Nprocessors;
+ RealD NN = UGrid->NodeCount();
+
+ std::cout << GridLogMessage<< "*****************************************************************" <Barrier();
+ Dw.ZeroCounters();
+ Dw.Dhop(src,result,0);
+ std::cout<Barrier();
+
+ double volume=Ls; for(int mu=0;muBarrier();
+ DwH.ZeroCounters();
+ DwH.Dhop(src,result,0);
+ double t0=usecond();
+ for(int i=0;iBarrier();
+
+ double volume=Ls; for(int mu=0;muBarrier();
+ DwD.ZeroCounters();
+ DwD.Dhop(src_d,result_d,0);
+ std::cout<Barrier();
+
+ double volume=Ls; for(int mu=0;mu simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector mpi_layout = GridDefaultMpi();
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
- GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
+ GridRedBlackCartesian RBGrid(&Grid);
int threads = GridThread::GetThreads();
std::cout<
#include
#include
#include
+#include
#include
#include
@@ -44,30 +45,16 @@ Author: Peter Boyle
#include
#include
#include
-
-// Lanczos support
-//#include
+#include
+#include
#include
#include
#include
-// Eigen/lanczos
// EigCg
-// MCR
// Pcg
-// Multishift CG
// Hdcg
// GCR
// etc..
-// integrator/Leapfrog
-// integrator/Omelyan
-// integrator/ForceGradient
-
-// montecarlo/hmc
-// montecarlo/rhmc
-// montecarlo/metropolis
-// etc...
-
-
#endif
diff --git a/lib/algorithms/FFT.h b/lib/algorithms/FFT.h
index 240f338b..ec558ad9 100644
--- a/lib/algorithms/FFT.h
+++ b/lib/algorithms/FFT.h
@@ -230,6 +230,7 @@ namespace Grid {
// Barrel shift and collect global pencil
std::vector lcoor(Nd), gcoor(Nd);
result = source;
+ int pc = processor_coor[dim];
for(int p=0;plSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
- cbuf[dim]+=p*L;
+ cbuf[dim]+=((pc+p) % processors[dim])*L;
+ // cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
}
}
@@ -278,7 +280,6 @@ namespace Grid {
flops+= flops_call*NN;
// writing out result
- int pc = processor_coor[dim];
PARALLEL_REGION
{
std::vector clbuf(Nd), cgbuf(Nd);
diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h
index 6cb77296..f1b8820e 100644
--- a/lib/algorithms/LinearOperator.h
+++ b/lib/algorithms/LinearOperator.h
@@ -162,15 +162,10 @@ namespace Grid {
_Mat.M(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
- ComplexD dot;
-
_Mat.M(in,out);
- dot= innerProduct(in,out);
- n1=real(dot);
-
- dot = innerProduct(out,out);
- n2=real(dot);
+ ComplexD dot= innerProduct(in,out); n1=real(dot);
+ n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.M(in,out);
@@ -192,10 +187,10 @@ namespace Grid {
ni=Mpc(in,tmp);
no=MpcDag(tmp,out);
}
- void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
MpcDagMpc(in,out,n1,n2);
}
- void HermOp(const Field &in, Field &out){
+ virtual void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
@@ -212,7 +207,6 @@ namespace Grid {
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
-
};
template
class SchurDiagMooeeOperator : public SchurOperatorBase {
@@ -270,7 +264,6 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in);
}
};
-
template
class SchurDiagTwoOperator : public SchurOperatorBase {
protected:
@@ -299,6 +292,45 @@ namespace Grid {
return axpy_norm(out,-1.0,tmp,in);
}
};
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+ // Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
+ // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+ template using SchurDiagOneRH = SchurDiagTwoOperator ;
+ template using SchurDiagOneLH = SchurDiagOneOperator ;
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+ // Staggered use
+ ///////////////////////////////////////////////////////////////////////////////////////////////////
+ template
+ class SchurStaggeredOperator : public SchurOperatorBase {
+ protected:
+ Matrix &_Mat;
+ public:
+ SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
+ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+ n2 = Mpc(in,out);
+ ComplexD dot= innerProduct(in,out);
+ n1 = real(dot);
+ }
+ virtual void HermOp(const Field &in, Field &out){
+ Mpc(in,out);
+ }
+ virtual RealD Mpc (const Field &in, Field &out) {
+ Field tmp(in._grid);
+ _Mat.Meooe(in,tmp);
+ _Mat.MooeeInv(tmp,out);
+ _Mat.MeooeDag(out,tmp);
+ _Mat.Mooee(in,out);
+ return axpy_norm(out,-1.0,tmp,out);
+ }
+ virtual RealD MpcDag (const Field &in, Field &out){
+ return Mpc(in,out);
+ }
+ virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
+ assert(0);// Never need with staggered
+ }
+ };
+ template using SchurStagOperator = SchurStaggeredOperator;
/////////////////////////////////////////////////////////////
diff --git a/lib/algorithms/approx/Chebyshev.h b/lib/algorithms/approx/Chebyshev.h
index 2793f138..f8c21a05 100644
--- a/lib/algorithms/approx/Chebyshev.h
+++ b/lib/algorithms/approx/Chebyshev.h
@@ -8,6 +8,7 @@
Author: Peter Boyle
Author: paboyle
+Author: Christoph Lehner
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -193,6 +194,47 @@ namespace Grid {
return sum;
};
+ RealD approxD(RealD x)
+ {
+ RealD Un;
+ RealD Unm;
+ RealD Unp;
+
+ RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
+
+ RealD U0=1;
+ RealD U1=2*y;
+
+ RealD sum;
+ sum = Coeffs[1]*U0;
+ sum+= Coeffs[2]*U1*2.0;
+
+ Un =U1;
+ Unm=U0;
+ for(int i=2;i::quiet_NaN();
+ }
+
// Implement the required interface
void operator() (LinearOperatorBase &Linop, const Field &in, Field &out) {
diff --git a/lib/algorithms/approx/Forecast.h b/lib/algorithms/approx/Forecast.h
new file mode 100644
index 00000000..87eb84a6
--- /dev/null
+++ b/lib/algorithms/approx/Forecast.h
@@ -0,0 +1,152 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./lib/algorithms/approx/Forecast.h
+
+Copyright (C) 2015
+
+Author: Peter Boyle
+Author: paboyle
+Author: David Murphy
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/* END LEGAL */
+
+#ifndef INCLUDED_FORECAST_H
+#define INCLUDED_FORECAST_H
+
+namespace Grid {
+
+ // Abstract base class.
+ // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
+ // and returns a forecasted solution to the system D*psi = phi (psi).
+ template
+ class Forecast
+ {
+ public:
+ virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector& chi) = 0;
+ };
+
+ // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
+ // used to forecast solutions across poles of the EOFA heatbath.
+ //
+ // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
+ template
+ class ChronoForecast : public Forecast
+ {
+ public:
+ Field operator()(Matrix &Mat, const Field& phi, const std::vector& prev_solns)
+ {
+ int degree = prev_solns.size();
+ Field chi(phi); // forecasted solution
+
+ // Trivial cases
+ if(degree == 0){ chi = zero; return chi; }
+ else if(degree == 1){ return prev_solns[0]; }
+
+ RealD dot;
+ ComplexD xp;
+ Field r(phi); // residual
+ Field Mv(phi);
+ std::vector v(prev_solns); // orthonormalized previous solutions
+ std::vector MdagMv(degree,phi);
+
+ // Array to hold the matrix elements
+ std::vector> G(degree, std::vector(degree));
+
+ // Solution and source vectors
+ std::vector a(degree);
+ std::vector b(degree);
+
+ // Orthonormalize the vector basis
+ for(int i=0; i std::abs(G[k][k])){ k = j; } }
+ if(k != i){
+ xp = b[k];
+ b[k] = b[i];
+ b[i] = xp;
+ for(int j=0; j=0; i--){
+ a[i] = 0.0;
+ for(int j=i+1; j &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD);
tmp = B - AD;
+ //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+ //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
+ //std::cout << GridLogMessage << " m_rr " << m_rr< &Linop, const Field &B, Field &X)
MatrixTimer.Start();
Linop.HermOp(D, Z);
MatrixTimer.Stop();
+ //std::cout << GridLogMessage << " norm2 Z " <
+Author: paboyle
+Author: Chulwoo Jung
+Author: Christoph Lehner
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ See the full license in the file "LICENSE" in the top level distribution directory
+ *************************************************************************************/
+ /* END LEGAL */
+#ifndef GRID_BIRL_H
+#define GRID_BIRL_H
+
+#include //memset
+
+#include
+#include
+
+#include
+#include
+#include
+
+namespace Grid {
+
+/////////////////////////////////////////////////////////////
+// Implicitly restarted lanczos
+/////////////////////////////////////////////////////////////
+
+ template
+ class BlockImplicitlyRestartedLanczos {
+
+ const RealD small = 1.0e-16;
+public:
+ int lock;
+ int get;
+ int Niter;
+ int converged;
+
+ int Nminres; // Minimum number of restarts; only check for convergence after
+ int Nstop; // Number of evecs checked for convergence
+ int Nk; // Number of converged sought
+ int Np; // Np -- Number of spare vecs in kryloc space
+ int Nm; // Nm -- total number of vectors
+
+ int orth_period;
+
+ RealD OrthoTime;
+
+ RealD eresid, betastp;
+ SortEigen _sort;
+ LinearFunction &_HermOp;
+ LinearFunction &_HermOpTest;
+ /////////////////////////
+ // Constructor
+ /////////////////////////
+
+ BlockImplicitlyRestartedLanczos(
+ LinearFunction & HermOp,
+ LinearFunction & HermOpTest,
+ int _Nstop, // sought vecs
+ int _Nk, // sought vecs
+ int _Nm, // spare vecs
+ RealD _eresid, // resid in lmdue deficit
+ RealD _betastp, // if beta(k) < betastp: converged
+ int _Niter, // Max iterations
+ int _Nminres, int _orth_period = 1) :
+ _HermOp(HermOp),
+ _HermOpTest(HermOpTest),
+ Nstop(_Nstop),
+ Nk(_Nk),
+ Nm(_Nm),
+ eresid(_eresid),
+ betastp(_betastp),
+ Niter(_Niter),
+ Nminres(_Nminres),
+ orth_period(_orth_period)
+ {
+ Np = Nm-Nk; assert(Np>0);
+ };
+
+ BlockImplicitlyRestartedLanczos(
+ LinearFunction & HermOp,
+ LinearFunction & HermOpTest,
+ int _Nk, // sought vecs
+ int _Nm, // spare vecs
+ RealD _eresid, // resid in lmdue deficit
+ RealD _betastp, // if beta(k) < betastp: converged
+ int _Niter, // Max iterations
+ int _Nminres,
+ int _orth_period = 1) :
+ _HermOp(HermOp),
+ _HermOpTest(HermOpTest),
+ Nstop(_Nk),
+ Nk(_Nk),
+ Nm(_Nm),
+ eresid(_eresid),
+ betastp(_betastp),
+ Niter(_Niter),
+ Nminres(_Nminres),
+ orth_period(_orth_period)
+ {
+ Np = Nm-Nk; assert(Np>0);
+ };
+
+
+/* Saad PP. 195
+1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
+2. For k = 1,2,...,m Do:
+3. wk:=Avk−βkv_{k−1}
+4. αk:=(wk,vk) //
+5. wk:=wk−αkvk // wk orthog vk
+6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+7. vk+1 := wk/βk+1
+8. EndDo
+ */
+ void step(std::vector& lmd,
+ std::vector& lme,
+ BasisFieldVector& evec,
+ Field& w,int Nm,int k)
+ {
+ assert( k< Nm );
+
+ GridStopWatch gsw_op,gsw_o;
+
+ Field& evec_k = evec[k];
+
+ gsw_op.Start();
+ _HermOp(evec_k,w);
+ gsw_op.Stop();
+
+ if(k>0){
+ w -= lme[k-1] * evec[k-1];
+ }
+
+ ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
+ RealD alph = real(zalph);
+
+ w = w - alph * evec_k;// 5. wk:=wk−αkvk
+
+ RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
+ // 7. vk+1 := wk/βk+1
+
+ std::cout<0 && k % orth_period == 0) {
+ orthogonalize(w,evec,k); // orthonormalise
+ }
+ gsw_o.Stop();
+
+ if(k < Nm-1) {
+ evec[k+1] = w;
+ }
+
+ std::cout << GridLogMessage << "Timing: operator=" << gsw_op.Elapsed() <<
+ " orth=" << gsw_o.Elapsed() << std::endl;
+
+ }
+
+ void qr_decomp(std::vector& lmd,
+ std::vector& lme,
+ int Nk,
+ int Nm,
+ std::vector& Qt,
+ RealD Dsh,
+ int kmin,
+ int kmax)
+ {
+ int k = kmin-1;
+ RealD x;
+
+ RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
+ RealD c = ( lmd[k] -Dsh) *Fden;
+ RealD s = -lme[k] *Fden;
+
+ RealD tmpa1 = lmd[k];
+ RealD tmpa2 = lmd[k+1];
+ RealD tmpb = lme[k];
+
+ lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
+ lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
+ lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
+ x =-s*lme[k+1];
+ lme[k+1] = c*lme[k+1];
+
+ for(int i=0; i& lmd,
+ std::vector& lme,
+ int N1,
+ int N2,
+ std::vector& Qt,
+ GridBase *grid){
+
+ std::cout << GridLogMessage << "diagonalize_lapack start\n";
+ GridStopWatch gsw;
+
+ const int size = Nm;
+ // tevals.resize(size);
+ // tevecs.resize(size);
+ LAPACK_INT NN = N1;
+ std::vector evals_tmp(NN);
+ std::vector evec_tmp(NN*NN);
+ memset(&evec_tmp[0],0,sizeof(double)*NN*NN);
+ // double AA[NN][NN];
+ std::vector DD(NN);
+ std::vector EE(NN);
+ for (int i = 0; i< NN; i++)
+ for (int j = i - 1; j <= i + 1; j++)
+ if ( j < NN && j >= 0 ) {
+ if (i==j) DD[i] = lmd[i];
+ if (i==j) evals_tmp[i] = lmd[i];
+ if (j==(i-1)) EE[j] = lme[j];
+ }
+ LAPACK_INT evals_found;
+ LAPACK_INT lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
+ LAPACK_INT liwork = 3+NN*10 ;
+ std::vector iwork(liwork);
+ std::vector work(lwork);
+ std::vector isuppz(2*NN);
+ char jobz = 'V'; // calculate evals & evecs
+ char range = 'I'; // calculate all evals
+ // char range = 'A'; // calculate all evals
+ char uplo = 'U'; // refer to upper half of original matrix
+ char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
+ std::vector ifail(NN);
+ LAPACK_INT info;
+ // int total = QMP_get_number_of_nodes();
+ // int node = QMP_get_node_number();
+ // GridBase *grid = evec[0]._grid;
+ int total = grid->_Nprocessors;
+ int node = grid->_processor;
+ int interval = (NN/total)+1;
+ double vl = 0.0, vu = 0.0;
+ LAPACK_INT il = interval*node+1 , iu = interval*(node+1);
+ if (iu > NN) iu=NN;
+ double tol = 0.0;
+ if (1) {
+ memset(&evals_tmp[0],0,sizeof(double)*NN);
+ if ( il <= NN){
+ std::cout << GridLogMessage << "dstegr started" << std::endl;
+ gsw.Start();
+ dstegr(&jobz, &range, &NN,
+ (double*)&DD[0], (double*)&EE[0],
+ &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
+ &tol, // tolerance
+ &evals_found, &evals_tmp[0], (double*)&evec_tmp[0], &NN,
+ &isuppz[0],
+ &work[0], &lwork, &iwork[0], &liwork,
+ &info);
+ gsw.Stop();
+ std::cout << GridLogMessage << "dstegr completed in " << gsw.Elapsed() << std::endl;
+ for (int i = iu-1; i>= il-1; i--){
+ evals_tmp[i] = evals_tmp[i - (il-1)];
+ if (il>1) evals_tmp[i-(il-1)]=0.;
+ for (int j = 0; j< NN; j++){
+ evec_tmp[i*NN + j] = evec_tmp[(i - (il-1)) * NN + j];
+ if (il>1) evec_tmp[(i-(il-1)) * NN + j]=0.;
+ }
+ }
+ }
+ {
+ // QMP_sum_double_array(evals_tmp,NN);
+ // QMP_sum_double_array((double *)evec_tmp,NN*NN);
+ grid->GlobalSumVector(&evals_tmp[0],NN);
+ grid->GlobalSumVector(&evec_tmp[0],NN*NN);
+ }
+ }
+ // cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
+ for(int i=0;i& lmd,
+ std::vector& lme,
+ int N2,
+ int N1,
+ std::vector& Qt,
+ GridBase *grid)
+ {
+
+#ifdef USE_LAPACK_IRL
+ const int check_lapack=0; // just use lapack if 0, check against lapack if 1
+
+ if(!check_lapack)
+ return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
+
+ std::vector lmd2(N1);
+ std::vector lme2(N1);
+ std::vector Qt2(N1*N1);
+ for(int k=0; k= kmin; --j){
+ RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
+ if(fabs(lme[j-1])+dds > dds){
+ kmax = j+1;
+ goto continued;
+ }
+ }
+ Niter = iter;
+#ifdef USE_LAPACK_IRL
+ if(check_lapack){
+ const double SMALL=1e-8;
+ diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
+ std::vector lmd3(N2);
+ for(int k=0; kSMALL) std::cout<SMALL) std::cout<SMALL) std::cout< dds){
+ kmin = j+1;
+ break;
+ }
+ }
+ }
+ std::cout<
+ static RealD normalise(T& v)
+ {
+ RealD nn = norm2(v);
+ nn = sqrt(nn);
+ v = v * (1.0/nn);
+ return nn;
+ }
+
+ void orthogonalize(Field& w,
+ BasisFieldVector& evec,
+ int k)
+ {
+ double t0=-usecond()/1e6;
+
+ evec.orthogonalize(w,k);
+
+ normalise(w);
+ t0+=usecond()/1e6;
+ OrthoTime +=t0;
+ }
+
+ void setUnit_Qt(int Nm, std::vector &Qt) {
+ for(int i=0; i K P = M − K â€
+Compute the factorization AVM = VM HM + fM eM
+repeat
+ Q=I
+ for i = 1,...,P do
+ QiRi =HM −θiI Q = QQi
+ H M = Q †i H M Q i
+ end for
+ βK =HM(K+1,K) σK =Q(M,K)
+ r=vK+1βK +rσK
+ VK =VM(1:M)Q(1:M,1:K)
+ HK =HM(1:K,1:K)
+ →AVK =VKHK +fKe†K †Extend to an M = K + P step factorization AVM = VMHM + fMeM
+until convergence
+*/
+
+ void calc(std::vector& eval,
+ BasisFieldVector& evec,
+ const Field& src,
+ int& Nconv,
+ bool reverse,
+ int SkipTest)
+ {
+
+ GridBase *grid = evec._v[0]._grid;//evec.get(0 + evec_offset)._grid;
+ assert(grid == src._grid);
+
+ std::cout< lme(Nm);
+ std::vector lme2(Nm);
+ std::vector eval2(Nm);
+ std::vector eval2_copy(Nm);
+ std::vector Qt(Nm*Nm);
+
+
+ Field f(grid);
+ Field v(grid);
+
+ int k1 = 1;
+ int k2 = Nk;
+
+ Nconv = 0;
+
+ RealD beta_k;
+
+ // Set initial vector
+ evec[0] = src;
+ normalise(evec[0]);
+ std:: cout<0);
+ evec.rotate(Qt,k1-1,k2+1,0,Nm,Nm);
+
+ t1=usecond()/1e6;
+ std::cout<= Nminres) {
+ std::cout << GridLogMessage << "Rotation to test convergence " << std::endl;
+
+ Field ev0_orig(grid);
+ ev0_orig = evec[0];
+
+ evec.rotate(Qt,0,Nk,0,Nk,Nm);
+
+ {
+ std::cout << GridLogMessage << "Test convergence" << std::endl;
+ Field B(grid);
+
+ for(int j = 0; j=Nstop || beta_k < betastp){
+ goto converged;
+ }
+
+ std::cout << GridLogMessage << "Rotate back" << std::endl;
+ //B[j] +=Qt[k+_Nm*j] * _v[k]._odata[ss];
+ {
+ Eigen::MatrixXd qm = Eigen::MatrixXd::Zero(Nk,Nk);
+ for (int k=0;k QtI(Nm*Nm);
+ for (int k=0;k
+class BlockProjector {
+public:
+
+ BasisFieldVector& _evec;
+ BlockedGrid& _bgrid;
+
+ BlockProjector(BasisFieldVector& evec, BlockedGrid& bgrid) : _evec(evec), _bgrid(bgrid) {
+ }
+
+ void createOrthonormalBasis(RealD thres = 0.0) {
+
+ GridStopWatch sw;
+ sw.Start();
+
+ int cnt = 0;
+
+#pragma omp parallel shared(cnt)
+ {
+ int lcnt = 0;
+
+#pragma omp for
+ for (int b=0;b<_bgrid._o_blocks;b++) {
+
+ for (int i=0;i<_evec._Nm;i++) {
+
+ auto nrm0 = _bgrid.block_sp(b,_evec._v[i],_evec._v[i]);
+
+ // |i> -= |j>
+ for (int j=0;j
+ void coarseToFine(const CoarseField& in, Field& out) {
+
+ out = zero;
+ out.checkerboard = _evec._v[0].checkerboard;
+
+ int Nbasis = sizeof(in._odata[0]._internal._internal) / sizeof(in._odata[0]._internal._internal[0]);
+ assert(Nbasis == _evec._Nm);
+
+#pragma omp parallel for
+ for (int b=0;b<_bgrid._o_blocks;b++) {
+ for (int j=0;j<_evec._Nm;j++) {
+ _bgrid.block_caxpy(b,out,in._odata[b]._internal._internal[j],_evec._v[j],out);
+ }
+ }
+
+ }
+
+ template
+ void fineToCoarse(const Field& in, CoarseField& out) {
+
+ out = zero;
+
+ int Nbasis = sizeof(out._odata[0]._internal._internal) / sizeof(out._odata[0]._internal._internal[0]);
+ assert(Nbasis == _evec._Nm);
+
+
+ Field tmp(_bgrid._grid);
+ tmp = in;
+
+#pragma omp parallel for
+ for (int b=0;b<_bgrid._o_blocks;b++) {
+ for (int j=0;j<_evec._Nm;j++) {
+ // |rhs> -= |j>
+ auto c = _bgrid.block_sp(b,_evec._v[j],tmp);
+ _bgrid.block_caxpy(b,tmp,-c,_evec._v[j],tmp); // may make this more numerically stable
+ out._odata[b]._internal._internal[j] = c;
+ }
+ }
+
+ }
+
+ template
+ void deflateFine(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) {
+ result = zero;
+ for (int i=0;i
+ void deflateCoarse(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) {
+ CoarseField src_coarse(_coef._v[0]._grid);
+ CoarseField result_coarse = src_coarse;
+ result_coarse = zero;
+ fineToCoarse(src_orig,src_coarse);
+ for (int i=0;i
+ void deflate(BasisFieldVector& _coef,const std::vector& eval,int N,const Field& src_orig,Field& result) {
+ // Deflation on coarse Grid is much faster, so use it by default. Deflation on fine Grid is kept for legacy reasons for now.
+ deflateCoarse(_coef,eval,N,src_orig,result);
+ }
+
+};
+}
diff --git a/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h
new file mode 100644
index 00000000..821272de
--- /dev/null
+++ b/lib/algorithms/iterative/BlockImplicitlyRestartedLanczos/BlockedGrid.h
@@ -0,0 +1,401 @@
+namespace Grid {
+
+template
+class BlockedGrid {
+public:
+ GridBase* _grid;
+ typedef typename Field::scalar_type Coeff_t;
+ typedef typename Field::vector_type vCoeff_t;
+
+ std::vector _bs; // block size
+ std::vector _nb; // number of blocks
+ std::vector _l; // local dimensions irrespective of cb
+ std::vector _l_cb; // local dimensions of checkerboarded vector
+ std::vector