1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Benchmark wilson dhop now; 14.6GF on one core, not as fast as SU(3)xSU(3) [23GF] but still not too shabby.

Disassembling output shows ugly sequences in the permute sector. Could comparatively benchmark with and without
the if-else structure to see how much I'm losing.

Drops to 9GF as it falls out of cache. Moving to Lebesgue ordering should help there. Substantive progress.
This commit is contained in:
Peter Boyle 2015-04-29 06:50:18 +01:00
parent c72db6c6f6
commit b7090ebba4

View File

@ -21,8 +21,8 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
std::vector<int> simd_layout({1,1,2,2});
std::vector<int> mpi_layout ({2,2,2,2});
std::vector<int> latt_size ({8,8,8,8});
std::vector<int> mpi_layout ({1,1,1,1});
std::vector<int> latt_size ({4,4,8,8});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<int> seeds({1,2,3,4});
@ -39,6 +39,11 @@ int main (int argc, char ** argv)
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
std::vector<LatticeColourMatrix> U(4,&Grid);
double volume=1;
for(int mu=0;mu<Nd;mu++){
volume=volume*latt_size[mu];
}
for(int mu=0;mu<Nd;mu++){
// U[mu] = 1.0;
// pokeIndex<3>(Umu,U[mu],mu);
@ -70,11 +75,21 @@ int main (int argc, char ** argv)
RealD mass=0.1;
WilsonMatrix Dw(Umu,mass);
std::cout << "Calling Dw"<<std::endl;
Dw.multiply(src,result);
int ncall=100;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.multiply(src,result);
}
double t1=usecond();
double flops=1320*volume*ncall;
std::cout << "Called Dw"<<std::endl;
std::cout << "norm result "<< norm2(result)<<std::endl;
std::cout << "norm ref "<< norm2(ref)<<std::endl;
std::cout << "mflop/s = "<< flops/(t1-t0)<<std::endl;
// for(int ss=0;ss<10;ss++ ){
for(int ss=0;ss<0;ss++ ){