Benchmark wilson dhop now; 14.6GF on one core, not as fast as SU(3)xSU(3) [23GF] but still not too shabby.

Disassembling output shows ugly sequences in the permute sector. Could comparatively benchmark with and without the if-else structure to see how much I'm losing. Drops to 9GF as it falls out of cache. Moving to Lebesgue ordering should help there. Substantive progress.
2026-07-17 23:53:27 +01:00 · 2015-04-29 06:50:18 +01:00
parent c72db6c6f6
commit b7090ebba4
1 changed files with 18 additions and 3 deletions
@@ -21,8 +21,8 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);

  std::vector<int> simd_layout({1,1,2,2});
-  std::vector<int> mpi_layout ({2,2,2,2});
-  std::vector<int> latt_size  ({8,8,8,8});
+  std::vector<int> mpi_layout ({1,1,1,1});
+  std::vector<int> latt_size  ({4,4,8,8});
    
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
  std::vector<int> seeds({1,2,3,4});
@@ -39,6 +39,11 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
  std::vector<LatticeColourMatrix> U(4,&Grid);

+  double volume=1;
+  for(int mu=0;mu<Nd;mu++){
+    volume=volume*latt_size[mu];
+  }  
+
  for(int mu=0;mu<Nd;mu++){
    //    U[mu] = 1.0;
    //    pokeIndex<3>(Umu,U[mu],mu);
@@ -70,11 +75,21 @@ int main (int argc, char ** argv)

  RealD mass=0.1;
  WilsonMatrix Dw(Umu,mass);
+  
  std::cout << "Calling Dw"<<std::endl;
-  Dw.multiply(src,result);
+  int ncall=100;
+  double t0=usecond();
+  for(int i=0;i<ncall;i++){
+    Dw.multiply(src,result);
+  }
+  double t1=usecond();
+  double flops=1320*volume*ncall;
+
+  
  std::cout << "Called Dw"<<std::endl;
  std::cout << "norm result "<< norm2(result)<<std::endl;
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
+  std::cout << "mflop/s = "<< flops/(t1-t0)<<std::endl;

  //  for(int ss=0;ss<10;ss++ ){
  for(int ss=0;ss<0;ss++ ){