diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index e376bb18..f4092bc5 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -853,8 +853,7 @@ public: CComplex alpha(1.0); CComplex beta (1.0); RealD flops = 8.0*M*N*K*BATCH; - int ncall=10; - RealD t0 = usecond(); + int ncall=1000; deviceVector As(BATCH); deviceVector Bs(BATCH); deviceVector Cs(BATCH); @@ -865,6 +864,16 @@ public: ptr = &C[b*M*N]; acceleratorPut(Cs[b],ptr); } + // Warm up call + gemmBatched(M,N,K, + alpha, + As, // m x k + Bs, // k x n + beta, + Cs); + synchronise(); + + RealD t0 = usecond(); for(int i=0;i