diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 84ef0a1a..5cc8636f 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -198,8 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { // Possibly promote to double and sum ///////////////////////////////////////////////////////////////////////////////////////////////////////// -// Uncomment to print per-phase timing for every sumD_gpu_small and sumD_gpu_large call. -// #define GRID_REDUCTION_TIMING +#define GRID_REDUCTION_TIMING template inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)