#include int main (int argc, char ** argv) { using namespace Grid; Grid_init(&argc,&argv); Coordinate latt_size({64,64,64,16}); auto simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); auto mpi_layout = GridDefaultMpi(); GridCartesian Grid(latt_size, simd_layout, mpi_layout); std::vector seeds({1, 2, 3, 4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); LatticeComplexD test_data(&Grid); gaussian(pRNG,test_data); std::vector reduction_reference; std::vector reduction_result; //warmup for (int sweeps = 0; sweeps < 5; sweeps++) { reduction_result = sliceSumGpu(test_data,0); } int trace_id = traceStart("sliceSum benchmark"); for (int i = 0; i < Nd; i++) { RealD t=-usecond(); tracePush("sliceSum"); sliceSum(test_data,reduction_reference,i); tracePop("sliceSum"); t+=usecond(); std::cout << GridLogMessage << " sliceSum took "<