diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index bdd39a65..8bd1d113 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -34,7 +34,7 @@ #pragma push_macro("__SYCL_DEVICE_ONLY__") #undef __SYCL_DEVICE_ONLY__ #define EIGEN_DONT_VECTORIZE -//#undef EIGEN_USE_SYCL +#undef EIGEN_USE_SYCL #define __SYCL__REDEFINE__ #endif diff --git a/Grid/algorithms/approx/Zolotarev.cc b/Grid/algorithms/approx/Zolotarev.cc index c2efd41c..47779eae 100644 --- a/Grid/algorithms/approx/Zolotarev.cc +++ b/Grid/algorithms/approx/Zolotarev.cc @@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k, * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and * type = 1 for the approximation which is infinite at x = 0. */ -zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { +zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) { INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F, l, invlambda, xi, xisq, *tv, s, opl; int m, czero, ts; @@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { construct_partfrac(d); construct_contfrac(d); - /* Converting everything to PRECISION for external use only */ + /* Converting everything to ZOLO_PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata) } -zolotarev_data* higham(PRECISION epsilon, int n) { +zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) { INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq; int m, czero; zolotarev_data *zd; @@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) { /* Converting everything to PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -523,17 +523,17 @@ NAMESPACE_END(Grid); #ifdef TEST #undef ZERO -#define ZERO ((PRECISION) 0) +#define ZERO ((ZOLO_PRECISION) 0) #undef ONE -#define ONE ((PRECISION) 1) +#define ONE ((ZOLO_PRECISION) 1) #undef TWO -#define TWO ((PRECISION) 2) +#define TWO ((ZOLO_PRECISION) 2) /* Evaluate the rational approximation R(x) using the factored form */ -static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R; + ZOLO_PRECISION R; if (rdata -> type == 0) { R = rdata -> A * x; @@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { /* Evaluate the rational approximation R(x) using the partial fraction form */ -static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> alpha[rdata -> da - 1]; + ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1]; for (m = 0; m < rdata -> dd; m++) R += rdata -> alpha[m] / (x * x - rdata -> ap[m]); if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x); @@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0, * but with signalling overflow you will get an error message. */ -static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> beta[0] * x; + ZOLO_PRECISION R = rdata -> beta[0] * x; for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R; return R; } /* Evaluate the rational approximation R(x) using Cayley form */ -static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION T; + ZOLO_PRECISION T; T = rdata -> type == 0 ? ONE : -ONE; for (m = 0; m < rdata -> n; m++) @@ -607,7 +607,7 @@ int main(int argc, char** argv) { int m, n, plotpts = 5000, type = 0; float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr; zolotarev_data *rdata; - PRECISION y; + ZOLO_PRECISION y; FILE *plot_function, *plot_error, *plot_partfrac, *plot_contfrac, *plot_cayley; @@ -626,13 +626,13 @@ int main(int argc, char** argv) { } rdata = type == 2 - ? higham((PRECISION) eps, n) - : zolotarev((PRECISION) eps, n, type); + ? higham((ZOLO_PRECISION) eps, n) + : zolotarev((ZOLO_PRECISION) eps, n, type); printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION) "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION) - "\tPRECISION = " STRINGIFY(PRECISION) + "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION) "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n" "\tDelta = %g (maximum error)\n\n" "\tA = %g (overall factor)\n", @@ -681,15 +681,15 @@ int main(int argc, char** argv) { x = 2.4 * (float) m / plotpts - 1.2; if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) { /* skip x = 0 for type 1, as R(0) is singular */ - y = zolotarev_eval((PRECISION) x, rdata); + y = zolotarev_eval((ZOLO_PRECISION) x, rdata); fprintf(plot_function, "%g %g\n", x, (float) y); fprintf(plot_error, "%g %g\n", x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta)); - ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y) + ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y) + ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y) + ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) { maxypferr = MAX(maxypferr, fabs(ypferr)); diff --git a/Grid/algorithms/approx/Zolotarev.h b/Grid/algorithms/approx/Zolotarev.h index 800cf3c7..3c983cd3 100644 --- a/Grid/algorithms/approx/Zolotarev.h +++ b/Grid/algorithms/approx/Zolotarev.h @@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx); #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> #ifndef ZOLOTAREV_INTERNAL -#ifndef PRECISION -#define PRECISION double +#ifndef ZOLO_PRECISION +#define ZOLO_PRECISION double #endif -#define ZPRECISION PRECISION +#define ZPRECISION ZOLO_PRECISION #define ZOLOTAREV_DATA zolotarev_data #endif @@ -77,8 +77,8 @@ typedef struct { * zolotarev_data structure. The arguments must satisfy the constraints that * epsilon > 0, n > 0, and type = 0 or 1. */ -ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ; -ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type); +ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ; +ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type); void zolotarev_free(zolotarev_data *zdata); #endif @@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata); NAMESPACE_END(Approx); NAMESPACE_END(Grid); #endif + diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 2924350d..f6418b7e 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -31,12 +31,17 @@ Author: Peter Boyle #include #endif #ifdef GRID_CUDA -#include +#include #endif #ifdef GRID_SYCL -#error // need oneMKL version +#include +#endif +#if 0 +#define GRID_ONE_MKL +#endif +#ifdef GRID_ONE_MKL +#include #endif - /////////////////////////////////////////////////////////////////////// // Need to rearrange lattice data to be in the right format for a // batched multiply. Might as well make these static, dense packed @@ -46,12 +51,15 @@ NAMESPACE_BEGIN(Grid); typedef hipblasHandle_t gridblasHandle_t; #endif #ifdef GRID_CUDA - typedef cudablasHandle_t gridblasHandle_t; + typedef cublasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL - typedef int32_t gridblasHandle_t; + typedef cl::sycl::queue *gridblasHandle_t; #endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) +#ifdef GRID_ONE_MKL + typedef cl::sycl::queue *gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) typedef int32_t gridblasHandle_t; #endif @@ -70,12 +78,19 @@ public: #ifdef GRID_CUDA std::cout << "cublasCreate"<wait(); #endif } @@ -644,10 +662,19 @@ public: (cuDoubleComplex *) Cmn, ldc, sdc, batchCount); #endif -#ifdef GRID_SYCL - #warning "oneMKL implementation not made " +#if defined(GRID_SYCL) || defined(GRID_ONE_MKL) + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + oneapi::mkl::transpose::N, + oneapi::mkl::transpose::N, + m,n,k, + alpha, + (const ComplexD *)Amk,lda,sda, + (const ComplexD *)Bkn,ldb,sdb, + beta, + (ComplexD *)Cmn,ldc,sdc, + batchCount); #endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 6343db99..79572949 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -35,6 +35,7 @@ Author: Peter Boyle #include #include #include +#include #include #include #include @@ -46,5 +47,4 @@ Author: Peter Boyle #include #include #include -#include #include diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h index 142e2349..e31d8441 100644 --- a/Grid/lattice/Lattice_crc.h +++ b/Grid/lattice/Lattice_crc.h @@ -42,13 +42,13 @@ template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1 } } -template uint32_t crc(Lattice & buf) +template uint32_t crc(const Lattice & buf) { autoView( buf_v , buf, CpuRead); return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); } -#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); ComplexD nrm = rankInnerProduct(left,right); + // std::cerr<<"flight log " << std::hexfloat << nrm <<" "<GlobalSum(nrm); return nrm; } diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index dd62e109..a39b529f 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -280,20 +280,16 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} +#ifndef GRID_CUDA if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} #endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} -#endif } assert(0 && " Kernel optimisation case not covered "); } @@ -322,19 +318,13 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} -#endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} -#endif } } diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index daf41cae..0a3d176f 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc NAMESPACE_END(Grid); +#ifdef GRID_SYCL +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +#endif + + #endif diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 98bc3986..536e17f1 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid); }; NAMESPACE_END(Grid); + +#ifdef GRID_SYCL +template struct +sycl::is_device_copyable::value && (!std::is_trivially_copyable::value), + void>::type> + : public std::true_type {}; +#endif + diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f6efdee9..392cba61 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -255,17 +255,13 @@ inline int acceleratorIsCommunicable(void *ptr) #define GRID_SYCL_LEVEL_ZERO_IPC NAMESPACE_END(Grid); -#if 0 -#include -#include -#include -#include -#else + +// Force deterministic reductions +#define SYCL_REDUCTION_DETERMINISTIC #include #include #include #include -#endif NAMESPACE_BEGIN(Grid); diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index d013763a..9a0b4376 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -393,6 +393,9 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<({45,12,81,9})); @@ -454,11 +454,17 @@ public: pickCheckerboard(Even,src_e,src); pickCheckerboard(Odd,src_o,src); - const int num_cases = 1; +#ifdef AVX512 + const int num_cases = 3; +#else + const int num_cases = 2; +#endif std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases [] = { - { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } }; for(int c=0;c({8,2,2,2}); -#else LebesgueOrder::Block = std::vector({2,2,2,2}); -#endif + Benchmark::Decomposition(); int do_su4=0; @@ -910,7 +919,7 @@ int main (int argc, char ** argv) } if ( do_blas ) { -#if defined(GRID_CUDA) || defined(GRID_HIP) +#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) std::cout< using namespace std; using namespace Grid; -template -struct scal { - d internal; -}; - - Gamma::Algebra Gmu [] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT - }; - int main (int argc, char ** argv) { + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + std::string host(hostname); + Grid_init(&argc,&argv); const int Ls=12; - std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; - - { GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -92,7 +81,14 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); - std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + int nsecs=600; + if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ + std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds"); + GridCmdOptionInt(arg,nsecs); + } + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "< mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); double t1,t2,flops; double MdagMsiteflops = 1452; // Mobius (real coeffs) @@ -101,7 +97,14 @@ int main (int argc, char ** argv) std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<gSites()*iters; std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< CG(1.0e-8,10000); - for(int i=0;i<1;i++){ + csumref=0; + int i=0; + do { + std::cerr << "******************* DOUBLE PRECISION SOLVE "<gSites()*iters; flops+= CGsiteflops*FrbGrid->gSites()*iters; - + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< munge; - std::string format = getFormatString(); - - BinaryIO::writeLatticeObject(result_o,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " Mixed checksums "<(result_o_2,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " CG checksums "<