From 9e5fb52eb922e5e90086d3d55f022945ec43fb12 Mon Sep 17 00:00:00 2001 From: Thomas Wurm Date: Mon, 8 Mar 2021 13:53:34 +0100 Subject: [PATCH 1/6] Put GlobalSum outside the slice loop --- Grid/lattice/Lattice_reduction.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 0a5fbcb6..326b9ea3 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -361,6 +361,7 @@ template inline void sliceSum(const Lattice &Data,std::vector< // But easily avoided by using double precision fields /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; GridBase *grid = Data.Grid(); assert(grid!=NULL); @@ -419,20 +420,19 @@ template inline void sliceSum(const Lattice &Data,std::vector< } // sum over nodes. - sobj gsum; for(int t=0;t_processor_coor[orthogdim] ) { - gsum=lsSum[lt]; + result[t]=lsSum[lt]; } else { - gsum=Zero(); + result[t]=Zero(); } - grid->GlobalSum(gsum); - - result[t]=gsum; } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); } template From 2bb374daea0f412ab131d274c4fd2e60e2a92c3b Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 19 Mar 2021 11:33:23 +0100 Subject: [PATCH 2/6] hip-friendly --- Grid/algorithms/CoarsenedMatrix.h | 10 +++++++--- Grid/lattice/Lattice_transfer.h | 12 ++++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index b9594678..2fd187ff 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -442,6 +442,8 @@ public: for(int p=0; poSites()*nbasis, Nsimd, { @@ -453,7 +455,7 @@ public: StencilEntry *SE; for(int p=0;p AcceleratorViewContainer; for(int p=0;p_is_local) { @@ -754,7 +758,7 @@ public: StencilEntry *SE; for(int p=0;p_is_local) { diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 91de721f..2da78398 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -360,16 +360,22 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorRead); + auto coarseData_p = &coarseData_[0]; + auto fineData_p = &fineData_[0]; + Coordinate fine_rdimensions = fine->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions; + + vobj zz = Zero(); accelerator_for(sc,coarse->oSites(),1,{ // One thread per sub block Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + vobj cd = zz; + for(int sb=0;sb &coarseData,const Lattice &fineData) for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); - coarseData_[sc]=coarseData_[sc]+fineData_[sf]; + cd=cd+fineData_p[sf]; } + coarseData_p[sc] = cd; + }); return; } From b5aeae526fb9857f170feba58dfab14d16e4c955 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 Mar 2021 21:44:14 +0200 Subject: [PATCH 3/6] Make Cshift fields static to avoid repeated reallocaate overhead --- Grid/cshift/Cshift_mpi.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 375d004e..7e93e260 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -122,8 +122,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf(buffer_size); - cshiftVector recv_buf(buffer_size); + static cshiftVector send_buf; send_buf.resize(buffer_size); + static cshiftVector recv_buf; recv_buf.resize(buffer_size); int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -198,8 +198,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; @@ -294,8 +294,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf_v(buffer_size); - cshiftVector recv_buf_v(buffer_size); + static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); + static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); vobj *send_buf; vobj *recv_buf; { @@ -381,8 +381,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; { From c50f27e68bd4b3e4fb6a1da00aebe224f0a0bc23 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 20 Jun 2021 11:34:38 +0200 Subject: [PATCH 4/6] Make FFT play nice with split grid --- Grid/algorithms/FFT.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index ad42f049..29f0ec4b 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -136,7 +136,7 @@ public: flops=0; usec =0; Coordinate layout(Nd,1); - sgrid = new GridCartesian(dimensions,layout,processors); + sgrid = new GridCartesian(dimensions,layout,processors,*grid); }; ~FFT ( void) { @@ -182,7 +182,7 @@ public: pencil_gd[dim] = G*processors[dim]; // Pencil global vol LxLxGxLxL per node - GridCartesian pencil_g(pencil_gd,layout,processors); + GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid); // Construct pencils typedef typename vobj::scalar_object sobj; From dd091d0960c3334c17df2456e3dc3ac48a048a3e Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 15 Sep 2021 16:58:05 +0200 Subject: [PATCH 5/6] consistent pointer offloading instead of views --- Grid/lattice/Lattice_arith.h | 2 +- Grid/lattice/Lattice_basis.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index 3c269c58..b39a475d 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -225,7 +225,7 @@ void axpy(Lattice &ret,sobj a,const Lattice &x,const Lattice & autoView( x_v , x, AcceleratorRead); autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ - auto tmp = a*x_v(ss)+y_v(ss); + auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]); coalescedWrite(ret_v[ss],tmp); }); } diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 863b2548..0928cbd7 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -125,7 +125,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) for(int k=k0; k Date: Wed, 15 Sep 2021 18:38:32 +0200 Subject: [PATCH 6/6] A64FX drop mixed precision as well --- .../implementation/WilsonKernelsAsmA64FX.h | 240 +++++++++--------- 1 file changed, 120 insertions(+), 120 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index ffec05a0..35d1b841 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -73,17 +73,17 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -102,17 +102,17 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -131,17 +131,17 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include @@ -165,17 +165,17 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -194,17 +194,17 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -223,17 +223,17 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include @@ -280,17 +280,17 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -309,17 +309,17 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -338,17 +338,17 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include ///////////////////////////////////////////////////////////////// @@ -371,17 +371,17 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -400,17 +400,17 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -429,17 +429,17 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include