From 51f506553c59923fc60f3a68333b14757b8853a5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 12 Mar 2021 15:33:04 +0100 Subject: [PATCH] Read out the local ID once, and store --- .../WilsonKernelsHandImplementation.h | 165 ++++++++++-------- 1 file changed, 96 insertions(+), 69 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index fb42fe88..0703b613 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -79,18 +79,18 @@ Author: paboyle #ifdef GRID_SIMT #define LOAD_CHIMU(ptype) \ {const SiteSpinor & ref (in[offset]); \ - Chimu_00=coalescedReadPermute(ref()(0)(0),perm); \ - Chimu_01=coalescedReadPermute(ref()(0)(1),perm); \ - Chimu_02=coalescedReadPermute(ref()(0)(2),perm); \ - Chimu_10=coalescedReadPermute(ref()(1)(0),perm); \ - Chimu_11=coalescedReadPermute(ref()(1)(1),perm); \ - Chimu_12=coalescedReadPermute(ref()(1)(2),perm); \ - Chimu_20=coalescedReadPermute(ref()(2)(0),perm); \ - Chimu_21=coalescedReadPermute(ref()(2)(1),perm); \ - Chimu_22=coalescedReadPermute(ref()(2)(2),perm); \ - Chimu_30=coalescedReadPermute(ref()(3)(0),perm); \ - Chimu_31=coalescedReadPermute(ref()(3)(1),perm); \ - Chimu_32=coalescedReadPermute(ref()(3)(2),perm); } + Chimu_00=coalescedReadPermute(ref()(0)(0),perm,lane); \ + Chimu_01=coalescedReadPermute(ref()(0)(1),perm,lane); \ + Chimu_02=coalescedReadPermute(ref()(0)(2),perm,lane); \ + Chimu_10=coalescedReadPermute(ref()(1)(0),perm,lane); \ + Chimu_11=coalescedReadPermute(ref()(1)(1),perm,lane); \ + Chimu_12=coalescedReadPermute(ref()(1)(2),perm,lane); \ + Chimu_20=coalescedReadPermute(ref()(2)(0),perm,lane); \ + Chimu_21=coalescedReadPermute(ref()(2)(1),perm,lane); \ + Chimu_22=coalescedReadPermute(ref()(2)(2),perm,lane); \ + Chimu_30=coalescedReadPermute(ref()(3)(0),perm,lane); \ + Chimu_31=coalescedReadPermute(ref()(3)(1),perm,lane); \ + Chimu_32=coalescedReadPermute(ref()(3)(2),perm,lane); } #define PERMUTE_DIR(dir) ; #else #define LOAD_CHIMU(ptype) \ @@ -119,43 +119,43 @@ Author: paboyle #endif #define MULT_2SPIN(A)\ - {auto & ref(U[sU](A)); \ - U_00=coalescedRead(ref()(0,0)); \ - U_10=coalescedRead(ref()(1,0)); \ - U_20=coalescedRead(ref()(2,0)); \ - U_01=coalescedRead(ref()(0,1)); \ - U_11=coalescedRead(ref()(1,1)); \ - U_21=coalescedRead(ref()(2,1)); \ - UChi_00 = U_00*Chi_00; \ - UChi_10 = U_00*Chi_10; \ - UChi_01 = U_10*Chi_00; \ - UChi_11 = U_10*Chi_10; \ - UChi_02 = U_20*Chi_00; \ - UChi_12 = U_20*Chi_10; \ - UChi_00+= U_01*Chi_01; \ - UChi_10+= U_01*Chi_11; \ - UChi_01+= U_11*Chi_01; \ - UChi_11+= U_11*Chi_11; \ - UChi_02+= U_21*Chi_01; \ - UChi_12+= U_21*Chi_11; \ - U_00=coalescedRead(ref()(0,2)); \ - U_10=coalescedRead(ref()(1,2)); \ - U_20=coalescedRead(ref()(2,2)); \ - UChi_00+= U_00*Chi_02; \ - UChi_10+= U_00*Chi_12; \ - UChi_01+= U_10*Chi_02; \ - UChi_11+= U_10*Chi_12; \ - UChi_02+= U_20*Chi_02; \ + {auto & ref(U[sU](A)); \ + U_00=coalescedRead(ref()(0,0),lane); \ + U_10=coalescedRead(ref()(1,0),lane); \ + U_20=coalescedRead(ref()(2,0),lane); \ + U_01=coalescedRead(ref()(0,1),lane); \ + U_11=coalescedRead(ref()(1,1),lane); \ + U_21=coalescedRead(ref()(2,1),lane); \ + UChi_00 = U_00*Chi_00; \ + UChi_10 = U_00*Chi_10; \ + UChi_01 = U_10*Chi_00; \ + UChi_11 = U_10*Chi_10; \ + UChi_02 = U_20*Chi_00; \ + UChi_12 = U_20*Chi_10; \ + UChi_00+= U_01*Chi_01; \ + UChi_10+= U_01*Chi_11; \ + UChi_01+= U_11*Chi_01; \ + UChi_11+= U_11*Chi_11; \ + UChi_02+= U_21*Chi_01; \ + UChi_12+= U_21*Chi_11; \ + U_00=coalescedRead(ref()(0,2),lane); \ + U_10=coalescedRead(ref()(1,2),lane); \ + U_20=coalescedRead(ref()(2,2),lane); \ + UChi_00+= U_00*Chi_02; \ + UChi_10+= U_00*Chi_12; \ + UChi_01+= U_10*Chi_02; \ + UChi_11+= U_10*Chi_12; \ + UChi_02+= U_20*Chi_02; \ UChi_12+= U_20*Chi_12;} #define LOAD_CHI \ {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = coalescedRead(ref()(0)(0)); \ - Chi_01 = coalescedRead(ref()(0)(1)); \ - Chi_02 = coalescedRead(ref()(0)(2)); \ - Chi_10 = coalescedRead(ref()(1)(0)); \ - Chi_11 = coalescedRead(ref()(1)(1)); \ - Chi_12 = coalescedRead(ref()(1)(2));} + Chi_00 = coalescedRead(ref()(0)(0),lane); \ + Chi_01 = coalescedRead(ref()(0)(1),lane); \ + Chi_02 = coalescedRead(ref()(0)(2),lane); \ + Chi_10 = coalescedRead(ref()(1)(0),lane); \ + Chi_11 = coalescedRead(ref()(1)(1),lane); \ + Chi_12 = coalescedRead(ref()(1)(2),lane);} // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); @@ -453,35 +453,35 @@ Author: paboyle #define HAND_RESULT(ss) \ { \ SiteSpinor & ref (out[ss]); \ - coalescedWrite(ref()(0)(0),result_00); \ - coalescedWrite(ref()(0)(1),result_01); \ - coalescedWrite(ref()(0)(2),result_02); \ - coalescedWrite(ref()(1)(0),result_10); \ - coalescedWrite(ref()(1)(1),result_11); \ - coalescedWrite(ref()(1)(2),result_12); \ - coalescedWrite(ref()(2)(0),result_20); \ - coalescedWrite(ref()(2)(1),result_21); \ - coalescedWrite(ref()(2)(2),result_22); \ - coalescedWrite(ref()(3)(0),result_30); \ - coalescedWrite(ref()(3)(1),result_31); \ - coalescedWrite(ref()(3)(2),result_32); \ + coalescedWrite(ref()(0)(0),result_00,lane); \ + coalescedWrite(ref()(0)(1),result_01,lane); \ + coalescedWrite(ref()(0)(2),result_02,lane); \ + coalescedWrite(ref()(1)(0),result_10,lane); \ + coalescedWrite(ref()(1)(1),result_11,lane); \ + coalescedWrite(ref()(1)(2),result_12,lane); \ + coalescedWrite(ref()(2)(0),result_20,lane); \ + coalescedWrite(ref()(2)(1),result_21,lane); \ + coalescedWrite(ref()(2)(2),result_22,lane); \ + coalescedWrite(ref()(3)(0),result_30,lane); \ + coalescedWrite(ref()(3)(1),result_31,lane); \ + coalescedWrite(ref()(3)(2),result_32,lane); \ } #define HAND_RESULT_EXT(ss) \ { \ SiteSpinor & ref (out[ss]); \ - coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00); \ - coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01); \ - coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02); \ - coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10); \ - coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11); \ - coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12); \ - coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20); \ - coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21); \ - coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22); \ - coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30); \ - coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31); \ - coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32); \ + coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane); \ + coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane); \ + coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane); \ + coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane); \ + coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane); \ + coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane); \ + coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane); \ + coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane); \ + coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane); \ + coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane); \ + coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane); \ + coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane); \ } #define HAND_DECLARATIONS(Simd) \ @@ -558,6 +558,9 @@ WilsonKernels::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, // typedef decltype( coalescedRead( vCplx()()() )) Simt; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; @@ -584,6 +587,10 @@ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; @@ -609,6 +616,10 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); StencilEntry *SE; @@ -635,6 +646,10 @@ WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; @@ -660,6 +675,10 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); StencilEntry *SE; @@ -686,6 +705,10 @@ WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset, ptype; @@ -712,6 +735,10 @@ void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); StencilEntry *SE;