From bfb1cd36e2bf54a60636c5bb419288262420e000 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 26 May 2015 19:55:18 +0100 Subject: [PATCH] Strip out the dslash kernel implementation --- lib/qcd/Grid_qcd_dhop.cc | 309 ++++++++++++++ lib/qcd/Grid_qcd_dhop_hand.cc | 769 ++++++++++++++++++++++++++++++++++ 2 files changed, 1078 insertions(+) create mode 100644 lib/qcd/Grid_qcd_dhop.cc create mode 100644 lib/qcd/Grid_qcd_dhop_hand.cc diff --git a/lib/qcd/Grid_qcd_dhop.cc b/lib/qcd/Grid_qcd_dhop.cc new file mode 100644 index 00000000..1e5dcd16 --- /dev/null +++ b/lib/qcd/Grid_qcd_dhop.cc @@ -0,0 +1,309 @@ +#include + +namespace Grid { +namespace QCD { + +void DiracOpt::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + vHalfSpinColourVector tmp; + vHalfSpinColourVector chi; + vSpinColourVector result; + vHalfSpinColourVector Uchi; + int offset,local,perm, ptype; + + //#define VERBOSE( A) if ( ss<10 ) { std::cout << "site " < > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + vHalfSpinColourVector tmp; + vHalfSpinColourVector chi; + vSpinColourVector result; + vHalfSpinColourVector Uchi; + int offset,local,perm, ptype; + + // Xp + offset = st._offsets [Xm][ss]; + local = st._is_local[Xm][ss]; + perm = st._permute[Xm][ss]; + + ptype = st._permute_type[Xm]; + if ( local && perm ) { + spProjXp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Xm),&chi()); + spReconXp(result,Uchi); + + // Yp + offset = st._offsets [Ym][ss]; + local = st._is_local[Ym][ss]; + perm = st._permute[Ym][ss]; + ptype = st._permute_type[Ym]; + if ( local && perm ) { + spProjYp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Ym),&chi()); + accumReconYp(result,Uchi); + + // Zp + offset = st._offsets [Zm][ss]; + local = st._is_local[Zm][ss]; + perm = st._permute[Zm][ss]; + ptype = st._permute_type[Zm]; + if ( local && perm ) { + spProjZp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Zm),&chi()); + accumReconZp(result,Uchi); + + // Tp + offset = st._offsets [Tm][ss]; + local = st._is_local[Tm][ss]; + perm = st._permute[Tm][ss]; + ptype = st._permute_type[Tm]; + if ( local && perm ) { + spProjTp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTp(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Tm),&chi()); + accumReconTp(result,Uchi); + + // Xm + offset = st._offsets [Xp][ss]; + local = st._is_local[Xp][ss]; + perm = st._permute[Xp][ss]; + ptype = st._permute_type[Xp]; + + if ( local && perm ) + { + spProjXm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Xp),&chi()); + accumReconXm(result,Uchi); + + // Ym + offset = st._offsets [Yp][ss]; + local = st._is_local[Yp][ss]; + perm = st._permute[Yp][ss]; + ptype = st._permute_type[Yp]; + + if ( local && perm ) { + spProjYm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Yp),&chi()); + accumReconYm(result,Uchi); + + // Zm + offset = st._offsets [Zp][ss]; + local = st._is_local[Zp][ss]; + perm = st._permute[Zp][ss]; + ptype = st._permute_type[Zp]; + if ( local && perm ) { + spProjZm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Zp),&chi()); + accumReconZm(result,Uchi); + + // Tm + offset = st._offsets [Tp][ss]; + local = st._is_local[Tp][ss]; + perm = st._permute[Tp][ss]; + ptype = st._permute_type[Tp]; + if ( local && perm ) { + spProjTm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTm(chi,in._odata[offset]); + } else { + chi=buf[offset]; + } + mult(&Uchi(),&U._odata[ss](Tp),&chi()); + accumReconTm(result,Uchi); + + vstream(out._odata[ss],result); +} +}} diff --git a/lib/qcd/Grid_qcd_dhop_hand.cc b/lib/qcd/Grid_qcd_dhop_hand.cc new file mode 100644 index 00000000..f8d464fb --- /dev/null +++ b/lib/qcd/Grid_qcd_dhop_hand.cc @@ -0,0 +1,769 @@ +#include + +#define REGISTER + +#define LOAD_CHIMU \ + const vSpinColourVector & ref (in._odata[offset]); \ + Chimu_00=ref()(0)(0);\ + Chimu_01=ref()(0)(1);\ + Chimu_02=ref()(0)(2);\ + Chimu_10=ref()(1)(0);\ + Chimu_11=ref()(1)(1);\ + Chimu_12=ref()(1)(2);\ + Chimu_20=ref()(2)(0);\ + Chimu_21=ref()(2)(1);\ + Chimu_22=ref()(2)(2);\ + Chimu_30=ref()(3)(0);\ + Chimu_31=ref()(3)(1);\ + Chimu_32=ref()(3)(2); + +#define LOAD_CHI\ + const vHalfSpinColourVector &ref(buf[offset]); \ + Chi_00 = ref()(0)(0);\ + Chi_01 = ref()(0)(1);\ + Chi_02 = ref()(0)(2);\ + Chi_10 = ref()(1)(0);\ + Chi_11 = ref()(1)(1);\ + Chi_12 = ref()(1)(2); + +#define MULT_2SPIN(A)\ + auto & ref(U._odata[ss](A)); \ + U_00 = ref()(0,0);\ + U_10 = ref()(1,0);\ + U_20 = ref()(2,0);\ + U_01 = ref()(0,1);\ + U_11 = ref()(1,1); \ + U_21 = ref()(2,1);\ + UChi_00 = U_00*Chi_00;\ + UChi_10 = U_00*Chi_10;\ + UChi_01 = U_10*Chi_00;\ + UChi_11 = U_10*Chi_10;\ + UChi_02 = U_20*Chi_00;\ + UChi_12 = U_20*Chi_10;\ + UChi_00+= U_01*Chi_01;\ + UChi_10+= U_01*Chi_11;\ + UChi_01+= U_11*Chi_01;\ + UChi_11+= U_11*Chi_11;\ + UChi_02+= U_21*Chi_01;\ + UChi_12+= U_21*Chi_11;\ + U_00 = ref()(0,2);\ + U_10 = ref()(1,2);\ + U_20 = ref()(2,2);\ + UChi_00+= U_00*Chi_02;\ + UChi_10+= U_00*Chi_12;\ + UChi_01+= U_10*Chi_02;\ + UChi_11+= U_10*Chi_12;\ + UChi_02+= U_20*Chi_02;\ + UChi_12+= U_20*Chi_12; + +#define PERMUTE\ + permute(Chi_00,Chi_00,ptype);\ + permute(Chi_01,Chi_01,ptype);\ + permute(Chi_02,Chi_02,ptype);\ + permute(Chi_10,Chi_10,ptype);\ + permute(Chi_11,Chi_11,ptype);\ + permute(Chi_12,Chi_12,ptype); + +// hspin(0)=fspin(0)+timesI(fspin(3)); +// hspin(1)=fspin(1)+timesI(fspin(2)); +#define XP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_30);\ + Chi_01 = Chimu_01+timesI(Chimu_31);\ + Chi_02 = Chimu_02+timesI(Chimu_32);\ + Chi_10 = Chimu_10+timesI(Chimu_20);\ + Chi_11 = Chimu_11+timesI(Chimu_21);\ + Chi_12 = Chimu_12+timesI(Chimu_22); + +#define YP_PROJ \ + Chi_00 = Chimu_00-Chimu_30;\ + Chi_01 = Chimu_01-Chimu_31;\ + Chi_02 = Chimu_02-Chimu_32;\ + Chi_10 = Chimu_10+Chimu_20;\ + Chi_11 = Chimu_11+Chimu_21;\ + Chi_12 = Chimu_12+Chimu_22; + +#define ZP_PROJ \ + Chi_00 = Chimu_00+timesI(Chimu_20); \ + Chi_01 = Chimu_01+timesI(Chimu_21); \ + Chi_02 = Chimu_02+timesI(Chimu_22); \ + Chi_10 = Chimu_10-timesI(Chimu_30); \ + Chi_11 = Chimu_11-timesI(Chimu_31); \ + Chi_12 = Chimu_12-timesI(Chimu_32); + +#define TP_PROJ \ + Chi_00 = Chimu_00+Chimu_20; \ + Chi_01 = Chimu_01+Chimu_21; \ + Chi_02 = Chimu_02+Chimu_22; \ + Chi_10 = Chimu_10+Chimu_30; \ + Chi_11 = Chimu_11+Chimu_31; \ + Chi_12 = Chimu_12+Chimu_32; + + +// hspin(0)=fspin(0)-timesI(fspin(3)); +// hspin(1)=fspin(1)-timesI(fspin(2)); +#define XM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_30);\ + Chi_01 = Chimu_01-timesI(Chimu_31);\ + Chi_02 = Chimu_02-timesI(Chimu_32);\ + Chi_10 = Chimu_10-timesI(Chimu_20);\ + Chi_11 = Chimu_11-timesI(Chimu_21);\ + Chi_12 = Chimu_12-timesI(Chimu_22); + +#define YM_PROJ \ + Chi_00 = Chimu_00+Chimu_30;\ + Chi_01 = Chimu_01+Chimu_31;\ + Chi_02 = Chimu_02+Chimu_32;\ + Chi_10 = Chimu_10-Chimu_20;\ + Chi_11 = Chimu_11-Chimu_21;\ + Chi_12 = Chimu_12-Chimu_22; + +#define ZM_PROJ \ + Chi_00 = Chimu_00-timesI(Chimu_20); \ + Chi_01 = Chimu_01-timesI(Chimu_21); \ + Chi_02 = Chimu_02-timesI(Chimu_22); \ + Chi_10 = Chimu_10+timesI(Chimu_30); \ + Chi_11 = Chimu_11+timesI(Chimu_31); \ + Chi_12 = Chimu_12+timesI(Chimu_32); + +#define TM_PROJ \ + Chi_00 = Chimu_00-Chimu_20; \ + Chi_01 = Chimu_01-Chimu_21; \ + Chi_02 = Chimu_02-Chimu_22; \ + Chi_10 = Chimu_10-Chimu_30; \ + Chi_11 = Chimu_11-Chimu_31; \ + Chi_12 = Chimu_12-Chimu_32; + +// fspin(0)=hspin(0); +// fspin(1)=hspin(1); +// fspin(2)=timesMinusI(hspin(1)); +// fspin(3)=timesMinusI(hspin(0)); +#define XP_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesMinusI(UChi_10);\ + result_21 = timesMinusI(UChi_11);\ + result_22 = timesMinusI(UChi_12);\ + result_30 = timesMinusI(UChi_00);\ + result_31 = timesMinusI(UChi_01);\ + result_32 = timesMinusI(UChi_02); + +#define XP_RECON_ACCUM\ + result_00+=UChi_00;\ + result_01+=UChi_01;\ + result_02+=UChi_02;\ + result_10+=UChi_10;\ + result_11+=UChi_11;\ + result_12+=UChi_12;\ + result_20-=timesI(UChi_10);\ + result_21-=timesI(UChi_11);\ + result_22-=timesI(UChi_12);\ + result_30-=timesI(UChi_00);\ + result_31-=timesI(UChi_01);\ + result_32-=timesI(UChi_02); + +#define XM_RECON\ + result_00 = UChi_00;\ + result_01 = UChi_01;\ + result_02 = UChi_02;\ + result_10 = UChi_10;\ + result_11 = UChi_11;\ + result_12 = UChi_12;\ + result_20 = timesI(UChi_10);\ + result_21 = timesI(UChi_11);\ + result_22 = timesI(UChi_12);\ + result_30 = timesI(UChi_00);\ + result_31 = timesI(UChi_01);\ + result_32 = timesI(UChi_02); + +#define XM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_10);\ + result_21+= timesI(UChi_11);\ + result_22+= timesI(UChi_12);\ + result_30+= timesI(UChi_00);\ + result_31+= timesI(UChi_01);\ + result_32+= timesI(UChi_02); + +#define YP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_10;\ + result_21+= UChi_11;\ + result_22+= UChi_12;\ + result_30-= UChi_00;\ + result_31-= UChi_01;\ + result_32-= UChi_02; + +#define YM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_10;\ + result_21-= UChi_11;\ + result_22-= UChi_12;\ + result_30+= UChi_00;\ + result_31+= UChi_01;\ + result_32+= UChi_02; + +#define ZP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= timesI(UChi_00); \ + result_21-= timesI(UChi_01); \ + result_22-= timesI(UChi_02); \ + result_30+= timesI(UChi_10); \ + result_31+= timesI(UChi_11); \ + result_32+= timesI(UChi_12); + +#define ZM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= timesI(UChi_00); \ + result_21+= timesI(UChi_01); \ + result_22+= timesI(UChi_02); \ + result_30-= timesI(UChi_10); \ + result_31-= timesI(UChi_11); \ + result_32-= timesI(UChi_12); + +#define TP_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20+= UChi_00; \ + result_21+= UChi_01; \ + result_22+= UChi_02; \ + result_30+= UChi_10; \ + result_31+= UChi_11; \ + result_32+= UChi_12; + +#define TM_RECON_ACCUM\ + result_00+= UChi_00;\ + result_01+= UChi_01;\ + result_02+= UChi_02;\ + result_10+= UChi_10;\ + result_11+= UChi_11;\ + result_12+= UChi_12;\ + result_20-= UChi_00; \ + result_21-= UChi_01; \ + result_22-= UChi_02; \ + result_30-= UChi_10; \ + result_31-= UChi_11; \ + result_32-= UChi_12; + +namespace Grid { +namespace QCD { + +void DiracOptHand::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U, + std::vector > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + REGISTER vComplex result_00; // 12 regs on knc + REGISTER vComplex result_01; + REGISTER vComplex result_02; + + REGISTER vComplex result_10; + REGISTER vComplex result_11; + REGISTER vComplex result_12; + + REGISTER vComplex result_20; + REGISTER vComplex result_21; + REGISTER vComplex result_22; + + REGISTER vComplex result_30; + REGISTER vComplex result_31; + REGISTER vComplex result_32; // 20 left + + REGISTER vComplex Chi_00; // two spinor; 6 regs + REGISTER vComplex Chi_01; + REGISTER vComplex Chi_02; + + REGISTER vComplex Chi_10; + REGISTER vComplex Chi_11; + REGISTER vComplex Chi_12; // 14 left + + REGISTER vComplex UChi_00; // two spinor; 6 regs + REGISTER vComplex UChi_01; + REGISTER vComplex UChi_02; + + REGISTER vComplex UChi_10; + REGISTER vComplex UChi_11; + REGISTER vComplex UChi_12; // 8 left + + REGISTER vComplex U_00; // two rows of U matrix + REGISTER vComplex U_10; + REGISTER vComplex U_20; + REGISTER vComplex U_01; + REGISTER vComplex U_11; + REGISTER vComplex U_21; // 2 reg left. + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + + + int offset,local,perm, ptype; + + // Xp + offset = st._offsets [Xp][ss]; + local = st._is_local[Xp][ss]; + perm = st._permute[Xp][ss]; + ptype = st._permute_type[Xp]; + + if ( local ) { + LOAD_CHIMU; + XP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + + { + MULT_2SPIN(Xp); + } + XP_RECON; + // std::cout << "XP_RECON"< > &buf, + int ss,const LatticeFermion &in, LatticeFermion &out) +{ + REGISTER vComplex result_00; // 12 regs on knc + REGISTER vComplex result_01; + REGISTER vComplex result_02; + + REGISTER vComplex result_10; + REGISTER vComplex result_11; + REGISTER vComplex result_12; + + REGISTER vComplex result_20; + REGISTER vComplex result_21; + REGISTER vComplex result_22; + + REGISTER vComplex result_30; + REGISTER vComplex result_31; + REGISTER vComplex result_32; // 20 left + + REGISTER vComplex Chi_00; // two spinor; 6 regs + REGISTER vComplex Chi_01; + REGISTER vComplex Chi_02; + + REGISTER vComplex Chi_10; + REGISTER vComplex Chi_11; + REGISTER vComplex Chi_12; // 14 left + + REGISTER vComplex UChi_00; // two spinor; 6 regs + REGISTER vComplex UChi_01; + REGISTER vComplex UChi_02; + + REGISTER vComplex UChi_10; + REGISTER vComplex UChi_11; + REGISTER vComplex UChi_12; // 8 left + + REGISTER vComplex U_00; // two rows of U matrix + REGISTER vComplex U_10; + REGISTER vComplex U_20; + REGISTER vComplex U_01; + REGISTER vComplex U_11; + REGISTER vComplex U_21; // 2 reg left. + +#define Chimu_00 Chi_00 +#define Chimu_01 Chi_01 +#define Chimu_02 Chi_02 +#define Chimu_10 Chi_10 +#define Chimu_11 Chi_11 +#define Chimu_12 Chi_12 +#define Chimu_20 UChi_00 +#define Chimu_21 UChi_01 +#define Chimu_22 UChi_02 +#define Chimu_30 UChi_10 +#define Chimu_31 UChi_11 +#define Chimu_32 UChi_12 + + + int offset,local,perm, ptype; + + // Xp + offset = st._offsets [Xp][ss]; + local = st._is_local[Xp][ss]; + perm = st._permute[Xp][ss]; + ptype = st._permute_type[Xp]; + + if ( local ) { + LOAD_CHIMU; + XM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Xp); + } + XM_RECON; + + // Yp + offset = st._offsets [Yp][ss]; + local = st._is_local[Yp][ss]; + perm = st._permute[Yp][ss]; + ptype = st._permute_type[Yp]; + + if ( local ) { + LOAD_CHIMU; + YM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Yp); + } + YM_RECON_ACCUM; + + + // Zp + offset = st._offsets [Zp][ss]; + local = st._is_local[Zp][ss]; + perm = st._permute[Zp][ss]; + ptype = st._permute_type[Zp]; + + if ( local ) { + LOAD_CHIMU; + ZM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Zp); + } + ZM_RECON_ACCUM; + + // Tp + offset = st._offsets [Tp][ss]; + local = st._is_local[Tp][ss]; + perm = st._permute[Tp][ss]; + ptype = st._permute_type[Tp]; + + if ( local ) { + LOAD_CHIMU; + TM_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Tp); + } + TM_RECON_ACCUM; + + // Xm + offset = st._offsets [Xm][ss]; + local = st._is_local[Xm][ss]; + perm = st._permute[Xm][ss]; + ptype = st._permute_type[Xm]; + + if ( local ) { + LOAD_CHIMU; + XP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Xm); + } + XP_RECON_ACCUM; + + + // Ym + offset = st._offsets [Ym][ss]; + local = st._is_local[Ym][ss]; + perm = st._permute[Ym][ss]; + ptype = st._permute_type[Ym]; + + if ( local ) { + LOAD_CHIMU; + YP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Ym); + } + YP_RECON_ACCUM; + + // Zm + offset = st._offsets [Zm][ss]; + local = st._is_local[Zm][ss]; + perm = st._permute[Zm][ss]; + ptype = st._permute_type[Zm]; + + if ( local ) { + LOAD_CHIMU; + ZP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Zm); + } + ZP_RECON_ACCUM; + + // Tm + offset = st._offsets [Tm][ss]; + local = st._is_local[Tm][ss]; + perm = st._permute[Tm][ss]; + ptype = st._permute_type[Tm]; + + if ( local ) { + LOAD_CHIMU; + TP_PROJ; + if ( perm) { + PERMUTE; + } + } else { + LOAD_CHI; + } + { + MULT_2SPIN(Tm); + } + TP_RECON_ACCUM; + + { + vSpinColourVector & ref (out._odata[ss]); + vstream(ref()(0)(0),result_00); + vstream(ref()(0)(1),result_01); + vstream(ref()(0)(2),result_02); + vstream(ref()(1)(0),result_10); + vstream(ref()(1)(1),result_11); + vstream(ref()(1)(2),result_12); + vstream(ref()(2)(0),result_20); + vstream(ref()(2)(1),result_21); + vstream(ref()(2)(2),result_22); + vstream(ref()(3)(0),result_30); + vstream(ref()(3)(1),result_31); + vstream(ref()(3)(2),result_32); + } +} +}}