diff --git a/lib/simd/BGQQPX.h b/lib/simd/BGQQPX.h index 34888ab7..3f428ad9 100644 --- a/lib/simd/BGQQPX.h +++ b/lib/simd/BGQQPX.h @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -23,8 +23,8 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #ifndef GRID_ASM_BGQ_QPX_H #define GRID_ASM_BGQ_QPX_H @@ -127,20 +127,20 @@ Author: paboyle /********************************************************* * Macro sequences encoding QCD *********************************************************/ -#define LOCK_GAUGE(dir) \ - { \ - uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ - for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ - CACHE_LOCK(&byte_addr[i]); \ - } \ +#define LOCK_GAUGE(dir) \ + { \ + uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ + for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ + CACHE_LOCK(&byte_addr[i]); \ + } \ } -#define UNLOCK_GAUGE(dir) \ - { \ - uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ - for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ - CACHE_UNLOCK(&byte_addr[i]); \ - } \ +#define UNLOCK_GAUGE(dir) \ + { \ + uint8_t *byte_addr = (uint8_t *)&U._odata[sU](dir); \ + for(int i=0;i< 18*2*BYTES_PER_WORD*8;i+=32){ \ + CACHE_UNLOCK(&byte_addr[i]); \ + } \ } #define MAYBEPERM(A,B) @@ -165,369 +165,369 @@ Author: paboyle VLOAD(%0,%3,U0) \ VLOAD(%1,%3,U1) \ VLOAD(%2,%3,U2) \ - VMUL_RR_RI(U0,Chi_00,UChi_00) \ - VMUL_RR_RI(U1,Chi_00,UChi_01) \ - VMUL_RR_RI(U2,Chi_00,UChi_02) \ - VMUL_RR_RI(U0,Chi_10,UChi_10) \ - VMUL_RR_RI(U1,Chi_10,UChi_11) \ - VMUL_RR_RI(U2,Chi_10,UChi_12) \ - VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \ - VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \ - VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \ - VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \ - VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \ - VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \ - : : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \ - asm ( \ - VLOAD(%0,%3,U0) \ - VLOAD(%1,%3,U1) \ - VLOAD(%2,%3,U2) \ - VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \ - VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \ - VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \ - VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \ - VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \ - VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \ - VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \ - VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \ - VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \ - VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \ - VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \ - VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \ - : : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \ - asm ( \ - VLOAD(%0,%3,U0) \ - VLOAD(%1,%3,U1) \ - VLOAD(%2,%3,U2) \ - VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \ - VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \ - VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \ - VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \ - VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \ - VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \ - VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \ - VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \ - VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \ - VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \ - VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \ - VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \ - : : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \ + VMUL_RR_RI(U0,Chi_00,UChi_00) \ + VMUL_RR_RI(U1,Chi_00,UChi_01) \ + VMUL_RR_RI(U2,Chi_00,UChi_02) \ + VMUL_RR_RI(U0,Chi_10,UChi_10) \ + VMUL_RR_RI(U1,Chi_10,UChi_11) \ + VMUL_RR_RI(U2,Chi_10,UChi_12) \ + VMADD_MII_IR(U0,Chi_00,UChi_00,UChi_00) \ + VMADD_MII_IR(U1,Chi_00,UChi_01,UChi_01) \ + VMADD_MII_IR(U2,Chi_00,UChi_02,UChi_02) \ + VMADD_MII_IR(U0,Chi_10,UChi_10,UChi_10) \ + VMADD_MII_IR(U1,Chi_10,UChi_11,UChi_11) \ + VMADD_MII_IR(U2,Chi_10,UChi_12,UChi_12) \ + : : "r" (0), "r" (32*3), "r" (32*6), "r" (ub )); \ + asm ( \ + VLOAD(%0,%3,U0) \ + VLOAD(%1,%3,U1) \ + VLOAD(%2,%3,U2) \ + VMADD_RR_RI(U0,Chi_01,UChi_00,UChi_00) \ + VMADD_RR_RI(U1,Chi_01,UChi_01,UChi_01) \ + VMADD_RR_RI(U2,Chi_01,UChi_02,UChi_02) \ + VMADD_RR_RI(U0,Chi_11,UChi_10,UChi_10) \ + VMADD_RR_RI(U1,Chi_11,UChi_11,UChi_11) \ + VMADD_RR_RI(U2,Chi_11,UChi_12,UChi_12) \ + VMADD_MII_IR(U0,Chi_01,UChi_00,UChi_00) \ + VMADD_MII_IR(U1,Chi_01,UChi_01,UChi_01) \ + VMADD_MII_IR(U2,Chi_01,UChi_02,UChi_02) \ + VMADD_MII_IR(U0,Chi_11,UChi_10,UChi_10) \ + VMADD_MII_IR(U1,Chi_11,UChi_11,UChi_11) \ + VMADD_MII_IR(U2,Chi_11,UChi_12,UChi_12) \ + : : "r" (32), "r" (32*4), "r" (32*7), "r" (ub )); \ + asm ( \ + VLOAD(%0,%3,U0) \ + VLOAD(%1,%3,U1) \ + VLOAD(%2,%3,U2) \ + VMADD_RR_RI(U0,Chi_02,UChi_00,UChi_00) \ + VMADD_RR_RI(U1,Chi_02,UChi_01,UChi_01) \ + VMADD_RR_RI(U2,Chi_02,UChi_02,UChi_02) \ + VMADD_RR_RI(U0,Chi_12,UChi_10,UChi_10) \ + VMADD_RR_RI(U1,Chi_12,UChi_11,UChi_11) \ + VMADD_RR_RI(U2,Chi_12,UChi_12,UChi_12) \ + VMADD_MII_IR(U0,Chi_02,UChi_00,UChi_00) \ + VMADD_MII_IR(U1,Chi_02,UChi_01,UChi_01) \ + VMADD_MII_IR(U2,Chi_02,UChi_02,UChi_02) \ + VMADD_MII_IR(U0,Chi_12,UChi_10,UChi_10) \ + VMADD_MII_IR(U1,Chi_12,UChi_11,UChi_11) \ + VMADD_MII_IR(U2,Chi_12,UChi_12,UChi_12) \ + : : "r" (32*2), "r" (32*5), "r" (32*8), "r" (ub )); \ } -#define SAVE_RESULT(base,basep) {\ - uint64_t ub = ((uint64_t)base) - 32; \ - asm("mr %0,"REP";\n\t" \ - "li " IMM ",32;\n\t" \ - VSTORE(IMM,REP,psi_00) \ - VSTORE(IMM,REP,psi_01) \ - VSTORE(IMM,REP,psi_02) \ - VSTORE(IMM,REP,psi_10) \ - VSTORE(IMM,REP,psi_11) \ - VSTORE(IMM,REP,psi_12) \ - VSTORE(IMM,REP,psi_20) \ - VSTORE(IMM,REP,psi_21) \ - VSTORE(IMM,REP,psi_22) \ - VSTORE(IMM,REP,psi_30) \ - VSTORE(IMM,REP,psi_31) \ - VSTORE(IMM,REP,psi_32) \ - ); \ -} +#define SAVE_RESULT(base,basep) { \ + uint64_t ub = ((uint64_t)base) - 32; \ + asm("mr %0,"REP";\n\t" \ + "li " IMM ",32;\n\t" \ + VSTORE(IMM,REP,psi_00) \ + VSTORE(IMM,REP,psi_01) \ + VSTORE(IMM,REP,psi_02) \ + VSTORE(IMM,REP,psi_10) \ + VSTORE(IMM,REP,psi_11) \ + VSTORE(IMM,REP,psi_12) \ + VSTORE(IMM,REP,psi_20) \ + VSTORE(IMM,REP,psi_21) \ + VSTORE(IMM,REP,psi_22) \ + VSTORE(IMM,REP,psi_30) \ + VSTORE(IMM,REP,psi_31) \ + VSTORE(IMM,REP,psi_32) \ + ); \ + } /* *Annoying BG/Q loads with no immediat indexing and big performance hit *when second miss to a L1 line occurs */ -#define LOAD_CHI(base) { \ - uint64_t ub = ((uint64_t)base) - 64; \ - asm("mr %0,"REP";\n\t" \ - "li " IMM ",64;\n\t" \ - VLOAD(IMM,REP,Chi_00) \ - VLOAD(IMM,REP,Chi_02) \ - VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \ - ub = ((uint64_t)base) - 32; \ - asm("mr %0,"REP";\n\t" \ - "li IMM,64;\n\t" \ - VLOAD(IMM,REP,Chimu_01) \ - VLOAD(IMM,REP,Chimu_10) \ - VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \ +#define LOAD_CHI(base) { \ + uint64_t ub = ((uint64_t)base) - 64; \ + asm("mr %0,"REP";\n\t" \ + "li " IMM ",64;\n\t" \ + VLOAD(IMM,REP,Chi_00) \ + VLOAD(IMM,REP,Chi_02) \ + VLOAD(IMM,REP,Chi_11) : : "r" (ub) ); \ + ub = ((uint64_t)base) - 32; \ + asm("mr %0,"REP";\n\t" \ + "li IMM,64;\n\t" \ + VLOAD(IMM,REP,Chimu_01) \ + VLOAD(IMM,REP,Chimu_10) \ + VLOAD(IMM,REP,Chimu_12) : : "r" (ub) ); \ } -#define LOAD_CHIMU(base) { \ - uint64_t ub = ((uint64_t)base) - 64; \ - asm("mr %0,"REP";\n\t" \ - "li IMM,64;\n\t" \ - VLOAD(IMM,REP,Chimu_00) \ - VLOAD(IMM,REP,Chimu_02) \ - VLOAD(IMM,REP,Chimu_11) \ - VLOAD(IMM,REP,Chimu_20) \ - VLOAD(IMM,REP,Chimu_22) \ - VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \ - ub = ((uint64_t)base) - 32; \ - asm("mr %0,"REP";\n\t" \ - "li IMM,64;\n\t" \ - VLOAD(IMM,REP,Chimu_01) \ - VLOAD(IMM,REP,Chimu_10) \ - VLOAD(IMM,REP,Chimu_12) \ - VLOAD(IMM,REP,Chimu_21) \ - VLOAD(IMM,REP,Chimu_30) \ - VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \ +#define LOAD_CHIMU(base) { \ + uint64_t ub = ((uint64_t)base) - 64; \ + asm("mr %0,"REP";\n\t" \ + "li IMM,64;\n\t" \ + VLOAD(IMM,REP,Chimu_00) \ + VLOAD(IMM,REP,Chimu_02) \ + VLOAD(IMM,REP,Chimu_11) \ + VLOAD(IMM,REP,Chimu_20) \ + VLOAD(IMM,REP,Chimu_22) \ + VLOAD(IMM,REP,Chimu_31) : : "r" (ub) ); \ + ub = ((uint64_t)base) - 32; \ + asm("mr %0,"REP";\n\t" \ + "li IMM,64;\n\t" \ + VLOAD(IMM,REP,Chimu_01) \ + VLOAD(IMM,REP,Chimu_10) \ + VLOAD(IMM,REP,Chimu_12) \ + VLOAD(IMM,REP,Chimu_21) \ + VLOAD(IMM,REP,Chimu_30) \ + VLOAD(IMM,REP,Chimu_32) : : "r" (ub) ); \ } // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); -#define XP_PROJMEM(base) { \ +#define XP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \ + VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \ + VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \ + VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \ + VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \ + VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \ + ); \ + } + +#define XM_PROJMEM(base) { \ LOAD_CHIMU(base); \ asm ( \ VONE(one) \ - VMADD_MII_IR(one,Chimu_30,Chimu_00,Chi_00) \ - VMADD_MII_IR(one,Chimu_31,Chimu_01,Chi_01) \ - VMADD_MII_IR(one,Chimu_32,Chimu_02,Chi_02) \ - VMADD_MII_IR(one,Chimu_20,Chimu_10,Chi_10) \ - VMADD_MII_IR(one,Chimu_21,Chimu_11,Chi_11) \ - VMADD_MII_IR(one,Chimu_22,Chimu_12,Chi_12) \ - ); \ - } - -#define XM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \ - VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \ - VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \ - VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \ - VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \ - VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \ + VMADD_II_MIR(one,Chimu_30,Chimu_00,Chi_00) \ + VMADD_II_MIR(one,Chimu_31,Chimu_01,Chi_01) \ + VMADD_II_MIR(one,Chimu_32,Chimu_02,Chi_02) \ + VMADD_II_MIR(one,Chimu_20,Chimu_10,Chi_10) \ + VMADD_II_MIR(one,Chimu_21,Chimu_11,Chi_11) \ + VMADD_II_MIR(one,Chimu_22,Chimu_12,Chi_12) \ ); \ } // hspin(0)=fspin(0)-fspin(3); // hspin(1)=fspin(1)+fspin(2); -#define YP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VSUB(Chimu_00,Chimu_00,Chi_30) \ - VSUB(Chimu_01,Chimu_01,Chi_31) \ - VSUB(Chimu_02,Chimu_02,Chi_32) \ - VADD(Chimu_10,Chimu_10,Chi_20) \ - VADD(Chimu_11,Chimu_11,Chi_21) \ - VADD(Chimu_12,Chimu_12,Chi_22) \ - ); \ +#define YP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VSUB(Chimu_00,Chimu_00,Chi_30) \ + VSUB(Chimu_01,Chimu_01,Chi_31) \ + VSUB(Chimu_02,Chimu_02,Chi_32) \ + VADD(Chimu_10,Chimu_10,Chi_20) \ + VADD(Chimu_11,Chimu_11,Chi_21) \ + VADD(Chimu_12,Chimu_12,Chi_22) \ + ); \ } -#define YM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VADD(Chimu_00,Chimu_00,Chi_30) \ - VADD(Chimu_01,Chimu_01,Chi_31) \ - VADD(Chimu_02,Chimu_02,Chi_32) \ - VSUB(Chimu_10,Chimu_10,Chi_20) \ - VSUB(Chimu_11,Chimu_11,Chi_21) \ - VSUB(Chimu_12,Chimu_12,Chi_22) \ - ); \ +#define YM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VADD(Chimu_00,Chimu_00,Chi_30) \ + VADD(Chimu_01,Chimu_01,Chi_31) \ + VADD(Chimu_02,Chimu_02,Chi_32) \ + VSUB(Chimu_10,Chimu_10,Chi_20) \ + VSUB(Chimu_11,Chimu_11,Chi_21) \ + VSUB(Chimu_12,Chimu_12,Chi_22) \ + ); \ } - /*Gz - * 0 0 i 0 [0]+-i[2] - * 0 0 0 -i [1]-+i[3] - * -i 0 0 0 - * 0 i 0 0 - */ -#define ZP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \ - VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \ - VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \ - VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \ - VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \ - VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \ - ); \ +/*Gz + * 0 0 i 0 [0]+-i[2] + * 0 0 0 -i [1]-+i[3] + * -i 0 0 0 + * 0 i 0 0 + */ +#define ZP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_MII_IR(one,Chimu_20,Chimu_00,Chi_00) \ + VMADD_MII_IR(one,Chimu_21,Chimu_01,Chi_01) \ + VMADD_MII_IR(one,Chimu_22,Chimu_02,Chi_02) \ + VMADD_II_MIR(one,Chimu_30,Chimu_10,Chi_10) \ + VMADD_II_MIR(one,Chimu_31,Chimu_11,Chi_11) \ + VMADD_II_MIR(one,Chimu_32,Chimu_12,Chi_12) \ + ); \ } -#define ZM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VONE(one) \ - VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \ - VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \ - VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \ - VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \ - VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \ - VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \ - ); \ +#define ZM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VONE(one) \ + VMADD_II_MIR(one,Chimu_20,Chimu_00,Chi_00) \ + VMADD_II_MIR(one,Chimu_21,Chimu_01,Chi_01) \ + VMADD_II_MIR(one,Chimu_22,Chimu_02,Chi_02) \ + VMADD_MII_IR(one,Chimu_30,Chimu_10,Chi_10) \ + VMADD_MII_IR(one,Chimu_31,Chimu_11,Chi_11) \ + VMADD_MII_IR(one,Chimu_32,Chimu_12,Chi_12) \ + ); \ } - /*Gt - * 0 0 1 0 [0]+-[2] - * 0 0 0 1 [1]+-[3] - * 1 0 0 0 - * 0 1 0 0 - */ -#define TP_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VADD(Chimu_00,Chimu_00,Chi_20) \ - VADD(Chimu_01,Chimu_01,Chi_21) \ - VADD(Chimu_02,Chimu_02,Chi_22) \ - VADD(Chimu_10,Chimu_10,Chi_30) \ - VADD(Chimu_11,Chimu_11,Chi_31) \ - VADD(Chimu_12,Chimu_12,Chi_32) \ - ); \ +/*Gt + * 0 0 1 0 [0]+-[2] + * 0 0 0 1 [1]+-[3] + * 1 0 0 0 + * 0 1 0 0 + */ +#define TP_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VADD(Chimu_00,Chimu_00,Chi_20) \ + VADD(Chimu_01,Chimu_01,Chi_21) \ + VADD(Chimu_02,Chimu_02,Chi_22) \ + VADD(Chimu_10,Chimu_10,Chi_30) \ + VADD(Chimu_11,Chimu_11,Chi_31) \ + VADD(Chimu_12,Chimu_12,Chi_32) \ + ); \ } -#define TM_PROJMEM(base) { \ - LOAD_CHIMU(base); \ - asm ( \ - VSUB(Chimu_00,Chimu_00,Chi_20) \ - VSUB(Chimu_01,Chimu_01,Chi_21) \ - VSUB(Chimu_02,Chimu_02,Chi_22) \ - VSUB(Chimu_10,Chimu_10,Chi_30) \ - VSUB(Chimu_11,Chimu_11,Chi_31) \ - VSUB(Chimu_12,Chimu_12,Chi_32) \ - ); \ +#define TM_PROJMEM(base) { \ + LOAD_CHIMU(base); \ + asm ( \ + VSUB(Chimu_00,Chimu_00,Chi_20) \ + VSUB(Chimu_01,Chimu_01,Chi_21) \ + VSUB(Chimu_02,Chimu_02,Chi_22) \ + VSUB(Chimu_10,Chimu_10,Chi_30) \ + VSUB(Chimu_11,Chimu_11,Chi_31) \ + VSUB(Chimu_12,Chimu_12,Chi_32) \ + ); \ } /* - fspin(0)=hspin(0); - fspin(1)=hspin(1); - fspin(2)=timesMinusI(hspin(1)); - fspin(3)=timesMinusI(hspin(0)); + fspin(0)=hspin(0); + fspin(1)=hspin(1); + fspin(2)=timesMinusI(hspin(1)); + fspin(3)=timesMinusI(hspin(0)); - fspin(0)+=hspin(0); - fspin(1)+=hspin(1); - fspin(2)-=timesI(hspin(1)); - fspin(3)-=timesI(hspin(0)); - */ -#define XP_RECON { \ - asm(\ - VONE(one)\ - VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\ - VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\ - VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ - VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ - VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ - VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ - VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ - VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ - VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ - VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ - ); \ + fspin(0)+=hspin(0); + fspin(1)+=hspin(1); + fspin(2)-=timesI(hspin(1)); + fspin(3)-=timesI(hspin(0)); +*/ +#define XP_RECON { \ + asm( \ + VONE(one) \ + VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \ + VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \ + VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ + VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ + VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ + VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ + VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ + VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ + VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ + VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ + ); \ } -#define XM_RECON { \ - asm(\ - VONE(one)\ - VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02)\ - VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12)\ - VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ - VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ - VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ - VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ - VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ - VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ - VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ - VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ - ); \ +#define XM_RECON { \ + asm( \ + VONE(one) \ + VMOV(psi_00,UChi_00) VMOV(psi_01,UChi_01) VMOV(psi_02,UChi_02) \ + VMOV(psi_10,UChi_10) VMOV(psi_11,UChi_11) VMOV(psi_12,UChi_12) \ + VZERO(psi_20) VZERO(psi_21) VZERO(psi_22) \ + VZERO(psi_30) VZERO(psi_31) VZERO(psi_32) \ + VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ + VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ + VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ + VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ + VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ + VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ + ); \ } -#define XP_RECON_ACCUM { \ - asm(\ - VONE(one)\ +#define XP_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ - VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ - VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ - VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ - VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ - VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ - ); \ + VMADD_II_MIR(one,UChi_10,psi_20,psi_20) \ + VMADD_II_MIR(one,UChi_11,psi_21,psi_21) \ + VMADD_II_MIR(one,UChi_12,psi_22,psi_22) \ + VMADD_II_MIR(one,UChi_00,psi_30,psi_30) \ + VMADD_II_MIR(one,UChi_01,psi_31,psi_31) \ + VMADD_II_MIR(one,UChi_02,psi_32,psi_32) \ + ); \ } -#define XM_RECON_ACCUM { \ - asm(\ - VONE(one)\ +#define XM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ - VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ - VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ - VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ - VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ - VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ - ); \ + VMADD_MII_IR(one,UChi_10,psi_20,psi_20) \ + VMADD_MII_IR(one,UChi_11,psi_21,psi_21) \ + VMADD_MII_IR(one,UChi_12,psi_22,psi_22) \ + VMADD_MII_IR(one,UChi_00,psi_30,psi_30) \ + VMADD_MII_IR(one,UChi_01,psi_31,psi_31) \ + VMADD_MII_IR(one,UChi_02,psi_32,psi_32) \ + ); \ } // fspin(2)+=hspin(1); // fspin(3)-=hspin(0); -#define YP_RECON_ACCUM {\ - asm(\ +#define YP_RECON_ACCUM { \ + asm( \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VADD(psi_20,UChi_10,psi_20) VADD(psi_21,UChi_11,psi_21) VADD(psi_22,UChi_12,psi_22) \ VSUB(psi_30,UChi_00,psi_30) VSUB(psi_31,UChi_01,psi_31) VSUB(psi_32,UChi_02,psi_32) \ - );\ - } -#define YM_RECON_ACCUM {\ - asm(\ + ); \ + } +#define YM_RECON_ACCUM { \ + asm( \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VSUB(psi_20,UChi_10,psi_20) VSUB(psi_21,UChi_11,psi_21) VSUB(psi_22,UChi_12,psi_22) \ VADD(psi_30,UChi_00,psi_30) VADD(psi_31,UChi_01,psi_31) VADD(psi_32,UChi_02,psi_32) \ - );\ - } + ); \ + } // fspin(2)-=timesI(hspin(0)); // fspin(3)+=timesI(hspin(1)); -#define ZP_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define ZP_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \ - VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \ - VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \ - VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \ - VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \ - VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \ - );\ - } + VMADD_II_MIR(one,UChi_00,psi_20,psi_20) \ + VMADD_II_MIR(one,UChi_01,psi_21,psi_21) \ + VMADD_II_MIR(one,UChi_02,psi_22,psi_22) \ + VMADD_MII_IR(one,UChi_10,psi_30,psi_30) \ + VMADD_MII_IR(one,UChi_11,psi_31,psi_31) \ + VMADD_MII_IR(one,UChi_12,psi_32,psi_32) \ + ); \ + } -#define ZM_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define ZM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ - VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \ - VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \ - VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \ - VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \ - VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \ - VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \ - );\ - } + VMADD_MII_IR(one,UChi_00,psi_20,psi_20) \ + VMADD_MII_IR(one,UChi_01,psi_21,psi_21) \ + VMADD_MII_IR(one,UChi_02,psi_22,psi_22) \ + VMADD_II_MIR(one,UChi_10,psi_30,psi_30) \ + VMADD_II_MIR(one,UChi_11,psi_31,psi_31) \ + VMADD_II_MIR(one,UChi_12,psi_32,psi_32) \ + ); \ + } // fspin(2)+=hspin(0); // fspin(3)+=hspin(1); -#define TP_RECON_ACCUM {\ - asm(\ +#define TP_RECON_ACCUM { \ + asm( \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VADD(psi_20,UChi_00,psi_20) VADD(psi_21,UChi_01,psi_21) VADD(psi_22,UChi_02,psi_22) \ VADD(psi_30,UChi_10,psi_30) VADD(psi_31,UChi_11,psi_31) VADD(psi_32,UChi_12,psi_32) \ - );\ - } + ); \ + } -#define TM_RECON_ACCUM {\ - asm(\ - VONE(one)\ +#define TM_RECON_ACCUM { \ + asm( \ + VONE(one) \ VADD(psi_00,UChi_00,psi_00) VADD(psi_01,UChi_01,psi_01) VADD(psi_02,UChi_02,psi_02) \ VADD(psi_10,UChi_10,psi_10) VADD(psi_11,UChi_11,psi_11) VADD(psi_12,UChi_12,psi_12) \ VSUB(psi_20,UChi_00,psi_20) VSUB(psi_21,UChi_01,psi_21) VSUB(psi_22,UChi_02,psi_22) \ VSUB(psi_30,UChi_10,psi_30) VSUB(psi_31,UChi_11,psi_31) VSUB(psi_32,UChi_12,psi_32) \ - );\ - } + ); \ + } uint64_t GetPFInfo(int nent,int plocal); uint64_t GetInfo(int ptype,int local,int perm,int Xp,int ent,int plocal); @@ -551,244 +551,244 @@ void testme(int osites,int ssU) int nmax=osites; for(int site=0;site=nmax) ssn=0; - int sUn=ssn; - for(int s=0;s=nmax) ssn=0; + int sUn=ssn; + for(int s=0;s shuffle and xor the real part sign bit + //////////////////////////////// + // Yp + //////////////////////////////// + basep = GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YP_PROJMEM(base); + YP_PROJMEM(base); #else - YM_PROJMEM(base); + YM_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR2,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFYP(Yp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR2,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Zp,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFYP(Yp,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YP_RECON_ACCUM; + YP_RECON_ACCUM; #else - YM_RECON_ACCUM; + YM_RECON_ACCUM; #endif - //////////////////////////////// - // Zp - //////////////////////////////// - basep = GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Zp + //////////////////////////////// + basep = GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZP_PROJMEM(base); + ZP_PROJMEM(base); #else - ZM_PROJMEM(base); + ZM_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR1,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFZP(Zp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR1,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Tp,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFZP(Zp,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZP_RECON_ACCUM; + ZP_RECON_ACCUM; #else - ZM_RECON_ACCUM; + ZM_RECON_ACCUM; #endif - //////////////////////////////// - // Tp - //////////////////////////////// - basep = GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Tp + //////////////////////////////// + basep = GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TP_PROJMEM(base); + TP_PROJMEM(base); #else - TM_PROJMEM(base); + TM_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR0,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFTP(Tp,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR0,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Xm,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFTP(Tp,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TP_RECON_ACCUM; + TP_RECON_ACCUM; #else - TM_RECON_ACCUM; + TM_RECON_ACCUM; #endif - //////////////////////////////// - // Xm - //////////////////////////////// + //////////////////////////////// + // Xm + //////////////////////////////// #ifndef STREAM_STORE - basep= (uint64_t) &out._odata[ss]; + basep= (uint64_t) &out._odata[ss]; #endif - // basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + // basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - XM_PROJMEM(base); + XM_PROJMEM(base); #else - XP_PROJMEM(base); + XP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR3,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFXM(Xm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR3,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Ym,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFXM(Xm,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - XM_RECON_ACCUM; + XM_RECON_ACCUM; #else - XP_RECON_ACCUM; + XP_RECON_ACCUM; #endif - //////////////////////////////// - // Ym - //////////////////////////////// - basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Ym + //////////////////////////////// + basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YM_PROJMEM(base); + YM_PROJMEM(base); #else - YP_PROJMEM(base); + YP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR2,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFYM(Ym,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR2,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Zm,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFYM(Ym,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - YM_RECON_ACCUM; + YM_RECON_ACCUM; #else - YP_RECON_ACCUM; + YP_RECON_ACCUM; #endif - //////////////////////////////// - // Zm - //////////////////////////////// - basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Zm + //////////////////////////////// + basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZM_PROJMEM(base); + ZM_PROJMEM(base); #else - ZP_PROJMEM(base); + ZP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR1,perm); - } else { - LOAD_CHI(base); - } - base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; - PREFETCH_CHIMU(base); - { - MULT_2SPIN_DIR_PFZM(Zm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + MAYBEPERM(PERMUTE_DIR1,perm); + } else { + LOAD_CHI(base); + } + base = GetInfo(ptype,local,perm,Tm,ent,plocal); ent++; + PREFETCH_CHIMU(base); + { + MULT_2SPIN_DIR_PFZM(Zm,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - ZM_RECON_ACCUM; + ZM_RECON_ACCUM; #else - ZP_RECON_ACCUM; + ZP_RECON_ACCUM; #endif - //////////////////////////////// - // Tm - //////////////////////////////// - basep= GetPFInfo(nent,plocal); nent++; - if ( local ) { - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + //////////////////////////////// + // Tm + //////////////////////////////// + basep= GetPFInfo(nent,plocal); nent++; + if ( local ) { + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TM_PROJMEM(base); + TM_PROJMEM(base); #else - TP_PROJMEM(base); + TP_PROJMEM(base); #endif - MAYBEPERM(PERMUTE_DIR0,perm); - } else { - LOAD_CHI(base); - } - base= (uint64_t) &out._odata[ss]; + MAYBEPERM(PERMUTE_DIR0,perm); + } else { + LOAD_CHI(base); + } + base= (uint64_t) &out._odata[ss]; #ifndef STREAM_STORE - PREFETCH_CHIMU(base); + PREFETCH_CHIMU(base); #endif - { - MULT_2SPIN_DIR_PFTM(Tm,basep); - } - LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit + { + MULT_2SPIN_DIR_PFTM(Tm,basep); + } + LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit #ifdef KERNEL_DAG - TM_RECON_ACCUM; + TM_RECON_ACCUM; #else - TP_RECON_ACCUM; + TP_RECON_ACCUM; #endif - basep= GetPFInfo(nent,plocal); nent++; - SAVE_RESULT(base,basep); + basep= GetPFInfo(nent,plocal); nent++; + SAVE_RESULT(base,basep); - } - ssU++; + } + ssU++; } }