mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Overlap comms compute modifications
This commit is contained in:
		@@ -153,7 +153,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
/*********************************************************
 | 
			
		||||
 * Macro sequences encoding QCD
 | 
			
		||||
 *********************************************************/
 | 
			
		||||
#define LOCK_GAUGEa(dir)							
 | 
			
		||||
#define LOCK_GAUGE(dir)							\
 | 
			
		||||
  {									\
 | 
			
		||||
    uint64_t byte_addr = (uint64_t)&U._odata[sU];			\
 | 
			
		||||
@@ -168,8 +167,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	 : : "b" (count), "b" (byte_addr) );					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define UNLOCK_GAUGEa(dir)						
 | 
			
		||||
 | 
			
		||||
#define UNLOCK_GAUGE(dir)						\
 | 
			
		||||
  {									\
 | 
			
		||||
    uint64_t byte_addr = (uint64_t)&U._odata[sU];			\
 | 
			
		||||
@@ -184,20 +181,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	 : : "b" (count), "b" (byte_addr) );					\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define ZERO_PSI				\
 | 
			
		||||
  VZERO(psi_00)					\
 | 
			
		||||
  VZERO(psi_01)					\
 | 
			
		||||
  VZERO(psi_02)					\
 | 
			
		||||
  VZERO(psi_10)					\
 | 
			
		||||
  VZERO(psi_11)					\
 | 
			
		||||
  VZERO(psi_12)					\
 | 
			
		||||
  VZERO(psi_20)					\
 | 
			
		||||
  VZERO(psi_21)					\
 | 
			
		||||
  VZERO(psi_22)					\
 | 
			
		||||
  VZERO(psi_30)					\
 | 
			
		||||
  VZERO(psi_31)					\
 | 
			
		||||
  VZERO(psi_32)
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_QPX_LSd(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,16) 
 | 
			
		||||
#define MULT_2SPIN_QPX_LSf(ptr,p) MULT_2SPIN_QPX_INTERNAL(ptr,p,VSPLAT,8) 
 | 
			
		||||
#define MULT_2SPIN_QPXd(ptr,p)    MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,32) 
 | 
			
		||||
#define MULT_2SPIN_QPXf(ptr,p)    MULT_2SPIN_QPX_INTERNAL(ptr,p,VLOAD,16) 
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_QPX_INTERNALa(ptr,p,ULOAD,USKIP) { \
 | 
			
		||||
    asm (VMOV(UChi_00,Chi_00)			     \
 | 
			
		||||
	 VMOV(UChi_01,Chi_01)			     \
 | 
			
		||||
	 VMOV(UChi_02,Chi_02)			     \
 | 
			
		||||
	 VMOV(UChi_10,Chi_10)			     \
 | 
			
		||||
	 VMOV(UChi_11,Chi_11)			     \
 | 
			
		||||
	 VMOV(UChi_12,Chi_12) );		     \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_QPX_INTERNAL(ptr,p,ULOAD,USKIP) {			\
 | 
			
		||||
    uint64_t ub = ((uint64_t)ptr);				\
 | 
			
		||||
    asm (							\
 | 
			
		||||
@@ -253,14 +255,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	 : : "b" (USKIP*2), "b" (USKIP*5), "b" (USKIP*8), "b" (ub ));		\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN(&U._odata[sU](A),p)
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_DIR_PF(A,p) MULT_2SPIN_PF(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_PF(ptr,pf) MULT_2SPIN(ptr,pf)
 | 
			
		||||
 | 
			
		||||
#define SAVE_RESULT(base,basep) {\
 | 
			
		||||
    uint64_t ub = ((uint64_t)base)  - (VSIZE);			\
 | 
			
		||||
@@ -281,6 +278,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	: : "b" (ub) : HASH(pIMM), HASH(pREP) );				\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 *Annoying BG/Q loads with no immediat indexing and big performance hit
 | 
			
		||||
 *when second miss to a L1 line occurs
 | 
			
		||||
@@ -300,36 +298,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	VLOADu(IMM,REP,Chi_12)	: : "b" (ub)  : HASH(pIMM), HASH(pREP) );	\
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIa(base) {						\
 | 
			
		||||
    uint64_t ub = ((uint64_t)base)  - (VSIZE);			\
 | 
			
		||||
    asm("mr  " HASH(REP) ",%0 ;\n"					\
 | 
			
		||||
	"li  " HASH(IMM) "," HASH(VSIZE) ";\n"			\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_00)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_01)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_02)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_10)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_11)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_12) : : "b" (ub)  : HASH(pIMM), HASH(pREP)  ); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMUa(base) {						\
 | 
			
		||||
    uint64_t ub = ((uint64_t)base)  - (VSIZE);			\
 | 
			
		||||
    asm("mr  " HASH(REP) ",%0 ;\n"					\
 | 
			
		||||
	"li  " HASH(IMM) "," HASH(VSIZE) ";\n"			\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_00)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_01)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_02)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_10)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_11)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_12)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_20)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_21)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_22)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_30)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_31)						\
 | 
			
		||||
	VLOADu(IMM,REP,Chi_32) : : "b" (ub)  : HASH(pIMM), HASH(pREP)  ); \
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#define LOAD_CHIMU(base) {						\
 | 
			
		||||
    uint64_t ub = ((uint64_t)base)  - (2*VSIZE);			\
 | 
			
		||||
    asm("mr " HASH(REP) ",%0;\n"					\
 | 
			
		||||
@@ -605,6 +573,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	);\
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define ADD_RESULTi(PTR,pf)						\
 | 
			
		||||
  LOAD_CHIMU(PTR)							\
 | 
			
		||||
  asm(									\
 | 
			
		||||
  VADD(psi_00,chi_00,psi_00)  VADD(psi_01,chi_01,psi_01)  VADD(psi_02,chi_02,psi_02) \
 | 
			
		||||
  VADD(psi_10,chi_10,psi_10)  VADD(psi_11,chi_11,psi_11)  VADD(psi_12,chi_12,psi_12) \
 | 
			
		||||
  VADD(psi_20,chi_20,psi_20)  VADD(psi_21,chi_21,psi_21)  VADD(psi_22,chi_22,psi_22) \
 | 
			
		||||
  VADD(psi_30,chi_30,psi_30)  VADD(psi_31,chi_31,psi_31)  VADD(psi_32,chi_32,psi_32) ); \
 | 
			
		||||
  SAVE_RESULT(PTR,pf);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR3
 | 
			
		||||
#define PERMUTE_DIR2
 | 
			
		||||
#define PERMUTE_DIR1
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user