#ifdef KERNEL_DAG #define DIR0_PROJMEM(base) XP_PROJMEM(base); #define DIR1_PROJMEM(base) YP_PROJMEM(base); #define DIR2_PROJMEM(base) ZP_PROJMEM(base); #define DIR3_PROJMEM(base) TP_PROJMEM(base); #define DIR4_PROJMEM(base) XM_PROJMEM(base); #define DIR5_PROJMEM(base) YM_PROJMEM(base); #define DIR6_PROJMEM(base) ZM_PROJMEM(base); #define DIR7_PROJMEM(base) TM_PROJMEM(base); #define DIR0_RECON XP_RECON #define DIR1_RECON YP_RECON_ACCUM #define DIR2_RECON ZP_RECON_ACCUM #define DIR3_RECON TP_RECON_ACCUM #define DIR4_RECON XM_RECON_ACCUM #define DIR5_RECON YM_RECON_ACCUM #define DIR6_RECON ZM_RECON_ACCUM #define DIR7_RECON TM_RECON_ACCUM #else #define DIR0_PROJMEM(base) XM_PROJMEM(base); #define DIR1_PROJMEM(base) YM_PROJMEM(base); #define DIR2_PROJMEM(base) ZM_PROJMEM(base); #define DIR3_PROJMEM(base) TM_PROJMEM(base); #define DIR4_PROJMEM(base) XP_PROJMEM(base); #define DIR5_PROJMEM(base) YP_PROJMEM(base); #define DIR6_PROJMEM(base) ZP_PROJMEM(base); #define DIR7_PROJMEM(base) TP_PROJMEM(base); #define DIR0_RECON XM_RECON #define DIR1_RECON YM_RECON_ACCUM #define DIR2_RECON ZM_RECON_ACCUM #define DIR3_RECON TM_RECON_ACCUM #define DIR4_RECON XP_RECON_ACCUM #define DIR5_RECON YP_RECON_ACCUM #define DIR6_RECON ZP_RECON_ACCUM #define DIR7_RECON TP_RECON_ACCUM #endif //////////////////////////////////////////////////////////////////////////////// // Comms then compute kernel //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR_AND_EXTERIOR #define ZERO_NMU(A) #define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) #define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) #define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ LOAD64(%r10,isigns); \ PROJMEM(base); \ MAYBEPERM(PERMUTE_DIR,perm); #define EXTERIOR_BLOCK(a,b,RECON) \ LOAD_CHI(base); #define COMMON_BLOCK(a,b,RECON) \ base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ MULT_2SPIN_DIR_PF(a,basep); \ LOAD64(%r10,isigns); \ RECON; #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif //////////////////////////////////////////////////////////////////////////////// // Pre comms kernel -- prefetch like normal because it is mostly right //////////////////////////////////////////////////////////////////////////////// #ifdef INTERIOR #define COMMON_BLOCK(a,b,RECON) #define ZERO_NMU(A) // No accumulate for DIR0 #define EXTERIOR_BLOCK_XP(a,b,RECON) \ ZERO_PSI; \ base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; #define EXTERIOR_BLOCK(a,b,RECON) \ base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; #define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) #define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ LOAD64(%r10,isigns); \ PROJMEM(base); \ MAYBEPERM(PERMUTE_DIR,perm); \ base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ MULT_2SPIN_DIR_PF(a,basep); \ LOAD64(%r10,isigns); \ RECON; #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR #define ZERO_NMU(A) nmu=0; #define INTERIOR_BLOCK_XP(a,b,PERMUTE_DIR,PROJMEM,RECON) \ ZERO_PSI; base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; #define EXTERIOR_BLOCK_XP(a,b,RECON) EXTERIOR_BLOCK(a,b,RECON) #define INTERIOR_BLOCK(a,b,PERMUTE_DIR,PROJMEM,RECON) \ base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; #define EXTERIOR_BLOCK(a,b,RECON) \ nmu++; \ LOAD_CHI(base); \ MULT_2SPIN_DIR_PF(a,base); \ base = st.GetInfo(ptype,local,perm,b,ent,plocal); ent++; \ LOAD64(%r10,isigns); \ RECON; #define COMMON_BLOCK(a,b,RECON) #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif { int nmu; int local,perm, ptype; uint64_t base; uint64_t basep; const uint64_t plocal =(uint64_t) & in._odata[0]; COMPLEX_SIGNS(isigns); MASK_REGS; int nmax=U._grid->oSites(); for(int site=0;site=nmax) ssn=0; int sUn=lo.Reorder(ssn); #ifndef EXTERIOR LOCK_GAUGE(0); #endif for(int s=0;s