From 48d1f0df8932ed6a41094db0320035195827c86d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Dec 2023 12:33:47 -0500 Subject: [PATCH] Optimised partially, working --- Grid/lattice/Lattice_transfer.h | 126 +++++++++++++++++--------------- 1 file changed, 68 insertions(+), 58 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index cf8fd090..1cb1d449 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -763,19 +763,22 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro #if 1 size_t nsite = 1; for(int i=0;ioIndex(from_coor); @@ -801,12 +804,11 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate t_rdimensions = Tg->_rdimensions; accelerator_for(idx, nsite, 1, { - Coordinate from_coor, to_coor; - size_t rem = idx; + Coordinate from_coor, to_coor, base; + Lexicographic::CoorFromIndex(base,idx,RegionSize); for(int i=0;i &From,Lattice & To,Coordinate Fro typedef typename vobj::vector_type vector_type; typedef typename vobj::scalar_type scalar_type; - autoView(from_v,From,AcceleratorRead); - autoView(to_v,To,AcceleratorWrite); - RealD t_acc=-usecond(); - const int words=sizeof(vobj)/sizeof(vector_type); - accelerator_for(idx,nsite,words,{ - int* tt = table_d + 4*idx; - int from_oidx = *tt++; - int from_lane = *tt++; - int to_oidx = *tt++; - int to_lane = *tt; + { + autoView(from_v,From,AcceleratorRead); + autoView(to_v,To,AcceleratorWrite); + // autoView(from_v,From,CpuRead); + // autoView(to_v,To,CpuWrite); + RealD t_acc=-usecond(); + const int words=sizeof(vobj)/sizeof(vector_type); + accelerator_for(idx,nsite,1,{ + // for(int idx=0;idx_ldimensions; Coordinate rdf = Fg->_rdimensions; Coordinate isf = Fg->_istride; Coordinate osf = Fg->_ostride; + Coordinate ldt = Tg->_ldimensions; Coordinate rdt = Tg->_rdimensions; Coordinate ist = Tg->_istride; Coordinate ost = Tg->_ostride; + { autoView( t_v , To, CpuWrite); autoView( f_v , From, CpuRead); - thread_for(idx,Fg->lSites(),{ - sobj s; + // thread_for(idx,Fg->lSites(),{ + ComplexD mysum(0.0); + int num=0; + for(int idx=0;idxlSites();idx++) { Coordinate Fcoor(nd); Coordinate Tcoor(nd); Lexicographic::CoorFromIndex(Fcoor,idx,ldf); @@ -881,24 +884,31 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d]; } if (in_region) { -#if 0 - Integer idx_f = 0; for(int d=0;dGlobalSum(mysum); + // std::cout << " localCopyRegion slow sum "<