diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 1cb1d449..ff8b1ec2 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -745,6 +745,9 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; + //////////////////////////////////////////////////////////////////////////////////////////////// + // the checks should guarantee that the operations are local + //////////////////////////////////////////////////////////////////////////////////////////////// GridBase *Fg = From.Grid(); GridBase *Tg = To.Grid(); assert(!Fg->_isCheckerBoarded); @@ -758,44 +761,12 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro for(int d=0;d_processors[d] == Tg->_processors[d]); } - // the above should guarantee that the operations are local - -#if 1 size_t nsite = 1; for(int i=0;ioIndex(from_coor); - int fiidx = Fg->iIndex(from_coor); - int toidx = Tg->oIndex(to_coor); - int tiidx = Tg->iIndex(to_coor); - int* tt = table + 4*idx; - tt[0] = foidx; - tt[1] = fiidx; - tt[2] = toidx; - tt[3] = tiidx; - }); - - int* table_d = (int*)acceleratorAllocDevice(tbytes); - acceleratorCopyToDevice(table,table_d,tbytes); -#else - int* table_d = (int*)acceleratorAllocDevice(tbytes); + //////////////////////////////////////////////////////////////////////////////////////////////// + // do the index calc on the GPU + //////////////////////////////////////////////////////////////////////////////////////////////// Coordinate f_ostride = Fg->_ostride; Coordinate f_istride = Fg->_istride; Coordinate f_rdimensions = Fg->_rdimensions; @@ -803,112 +774,35 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Coordinate t_istride = Tg->_istride; Coordinate t_rdimensions = Tg->_rdimensions; - accelerator_for(idx, nsite, 1, { + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + autoView(from_v,From,AcceleratorRead); + autoView(to_v,To,AcceleratorWrite); + + const int words=sizeof(vobj)/sizeof(vector_type); + accelerator_for(idx,nsite,1,{ + Coordinate from_coor, to_coor, base; Lexicographic::CoorFromIndex(base,idx,RegionSize); for(int i=0;i_ldimensions; - Coordinate rdf = Fg->_rdimensions; - Coordinate isf = Fg->_istride; - Coordinate osf = Fg->_ostride; - Coordinate ldt = Tg->_ldimensions; - Coordinate rdt = Tg->_rdimensions; - Coordinate ist = Tg->_istride; - Coordinate ost = Tg->_ostride; - - { - autoView( t_v , To, CpuWrite); - autoView( f_v , From, CpuRead); - // thread_for(idx,Fg->lSites(),{ - ComplexD mysum(0.0); - int num=0; - for(int idx=0;idxlSites();idx++) { - Coordinate Fcoor(nd); - Coordinate Tcoor(nd); - Lexicographic::CoorFromIndex(Fcoor,idx,ldf); - int in_region=1; - for(int d=0;d=FromLowerLeft[d]+RegionSize[d]) ){ - in_region=0; + scalar_type stmp; + for(int w=0;wGlobalSum(mysum); - // std::cout << " localCopyRegion slow sum "< &lowDim,const Lattice & higherDim,int slic } - +//FIXME: make this run entirely on GPU //Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim //The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same template