#ifndef _GRID_NONE_CSHIFT_H_ #define _GRID_NONE_CSHIFT_H_ // Work out whether to permute // ABCDEFGH -> AE BF CG DH permute wrap num // // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH 0 0 // Shift 1 BF CG DH AE 0 0 0 1 BCDEFGHA 0 1 // Shift 2 CG DH AE BF 0 0 1 1 CDEFGHAB 0 2 // Shift 3 DH AE BF CG 0 1 1 1 DEFGHABC 0 3 // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD 1 0 // Shift 5 BF CG DH AE 1 1 1 0 FGHACBDE 1 1 // Shift 6 CG DH AE BF 1 1 0 0 GHABCDEF 1 2 // Shift 7 DH AE BF CG 1 0 0 0 HABCDEFG 1 3 // Suppose 4way simd in one dim. // ABCDEFGH -> AECG BFDH permute wrap num // Shift 0 AECG BFDH 0,00 0,00 ABCDEFGH 0 0 // Shift 1 BFDH CGEA 0,00 1,01 BCDEFGHA 0 1 // Shift 2 CGEA DHFB 1,01 1,01 CDEFGHAB 1 0 // Shift 3 DHFB EAGC 1,01 1,11 DEFGHABC 1 1 // Shift 4 EAGC FBHD 1,11 1,11 EFGHABCD 2 0 // Shift 5 FBHD GCAE 1,11 1,10 FGHABCDE 2 1 // Shift 6 GCAE HDBF 1,10 1,10 GHABCDEF 3 0 // Shift 7 HDBF AECG 1,10 0,00 HABCDEFG 3 1 // Generalisation to 8 way simd, 16 way simd required. // // Need log2 Nway masks. consisting of // 1 bit 256 bit granule // 2 bit 128 bit granule // 4 bits 64 bit granule // 8 bits 32 bit granules // // 15 bits.... // For optimisation: // // split into Cshift_none_rb_permute // split into Cshift_none_rb_simple // // split into Cshift_none_permute // split into Cshift_none_simple friend Lattice Cshift(Lattice &rhs,int dimension,int shift) { Lattice ret(rhs._grid); int fd = rhs._grid->_fdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension]; int ld = rhs._grid->_ldimensions[dimension]; int gd = rhs._grid->_gdimensions[dimension]; // Map to always positive shift modulo global full dimension. shift = (shift+fd)%fd; ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); // the permute type int permute_dim =rhs._grid->_simd_layout[dimension]>1 ; int permute_type=0; for(int d=0;d_simd_layout[d]>1 ) permute_type++; } for(int x=0;x_ostride[dimension]; int o = 0; if ( permute_dim ) { for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); int sx = (x+sshift)%rd; int so = sx*rhs._grid->_ostride[dimension]; int permute_slice=0; int wrap = sshift/rd; int num = sshift%rd; if ( x< rd-num ) permute_slice=wrap; else permute_slice = 1-wrap; if ( permute_slice ) { permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type); } else { ret._odata[bo+o+b]=rhs._odata[so+o+b]; } } o +=rhs._grid->_slice_stride[dimension]; } } else { for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_block[dimension];b++){ // This call in inner loop is annoying but necessary for dimension=0 // in the case of RedBlack grids. Could optimise away with // alternate code paths for all other cases. int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b); int sx = (x+sshift)%rd; int so = sx*rhs._grid->_ostride[dimension]; ret._odata[bo+o+b]=rhs._odata[so+o+b]; } o +=rhs._grid->_slice_stride[dimension]; } } } return ret; } #if 0 // Collapse doesn't appear to work the way I think it should in icpc friend Lattice Cshift(Lattice &rhs,int dimension,int shift) { Lattice ret(rhs._grid); ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); int sx,so,o; int rd = rhs._grid->_rdimensions[dimension]; int ld = rhs._grid->_dimensions[dimension]; // Map to always positive shift. shift = (shift+ld)%ld; // Work out whether to permute and the permute type // ABCDEFGH -> AE BF CG DH permute // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA int permute_dim =rhs._grid->_layout[dimension]>1 ; int permute_type=0; for(int d=0;d_layout[d]>1 ) permute_type++; // loop over perp slices. // Threading considerations: // Need to map thread_num to // // x_min,x_max for Loop-A // n_min,n_max for Loop-B // b_min,b_max for Loop-C // In a way that maximally load balances. // // Optimal: // There are rd*n_block*block items of work. // These serialise as item "w" // b=w%block; w=w/block // n=w%nblock; x=w/nblock. Perhaps 20 cycles? // // Logic: // x_chunk = (rd+thread)/nthreads simply divide work across nodes. // // rd=5 , threads = 8; // 0 1 2 3 4 5 6 7 // 0 0 0 1 1 1 1 1 for(int x=0;x_ostride[dimension]; so =sx*rhs._grid->_ostride[dimension]; int permute_slice=0; if ( permute_dim ) { permute_slice = shift/rd; if ( x_slice_block[dimension]*internal; for(int n=0;n_slice_nblock[dimension];n++){ vComplex *optr = (vComplex *)&ret._odata[o]; vComplex *iptr = (vComplex *)&rhs._odata[so]; for(int b=0;b_slice_stride[dimension]; so+=rhs._grid->_slice_stride[dimension]; } } else { for(int n=0;n_slice_nblock[dimension];n++){ for(int i=0;i_slice_block[dimension];i++){ ret._odata[o+i]=rhs._odata[so+i]; } o+=rhs._grid->_slice_stride[dimension]; so+=rhs._grid->_slice_stride[dimension]; } } #else if ( permute_slice ) { int internal=sizeof(vobj)/sizeof(vComplex); int num =rhs._grid->_slice_block[dimension]*internal; #pragma omp parallel for collapse(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int b=0;b_slice_stride[dimension]]; vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]]; permute(optr[b],iptr[b],permute_type); } } } else { #pragma omp parallel for collapse(2) for(int n=0;n_slice_nblock[dimension];n++){ for(int i=0;i_slice_block[dimension];i++){ int oo = o+ n*rhs._grid->_slice_stride[dimension]; int soo=so+ n*rhs._grid->_slice_stride[dimension]; ret._odata[oo+i]=rhs._odata[soo+i]; } } } #endif } return ret; } friend Lattice Cshift(Lattice &rhs,int dimension,int shift) { Lattice ret(rhs._grid); ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift); shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift); int rd = rhs._grid->_rdimensions[dimension]; int ld = rhs._grid->_dimensions[dimension]; // Map to always positive shift. shift = (shift+ld)%ld; // Work out whether to permute and the permute type // ABCDEFGH -> AE BF CG DH permute // Shift 0 AE BF CG DH 0 0 0 0 ABCDEFGH // Shift 1 DH AE BF CG 1 0 0 0 HABCDEFG // Shift 2 CG DH AE BF 1 1 0 0 GHABCDEF // Shift 3 BF CG DH AE 1 1 1 0 FGHACBDE // Shift 4 AE BF CG DH 1 1 1 1 EFGHABCD // Shift 5 DH AE BF CG 0 1 1 1 DEFGHABC // Shift 6 CG DH AE BF 0 0 1 1 CDEFGHAB // Shift 7 BF CG DH AE 0 0 0 1 BCDEFGHA int permute_dim =rhs._grid->_layout[dimension]>1 ; int permute_type=0; for(int d=0;d_layout[d]>1 ) permute_type++; // loop over all work int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; #pragma omp parallel for for(int ww=0;ww_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension]; int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension]; int x = w; int sx,so,o; sx = (x-shift+ld)%rd; o = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert. so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; int permute_slice=0; if ( permute_dim ) { permute_slice = shift/rd; if ( x