Better openMP for Cshift

2025-07-26 09:17:08 +01:00 · 2015-03-04 11:50:59 +00:00
parent 9e200b9d7a
commit d9cff588c5
2 changed files with 14 additions and 81 deletions
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -68,7 +68,8 @@ public:
        }
        return ret;
    };
-#if 0
+
+
    friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
    {
        Lattice<vobj> ret(rhs._grid);
@@ -131,20 +132,24 @@ public:
            }

            if ( permute_slice ) {
-                
-                int internal=sizeof(vobj)/sizeof(vComplex);
+
+	        int internal=sizeof(vobj)/sizeof(vComplex);
                int num =rhs._grid->_slice_block[dimension]*internal;
-                
+
+#pragma omp parallel for collapse(2)
                for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-                    vComplex *optr = (vComplex *)&ret._odata[o];
-                    vComplex *iptr = (vComplex *)&rhs._odata[so];
                    for(int b=0;b<num;b++){
-                        permute(optr[b],iptr[b],permute_type);
+		      vComplex *optr = (vComplex *)&ret._odata[o];
+		      vComplex *iptr = (vComplex *)&rhs._odata[so];
+                      permute(optr[b],iptr[b],permute_type);
                    }
                    o+=rhs._grid->_slice_stride[dimension];
                    so+=rhs._grid->_slice_stride[dimension];
                }
+
            } else {
+
+#pragma omp parallel for collapse(2)
                for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
                    for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
                        ret._odata[o+i]=rhs._odata[so+i];
@@ -157,78 +162,6 @@ public:
        }
        return ret;
    }
-#else
-    friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
-    {
-        Lattice<vobj> ret(rhs._grid);
-	int sx,so,o;
-        
-        ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
-        shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
-        int rd = rhs._grid->_rdimensions[dimension];
-        int ld = rhs._grid->_dimensions[dimension];
-        
-        // Map to always positive shift.
-        shift = (shift+ld)%ld;
-        
-        // Work out whether to permute and the permute type
-        // ABCDEFGH ->   AE BF CG DH       permute
-        // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
-        // Shift 1       DH AE BF CG       1 0 0 0    HABCDEFG
-        // Shift 2       CG DH AE BF       1 1 0 0    GHABCDEF
-        // Shift 3       BF CG DH AE       1 1 1 0    FGHACBDE
-        // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
-        // Shift 5       DH AE BF CG       0 1 1 1    DEFGHABC
-        // Shift 6       CG DH AE BF       0 0 1 1    CDEFGHAB
-        // Shift 7       BF CG DH AE       0 0 0 1    BCDEFGHA
-        int permute_dim =rhs._grid->_layout[dimension]>1 ;
-        int permute_type=0;
-        for(int d=0;d<dimension;d++)
-            if (rhs._grid->_layout[d]>1 ) permute_type++;
-        
-        
-        // loop over all work
-        int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-//#pragma omp parallel for
-        for(int ww=0;ww<work;ww++){
-
-            
-            // can optimise this if know w moves congtiguously for a given thread.
-            // b=(b+1);
-            // if (b==_slice_block) {b=0; n=n+1;}
-            // if (n==_slice_nblock) { n=0; x=x+1}
-            //
-            // Perhaps a five cycle iterator, or so.
-            int w=ww;
-            int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
-            int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
-            int x = w;
-
-            sx = (x-shift+ld)%rd;
-            o  = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
-            so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
-            
-            int permute_slice=0;
-            if ( permute_dim ) {
-                permute_slice = shift/rd;
-                if ( x<shift%rd ) permute_slice = 1-permute_slice;
-            }
-            
-            if ( permute_slice ) {
-                
-                int internal=sizeof(vobj)/sizeof(vComplex);
-                vComplex *optr = (vComplex *)&ret._odata[o+b];
-                vComplex *iptr = (vComplex *)&rhs._odata[so+b];
-                for(int i=0;i<internal;i++){
-                    permute(optr[i],iptr[i],permute_type);
-                }
-            } else {
-                ret._odata[o+b]=rhs._odata[so+b];
-            }
-        }
-        return ret;
-    }
-#endif

    template<class sobj>
    inline Lattice<vobj> & operator = (const sobj & r){
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -2,10 +2,10 @@
 /* Grid_config.h.in.  Generated from configure.ac by autoheader.  */

 /* AVX */
-/* #undef AVX1 */
+#define AVX1 1

 /* AVX2 */
-#define AVX2 1
+/* #undef AVX2 */

 /* AVX512 */
 /* #undef AVX512 */