diff --git a/lib/Stencil.h b/lib/Stencil.h index a1f09d6b..1821419a 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -240,6 +240,10 @@ PARALLEL_FOR_LOOP for(int o=0;o &lis int myrank = _processor; int ierr; - if ( (CommunicatorPolicy == CommunicatorPolicyIsend) ) { + if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { MPI_Request xrq; MPI_Request rrq; diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index c01187de..e2fa0481 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -142,12 +142,12 @@ PARALLEL_NESTED_LOOP2 /////////////////////////////////////////////////////////////////// // Gather for when there *is* need to SIMD split with compression /////////////////////////////////////////////////////////////////// -template void +template double Gather_plane_exchange(const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) { int rd = rhs._grid->_rdimensions[dimension]; - + double t1,t2; if ( !rhs._grid->CheckerBoarded(dimension) ) { cbmask = 0x3; } @@ -186,13 +186,20 @@ Gather_plane_exchange(const Lattice &rhs, } assert( (table.size()&0x1)==0); + t1=usecond(); PARALLEL_FOR_LOOP for(int j=0;j AC BD + //AC BD -> AB CD out1= _mm256_permute2f128_ps(in1,in2,0x20); out2= _mm256_permute2f128_ps(in1,in2,0x31); }; static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ + //Invertible + // ABCD EFGH ->ABEF CDGH + // ABEF CDGH ->ABCD EFGH out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); }; static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ - out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + // Invertible ? + // ABCD EFGH -> ACEG BDFH + // ACEG BDFH -> AEBF CGDH + // out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + // out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + // Bollocks; need + // AECG BFDH -> ABCD EFGH + out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ }; static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ assert(0); diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index d840140e..b94febb5 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -419,8 +419,10 @@ void ExchangeTester(const functor &func) assert(found==1); } - // for(int i=0;i