From be3a8249c6f2b628bda6fa4258ca4b4ffac139a5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 16 Feb 2017 23:51:15 +0000 Subject: [PATCH 1/4] Faster gather --- lib/Stencil.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 8b5eac2d..00c9f7aa 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -240,6 +240,10 @@ PARALLEL_FOR_LOOP for(int o=0;o Date: Thu, 16 Feb 2017 23:51:33 +0000 Subject: [PATCH 2/4] Make clang happy with parenthesis --- lib/communicator/Communicator_mpi3.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/communicator/Communicator_mpi3.cc b/lib/communicator/Communicator_mpi3.cc index b86d5259..23626c5b 100644 --- a/lib/communicator/Communicator_mpi3.cc +++ b/lib/communicator/Communicator_mpi3.cc @@ -100,7 +100,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { int flag; int provided; - mtrace(); + // mtrace(); MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { @@ -511,7 +511,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int myrank = _processor; int ierr; - if ( (CommunicatorPolicy == CommunicatorPolicyIsend) ) { + if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { MPI_Request xrq; MPI_Request rrq; From 8a29c16bdef4ff0907c400c200751887cf4a16fb Mon Sep 17 00:00:00 2001 From: paboyle Date: Thu, 16 Feb 2017 23:52:22 +0000 Subject: [PATCH 3/4] Faster gather exchange --- lib/cshift/Cshift_common.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index c01187de..e2fa0481 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -142,12 +142,12 @@ PARALLEL_NESTED_LOOP2 /////////////////////////////////////////////////////////////////// // Gather for when there *is* need to SIMD split with compression /////////////////////////////////////////////////////////////////// -template void +template double Gather_plane_exchange(const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) { int rd = rhs._grid->_rdimensions[dimension]; - + double t1,t2; if ( !rhs._grid->CheckerBoarded(dimension) ) { cbmask = 0x3; } @@ -186,13 +186,20 @@ Gather_plane_exchange(const Lattice &rhs, } assert( (table.size()&0x1)==0); + t1=usecond(); PARALLEL_FOR_LOOP for(int j=0;j Date: Thu, 16 Feb 2017 23:52:44 +0000 Subject: [PATCH 4/4] Improvements to avx for invertible to avoid latent bug --- lib/simd/Grid_avx.h | 19 +++++++++++++++++-- tests/Test_simd.cc | 6 ++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h index 7abe4d5a..2dbe26f4 100644 --- a/lib/simd/Grid_avx.h +++ b/lib/simd/Grid_avx.h @@ -474,16 +474,31 @@ namespace Optimization { struct Exchange{ // 3210 ordering static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ + //Invertible + //AB CD -> AC BD + //AC BD -> AB CD out1= _mm256_permute2f128_ps(in1,in2,0x20); out2= _mm256_permute2f128_ps(in1,in2,0x31); }; static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ + //Invertible + // ABCD EFGH ->ABEF CDGH + // ABEF CDGH ->ABCD EFGH out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0)); out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2)); }; static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ - out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); - out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + // Invertible ? + // ABCD EFGH -> ACEG BDFH + // ACEG BDFH -> AEBF CGDH + // out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); + // out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); + // Bollocks; need + // AECG BFDH -> ABCD EFGH + out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/ + out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/ + out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ + out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/ }; static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){ assert(0); diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index d840140e..b94febb5 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -419,8 +419,10 @@ void ExchangeTester(const functor &func) assert(found==1); } - // for(int i=0;i