Merge branch 'feature/bgq-asm' of https://github.com/paboyle/Grid into feature/bgq-asm

2025-12-18 03:34:40 +00:00 · 2017-02-16 18:52:30 -05:00
parent d68907fc3e f246fe3304
commit cd0da81196
5 changed files with 38 additions and 12 deletions
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -240,6 +240,10 @@ PARALLEL_FOR_LOOP
        for(int o=0;o<Mergers[i].buffer_size/2;o++){
 	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
 		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 	  //	  cobj temp1,temp2;
 	  //	  exchange(temp1,temp2,Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 	  //	  vstream(Mergers[i].mpointer[2*o],temp1);
 	  //	  vstream(Mergers[i].mpointer[2*o+1],temp2);
 	}
      }
      mergetime+=usecond();
@@ -1037,9 +1041,7 @@ PARALLEL_FOR_LOOP
 	int sx   = (x+sshift)%rd;
-	gathermtime-=usecond();
+	gathermtime+=Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 	Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 	gathermtime+=usecond();
 	//spointers[0] -- low
 	//spointers[1] -- high
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -100,7 +100,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
-  mtrace();
+  //  mtrace();
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
@@ -511,7 +511,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  int myrank = _processor;
  int ierr;
-  if ( (CommunicatorPolicy == CommunicatorPolicyIsend) ) { 
+  if ( CommunicatorPolicy == CommunicatorPolicyIsend ) { 
    MPI_Request xrq;
    MPI_Request rrq;
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -142,12 +142,12 @@ PARALLEL_NESTED_LOOP2
 ///////////////////////////////////////////////////////////////////
 // Gather for when there *is* need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
-template<class cobj,class vobj,class compressor> void 
+template<class cobj,class vobj,class compressor> double
 Gather_plane_exchange(const Lattice<vobj> &rhs,
 		      std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
 {
  int rd = rhs._grid->_rdimensions[dimension];
-
+  double t1,t2;
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
@@ -186,13 +186,20 @@ Gather_plane_exchange(const Lattice<vobj> &rhs,
  }
  assert( (table.size()&0x1)==0);
  t1=usecond();
 PARALLEL_FOR_LOOP     
  for(int j=0;j<table.size()/2;j++){
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
-    exchange(pointers[0][j],pointers[1][j],temp1,temp2,type);
+    cobj temp3;
    cobj temp4;
    exchange(temp3,temp4,temp1,temp2,type);
    vstream(pointers[0][j],temp3);
    vstream(pointers[1][j],temp4);
  }
  t2=usecond();
 return t2-t1;
 }
 //////////////////////////////////////////////////////
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -474,16 +474,31 @@ namespace Optimization {
  struct Exchange{
    // 3210 ordering
    static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
      //Invertible
      //AB CD ->  AC BD
      //AC BD ->  AB CD
      out1= _mm256_permute2f128_ps(in1,in2,0x20);
      out2= _mm256_permute2f128_ps(in1,in2,0x31);
    };
    static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
      //Invertible
      // ABCD EFGH  ->ABEF CDGH
      // ABEF CDGH  ->ABCD EFGH
      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
    };
    static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
-      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      // Invertible ? 
-      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+      // ABCD EFGH -> ACEG BDFH
      // ACEG BDFH -> AEBF CGDH
      //      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
      //      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
      // Bollocks; need 
      // AECG BFDH -> ABCD EFGH
      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
      out1= _mm256_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
      out2= _mm256_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
    };
    static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
      assert(0);
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -419,8 +419,10 @@ void ExchangeTester(const functor &func)
    assert(found==1);
  }
-  //  for(int i=0;i<Nsimd;i++){
+  for(int i=0;i<Nsimd;i++){
-    //    std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
+    assert(test1[i]==input1[i]);
    assert(test2[i]==input2[i]);
  }//    std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
    //    std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
  //  }
 }