Vectorise the XYZT face gathering better.

Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent with efficiency.
2025-07-26 09:17:08 +01:00 · 2017-02-15 11:11:04 +00:00
parent aca7a3ef0a
commit bd600702cf
10 changed files with 510 additions and 34 deletions
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -338,9 +338,9 @@ void Grid_init(int *argc,char ***argv)
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
-    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
+    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
  } else {
-    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
+    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -184,14 +184,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  struct Merge {
    cobj * mpointer;
    std::vector<scalar_object *> rpointers;
+    std::vector<cobj *> vpointers;
    Integer buffer_size;
    Integer packet_id;
+    Integer exchange;
+    Integer type;
  };
  
  std::vector<Merge> Mergers;

  void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
    Merge m;
+    m.exchange = 0;
    m.mpointer = merge_p;
    m.rpointers= rpointers;
    m.buffer_size = buffer_size;
@@ -199,6 +203,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
    Mergers.push_back(m);
  }

+  void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) {
+    Merge m;
+    m.exchange = 1;
+    m.type     = type;
+    m.mpointer = merge_p;
+    m.vpointers= rpointers;
+    m.buffer_size = buffer_size;
+    m.packet_id   = packet_id;
+    Mergers.push_back(m);
+  }
+
  void CommsMerge(void ) { 

    for(int i=0;i<Mergers.size();i++){	
@@ -214,10 +229,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
      //      std::string fname(ss.str());
      //      std::ofstream fout(fname);

+      if ( Mergers[i].exchange == 0 ) { 
 PARALLEL_FOR_LOOP
-      for(int o=0;o<Mergers[i].buffer_size;o++){
-	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
-	//	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
+        for(int o=0;o<Mergers[i].buffer_size;o++){
+	  merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
+	  //	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
+	}
+      } else { 
+PARALLEL_FOR_LOOP
+        for(int o=0;o<Mergers[i].buffer_size/2;o++){
+	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
+		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
+	}
      }
      mergetime+=usecond();
    }
@@ -285,6 +308,8 @@ PARALLEL_FOR_LOOP
  cobj* u_send_buf_p;
  std::vector<scalar_object *> u_simd_send_buf;
  std::vector<scalar_object *> u_simd_recv_buf;
+  std::vector<cobj *> new_simd_send_buf;
+  std::vector<cobj *> new_simd_recv_buf;

  int u_comm_offset;
  int _unified_buffer_size;
@@ -432,12 +457,15 @@ PARALLEL_FOR_LOOP

    u_simd_send_buf.resize(Nsimd);
    u_simd_recv_buf.resize(Nsimd);
-
+    new_simd_send_buf.resize(Nsimd);
+    new_simd_recv_buf.resize(Nsimd);
    u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    for(int l=0;l<Nsimd;l++){
      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
+      new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
+      new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    }

    PrecomputeByteOffsets();
@@ -675,7 +703,7 @@ PARALLEL_FOR_LOOP
    HaloGather(source,compress);
    this->CommunicateBegin(reqs);
    this->CommunicateComplete(reqs);
-    CommsMerge(); // spins
+    CommsMerge(); 
  }
  
  template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@@ -706,7 +734,9 @@ PARALLEL_FOR_LOOP
      if ( sshift[0] == sshift[1] ) {
 	if (splice_dim) {
 	  splicetime-=usecond();
-	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
+	  //	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
+	  //	  std::cout << "GatherSimdNew"<<std::endl;
+	  GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
 	  splicetime+=usecond();
 	} else { 
 	  nosplicetime-=usecond();
@@ -716,8 +746,9 @@ PARALLEL_FOR_LOOP
      } else {
 	if(splice_dim){
 	  splicetime-=usecond();
-	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
-	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
+	  //	  std::cout << "GatherSimdNew2calls"<<std::endl;
+	  GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
+	  GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 	  splicetime+=usecond();
 	} else {
 	  nosplicetime-=usecond();
@@ -949,6 +980,120 @@ PARALLEL_FOR_LOOP
      }
    }
  }
+
+
+  template<class compressor>
+  void  GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
+  {
+    const int Nsimd = _grid->Nsimd();
+
+    const int maxl =2;// max layout in a direction
+    int fd = _grid->_fdimensions[dimension];
+    int rd = _grid->_rdimensions[dimension];
+    int ld = _grid->_ldimensions[dimension];
+    int pd              = _grid->_processors[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+    assert(comm_dim==1);
+    // This will not work with a rotate dim
+    assert(simd_layout==maxl);
+    assert(shift>=0);
+    assert(shift<fd);
+    
+    int permute_type=_grid->PermuteType(dimension);
+    //    std::cout << "SimdNew permute type "<<permute_type<<std::endl;
+
+    ///////////////////////////////////////////////
+    // Simd direction uses an extract/merge pair
+    ///////////////////////////////////////////////
+    int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+    int words = sizeof(cobj)/sizeof(vector_type);
+    
+    assert(cbmask==0x3); // Fixme think there is a latent bug if not true
+    
+    int bytes = (buffer_size*sizeof(cobj))/simd_layout;
+    assert(bytes*simd_layout == buffer_size*sizeof(cobj));
+
+    std::vector<cobj *> rpointers(maxl);
+    std::vector<cobj *> spointers(maxl);
+
+    ///////////////////////////////////////////
+    // Work out what to send where
+    ///////////////////////////////////////////
+    
+    int cb    = (cbmask==0x2)? Odd : Even;
+    int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
+    
+    // loop over outer coord planes orthog to dim
+    for(int x=0;x<rd;x++){       
+      
+      int any_offnode = ( ((x+sshift)%fd) >= rd );
+
+      if ( any_offnode ) {
+	
+	for(int i=0;i<maxl;i++){       
+	  spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
+	}
+	
+	int sx   = (x+sshift)%rd;
+	
+	gathermtime-=usecond();
+	Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
+	gathermtime+=usecond();
+
+	//spointers[0] -- low
+	//spointers[1] -- high
+
+	for(int i=0;i<maxl;i++){
+
+	  int my_coor  = rd*i + x;            // self explanatory
+	  int nbr_coor = my_coor+sshift;      // self explanatory
+
+	  int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+	  int nbr_lcoor= (nbr_coor%ld);       // local plane coor on neighbour node
+	  int nbr_ic   = (nbr_lcoor)/rd;      // inner coord of peer simd lane "i"
+	  int nbr_ox   = (nbr_lcoor%rd);      // outer coord of peer "x"
+
+	  int nbr_plane = nbr_ic;
+	  assert (sx == nbr_ox);
+
+	  auto rp = &new_simd_recv_buf[i        ][u_comm_offset];
+	  auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset];
+
+	  if(nbr_proc){
+
+	    int recv_from_rank;
+	    int xmit_to_rank;
+	    
+	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+ 
+	    // shm == receive pointer         if offnode
+	    // shm == Translate[send pointer] if on node -- my view of his send pointer
+	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
+	    if (shm==NULL) { 
+	      shm = rp;
+	    }
+
+	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
+	    // assuming above pointer flip
+	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
+
+	    rpointers[i] = shm;
+	    
+	  } else { 
+	    
+	    rpointers[i] = sp;
+	    
+	  }
+	}
+
+	AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type);
+
+	u_comm_offset     +=buffer_size;
+      }
+    }
+  }
+
  
 };
 }
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -42,10 +42,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
-    //    MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
-    //    assert (provided == MPI_THREAD_SERIALIZED);
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    if ( provided != MPI_THREAD_MULTIPLE ) {
+      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
+    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -103,6 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
  int e1=rhs._grid->_slice_nblock[dimension];
  int e2=rhs._grid->_slice_block[dimension];
  int n1=rhs._grid->_slice_stride[dimension];
+
  if ( cbmask ==0x3){
 PARALLEL_NESTED_LOOP2
    for(int n=0;n<e1;n++){
@@ -110,6 +111,7 @@ PARALLEL_NESTED_LOOP2

 	int o      =   n*n1;
 	int offset = b+n*e2;
+	
 	cobj temp =compress(rhs._odata[so+o+b]);
 	extract<cobj>(temp,pointers,offset);

@@ -137,6 +139,62 @@ PARALLEL_NESTED_LOOP2
  }
 }

+///////////////////////////////////////////////////////////////////
+// Gather for when there *is* need to SIMD split with compression
+///////////////////////////////////////////////////////////////////
+template<class cobj,class vobj,class compressor> void 
+Gather_plane_exchange(const Lattice<vobj> &rhs,
+		      std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask = 0x3;
+  }
+
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+  int n1=rhs._grid->_slice_stride[dimension];
+
+  // Need to switch to a table loop
+  std::vector<std::pair<int,int> > table;
+
+  if ( cbmask ==0x3){
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	table.push_back(std::pair<int,int> (offset,o+b));
+      }
+    }
+  } else { 
+    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+    // Test_cshift_red_black code.
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o=n*n1;
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	int offset = b+n*e2;
+
+	if ( ocb & cbmask ) {
+	  table.push_back(std::pair<int,int> (offset,o+b));
+	}
+      }
+    }
+  }
+
+  assert( (table.size()&0x1)==0);
+PARALLEL_FOR_LOOP     
+  for(int j=0;j<table.size()/2;j++){
+    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
+    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
+    exchange(pointers[0][j],pointers[1][j],temp1,temp2,type);
+  }
+}
+
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -469,9 +469,47 @@ namespace Optimization {
    static inline __m256d Permute3(__m256d in){
      return in;
    };
-
  };

+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      out1= _mm256_permute2f128_ps(in1,in2,0x20);
+      out2= _mm256_permute2f128_ps(in1,in2,0x31);
+    };
+    static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      assert(0);
+      return;
+    };
+
+    static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      out1= _mm256_permute2f128_pd(in1,in2,0x20);
+      out2= _mm256_permute2f128_pd(in1,in2,0x31);
+      return;
+    };
+    static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      out1= _mm256_shuffle_pd(in1,in2,0x0);
+      out2= _mm256_shuffle_pd(in1,in2,0xF);
+    };
+    static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      assert(0);
+      return;
+    };
+  };
+
+
 #if defined (AVX2)
 #define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256)  _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
 #define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -343,6 +343,46 @@ namespace Optimization {

  };

+  // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
+  // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
+  // The operation is its own inverse
+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+ 
+    static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1 = _mm512_shuffle_pd(in1,in2,0x00);
+      out2 = _mm512_shuffle_pd(in1,in2,0xFF);
+    };
+    static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      assert(0);
+      return;
+    };
+  };
+

  struct Rotate{

--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -326,7 +326,43 @@ namespace Optimization {
    static inline __m128d Permute3(__m128d in){
      return in;
    };
+  };

+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      assert(0);
+      return;
+    };
+
+    static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      out1= _mm_shuffle_pd(in1,in2,0x0);
+      out2= _mm_shuffle_pd(in1,in2,0x3);
+    };
+    static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
  };

  struct Rotate{
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -350,6 +350,18 @@ class Grid_simd {
    return ret;
  }

+  ///////////////////////
+  // Exchange 
+  // Al Ah , Bl Bh -> Al Bl Ah,Bh
+  ///////////////////////
+  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
+  {
+    if     (n==3) Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+    else if(n==2) Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+    else if(n==1) Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+    else if(n==0) Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+  }
+
  ////////////////////////////////////////////////////////////////////
  // General permute; assumes vector length is same across
  // all subtypes; may not be a good assumption, but could
@@ -372,23 +384,11 @@ class Grid_simd {
      int dist = perm & 0xF;
      y = rotate(b, dist);
      return;
-    }
-    switch (perm) {
-      case 3:
-        permute3(y, b);
-        break;
-      case 2:
-        permute2(y, b);
-        break;
-      case 1:
-        permute1(y, b);
-        break;
-      case 0:
-        permute0(y, b);
-        break;
-      default:
-        assert(0);
-    }
+    } 
+    else if(perm==3) permute3(y, b);
+    else if(perm==2) permute2(y, b);
+    else if(perm==1) permute1(y, b);
+    else if(perm==0) permute0(y, b);
  }

 };  // end of Grid_simd class definition
@@ -444,6 +444,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
 }    

+
+
 ///////////////////////
 // Splat
 ///////////////////////
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -105,6 +105,11 @@ class iScalar {
  friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
    rotate(out._internal,in._internal,rot);
  }
+  friend strong_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
+				     const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type){
+    exchange(out1._internal,out2._internal,
+	      in1._internal, in2._internal,type);
+  }

  // Unary negation
  friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
@@ -248,6 +253,13 @@ class iVector {
      rotate(out._internal[i],in._internal[i],rot);
    }
  }
+  friend strong_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
+				     const iVector<vtype,N> &in1,const iVector<vtype,N> &in2,int type){
+    for(int i=0;i<N;i++){
+      exchange(out1._internal[i],out2._internal[i],
+	        in1._internal[i], in2._internal[i],type);
+    }
+  }

  // Unary negation
  friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
@@ -374,6 +386,14 @@ class iMatrix {
 	rotate(out._internal[i][j],in._internal[i][j],rot);
    }}
  }
+  friend strong_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
+				     const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	exchange(out1._internal[i][j],out2._internal[i][j],
+		  in1._internal[i][j], in2._internal[i][j],type);
+    }}
+  }

  // Unary negation
  friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -113,8 +113,6 @@ public:
 //  outerproduct, 
 //  zeroit
 //  permute
-
-
 class funcReduce {
 public:
  funcReduce() {};
@@ -168,7 +166,7 @@ void Tester(const functor &func)

  int ok=0;
  for(int i=0;i<Nsimd;i++){
-    if ( abs(reference[i]-result[i])>1.0e-7){
+    if ( abs(reference[i]-result[i])>1.0e-6){
      std::cout<<GridLogMessage<< "*****" << std::endl;
      std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
      ok++;
@@ -245,6 +243,28 @@ public:
  }
  std::string name(void) const { return std::string("Permute"); }
 };
+
+class funcExchange {
+public:
+  int n;
+  funcExchange(int _n) { n=_n;};
+  template<class vec>    void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);}
+  template<class scal>   void apply(std::vector<scal> &r1,std::vector<scal> &r2,std::vector<scal> &in1,std::vector<scal> &in2)  const { 
+    int sz=in1.size();
+
+    
+    int msk = sz>>(n+1);
+
+    int j1=0;
+    int j2=0;
+    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in1[ i ];
+    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in2[ i ];
+    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in1[ i ];
+    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in2[ i ];
+  }
+  std::string name(void) const { return std::string("Exchange"); }
+};
+
 class funcRotate {
 public:
  int n;
@@ -325,6 +345,87 @@ void PermTester(const functor &func)
  assert(ok==0);
 }

+
+template<class scal, class vec,class functor > 
+void ExchangeTester(const functor &func)
+{
+  GridSerialRNG          sRNG;
+  sRNG.SeedRandomDevice();
+  
+  int Nsimd = vec::Nsimd();
+
+  std::vector<scal> input1(Nsimd);
+  std::vector<scal> input2(Nsimd);
+  std::vector<scal> result1(Nsimd);
+  std::vector<scal> result2(Nsimd);
+  std::vector<scal> reference1(Nsimd);
+  std::vector<scal> reference2(Nsimd);
+  std::vector<scal> test1(Nsimd);
+  std::vector<scal> test2(Nsimd);
+
+  std::vector<vec,alignedAllocator<vec> > buf(6);
+  vec & v_input1 = buf[0];
+  vec & v_input2 = buf[1];
+  vec & v_result1 = buf[2];
+  vec & v_result2 = buf[3];
+  vec & v_test1 = buf[4];
+  vec & v_test2 = buf[5];
+
+  for(int i=0;i<Nsimd;i++){
+    random(sRNG,input1[i]);
+    random(sRNG,input2[i]);
+    random(sRNG,result1[i]);
+    random(sRNG,result2[i]);
+  }
+
+  merge<vec,scal>(v_input1,input1);
+  merge<vec,scal>(v_input2,input2);
+  merge<vec,scal>(v_result1,result1);
+  merge<vec,scal>(v_result2,result1);
+
+  func(v_result1,v_result2,v_input1,v_input2);
+  func.apply(reference1,reference2,input1,input2);
+
+  func(v_test1,v_test2,v_result1,v_result2);
+
+  extract<vec,scal>(v_result1,result1);
+  extract<vec,scal>(v_result2,result2);
+  extract<vec,scal>(v_test1,test1);
+  extract<vec,scal>(v_test2,test2);
+
+  std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
+
+  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference1[i]<<" "<<result1[i]<<std::endl;
+  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference2[i]<<" "<<result2[i]<<std::endl;
+
+  for(int i=0;i<Nsimd;i++){
+    int found=0;
+    for(int j=0;j<Nsimd;j++){
+      if(reference1[j]==result1[i]) {
+	found=1;
+	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl;
+      }
+    }
+    assert(found==1);
+  }
+  for(int i=0;i<Nsimd;i++){
+    int found=0;
+    for(int j=0;j<Nsimd;j++){
+      if(reference2[j]==result2[i]) {
+	found=1;
+	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl;
+      }
+    }
+    assert(found==1);
+  }
+
+  //  for(int i=0;i<Nsimd;i++){
+    //    std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
+    //    std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
+  //  }
+}
+
+
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
@@ -363,6 +464,15 @@ int main (int argc, char ** argv)
    PermTester<RealF,vRealF>(funcPermute(i));
  }

+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealF exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+
+  // Log2 iteration
+  for(int i=0;(1<<i)< vRealF::Nsimd();i++){
+    ExchangeTester<RealF,vRealF>(funcExchange(i));
+  }
+
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
  std::cout<<GridLogMessage << "Testing vRealF rotate "<<std::endl;
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
@@ -394,6 +504,14 @@ int main (int argc, char ** argv)
    PermTester<RealD,vRealD>(funcPermute(i));
  }

+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealD exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  // Log2 iteration
+  for(int i=0;(1<<i)< vRealD::Nsimd();i++){
+    ExchangeTester<RealD,vRealD>(funcExchange(i));
+  }
+
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
  std::cout<<GridLogMessage << "Testing vRealD rotate "<<std::endl;
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
@@ -429,6 +547,16 @@ int main (int argc, char ** argv)
    PermTester<ComplexF,vComplexF>(funcPermute(i));
  }

+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexF exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  // Log2 iteration
+  for(int i=0;(1<<i)< vComplexF::Nsimd();i++){
+    ExchangeTester<ComplexF,vComplexF>(funcExchange(i));
+  }
+
+
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
  std::cout<<GridLogMessage << "Testing vComplexF rotate "<<std::endl;
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
@@ -466,6 +594,15 @@ int main (int argc, char ** argv)
  }


+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexD exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  // Log2 iteration
+  for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
+    ExchangeTester<ComplexD,vComplexD>(funcExchange(i));
+  }
+
+
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
  std::cout<<GridLogMessage << "Testing vComplexD rotate "<<std::endl;
  std::cout<<GridLogMessage << "==================================="<<  std::endl;