From bd600702cf2a0e5bd8335c1192cbe480dc76a7af Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 15 Feb 2017 11:11:04 +0000
Subject: [PATCH] Vectorise the XYZT face gathering better. Hard coded for
 simd_layout <= 2 in any given spread out direction; full generality is
 inconsistent with efficiency.

---
 lib/Init.cc                          |   4 +-
 lib/Stencil.h                        | 161 +++++++++++++++++++++++++--
 lib/communicator/Communicator_mpi.cc |   6 +-
 lib/cshift/Cshift_common.h           |  58 ++++++++++
 lib/simd/Grid_avx.h                  |  40 ++++++-
 lib/simd/Grid_avx512.h               |  40 +++++++
 lib/simd/Grid_sse4.h                 |  36 ++++++
 lib/simd/Grid_vector_types.h         |  36 +++---
 lib/tensors/Tensor_class.h           |  20 ++++
 tests/Test_simd.cc                   | 143 +++++++++++++++++++++++-
 10 files changed, 510 insertions(+), 34 deletions(-)
diff --git a/lib/Init.cc b/lib/Init.cc
index 34dc1720..aeab5835 100644
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -338,9 +338,9 @@ void Grid_init(int *argc,char ***argv)
     QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
-    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
+    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
   } else {
-    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
+    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
   }
   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);
diff --git a/lib/Stencil.h b/lib/Stencil.h
index 71f086af..8b5eac2d 100644
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -184,14 +184,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
   struct Merge {
     cobj * mpointer;
     std::vector<scalar_object *> rpointers;
+    std::vector<cobj *> vpointers;
     Integer buffer_size;
     Integer packet_id;
+    Integer exchange;
+    Integer type;
   };
   
   std::vector<Merge> Mergers;
 
   void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
     Merge m;
+    m.exchange = 0;
     m.mpointer = merge_p;
     m.rpointers= rpointers;
     m.buffer_size = buffer_size;
@@ -199,6 +203,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
     Mergers.push_back(m);
   }
 
+  void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) {
+    Merge m;
+    m.exchange = 1;
+    m.type     = type;
+    m.mpointer = merge_p;
+    m.vpointers= rpointers;
+    m.buffer_size = buffer_size;
+    m.packet_id   = packet_id;
+    Mergers.push_back(m);
+  }
+
   void CommsMerge(void ) { 
 
     for(int i=0;i<Mergers.size();i++){	
@@ -214,10 +229,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
       //      std::string fname(ss.str());
       //      std::ofstream fout(fname);
 
+      if ( Mergers[i].exchange == 0 ) { 
 PARALLEL_FOR_LOOP
-      for(int o=0;o<Mergers[i].buffer_size;o++){
-	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
-	//	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
+        for(int o=0;o<Mergers[i].buffer_size;o++){
+	  merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
+	  //	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
+	}
+      } else { 
+PARALLEL_FOR_LOOP
+        for(int o=0;o<Mergers[i].buffer_size/2;o++){
+	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
+		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
+	}
       }
       mergetime+=usecond();
     }
@@ -285,6 +308,8 @@ PARALLEL_FOR_LOOP
   cobj* u_send_buf_p;
   std::vector<scalar_object *> u_simd_send_buf;
   std::vector<scalar_object *> u_simd_recv_buf;
+  std::vector<cobj *> new_simd_send_buf;
+  std::vector<cobj *> new_simd_recv_buf;
 
   int u_comm_offset;
   int _unified_buffer_size;
@@ -432,12 +457,15 @@ PARALLEL_FOR_LOOP
 
     u_simd_send_buf.resize(Nsimd);
     u_simd_recv_buf.resize(Nsimd);
-
+    new_simd_send_buf.resize(Nsimd);
+    new_simd_recv_buf.resize(Nsimd);
     u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
     u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
     for(int l=0;l<Nsimd;l++){
       u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
       u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
+      new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
+      new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
     }
 
     PrecomputeByteOffsets();
@@ -675,7 +703,7 @@ PARALLEL_FOR_LOOP
     HaloGather(source,compress);
     this->CommunicateBegin(reqs);
     this->CommunicateComplete(reqs);
-    CommsMerge(); // spins
+    CommsMerge(); 
   }
   
   template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@@ -706,7 +734,9 @@ PARALLEL_FOR_LOOP
       if ( sshift[0] == sshift[1] ) {
 	if (splice_dim) {
 	  splicetime-=usecond();
-	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
+	  //	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
+	  //	  std::cout << "GatherSimdNew"<<std::endl;
+	  GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
 	  splicetime+=usecond();
 	} else { 
 	  nosplicetime-=usecond();
@@ -716,8 +746,9 @@ PARALLEL_FOR_LOOP
       } else {
 	if(splice_dim){
 	  splicetime-=usecond();
-	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
-	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
+	  //	  std::cout << "GatherSimdNew2calls"<<std::endl;
+	  GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
+	  GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 	  splicetime+=usecond();
 	} else {
 	  nosplicetime-=usecond();
@@ -949,6 +980,120 @@ PARALLEL_FOR_LOOP
       }
     }
   }
+
+
+  template<class compressor>
+  void  GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
+  {
+    const int Nsimd = _grid->Nsimd();
+
+    const int maxl =2;// max layout in a direction
+    int fd = _grid->_fdimensions[dimension];
+    int rd = _grid->_rdimensions[dimension];
+    int ld = _grid->_ldimensions[dimension];
+    int pd              = _grid->_processors[dimension];
+    int simd_layout     = _grid->_simd_layout[dimension];
+    int comm_dim        = _grid->_processors[dimension] >1 ;
+    assert(comm_dim==1);
+    // This will not work with a rotate dim
+    assert(simd_layout==maxl);
+    assert(shift>=0);
+    assert(shift<fd);
+    
+    int permute_type=_grid->PermuteType(dimension);
+    //    std::cout << "SimdNew permute type "<<permute_type<<std::endl;
+
+    ///////////////////////////////////////////////
+    // Simd direction uses an extract/merge pair
+    ///////////////////////////////////////////////
+    int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
+    int words = sizeof(cobj)/sizeof(vector_type);
+    
+    assert(cbmask==0x3); // Fixme think there is a latent bug if not true
+    
+    int bytes = (buffer_size*sizeof(cobj))/simd_layout;
+    assert(bytes*simd_layout == buffer_size*sizeof(cobj));
+
+    std::vector<cobj *> rpointers(maxl);
+    std::vector<cobj *> spointers(maxl);
+
+    ///////////////////////////////////////////
+    // Work out what to send where
+    ///////////////////////////////////////////
+    
+    int cb    = (cbmask==0x2)? Odd : Even;
+    int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
+    
+    // loop over outer coord planes orthog to dim
+    for(int x=0;x<rd;x++){       
+      
+      int any_offnode = ( ((x+sshift)%fd) >= rd );
+
+      if ( any_offnode ) {
+	
+	for(int i=0;i<maxl;i++){       
+	  spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
+	}
+	
+	int sx   = (x+sshift)%rd;
+	
+	gathermtime-=usecond();
+	Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
+	gathermtime+=usecond();
+
+	//spointers[0] -- low
+	//spointers[1] -- high
+
+	for(int i=0;i<maxl;i++){
+
+	  int my_coor  = rd*i + x;            // self explanatory
+	  int nbr_coor = my_coor+sshift;      // self explanatory
+
+	  int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+	  int nbr_lcoor= (nbr_coor%ld);       // local plane coor on neighbour node
+	  int nbr_ic   = (nbr_lcoor)/rd;      // inner coord of peer simd lane "i"
+	  int nbr_ox   = (nbr_lcoor%rd);      // outer coord of peer "x"
+
+	  int nbr_plane = nbr_ic;
+	  assert (sx == nbr_ox);
+
+	  auto rp = &new_simd_recv_buf[i        ][u_comm_offset];
+	  auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset];
+
+	  if(nbr_proc){
+
+	    int recv_from_rank;
+	    int xmit_to_rank;
+	    
+	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+ 
+	    // shm == receive pointer         if offnode
+	    // shm == Translate[send pointer] if on node -- my view of his send pointer
+	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
+	    if (shm==NULL) { 
+	      shm = rp;
+	    }
+
+	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
+	    // assuming above pointer flip
+	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
+
+	    rpointers[i] = shm;
+	    
+	  } else { 
+	    
+	    rpointers[i] = sp;
+	    
+	  }
+	}
+
+	AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type);
+
+	u_comm_offset     +=buffer_size;
+      }
+    }
+  }
+
   
 };
 }
diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc
index 61126a17..2033a446 100644
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -42,10 +42,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
   int provided;
   MPI_Initialized(&flag); // needed to coexist with other libs apparently
   if ( !flag ) {
-    //    MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
-    //    assert (provided == MPI_THREAD_SERIALIZED);
     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
-    assert (provided == MPI_THREAD_MULTIPLE);
+    if ( provided != MPI_THREAD_MULTIPLE ) {
+      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
+    }
   }
   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
   ShmInitGeneric();
diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h
index 813929d8..c01187de 100644
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -103,6 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
   int e1=rhs._grid->_slice_nblock[dimension];
   int e2=rhs._grid->_slice_block[dimension];
   int n1=rhs._grid->_slice_stride[dimension];
+
   if ( cbmask ==0x3){
 PARALLEL_NESTED_LOOP2
     for(int n=0;n<e1;n++){
@@ -110,6 +111,7 @@ PARALLEL_NESTED_LOOP2
 
 	int o      =   n*n1;
 	int offset = b+n*e2;
+	
 	cobj temp =compress(rhs._odata[so+o+b]);
 	extract<cobj>(temp,pointers,offset);
 
@@ -137,6 +139,62 @@ PARALLEL_NESTED_LOOP2
   }
 }
 
+///////////////////////////////////////////////////////////////////
+// Gather for when there *is* need to SIMD split with compression
+///////////////////////////////////////////////////////////////////
+template<class cobj,class vobj,class compressor> void 
+Gather_plane_exchange(const Lattice<vobj> &rhs,
+		      std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask = 0x3;
+  }
+
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+  int n1=rhs._grid->_slice_stride[dimension];
+
+  // Need to switch to a table loop
+  std::vector<std::pair<int,int> > table;
+
+  if ( cbmask ==0x3){
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	table.push_back(std::pair<int,int> (offset,o+b));
+      }
+    }
+  } else { 
+    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+    // Test_cshift_red_black code.
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o=n*n1;
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	int offset = b+n*e2;
+
+	if ( ocb & cbmask ) {
+	  table.push_back(std::pair<int,int> (offset,o+b));
+	}
+      }
+    }
+  }
+
+  assert( (table.size()&0x1)==0);
+PARALLEL_FOR_LOOP     
+  for(int j=0;j<table.size()/2;j++){
+    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
+    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
+    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
+    exchange(pointers[0][j],pointers[1][j],temp1,temp2,type);
+  }
+}
+
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
diff --git a/lib/simd/Grid_avx.h b/lib/simd/Grid_avx.h
index 724f52bb..7abe4d5a 100644
--- a/lib/simd/Grid_avx.h
+++ b/lib/simd/Grid_avx.h
@@ -469,9 +469,47 @@ namespace Optimization {
     static inline __m256d Permute3(__m256d in){
       return in;
     };
-
   };
 
+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      out1= _mm256_permute2f128_ps(in1,in2,0x20);
+      out2= _mm256_permute2f128_ps(in1,in2,0x31);
+    };
+    static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
+      assert(0);
+      return;
+    };
+
+    static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      out1= _mm256_permute2f128_pd(in1,in2,0x20);
+      out2= _mm256_permute2f128_pd(in1,in2,0x31);
+      return;
+    };
+    static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      out1= _mm256_shuffle_pd(in1,in2,0x0);
+      out2= _mm256_shuffle_pd(in1,in2,0xF);
+    };
+    static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
+      assert(0);
+      return;
+    };
+  };
+
+
 #if defined (AVX2)
 #define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256)  _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
 #define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
diff --git a/lib/simd/Grid_avx512.h b/lib/simd/Grid_avx512.h
index ebf99e16..22d45aeb 100644
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -343,6 +343,46 @@ namespace Optimization {
 
   };
 
+  // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
+  // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
+  // The operation is its own inverse
+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
+      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+ 
+    static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      out1 = _mm512_shuffle_pd(in1,in2,0x00);
+      out2 = _mm512_shuffle_pd(in1,in2,0xFF);
+    };
+    static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
+      assert(0);
+      return;
+    };
+  };
+
 
   struct Rotate{
 
diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h
index 943756b2..fcad4c28 100644
--- a/lib/simd/Grid_sse4.h
+++ b/lib/simd/Grid_sse4.h
@@ -326,7 +326,43 @@ namespace Optimization {
     static inline __m128d Permute3(__m128d in){
       return in;
     };
+  };
 
+  struct Exchange{
+    // 3210 ordering
+    static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
+      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
+    };
+    static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
+      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
+    };
+    static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
+      assert(0);
+      return;
+    };
+
+    static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      out1= _mm_shuffle_pd(in1,in2,0x0);
+      out2= _mm_shuffle_pd(in1,in2,0x3);
+    };
+    static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
+      assert(0);
+      return;
+    };
   };
 
   struct Rotate{
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 8a6ab2e7..cd499d88 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -350,6 +350,18 @@ class Grid_simd {
     return ret;
   }
 
+  ///////////////////////
+  // Exchange 
+  // Al Ah , Bl Bh -> Al Bl Ah,Bh
+  ///////////////////////
+  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
+  {
+    if     (n==3) Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
+    else if(n==2) Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
+    else if(n==1) Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
+    else if(n==0) Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
+  }
+
   ////////////////////////////////////////////////////////////////////
   // General permute; assumes vector length is same across
   // all subtypes; may not be a good assumption, but could
@@ -372,23 +384,11 @@ class Grid_simd {
       int dist = perm & 0xF;
       y = rotate(b, dist);
       return;
-    }
-    switch (perm) {
-      case 3:
-        permute3(y, b);
-        break;
-      case 2:
-        permute2(y, b);
-        break;
-      case 1:
-        permute1(y, b);
-        break;
-      case 0:
-        permute0(y, b);
-        break;
-      default:
-        assert(0);
-    }
+    } 
+    else if(perm==3) permute3(y, b);
+    else if(perm==2) permute2(y, b);
+    else if(perm==1) permute1(y, b);
+    else if(perm==0) permute0(y, b);
   }
 
 };  // end of Grid_simd class definition
@@ -444,6 +444,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
   ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
 }    
 
+
+
 ///////////////////////
 // Splat
 ///////////////////////
diff --git a/lib/tensors/Tensor_class.h b/lib/tensors/Tensor_class.h
index 473dd6b1..e0b69eb0 100644
--- a/lib/tensors/Tensor_class.h
+++ b/lib/tensors/Tensor_class.h
@@ -105,6 +105,11 @@ class iScalar {
   friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
     rotate(out._internal,in._internal,rot);
   }
+  friend strong_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
+				     const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type){
+    exchange(out1._internal,out2._internal,
+	      in1._internal, in2._internal,type);
+  }
 
   // Unary negation
   friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
@@ -248,6 +253,13 @@ class iVector {
       rotate(out._internal[i],in._internal[i],rot);
     }
   }
+  friend strong_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
+				     const iVector<vtype,N> &in1,const iVector<vtype,N> &in2,int type){
+    for(int i=0;i<N;i++){
+      exchange(out1._internal[i],out2._internal[i],
+	        in1._internal[i], in2._internal[i],type);
+    }
+  }
 
   // Unary negation
   friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
@@ -374,6 +386,14 @@ class iMatrix {
 	rotate(out._internal[i][j],in._internal[i][j],rot);
     }}
   }
+  friend strong_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
+				     const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	exchange(out1._internal[i][j],out2._internal[i][j],
+		  in1._internal[i][j], in2._internal[i][j],type);
+    }}
+  }
 
   // Unary negation
   friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc
index 92f9bcd8..d840140e 100644
--- a/tests/Test_simd.cc
+++ b/tests/Test_simd.cc
@@ -113,8 +113,6 @@ public:
 //  outerproduct, 
 //  zeroit
 //  permute
-
-
 class funcReduce {
 public:
   funcReduce() {};
@@ -168,7 +166,7 @@ void Tester(const functor &func)
 
   int ok=0;
   for(int i=0;i<Nsimd;i++){
-    if ( abs(reference[i]-result[i])>1.0e-7){
+    if ( abs(reference[i]-result[i])>1.0e-6){
       std::cout<<GridLogMessage<< "*****" << std::endl;
       std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
       ok++;
@@ -245,6 +243,28 @@ public:
   }
   std::string name(void) const { return std::string("Permute"); }
 };
+
+class funcExchange {
+public:
+  int n;
+  funcExchange(int _n) { n=_n;};
+  template<class vec>    void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);}
+  template<class scal>   void apply(std::vector<scal> &r1,std::vector<scal> &r2,std::vector<scal> &in1,std::vector<scal> &in2)  const { 
+    int sz=in1.size();
+
+    
+    int msk = sz>>(n+1);
+
+    int j1=0;
+    int j2=0;
+    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in1[ i ];
+    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in2[ i ];
+    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in1[ i ];
+    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in2[ i ];
+  }
+  std::string name(void) const { return std::string("Exchange"); }
+};
+
 class funcRotate {
 public:
   int n;
@@ -325,6 +345,87 @@ void PermTester(const functor &func)
   assert(ok==0);
 }
 
+
+template<class scal, class vec,class functor > 
+void ExchangeTester(const functor &func)
+{
+  GridSerialRNG          sRNG;
+  sRNG.SeedRandomDevice();
+  
+  int Nsimd = vec::Nsimd();
+
+  std::vector<scal> input1(Nsimd);
+  std::vector<scal> input2(Nsimd);
+  std::vector<scal> result1(Nsimd);
+  std::vector<scal> result2(Nsimd);
+  std::vector<scal> reference1(Nsimd);
+  std::vector<scal> reference2(Nsimd);
+  std::vector<scal> test1(Nsimd);
+  std::vector<scal> test2(Nsimd);
+
+  std::vector<vec,alignedAllocator<vec> > buf(6);
+  vec & v_input1 = buf[0];
+  vec & v_input2 = buf[1];
+  vec & v_result1 = buf[2];
+  vec & v_result2 = buf[3];
+  vec & v_test1 = buf[4];
+  vec & v_test2 = buf[5];
+
+  for(int i=0;i<Nsimd;i++){
+    random(sRNG,input1[i]);
+    random(sRNG,input2[i]);
+    random(sRNG,result1[i]);
+    random(sRNG,result2[i]);
+  }
+
+  merge<vec,scal>(v_input1,input1);
+  merge<vec,scal>(v_input2,input2);
+  merge<vec,scal>(v_result1,result1);
+  merge<vec,scal>(v_result2,result1);
+
+  func(v_result1,v_result2,v_input1,v_input2);
+  func.apply(reference1,reference2,input1,input2);
+
+  func(v_test1,v_test2,v_result1,v_result2);
+
+  extract<vec,scal>(v_result1,result1);
+  extract<vec,scal>(v_result2,result2);
+  extract<vec,scal>(v_test1,test1);
+  extract<vec,scal>(v_test2,test2);
+
+  std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
+
+  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference1[i]<<" "<<result1[i]<<std::endl;
+  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference2[i]<<" "<<result2[i]<<std::endl;
+
+  for(int i=0;i<Nsimd;i++){
+    int found=0;
+    for(int j=0;j<Nsimd;j++){
+      if(reference1[j]==result1[i]) {
+	found=1;
+	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl;
+      }
+    }
+    assert(found==1);
+  }
+  for(int i=0;i<Nsimd;i++){
+    int found=0;
+    for(int j=0;j<Nsimd;j++){
+      if(reference2[j]==result2[i]) {
+	found=1;
+	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl;
+      }
+    }
+    assert(found==1);
+  }
+
+  //  for(int i=0;i<Nsimd;i++){
+    //    std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
+    //    std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
+  //  }
+}
+
+
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);
@@ -363,6 +464,15 @@ int main (int argc, char ** argv)
     PermTester<RealF,vRealF>(funcPermute(i));
   }
 
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealF exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+
+  // Log2 iteration
+  for(int i=0;(1<<i)< vRealF::Nsimd();i++){
+    ExchangeTester<RealF,vRealF>(funcExchange(i));
+  }
+
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
   std::cout<<GridLogMessage << "Testing vRealF rotate "<<std::endl;
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
@@ -394,6 +504,14 @@ int main (int argc, char ** argv)
     PermTester<RealD,vRealD>(funcPermute(i));
   }
 
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vRealD exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  // Log2 iteration
+  for(int i=0;(1<<i)< vRealD::Nsimd();i++){
+    ExchangeTester<RealD,vRealD>(funcExchange(i));
+  }
+
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
   std::cout<<GridLogMessage << "Testing vRealD rotate "<<std::endl;
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
@@ -429,6 +547,16 @@ int main (int argc, char ** argv)
     PermTester<ComplexF,vComplexF>(funcPermute(i));
   }
 
+
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexF exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  // Log2 iteration
+  for(int i=0;(1<<i)< vComplexF::Nsimd();i++){
+    ExchangeTester<ComplexF,vComplexF>(funcExchange(i));
+  }
+
+
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
   std::cout<<GridLogMessage << "Testing vComplexF rotate "<<std::endl;
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
@@ -466,6 +594,15 @@ int main (int argc, char ** argv)
   }
 
 
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  std::cout<<GridLogMessage << "Testing vComplexD exchanges "<<std::endl;
+  std::cout<<GridLogMessage << "==================================="<<  std::endl;
+  // Log2 iteration
+  for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
+    ExchangeTester<ComplexD,vComplexD>(funcExchange(i));
+  }
+
+
   std::cout<<GridLogMessage << "==================================="<<  std::endl;
   std::cout<<GridLogMessage << "Testing vComplexD rotate "<<std::endl;
   std::cout<<GridLogMessage << "==================================="<<  std::endl;