mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Vectorise the XYZT face gathering better.
Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent with efficiency.
This commit is contained in:
		@@ -338,9 +338,9 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
 | 
			
		||||
    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
 | 
			
		||||
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
 | 
			
		||||
  } else {
 | 
			
		||||
    WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
 | 
			
		||||
    QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
 | 
			
		||||
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										161
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
							
						
						
									
										161
									
								
								lib/Stencil.h
									
									
									
									
									
								
							@@ -184,14 +184,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
  struct Merge {
 | 
			
		||||
    cobj * mpointer;
 | 
			
		||||
    std::vector<scalar_object *> rpointers;
 | 
			
		||||
    std::vector<cobj *> vpointers;
 | 
			
		||||
    Integer buffer_size;
 | 
			
		||||
    Integer packet_id;
 | 
			
		||||
    Integer exchange;
 | 
			
		||||
    Integer type;
 | 
			
		||||
  };
 | 
			
		||||
  
 | 
			
		||||
  std::vector<Merge> Mergers;
 | 
			
		||||
 | 
			
		||||
  void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
 | 
			
		||||
    Merge m;
 | 
			
		||||
    m.exchange = 0;
 | 
			
		||||
    m.mpointer = merge_p;
 | 
			
		||||
    m.rpointers= rpointers;
 | 
			
		||||
    m.buffer_size = buffer_size;
 | 
			
		||||
@@ -199,6 +203,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
    Mergers.push_back(m);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) {
 | 
			
		||||
    Merge m;
 | 
			
		||||
    m.exchange = 1;
 | 
			
		||||
    m.type     = type;
 | 
			
		||||
    m.mpointer = merge_p;
 | 
			
		||||
    m.vpointers= rpointers;
 | 
			
		||||
    m.buffer_size = buffer_size;
 | 
			
		||||
    m.packet_id   = packet_id;
 | 
			
		||||
    Mergers.push_back(m);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void CommsMerge(void ) { 
 | 
			
		||||
 | 
			
		||||
    for(int i=0;i<Mergers.size();i++){	
 | 
			
		||||
@@ -214,10 +229,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 | 
			
		||||
      //      std::string fname(ss.str());
 | 
			
		||||
      //      std::ofstream fout(fname);
 | 
			
		||||
 | 
			
		||||
      if ( Mergers[i].exchange == 0 ) { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
      for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
			
		||||
	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
			
		||||
	//	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
 | 
			
		||||
        for(int o=0;o<Mergers[i].buffer_size;o++){
 | 
			
		||||
	  merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
 | 
			
		||||
	  //	fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
      } else { 
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
        for(int o=0;o<Mergers[i].buffer_size/2;o++){
 | 
			
		||||
	  exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
 | 
			
		||||
		   Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      mergetime+=usecond();
 | 
			
		||||
    }
 | 
			
		||||
@@ -285,6 +308,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  cobj* u_send_buf_p;
 | 
			
		||||
  std::vector<scalar_object *> u_simd_send_buf;
 | 
			
		||||
  std::vector<scalar_object *> u_simd_recv_buf;
 | 
			
		||||
  std::vector<cobj *> new_simd_send_buf;
 | 
			
		||||
  std::vector<cobj *> new_simd_recv_buf;
 | 
			
		||||
 | 
			
		||||
  int u_comm_offset;
 | 
			
		||||
  int _unified_buffer_size;
 | 
			
		||||
@@ -432,12 +457,15 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
 | 
			
		||||
    u_simd_send_buf.resize(Nsimd);
 | 
			
		||||
    u_simd_recv_buf.resize(Nsimd);
 | 
			
		||||
 | 
			
		||||
    new_simd_send_buf.resize(Nsimd);
 | 
			
		||||
    new_simd_recv_buf.resize(Nsimd);
 | 
			
		||||
    u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
    u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
    for(int l=0;l<Nsimd;l++){
 | 
			
		||||
      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
 | 
			
		||||
      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
 | 
			
		||||
      new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
      new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    PrecomputeByteOffsets();
 | 
			
		||||
@@ -675,7 +703,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
    HaloGather(source,compress);
 | 
			
		||||
    this->CommunicateBegin(reqs);
 | 
			
		||||
    this->CommunicateComplete(reqs);
 | 
			
		||||
    CommsMerge(); // spins
 | 
			
		||||
    CommsMerge(); 
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
 | 
			
		||||
@@ -706,7 +734,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      if ( sshift[0] == sshift[1] ) {
 | 
			
		||||
	if (splice_dim) {
 | 
			
		||||
	  splicetime-=usecond();
 | 
			
		||||
	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 | 
			
		||||
	  //	  GatherSimd(source,dimension,shift,0x3,compress,face_idx);
 | 
			
		||||
	  //	  std::cout << "GatherSimdNew"<<std::endl;
 | 
			
		||||
	  GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
 | 
			
		||||
	  splicetime+=usecond();
 | 
			
		||||
	} else { 
 | 
			
		||||
	  nosplicetime-=usecond();
 | 
			
		||||
@@ -716,8 +746,9 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      } else {
 | 
			
		||||
	if(splice_dim){
 | 
			
		||||
	  splicetime-=usecond();
 | 
			
		||||
	  GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 | 
			
		||||
	  GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 | 
			
		||||
	  //	  std::cout << "GatherSimdNew2calls"<<std::endl;
 | 
			
		||||
	  GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
 | 
			
		||||
	  GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
 | 
			
		||||
	  splicetime+=usecond();
 | 
			
		||||
	} else {
 | 
			
		||||
	  nosplicetime-=usecond();
 | 
			
		||||
@@ -949,6 +980,120 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  template<class compressor>
 | 
			
		||||
  void  GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
 | 
			
		||||
  {
 | 
			
		||||
    const int Nsimd = _grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
    const int maxl =2;// max layout in a direction
 | 
			
		||||
    int fd = _grid->_fdimensions[dimension];
 | 
			
		||||
    int rd = _grid->_rdimensions[dimension];
 | 
			
		||||
    int ld = _grid->_ldimensions[dimension];
 | 
			
		||||
    int pd              = _grid->_processors[dimension];
 | 
			
		||||
    int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
    int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
			
		||||
    assert(comm_dim==1);
 | 
			
		||||
    // This will not work with a rotate dim
 | 
			
		||||
    assert(simd_layout==maxl);
 | 
			
		||||
    assert(shift>=0);
 | 
			
		||||
    assert(shift<fd);
 | 
			
		||||
    
 | 
			
		||||
    int permute_type=_grid->PermuteType(dimension);
 | 
			
		||||
    //    std::cout << "SimdNew permute type "<<permute_type<<std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////
 | 
			
		||||
    // Simd direction uses an extract/merge pair
 | 
			
		||||
    ///////////////////////////////////////////////
 | 
			
		||||
    int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
			
		||||
    int words = sizeof(cobj)/sizeof(vector_type);
 | 
			
		||||
    
 | 
			
		||||
    assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 | 
			
		||||
    
 | 
			
		||||
    int bytes = (buffer_size*sizeof(cobj))/simd_layout;
 | 
			
		||||
    assert(bytes*simd_layout == buffer_size*sizeof(cobj));
 | 
			
		||||
 | 
			
		||||
    std::vector<cobj *> rpointers(maxl);
 | 
			
		||||
    std::vector<cobj *> spointers(maxl);
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////
 | 
			
		||||
    // Work out what to send where
 | 
			
		||||
    ///////////////////////////////////////////
 | 
			
		||||
    
 | 
			
		||||
    int cb    = (cbmask==0x2)? Odd : Even;
 | 
			
		||||
    int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
			
		||||
    
 | 
			
		||||
    // loop over outer coord planes orthog to dim
 | 
			
		||||
    for(int x=0;x<rd;x++){       
 | 
			
		||||
      
 | 
			
		||||
      int any_offnode = ( ((x+sshift)%fd) >= rd );
 | 
			
		||||
 | 
			
		||||
      if ( any_offnode ) {
 | 
			
		||||
	
 | 
			
		||||
	for(int i=0;i<maxl;i++){       
 | 
			
		||||
	  spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	int sx   = (x+sshift)%rd;
 | 
			
		||||
	
 | 
			
		||||
	gathermtime-=usecond();
 | 
			
		||||
	Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
 | 
			
		||||
	gathermtime+=usecond();
 | 
			
		||||
 | 
			
		||||
	//spointers[0] -- low
 | 
			
		||||
	//spointers[1] -- high
 | 
			
		||||
 | 
			
		||||
	for(int i=0;i<maxl;i++){
 | 
			
		||||
 | 
			
		||||
	  int my_coor  = rd*i + x;            // self explanatory
 | 
			
		||||
	  int nbr_coor = my_coor+sshift;      // self explanatory
 | 
			
		||||
 | 
			
		||||
	  int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
 | 
			
		||||
	  int nbr_lcoor= (nbr_coor%ld);       // local plane coor on neighbour node
 | 
			
		||||
	  int nbr_ic   = (nbr_lcoor)/rd;      // inner coord of peer simd lane "i"
 | 
			
		||||
	  int nbr_ox   = (nbr_lcoor%rd);      // outer coord of peer "x"
 | 
			
		||||
 | 
			
		||||
	  int nbr_plane = nbr_ic;
 | 
			
		||||
	  assert (sx == nbr_ox);
 | 
			
		||||
 | 
			
		||||
	  auto rp = &new_simd_recv_buf[i        ][u_comm_offset];
 | 
			
		||||
	  auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset];
 | 
			
		||||
 | 
			
		||||
	  if(nbr_proc){
 | 
			
		||||
 | 
			
		||||
	    int recv_from_rank;
 | 
			
		||||
	    int xmit_to_rank;
 | 
			
		||||
	    
 | 
			
		||||
	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 | 
			
		||||
 
 | 
			
		||||
	    // shm == receive pointer         if offnode
 | 
			
		||||
	    // shm == Translate[send pointer] if on node -- my view of his send pointer
 | 
			
		||||
	    cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
 | 
			
		||||
	    if (shm==NULL) { 
 | 
			
		||||
	      shm = rp;
 | 
			
		||||
	    }
 | 
			
		||||
 | 
			
		||||
	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
 | 
			
		||||
	    // assuming above pointer flip
 | 
			
		||||
	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
 | 
			
		||||
 | 
			
		||||
	    rpointers[i] = shm;
 | 
			
		||||
	    
 | 
			
		||||
	  } else { 
 | 
			
		||||
	    
 | 
			
		||||
	    rpointers[i] = sp;
 | 
			
		||||
	    
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type);
 | 
			
		||||
 | 
			
		||||
	u_comm_offset     +=buffer_size;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
};
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -42,10 +42,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  int provided;
 | 
			
		||||
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
 | 
			
		||||
  if ( !flag ) {
 | 
			
		||||
    //    MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
 | 
			
		||||
    //    assert (provided == MPI_THREAD_SERIALIZED);
 | 
			
		||||
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
 | 
			
		||||
    assert (provided == MPI_THREAD_MULTIPLE);
 | 
			
		||||
    if ( provided != MPI_THREAD_MULTIPLE ) {
 | 
			
		||||
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
 | 
			
		||||
  ShmInitGeneric();
 | 
			
		||||
 
 | 
			
		||||
@@ -103,6 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
  int n1=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( cbmask ==0x3){
 | 
			
		||||
PARALLEL_NESTED_LOOP2
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
@@ -110,6 +111,7 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
	int offset = b+n*e2;
 | 
			
		||||
	
 | 
			
		||||
	cobj temp =compress(rhs._odata[so+o+b]);
 | 
			
		||||
	extract<cobj>(temp,pointers,offset);
 | 
			
		||||
 | 
			
		||||
@@ -137,6 +139,62 @@ PARALLEL_NESTED_LOOP2
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there *is* need to SIMD split with compression
 | 
			
		||||
///////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class cobj,class vobj,class compressor> void 
 | 
			
		||||
Gather_plane_exchange(const Lattice<vobj> &rhs,
 | 
			
		||||
		      std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
 | 
			
		||||
{
 | 
			
		||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
			
		||||
 | 
			
		||||
  if ( !rhs._grid->CheckerBoarded(dimension) ) {
 | 
			
		||||
    cbmask = 0x3;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
 | 
			
		||||
 | 
			
		||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
			
		||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
			
		||||
  int n1=rhs._grid->_slice_stride[dimension];
 | 
			
		||||
 | 
			
		||||
  // Need to switch to a table loop
 | 
			
		||||
  std::vector<std::pair<int,int> > table;
 | 
			
		||||
 | 
			
		||||
  if ( cbmask ==0x3){
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o      =   n*n1;
 | 
			
		||||
	int offset = b+n*e2;
 | 
			
		||||
	table.push_back(std::pair<int,int> (offset,o+b));
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else { 
 | 
			
		||||
    // Case of SIMD split AND checker dim cannot currently be hit, except in 
 | 
			
		||||
    // Test_cshift_red_black code.
 | 
			
		||||
    for(int n=0;n<e1;n++){
 | 
			
		||||
      for(int b=0;b<e2;b++){
 | 
			
		||||
	int o=n*n1;
 | 
			
		||||
	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
			
		||||
	int offset = b+n*e2;
 | 
			
		||||
 | 
			
		||||
	if ( ocb & cbmask ) {
 | 
			
		||||
	  table.push_back(std::pair<int,int> (offset,o+b));
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  assert( (table.size()&0x1)==0);
 | 
			
		||||
PARALLEL_FOR_LOOP     
 | 
			
		||||
  for(int j=0;j<table.size()/2;j++){
 | 
			
		||||
    //    buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
 | 
			
		||||
    cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
 | 
			
		||||
    cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
 | 
			
		||||
    exchange(pointers[0][j],pointers[1][j],temp1,temp2,type);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
// Gather for when there is no need to SIMD split
 | 
			
		||||
//////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -469,9 +469,47 @@ namespace Optimization {
 | 
			
		||||
    static inline __m256d Permute3(__m256d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Exchange{
 | 
			
		||||
    // 3210 ordering
 | 
			
		||||
    static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
 | 
			
		||||
      out1= _mm256_permute2f128_ps(in1,in2,0x20);
 | 
			
		||||
      out2= _mm256_permute2f128_ps(in1,in2,0x31);
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
 | 
			
		||||
      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
 | 
			
		||||
      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
 | 
			
		||||
      out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
 | 
			
		||||
      out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
 | 
			
		||||
      out1= _mm256_permute2f128_pd(in1,in2,0x20);
 | 
			
		||||
      out2= _mm256_permute2f128_pd(in1,in2,0x31);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
 | 
			
		||||
      out1= _mm256_shuffle_pd(in1,in2,0x0);
 | 
			
		||||
      out2= _mm256_shuffle_pd(in1,in2,0xF);
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if defined (AVX2)
 | 
			
		||||
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256)  _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
 | 
			
		||||
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
 | 
			
		||||
 
 | 
			
		||||
@@ -343,6 +343,46 @@ namespace Optimization {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
 | 
			
		||||
  // On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
 | 
			
		||||
  // The operation is its own inverse
 | 
			
		||||
  struct Exchange{
 | 
			
		||||
    // 3210 ordering
 | 
			
		||||
    static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
 | 
			
		||||
      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
 | 
			
		||||
      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
 | 
			
		||||
      out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
 | 
			
		||||
      out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
 | 
			
		||||
      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
 | 
			
		||||
      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
 | 
			
		||||
      out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
 | 
			
		||||
      out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
 | 
			
		||||
    };
 | 
			
		||||
 
 | 
			
		||||
    static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
 | 
			
		||||
      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
 | 
			
		||||
      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
 | 
			
		||||
      out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
 | 
			
		||||
      out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
 | 
			
		||||
      out1 = _mm512_shuffle_pd(in1,in2,0x00);
 | 
			
		||||
      out2 = _mm512_shuffle_pd(in1,in2,0xFF);
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -326,7 +326,43 @@ namespace Optimization {
 | 
			
		||||
    static inline __m128d Permute3(__m128d in){
 | 
			
		||||
      return in;
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Exchange{
 | 
			
		||||
    // 3210 ordering
 | 
			
		||||
    static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
 | 
			
		||||
      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
 | 
			
		||||
      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
 | 
			
		||||
      out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
 | 
			
		||||
      out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
 | 
			
		||||
      out1= _mm_shuffle_pd(in1,in2,0x0);
 | 
			
		||||
      out2= _mm_shuffle_pd(in1,in2,0x3);
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
    static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
 | 
			
		||||
      assert(0);
 | 
			
		||||
      return;
 | 
			
		||||
    };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  struct Rotate{
 | 
			
		||||
 
 | 
			
		||||
@@ -350,6 +350,18 @@ class Grid_simd {
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  // Exchange 
 | 
			
		||||
  // Al Ah , Bl Bh -> Al Bl Ah,Bh
 | 
			
		||||
  ///////////////////////
 | 
			
		||||
  friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
 | 
			
		||||
  {
 | 
			
		||||
    if     (n==3) Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
 | 
			
		||||
    else if(n==2) Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
 | 
			
		||||
    else if(n==1) Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
 | 
			
		||||
    else if(n==0) Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // General permute; assumes vector length is same across
 | 
			
		||||
  // all subtypes; may not be a good assumption, but could
 | 
			
		||||
@@ -372,23 +384,11 @@ class Grid_simd {
 | 
			
		||||
      int dist = perm & 0xF;
 | 
			
		||||
      y = rotate(b, dist);
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
    switch (perm) {
 | 
			
		||||
      case 3:
 | 
			
		||||
        permute3(y, b);
 | 
			
		||||
        break;
 | 
			
		||||
      case 2:
 | 
			
		||||
        permute2(y, b);
 | 
			
		||||
        break;
 | 
			
		||||
      case 1:
 | 
			
		||||
        permute1(y, b);
 | 
			
		||||
        break;
 | 
			
		||||
      case 0:
 | 
			
		||||
        permute0(y, b);
 | 
			
		||||
        break;
 | 
			
		||||
      default:
 | 
			
		||||
        assert(0);
 | 
			
		||||
    }
 | 
			
		||||
    } 
 | 
			
		||||
    else if(perm==3) permute3(y, b);
 | 
			
		||||
    else if(perm==2) permute2(y, b);
 | 
			
		||||
    else if(perm==1) permute1(y, b);
 | 
			
		||||
    else if(perm==0) permute0(y, b);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};  // end of Grid_simd class definition
 | 
			
		||||
@@ -444,6 +444,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
 | 
			
		||||
  ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
 | 
			
		||||
}    
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////
 | 
			
		||||
// Splat
 | 
			
		||||
///////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -105,6 +105,11 @@ class iScalar {
 | 
			
		||||
  friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
 | 
			
		||||
    rotate(out._internal,in._internal,rot);
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
 | 
			
		||||
				     const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type){
 | 
			
		||||
    exchange(out1._internal,out2._internal,
 | 
			
		||||
	      in1._internal, in2._internal,type);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Unary negation
 | 
			
		||||
  friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
 | 
			
		||||
@@ -248,6 +253,13 @@ class iVector {
 | 
			
		||||
      rotate(out._internal[i],in._internal[i],rot);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
 | 
			
		||||
				     const iVector<vtype,N> &in1,const iVector<vtype,N> &in2,int type){
 | 
			
		||||
    for(int i=0;i<N;i++){
 | 
			
		||||
      exchange(out1._internal[i],out2._internal[i],
 | 
			
		||||
	        in1._internal[i], in2._internal[i],type);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Unary negation
 | 
			
		||||
  friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
 | 
			
		||||
@@ -374,6 +386,14 @@ class iMatrix {
 | 
			
		||||
	rotate(out._internal[i][j],in._internal[i][j],rot);
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
 | 
			
		||||
				     const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
 | 
			
		||||
    for(int i=0;i<N;i++){
 | 
			
		||||
      for(int j=0;j<N;j++){
 | 
			
		||||
	exchange(out1._internal[i][j],out2._internal[i][j],
 | 
			
		||||
		  in1._internal[i][j], in2._internal[i][j],type);
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Unary negation
 | 
			
		||||
  friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
 | 
			
		||||
 
 | 
			
		||||
@@ -113,8 +113,6 @@ public:
 | 
			
		||||
//  outerproduct, 
 | 
			
		||||
//  zeroit
 | 
			
		||||
//  permute
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class funcReduce {
 | 
			
		||||
public:
 | 
			
		||||
  funcReduce() {};
 | 
			
		||||
@@ -168,7 +166,7 @@ void Tester(const functor &func)
 | 
			
		||||
 | 
			
		||||
  int ok=0;
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    if ( abs(reference[i]-result[i])>1.0e-7){
 | 
			
		||||
    if ( abs(reference[i]-result[i])>1.0e-6){
 | 
			
		||||
      std::cout<<GridLogMessage<< "*****" << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
 | 
			
		||||
      ok++;
 | 
			
		||||
@@ -245,6 +243,28 @@ public:
 | 
			
		||||
  }
 | 
			
		||||
  std::string name(void) const { return std::string("Permute"); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class funcExchange {
 | 
			
		||||
public:
 | 
			
		||||
  int n;
 | 
			
		||||
  funcExchange(int _n) { n=_n;};
 | 
			
		||||
  template<class vec>    void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);}
 | 
			
		||||
  template<class scal>   void apply(std::vector<scal> &r1,std::vector<scal> &r2,std::vector<scal> &in1,std::vector<scal> &in2)  const { 
 | 
			
		||||
    int sz=in1.size();
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    int msk = sz>>(n+1);
 | 
			
		||||
 | 
			
		||||
    int j1=0;
 | 
			
		||||
    int j2=0;
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in1[ i ];
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in2[ i ];
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in1[ i ];
 | 
			
		||||
    for(int i=0;i<sz;i++) if ( (i&msk)  ) r2[j2++] = in2[ i ];
 | 
			
		||||
  }
 | 
			
		||||
  std::string name(void) const { return std::string("Exchange"); }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class funcRotate {
 | 
			
		||||
public:
 | 
			
		||||
  int n;
 | 
			
		||||
@@ -325,6 +345,87 @@ void PermTester(const functor &func)
 | 
			
		||||
  assert(ok==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class scal, class vec,class functor > 
 | 
			
		||||
void ExchangeTester(const functor &func)
 | 
			
		||||
{
 | 
			
		||||
  GridSerialRNG          sRNG;
 | 
			
		||||
  sRNG.SeedRandomDevice();
 | 
			
		||||
  
 | 
			
		||||
  int Nsimd = vec::Nsimd();
 | 
			
		||||
 | 
			
		||||
  std::vector<scal> input1(Nsimd);
 | 
			
		||||
  std::vector<scal> input2(Nsimd);
 | 
			
		||||
  std::vector<scal> result1(Nsimd);
 | 
			
		||||
  std::vector<scal> result2(Nsimd);
 | 
			
		||||
  std::vector<scal> reference1(Nsimd);
 | 
			
		||||
  std::vector<scal> reference2(Nsimd);
 | 
			
		||||
  std::vector<scal> test1(Nsimd);
 | 
			
		||||
  std::vector<scal> test2(Nsimd);
 | 
			
		||||
 | 
			
		||||
  std::vector<vec,alignedAllocator<vec> > buf(6);
 | 
			
		||||
  vec & v_input1 = buf[0];
 | 
			
		||||
  vec & v_input2 = buf[1];
 | 
			
		||||
  vec & v_result1 = buf[2];
 | 
			
		||||
  vec & v_result2 = buf[3];
 | 
			
		||||
  vec & v_test1 = buf[4];
 | 
			
		||||
  vec & v_test2 = buf[5];
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    random(sRNG,input1[i]);
 | 
			
		||||
    random(sRNG,input2[i]);
 | 
			
		||||
    random(sRNG,result1[i]);
 | 
			
		||||
    random(sRNG,result2[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  merge<vec,scal>(v_input1,input1);
 | 
			
		||||
  merge<vec,scal>(v_input2,input2);
 | 
			
		||||
  merge<vec,scal>(v_result1,result1);
 | 
			
		||||
  merge<vec,scal>(v_result2,result1);
 | 
			
		||||
 | 
			
		||||
  func(v_result1,v_result2,v_input1,v_input2);
 | 
			
		||||
  func.apply(reference1,reference2,input1,input2);
 | 
			
		||||
 | 
			
		||||
  func(v_test1,v_test2,v_result1,v_result2);
 | 
			
		||||
 | 
			
		||||
  extract<vec,scal>(v_result1,result1);
 | 
			
		||||
  extract<vec,scal>(v_result2,result2);
 | 
			
		||||
  extract<vec,scal>(v_test1,test1);
 | 
			
		||||
  extract<vec,scal>(v_test2,test2);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
 | 
			
		||||
 | 
			
		||||
  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference1[i]<<" "<<result1[i]<<std::endl;
 | 
			
		||||
  //  for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference2[i]<<" "<<result2[i]<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    int found=0;
 | 
			
		||||
    for(int j=0;j<Nsimd;j++){
 | 
			
		||||
      if(reference1[j]==result1[i]) {
 | 
			
		||||
	found=1;
 | 
			
		||||
	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    assert(found==1);
 | 
			
		||||
  }
 | 
			
		||||
  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    int found=0;
 | 
			
		||||
    for(int j=0;j<Nsimd;j++){
 | 
			
		||||
      if(reference2[j]==result2[i]) {
 | 
			
		||||
	found=1;
 | 
			
		||||
	//	std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    assert(found==1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //  for(int i=0;i<Nsimd;i++){
 | 
			
		||||
    //    std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
 | 
			
		||||
    //    std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
 | 
			
		||||
  //  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
@@ -363,6 +464,15 @@ int main (int argc, char ** argv)
 | 
			
		||||
    PermTester<RealF,vRealF>(funcPermute(i));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vRealF exchanges "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
 | 
			
		||||
  // Log2 iteration
 | 
			
		||||
  for(int i=0;(1<<i)< vRealF::Nsimd();i++){
 | 
			
		||||
    ExchangeTester<RealF,vRealF>(funcExchange(i));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vRealF rotate "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
@@ -394,6 +504,14 @@ int main (int argc, char ** argv)
 | 
			
		||||
    PermTester<RealD,vRealD>(funcPermute(i));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vRealD exchanges "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  // Log2 iteration
 | 
			
		||||
  for(int i=0;(1<<i)< vRealD::Nsimd();i++){
 | 
			
		||||
    ExchangeTester<RealD,vRealD>(funcExchange(i));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vRealD rotate "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
@@ -429,6 +547,16 @@ int main (int argc, char ** argv)
 | 
			
		||||
    PermTester<ComplexF,vComplexF>(funcPermute(i));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vComplexF exchanges "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  // Log2 iteration
 | 
			
		||||
  for(int i=0;(1<<i)< vComplexF::Nsimd();i++){
 | 
			
		||||
    ExchangeTester<ComplexF,vComplexF>(funcExchange(i));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vComplexF rotate "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
@@ -466,6 +594,15 @@ int main (int argc, char ** argv)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vComplexD exchanges "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  // Log2 iteration
 | 
			
		||||
  for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
 | 
			
		||||
    ExchangeTester<ComplexD,vComplexD>(funcExchange(i));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Testing vComplexD rotate "<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "==================================="<<  std::endl;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user