mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Vectorise the XYZT face gathering better.
Hard coded for simd_layout <= 2 in any given spread out direction; full generality is inconsistent with efficiency.
This commit is contained in:
parent
aca7a3ef0a
commit
bd600702cf
@ -338,9 +338,9 @@ void Grid_init(int *argc,char ***argv)
|
||||
QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
||||
} else {
|
||||
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-isend") ){
|
||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyIsend);
|
||||
|
161
lib/Stencil.h
161
lib/Stencil.h
@ -184,14 +184,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
struct Merge {
|
||||
cobj * mpointer;
|
||||
std::vector<scalar_object *> rpointers;
|
||||
std::vector<cobj *> vpointers;
|
||||
Integer buffer_size;
|
||||
Integer packet_id;
|
||||
Integer exchange;
|
||||
Integer type;
|
||||
};
|
||||
|
||||
std::vector<Merge> Mergers;
|
||||
|
||||
void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
|
||||
Merge m;
|
||||
m.exchange = 0;
|
||||
m.mpointer = merge_p;
|
||||
m.rpointers= rpointers;
|
||||
m.buffer_size = buffer_size;
|
||||
@ -199,6 +203,17 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
Mergers.push_back(m);
|
||||
}
|
||||
|
||||
void AddMergeNew(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer packet_id,Integer type) {
|
||||
Merge m;
|
||||
m.exchange = 1;
|
||||
m.type = type;
|
||||
m.mpointer = merge_p;
|
||||
m.vpointers= rpointers;
|
||||
m.buffer_size = buffer_size;
|
||||
m.packet_id = packet_id;
|
||||
Mergers.push_back(m);
|
||||
}
|
||||
|
||||
void CommsMerge(void ) {
|
||||
|
||||
for(int i=0;i<Mergers.size();i++){
|
||||
@ -214,10 +229,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
||||
// std::string fname(ss.str());
|
||||
// std::ofstream fout(fname);
|
||||
|
||||
if ( Mergers[i].exchange == 0 ) {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
|
||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||
// fout<<o<<" "<<Mergers[i].mpointer[o]<<std::endl;
|
||||
}
|
||||
} else {
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int o=0;o<Mergers[i].buffer_size/2;o++){
|
||||
exchange(Mergers[i].mpointer[2*o],Mergers[i].mpointer[2*o+1],
|
||||
Mergers[i].vpointers[0][o],Mergers[i].vpointers[1][o],Mergers[i].type);
|
||||
}
|
||||
}
|
||||
mergetime+=usecond();
|
||||
}
|
||||
@ -285,6 +308,8 @@ PARALLEL_FOR_LOOP
|
||||
cobj* u_send_buf_p;
|
||||
std::vector<scalar_object *> u_simd_send_buf;
|
||||
std::vector<scalar_object *> u_simd_recv_buf;
|
||||
std::vector<cobj *> new_simd_send_buf;
|
||||
std::vector<cobj *> new_simd_recv_buf;
|
||||
|
||||
int u_comm_offset;
|
||||
int _unified_buffer_size;
|
||||
@ -432,12 +457,15 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
u_simd_send_buf.resize(Nsimd);
|
||||
u_simd_recv_buf.resize(Nsimd);
|
||||
|
||||
new_simd_send_buf.resize(Nsimd);
|
||||
new_simd_recv_buf.resize(Nsimd);
|
||||
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||
for(int l=0;l<Nsimd;l++){
|
||||
u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
|
||||
u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
|
||||
new_simd_recv_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||
new_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||
}
|
||||
|
||||
PrecomputeByteOffsets();
|
||||
@ -675,7 +703,7 @@ PARALLEL_FOR_LOOP
|
||||
HaloGather(source,compress);
|
||||
this->CommunicateBegin(reqs);
|
||||
this->CommunicateComplete(reqs);
|
||||
CommsMerge(); // spins
|
||||
CommsMerge();
|
||||
}
|
||||
|
||||
template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||
@ -706,7 +734,9 @@ PARALLEL_FOR_LOOP
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
if (splice_dim) {
|
||||
splicetime-=usecond();
|
||||
GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
||||
// GatherSimd(source,dimension,shift,0x3,compress,face_idx);
|
||||
// std::cout << "GatherSimdNew"<<std::endl;
|
||||
GatherSimdNew(source,dimension,shift,0x3,compress,face_idx);
|
||||
splicetime+=usecond();
|
||||
} else {
|
||||
nosplicetime-=usecond();
|
||||
@ -716,8 +746,9 @@ PARALLEL_FOR_LOOP
|
||||
} else {
|
||||
if(splice_dim){
|
||||
splicetime-=usecond();
|
||||
GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
|
||||
GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
|
||||
// std::cout << "GatherSimdNew2calls"<<std::endl;
|
||||
GatherSimdNew(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
|
||||
GatherSimdNew(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
|
||||
splicetime+=usecond();
|
||||
} else {
|
||||
nosplicetime-=usecond();
|
||||
@ -949,6 +980,120 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class compressor>
|
||||
void GatherSimdNew(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
|
||||
{
|
||||
const int Nsimd = _grid->Nsimd();
|
||||
|
||||
const int maxl =2;// max layout in a direction
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int pd = _grid->_processors[dimension];
|
||||
int simd_layout = _grid->_simd_layout[dimension];
|
||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||
assert(comm_dim==1);
|
||||
// This will not work with a rotate dim
|
||||
assert(simd_layout==maxl);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
|
||||
int permute_type=_grid->PermuteType(dimension);
|
||||
// std::cout << "SimdNew permute type "<<permute_type<<std::endl;
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||
int words = sizeof(cobj)/sizeof(vector_type);
|
||||
|
||||
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
|
||||
|
||||
int bytes = (buffer_size*sizeof(cobj))/simd_layout;
|
||||
assert(bytes*simd_layout == buffer_size*sizeof(cobj));
|
||||
|
||||
std::vector<cobj *> rpointers(maxl);
|
||||
std::vector<cobj *> spointers(maxl);
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
|
||||
int cb = (cbmask==0x2)? Odd : Even;
|
||||
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
||||
|
||||
// loop over outer coord planes orthog to dim
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
||||
|
||||
if ( any_offnode ) {
|
||||
|
||||
for(int i=0;i<maxl;i++){
|
||||
spointers[i] = (cobj *) &new_simd_send_buf[i][u_comm_offset];
|
||||
}
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
|
||||
gathermtime-=usecond();
|
||||
Gather_plane_exchange(rhs,spointers,dimension,sx,cbmask,compress,permute_type);
|
||||
gathermtime+=usecond();
|
||||
|
||||
//spointers[0] -- low
|
||||
//spointers[1] -- high
|
||||
|
||||
for(int i=0;i<maxl;i++){
|
||||
|
||||
int my_coor = rd*i + x; // self explanatory
|
||||
int nbr_coor = my_coor+sshift; // self explanatory
|
||||
|
||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||
int nbr_lcoor= (nbr_coor%ld); // local plane coor on neighbour node
|
||||
int nbr_ic = (nbr_lcoor)/rd; // inner coord of peer simd lane "i"
|
||||
int nbr_ox = (nbr_lcoor%rd); // outer coord of peer "x"
|
||||
|
||||
int nbr_plane = nbr_ic;
|
||||
assert (sx == nbr_ox);
|
||||
|
||||
auto rp = &new_simd_recv_buf[i ][u_comm_offset];
|
||||
auto sp = &new_simd_send_buf[nbr_plane][u_comm_offset];
|
||||
|
||||
if(nbr_proc){
|
||||
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
// shm == receive pointer if offnode
|
||||
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
||||
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||
if (shm==NULL) {
|
||||
shm = rp;
|
||||
}
|
||||
|
||||
// if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
|
||||
// assuming above pointer flip
|
||||
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
|
||||
|
||||
rpointers[i] = shm;
|
||||
|
||||
} else {
|
||||
|
||||
rpointers[i] = sp;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
AddMergeNew(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1,permute_type);
|
||||
|
||||
u_comm_offset +=buffer_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -42,10 +42,10 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||
int provided;
|
||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||
if ( !flag ) {
|
||||
// MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
|
||||
// assert (provided == MPI_THREAD_SERIALIZED);
|
||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||
assert (provided == MPI_THREAD_MULTIPLE);
|
||||
if ( provided != MPI_THREAD_MULTIPLE ) {
|
||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||
}
|
||||
}
|
||||
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
|
||||
ShmInitGeneric();
|
||||
|
@ -103,6 +103,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
||||
int e1=rhs._grid->_slice_nblock[dimension];
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
int n1=rhs._grid->_slice_stride[dimension];
|
||||
|
||||
if ( cbmask ==0x3){
|
||||
PARALLEL_NESTED_LOOP2
|
||||
for(int n=0;n<e1;n++){
|
||||
@ -110,6 +111,7 @@ PARALLEL_NESTED_LOOP2
|
||||
|
||||
int o = n*n1;
|
||||
int offset = b+n*e2;
|
||||
|
||||
cobj temp =compress(rhs._odata[so+o+b]);
|
||||
extract<cobj>(temp,pointers,offset);
|
||||
|
||||
@ -137,6 +139,62 @@ PARALLEL_NESTED_LOOP2
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gather for when there *is* need to SIMD split with compression
|
||||
///////////////////////////////////////////////////////////////////
|
||||
template<class cobj,class vobj,class compressor> void
|
||||
Gather_plane_exchange(const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type)
|
||||
{
|
||||
int rd = rhs._grid->_rdimensions[dimension];
|
||||
|
||||
if ( !rhs._grid->CheckerBoarded(dimension) ) {
|
||||
cbmask = 0x3;
|
||||
}
|
||||
|
||||
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
int e1=rhs._grid->_slice_nblock[dimension];
|
||||
int e2=rhs._grid->_slice_block[dimension];
|
||||
int n1=rhs._grid->_slice_stride[dimension];
|
||||
|
||||
// Need to switch to a table loop
|
||||
std::vector<std::pair<int,int> > table;
|
||||
|
||||
if ( cbmask ==0x3){
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o = n*n1;
|
||||
int offset = b+n*e2;
|
||||
table.push_back(std::pair<int,int> (offset,o+b));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||
// Test_cshift_red_black code.
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o=n*n1;
|
||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
|
||||
int offset = b+n*e2;
|
||||
|
||||
if ( ocb & cbmask ) {
|
||||
table.push_back(std::pair<int,int> (offset,o+b));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert( (table.size()&0x1)==0);
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int j=0;j<table.size()/2;j++){
|
||||
// buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
|
||||
cobj temp1 =compress(rhs._odata[so+table[2*j].second]);
|
||||
cobj temp2 =compress(rhs._odata[so+table[2*j+1].second]);
|
||||
exchange(pointers[0][j],pointers[1][j],temp1,temp2,type);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Gather for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
|
@ -469,9 +469,47 @@ namespace Optimization {
|
||||
static inline __m256d Permute3(__m256d in){
|
||||
return in;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
out1= _mm256_permute2f128_ps(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_ps(in1,in2,0x31);
|
||||
};
|
||||
static inline void Exchange1(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange2(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
out1= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm256_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange3(__m256 &out1,__m256 &out2,__m256 in1,__m256 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_permute2f128_pd(in1,in2,0x20);
|
||||
out2= _mm256_permute2f128_pd(in1,in2,0x31);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange1(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
out1= _mm256_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm256_shuffle_pd(in1,in2,0xF);
|
||||
};
|
||||
static inline void Exchange2(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m256d &out1,__m256d &out2,__m256d in1,__m256d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
#if defined (AVX2)
|
||||
#define _mm256_alignr_epi32_grid(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
|
||||
#define _mm256_alignr_epi64_grid(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
|
||||
|
@ -343,6 +343,46 @@ namespace Optimization {
|
||||
|
||||
};
|
||||
|
||||
// On extracting face: Ah Al , Bh Bl -> Ah Bh, Al Bl
|
||||
// On merging buffers: Ah,Bh , Al Bl -> Ah Al, Bh, Bl
|
||||
// The operation is its own inverse
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f32x4(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange2(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange3(__m512 &out1,__m512 &out2,__m512 in1,__m512 in2){
|
||||
out1= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm512_shuffle_f64x2(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange2(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
out1 = _mm512_shuffle_pd(in1,in2,0x00);
|
||||
out2 = _mm512_shuffle_pd(in1,in2,0xFF);
|
||||
};
|
||||
static inline void Exchange3(__m512d &out1,__m512d &out2,__m512d in1,__m512d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
struct Rotate{
|
||||
|
||||
|
@ -326,7 +326,43 @@ namespace Optimization {
|
||||
static inline __m128d Permute3(__m128d in){
|
||||
return in;
|
||||
};
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
static inline void Exchange0(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
};
|
||||
static inline void Exchange1(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0));
|
||||
out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1));
|
||||
};
|
||||
static inline void Exchange2(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128 &out1,__m128 &out2,__m128 in1,__m128 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static inline void Exchange0(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
out1= _mm_shuffle_pd(in1,in2,0x0);
|
||||
out2= _mm_shuffle_pd(in1,in2,0x3);
|
||||
};
|
||||
static inline void Exchange1(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange2(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static inline void Exchange3(__m128d &out1,__m128d &out2,__m128d in1,__m128d in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
@ -350,6 +350,18 @@ class Grid_simd {
|
||||
return ret;
|
||||
}
|
||||
|
||||
///////////////////////
|
||||
// Exchange
|
||||
// Al Ah , Bl Bh -> Al Bl Ah,Bh
|
||||
///////////////////////
|
||||
friend inline void exchange(Grid_simd &out1,Grid_simd &out2,Grid_simd in1,Grid_simd in2,int n)
|
||||
{
|
||||
if (n==3) Optimization::Exchange::Exchange3(out1.v,out2.v,in1.v,in2.v);
|
||||
else if(n==2) Optimization::Exchange::Exchange2(out1.v,out2.v,in1.v,in2.v);
|
||||
else if(n==1) Optimization::Exchange::Exchange1(out1.v,out2.v,in1.v,in2.v);
|
||||
else if(n==0) Optimization::Exchange::Exchange0(out1.v,out2.v,in1.v,in2.v);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// General permute; assumes vector length is same across
|
||||
// all subtypes; may not be a good assumption, but could
|
||||
@ -372,23 +384,11 @@ class Grid_simd {
|
||||
int dist = perm & 0xF;
|
||||
y = rotate(b, dist);
|
||||
return;
|
||||
}
|
||||
switch (perm) {
|
||||
case 3:
|
||||
permute3(y, b);
|
||||
break;
|
||||
case 2:
|
||||
permute2(y, b);
|
||||
break;
|
||||
case 1:
|
||||
permute1(y, b);
|
||||
break;
|
||||
case 0:
|
||||
permute0(y, b);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
else if(perm==3) permute3(y, b);
|
||||
else if(perm==2) permute2(y, b);
|
||||
else if(perm==1) permute1(y, b);
|
||||
else if(perm==0) permute0(y, b);
|
||||
}
|
||||
|
||||
}; // end of Grid_simd class definition
|
||||
@ -444,6 +444,8 @@ inline void rbroadcast(Grid_simd<S,V> &ret,const Grid_simd<S,V> &src,int lane){
|
||||
ret.v = unary<V>(real(typepun[lane]), VsplatSIMD());
|
||||
}
|
||||
|
||||
|
||||
|
||||
///////////////////////
|
||||
// Splat
|
||||
///////////////////////
|
||||
|
@ -105,6 +105,11 @@ class iScalar {
|
||||
friend strong_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
|
||||
rotate(out._internal,in._internal,rot);
|
||||
}
|
||||
friend strong_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
|
||||
const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type){
|
||||
exchange(out1._internal,out2._internal,
|
||||
in1._internal, in2._internal,type);
|
||||
}
|
||||
|
||||
// Unary negation
|
||||
friend strong_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
|
||||
@ -248,6 +253,13 @@ class iVector {
|
||||
rotate(out._internal[i],in._internal[i],rot);
|
||||
}
|
||||
}
|
||||
friend strong_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
|
||||
const iVector<vtype,N> &in1,const iVector<vtype,N> &in2,int type){
|
||||
for(int i=0;i<N;i++){
|
||||
exchange(out1._internal[i],out2._internal[i],
|
||||
in1._internal[i], in2._internal[i],type);
|
||||
}
|
||||
}
|
||||
|
||||
// Unary negation
|
||||
friend strong_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
|
||||
@ -374,6 +386,14 @@ class iMatrix {
|
||||
rotate(out._internal[i][j],in._internal[i][j],rot);
|
||||
}}
|
||||
}
|
||||
friend strong_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
|
||||
const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
exchange(out1._internal[i][j],out2._internal[i][j],
|
||||
in1._internal[i][j], in2._internal[i][j],type);
|
||||
}}
|
||||
}
|
||||
|
||||
// Unary negation
|
||||
friend strong_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
|
||||
|
@ -113,8 +113,6 @@ public:
|
||||
// outerproduct,
|
||||
// zeroit
|
||||
// permute
|
||||
|
||||
|
||||
class funcReduce {
|
||||
public:
|
||||
funcReduce() {};
|
||||
@ -168,7 +166,7 @@ void Tester(const functor &func)
|
||||
|
||||
int ok=0;
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
if ( abs(reference[i]-result[i])>1.0e-7){
|
||||
if ( abs(reference[i]-result[i])>1.0e-6){
|
||||
std::cout<<GridLogMessage<< "*****" << std::endl;
|
||||
std::cout<<GridLogMessage<< "["<<i<<"] "<< abs(reference[i]-result[i]) << " " <<reference[i]<< " " << result[i]<<std::endl;
|
||||
ok++;
|
||||
@ -245,6 +243,28 @@ public:
|
||||
}
|
||||
std::string name(void) const { return std::string("Permute"); }
|
||||
};
|
||||
|
||||
class funcExchange {
|
||||
public:
|
||||
int n;
|
||||
funcExchange(int _n) { n=_n;};
|
||||
template<class vec> void operator()(vec &r1,vec &r2,vec &i1,vec &i2) const { exchange(r1,r2,i1,i2,n);}
|
||||
template<class scal> void apply(std::vector<scal> &r1,std::vector<scal> &r2,std::vector<scal> &in1,std::vector<scal> &in2) const {
|
||||
int sz=in1.size();
|
||||
|
||||
|
||||
int msk = sz>>(n+1);
|
||||
|
||||
int j1=0;
|
||||
int j2=0;
|
||||
for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in1[ i ];
|
||||
for(int i=0;i<sz;i++) if ( (i&msk) == 0 ) r1[j1++] = in2[ i ];
|
||||
for(int i=0;i<sz;i++) if ( (i&msk) ) r2[j2++] = in1[ i ];
|
||||
for(int i=0;i<sz;i++) if ( (i&msk) ) r2[j2++] = in2[ i ];
|
||||
}
|
||||
std::string name(void) const { return std::string("Exchange"); }
|
||||
};
|
||||
|
||||
class funcRotate {
|
||||
public:
|
||||
int n;
|
||||
@ -325,6 +345,87 @@ void PermTester(const functor &func)
|
||||
assert(ok==0);
|
||||
}
|
||||
|
||||
|
||||
template<class scal, class vec,class functor >
|
||||
void ExchangeTester(const functor &func)
|
||||
{
|
||||
GridSerialRNG sRNG;
|
||||
sRNG.SeedRandomDevice();
|
||||
|
||||
int Nsimd = vec::Nsimd();
|
||||
|
||||
std::vector<scal> input1(Nsimd);
|
||||
std::vector<scal> input2(Nsimd);
|
||||
std::vector<scal> result1(Nsimd);
|
||||
std::vector<scal> result2(Nsimd);
|
||||
std::vector<scal> reference1(Nsimd);
|
||||
std::vector<scal> reference2(Nsimd);
|
||||
std::vector<scal> test1(Nsimd);
|
||||
std::vector<scal> test2(Nsimd);
|
||||
|
||||
std::vector<vec,alignedAllocator<vec> > buf(6);
|
||||
vec & v_input1 = buf[0];
|
||||
vec & v_input2 = buf[1];
|
||||
vec & v_result1 = buf[2];
|
||||
vec & v_result2 = buf[3];
|
||||
vec & v_test1 = buf[4];
|
||||
vec & v_test2 = buf[5];
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
random(sRNG,input1[i]);
|
||||
random(sRNG,input2[i]);
|
||||
random(sRNG,result1[i]);
|
||||
random(sRNG,result2[i]);
|
||||
}
|
||||
|
||||
merge<vec,scal>(v_input1,input1);
|
||||
merge<vec,scal>(v_input2,input2);
|
||||
merge<vec,scal>(v_result1,result1);
|
||||
merge<vec,scal>(v_result2,result1);
|
||||
|
||||
func(v_result1,v_result2,v_input1,v_input2);
|
||||
func.apply(reference1,reference2,input1,input2);
|
||||
|
||||
func(v_test1,v_test2,v_result1,v_result2);
|
||||
|
||||
extract<vec,scal>(v_result1,result1);
|
||||
extract<vec,scal>(v_result2,result2);
|
||||
extract<vec,scal>(v_test1,test1);
|
||||
extract<vec,scal>(v_test2,test2);
|
||||
|
||||
std::cout<<GridLogMessage << " " << func.name() << " " <<func.n <<std::endl;
|
||||
|
||||
// for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference1[i]<<" "<<result1[i]<<std::endl;
|
||||
// for(int i=0;i<Nsimd;i++) std::cout << " i "<<i<<" "<<reference2[i]<<" "<<result2[i]<<std::endl;
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
int found=0;
|
||||
for(int j=0;j<Nsimd;j++){
|
||||
if(reference1[j]==result1[i]) {
|
||||
found=1;
|
||||
// std::cout << " i "<<i<<" j "<<j<<" "<<reference1[j]<<" "<<result1[i]<<std::endl;
|
||||
}
|
||||
}
|
||||
assert(found==1);
|
||||
}
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
int found=0;
|
||||
for(int j=0;j<Nsimd;j++){
|
||||
if(reference2[j]==result2[i]) {
|
||||
found=1;
|
||||
// std::cout << " i "<<i<<" j "<<j<<" "<<reference2[j]<<" "<<result2[i]<<std::endl;
|
||||
}
|
||||
}
|
||||
assert(found==1);
|
||||
}
|
||||
|
||||
// for(int i=0;i<Nsimd;i++){
|
||||
// std::cout << " i "<< i<<" test1"<<test1[i]<<" "<<input1[i]<<std::endl;
|
||||
// std::cout << " i "<< i<<" test2"<<test2[i]<<" "<<input2[i]<<std::endl;
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
@ -363,6 +464,15 @@ int main (int argc, char ** argv)
|
||||
PermTester<RealF,vRealF>(funcPermute(i));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealF exchanges "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vRealF::Nsimd();i++){
|
||||
ExchangeTester<RealF,vRealF>(funcExchange(i));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealF rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
@ -394,6 +504,14 @@ int main (int argc, char ** argv)
|
||||
PermTester<RealD,vRealD>(funcPermute(i));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealD exchanges "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vRealD::Nsimd();i++){
|
||||
ExchangeTester<RealD,vRealD>(funcExchange(i));
|
||||
}
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vRealD rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
@ -429,6 +547,16 @@ int main (int argc, char ** argv)
|
||||
PermTester<ComplexF,vComplexF>(funcPermute(i));
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexF exchanges "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vComplexF::Nsimd();i++){
|
||||
ExchangeTester<ComplexF,vComplexF>(funcExchange(i));
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexF rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
@ -466,6 +594,15 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexD exchanges "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
// Log2 iteration
|
||||
for(int i=0;(1<<i)< vComplexD::Nsimd();i++){
|
||||
ExchangeTester<ComplexD,vComplexD>(funcExchange(i));
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
std::cout<<GridLogMessage << "Testing vComplexD rotate "<<std::endl;
|
||||
std::cout<<GridLogMessage << "==================================="<< std::endl;
|
||||
|
Loading…
Reference in New Issue
Block a user