mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-24 17:54:47 +01:00 
			
		
		
		
	Optimisation...
This commit is contained in:
		| @@ -138,16 +138,8 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out) | ||||
|   return ; | ||||
| } | ||||
|  | ||||
| void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) | ||||
| void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out) | ||||
| { | ||||
|   assert((dag==0) ||(dag==1)); | ||||
|  | ||||
|   WilsonCompressor compressor(dag); | ||||
|   Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor); | ||||
|  | ||||
| PARALLEL_FOR_LOOP | ||||
|   for(int sss=0;sss<grid->oSites();sss++){ | ||||
|  | ||||
|     vHalfSpinColourVector  tmp;     | ||||
|     vHalfSpinColourVector  chi;     | ||||
|     vSpinColourVector result; | ||||
| @@ -155,16 +147,14 @@ PARALLEL_FOR_LOOP | ||||
|     int offset,local,perm, ptype; | ||||
|  | ||||
|     //    int ss = Stencil._LebesgueReorder[sss]; | ||||
|     int ss = sss; | ||||
|     int ssu= ss; | ||||
|  | ||||
|     // Xp | ||||
|     int idx = (Xp+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     offset = Stencil._offsets [Xp][ss]; | ||||
|     local  = Stencil._is_local[Xp][ss]; | ||||
|     perm   = Stencil._permute[Xp][ss]; | ||||
|  | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|     ptype  = Stencil._permute_type[Xp]; | ||||
|     if ( local && perm ) { | ||||
|       spProjXp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
| @@ -173,16 +163,14 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); | ||||
|     spReconXp(result,Uchi); | ||||
|  | ||||
|     // Yp | ||||
|     idx = (Yp+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|  | ||||
|     offset = Stencil._offsets [Yp][ss]; | ||||
|     local  = Stencil._is_local[Yp][ss]; | ||||
|     perm   = Stencil._permute[Yp][ss]; | ||||
|     ptype  = Stencil._permute_type[Yp]; | ||||
|     if ( local && perm ) { | ||||
|       spProjYp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
| @@ -191,15 +179,14 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); | ||||
|     accumReconYp(result,Uchi); | ||||
|  | ||||
|     // Zp | ||||
|     idx = (Zp+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|     offset = Stencil._offsets [Zp][ss]; | ||||
|     local  = Stencil._is_local[Zp][ss]; | ||||
|     perm   = Stencil._permute[Zp][ss]; | ||||
|     ptype  = Stencil._permute_type[Zp]; | ||||
|     if ( local && perm ) { | ||||
|       spProjZp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
| @@ -208,15 +195,14 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); | ||||
|     accumReconZp(result,Uchi); | ||||
|  | ||||
|     // Tp | ||||
|     idx = (Tp+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|     offset = Stencil._offsets [Tp][ss]; | ||||
|     local  = Stencil._is_local[Tp][ss]; | ||||
|     perm   = Stencil._permute[Tp][ss]; | ||||
|     ptype  = Stencil._permute_type[Tp]; | ||||
|     if ( local && perm ) { | ||||
|       spProjTp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
| @@ -225,15 +211,14 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); | ||||
|     accumReconTp(result,Uchi); | ||||
|  | ||||
|     // Xm | ||||
|     idx = (Xm+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|     offset = Stencil._offsets [Xm][ss]; | ||||
|     local  = Stencil._is_local[Xm][ss]; | ||||
|     perm   = Stencil._permute[Xm][ss]; | ||||
|     ptype  = Stencil._permute_type[Xm]; | ||||
|  | ||||
|     if ( local && perm )  | ||||
|     { | ||||
| @@ -244,16 +229,15 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); | ||||
|     accumReconXm(result,Uchi); | ||||
|  | ||||
|  | ||||
|     // Ym | ||||
|     idx = (Ym+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|     offset = Stencil._offsets [Ym][ss]; | ||||
|     local  = Stencil._is_local[Ym][ss]; | ||||
|     perm   = Stencil._permute[Ym][ss]; | ||||
|     ptype  = Stencil._permute_type[Ym]; | ||||
|  | ||||
|     if ( local && perm ) { | ||||
|       spProjYm(tmp,in._odata[offset]); | ||||
| @@ -263,15 +247,14 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); | ||||
|     accumReconYm(result,Uchi); | ||||
|  | ||||
|     // Zm | ||||
|     idx = (Zm+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|     offset = Stencil._offsets [Zm][ss]; | ||||
|     local  = Stencil._is_local[Zm][ss]; | ||||
|     perm   = Stencil._permute[Zm][ss]; | ||||
|     ptype  = Stencil._permute_type[Zm]; | ||||
|     if ( local && perm ) { | ||||
|       spProjZm(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
| @@ -280,15 +263,14 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); | ||||
|     accumReconZm(result,Uchi); | ||||
|  | ||||
|     // Tm | ||||
|     idx = (Tm+dag*4)%8; | ||||
|     offset = Stencil._offsets [idx][ss]; | ||||
|     local  = Stencil._is_local[idx][ss]; | ||||
|     perm   = Stencil._permute[idx][ss]; | ||||
|     ptype  = Stencil._permute_type[idx]; | ||||
|     offset = Stencil._offsets [Tm][ss]; | ||||
|     local  = Stencil._is_local[Tm][ss]; | ||||
|     perm   = Stencil._permute[Tm][ss]; | ||||
|     ptype  = Stencil._permute_type[Tm]; | ||||
|     if ( local && perm ) { | ||||
|       spProjTm(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
| @@ -297,12 +279,175 @@ PARALLEL_FOR_LOOP | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); | ||||
|     accumReconTm(result,Uchi); | ||||
|  | ||||
|     vstream(out._odata[ss],result); | ||||
|   } | ||||
| } | ||||
| void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out) | ||||
| { | ||||
|     vHalfSpinColourVector  tmp;     | ||||
|     vHalfSpinColourVector  chi;     | ||||
|     vSpinColourVector result; | ||||
|     vHalfSpinColourVector Uchi; | ||||
|     int offset,local,perm, ptype; | ||||
|  | ||||
|     int ssu= ss; | ||||
|  | ||||
|     // Xp | ||||
|     offset = Stencil._offsets [Xm][ss]; | ||||
|     local  = Stencil._is_local[Xm][ss]; | ||||
|     perm   = Stencil._permute[Xm][ss]; | ||||
|  | ||||
|     ptype  = Stencil._permute_type[Xm]; | ||||
|     if ( local && perm ) { | ||||
|       spProjXp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjXp(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); | ||||
|     spReconXp(result,Uchi); | ||||
|  | ||||
|     // Yp | ||||
|     offset = Stencil._offsets [Ym][ss]; | ||||
|     local  = Stencil._is_local[Ym][ss]; | ||||
|     perm   = Stencil._permute[Ym][ss]; | ||||
|     ptype  = Stencil._permute_type[Ym]; | ||||
|     if ( local && perm ) { | ||||
|       spProjYp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjYp(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); | ||||
|     accumReconYp(result,Uchi); | ||||
|  | ||||
|     // Zp | ||||
|     offset = Stencil._offsets [Zm][ss]; | ||||
|     local  = Stencil._is_local[Zm][ss]; | ||||
|     perm   = Stencil._permute[Zm][ss]; | ||||
|     ptype  = Stencil._permute_type[Zm]; | ||||
|     if ( local && perm ) { | ||||
|       spProjZp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjZp(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); | ||||
|     accumReconZp(result,Uchi); | ||||
|  | ||||
|     // Tp | ||||
|     offset = Stencil._offsets [Tm][ss]; | ||||
|     local  = Stencil._is_local[Tm][ss]; | ||||
|     perm   = Stencil._permute[Tm][ss]; | ||||
|     ptype  = Stencil._permute_type[Tm]; | ||||
|     if ( local && perm ) { | ||||
|       spProjTp(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjTp(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); | ||||
|     accumReconTp(result,Uchi); | ||||
|  | ||||
|     // Xm | ||||
|     offset = Stencil._offsets [Xp][ss]; | ||||
|     local  = Stencil._is_local[Xp][ss]; | ||||
|     perm   = Stencil._permute[Xp][ss]; | ||||
|     ptype  = Stencil._permute_type[Xp]; | ||||
|  | ||||
|     if ( local && perm )  | ||||
|     { | ||||
|       spProjXm(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjXm(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); | ||||
|     accumReconXm(result,Uchi); | ||||
|  | ||||
|  | ||||
|     // Ym | ||||
|     offset = Stencil._offsets [Yp][ss]; | ||||
|     local  = Stencil._is_local[Yp][ss]; | ||||
|     perm   = Stencil._permute[Yp][ss]; | ||||
|     ptype  = Stencil._permute_type[Yp]; | ||||
|  | ||||
|     if ( local && perm ) { | ||||
|       spProjYm(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjYm(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); | ||||
|     accumReconYm(result,Uchi); | ||||
|  | ||||
|     // Zm | ||||
|     offset = Stencil._offsets [Zp][ss]; | ||||
|     local  = Stencil._is_local[Zp][ss]; | ||||
|     perm   = Stencil._permute[Zp][ss]; | ||||
|     ptype  = Stencil._permute_type[Zp]; | ||||
|     if ( local && perm ) { | ||||
|       spProjZm(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjZm(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); | ||||
|     accumReconZm(result,Uchi); | ||||
|  | ||||
|     // Tm | ||||
|     offset = Stencil._offsets [Tp][ss]; | ||||
|     local  = Stencil._is_local[Tp][ss]; | ||||
|     perm   = Stencil._permute[Tp][ss]; | ||||
|     ptype  = Stencil._permute_type[Tp]; | ||||
|     if ( local && perm ) { | ||||
|       spProjTm(tmp,in._odata[offset]); | ||||
|       permute(chi,tmp,ptype); | ||||
|     } else if ( local ) { | ||||
|       spProjTm(chi,in._odata[offset]); | ||||
|     } else {  | ||||
|       chi=comm_buf[offset]; | ||||
|     } | ||||
|     mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); | ||||
|     accumReconTm(result,Uchi); | ||||
|  | ||||
|     vstream(out._odata[ss],result); | ||||
| } | ||||
|  | ||||
| void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) | ||||
| { | ||||
|   assert((dag==0) ||(dag==1)); | ||||
|  | ||||
|   WilsonCompressor compressor(dag); | ||||
|   Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor); | ||||
|  | ||||
|   if ( dag ) { | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int sss=0;sss<grid->oSites();sss++){ | ||||
|       DhopSiteDag(sss,in,out); | ||||
|     } | ||||
|   } else { | ||||
| PARALLEL_FOR_LOOP | ||||
|     for(int sss=0;sss<grid->oSites();sss++){ | ||||
|       DhopSite(sss,in,out); | ||||
|     } | ||||
|   } | ||||
| } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -46,6 +46,8 @@ namespace Grid { | ||||
|  | ||||
|       // non-hermitian hopping term; half cb or both | ||||
|       void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag); | ||||
|       void DhopSite   (int ss,const LatticeFermion &in, LatticeFermion &out); | ||||
|       void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out); | ||||
|  | ||||
|       typedef iScalar<iMatrix<vComplex, Nc> > matrix; | ||||
|  | ||||
|   | ||||
							
								
								
									
										9
									
								
								scripts/bench_wilson.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										9
									
								
								scripts/bench_wilson.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| for omp in 1 2 4 | ||||
| do | ||||
| echo > wilson.t$omp | ||||
| for vol in 4.4.4.4 4.4.4.8 4.4.8.8  4.8.8.8  8.8.8.8   8.8.8.16 8.8.16.16  8.16.16.16 | ||||
| do    | ||||
| perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp  | grep mflop | awk '{print $3}'` | ||||
| echo $vol $perf >> wilson.t$omp | ||||
| done | ||||
| done | ||||
							
								
								
									
										7
									
								
								scripts/wilson.gnu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								scripts/wilson.gnu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,7 @@ | ||||
| plot 'wilson.t1' u 2 w l t "AVX1-OMP=1" | ||||
| replot 'wilson.t2' u 2 w l t "AVX1-OMP=2" | ||||
| replot 'wilson.t4' u 2 w l t "AVX1-OMP=4" | ||||
| set terminal 'pdf' | ||||
| set output 'wilson_clang.pdf' | ||||
| replot | ||||
| quit | ||||
		Reference in New Issue
	
	Block a user