mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Cleaning up the single/double kernel implementation switch
This commit is contained in:
		@@ -32,81 +32,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <simd/Intel512wilson.h>
 | 
					#include <simd/Intel512wilson.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef VLOAD
 | 
					#include <simd/Intel512single.h>
 | 
				
			||||||
#undef VSTORE
 | 
					 | 
				
			||||||
#undef VMUL
 | 
					 | 
				
			||||||
#undef VMADD
 | 
					 | 
				
			||||||
#undef ZEND
 | 
					 | 
				
			||||||
#undef ZLOAD
 | 
					 | 
				
			||||||
#undef ZMUL
 | 
					 | 
				
			||||||
#undef ZMADD
 | 
					 | 
				
			||||||
#undef VZERO
 | 
					 | 
				
			||||||
#undef VTIMESI
 | 
					 | 
				
			||||||
#undef VTIMESMINUSI
 | 
					 | 
				
			||||||
#undef VMOVIDUP 
 | 
					 | 
				
			||||||
#undef VMOVRDUP 
 | 
					 | 
				
			||||||
#undef VMADDSUB
 | 
					 | 
				
			||||||
#undef VSHUF
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define VZERO(A)                  VZEROf(A)
 | 
					 | 
				
			||||||
#define VMOV(A,B)                 VMOVf(A,B)
 | 
					 | 
				
			||||||
#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
					 | 
				
			||||||
#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VADD(A,B,C)               VADDf(A,B,C)
 | 
					 | 
				
			||||||
#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
					 | 
				
			||||||
#define VMUL(Uri,Uir,Chi)  VMULf(Uri,Uir,Chi)
 | 
					 | 
				
			||||||
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
 | 
					 | 
				
			||||||
#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VPERM0(A,B)               VPERM0f(A,B)
 | 
					 | 
				
			||||||
#define VPERM1(A,B)               VPERM1f(A,B)
 | 
					 | 
				
			||||||
#define VPERM2(A,B)               VPERM2f(A,B)
 | 
					 | 
				
			||||||
#define VPERM3(A,B)               VPERM3f(A,B)
 | 
					 | 
				
			||||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define ZEND1(A,B,C)               ZEND1f(A,B,C)
 | 
					 | 
				
			||||||
#define ZEND2(A,B,C)               ZEND2f(A,B,C)
 | 
					 | 
				
			||||||
#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 | 
					 | 
				
			||||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
					 | 
				
			||||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
					 | 
				
			||||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
 | 
					 | 
				
			||||||
#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
 | 
					 | 
				
			||||||
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
 | 
					 | 
				
			||||||
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
 | 
					 | 
				
			||||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) 
 | 
					 | 
				
			||||||
#define VSHUF(A,B) VSHUFf(A,B)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
					 | 
				
			||||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
namespace QCD {
 | 
					namespace QCD {
 | 
				
			||||||
@@ -136,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
					  SE=st.GetEntry(ptype,Xm,ss);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if 0
 | 
					 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					 | 
				
			||||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  LOAD64(%r9,pf);
 | 
					 | 
				
			||||||
  __asm__( 
 | 
					 | 
				
			||||||
	  VPREFETCH(0,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(1,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(2,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(3,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(4,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(5,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(6,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(7,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(8,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(9,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(10,%r9)
 | 
					 | 
				
			||||||
	  VPREFETCH(11,%r9) );
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Xm
 | 
					  // Xm
 | 
				
			||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
@@ -322,8 +229,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
				
			|||||||
  offset = SE->_offset;
 | 
					  offset = SE->_offset;
 | 
				
			||||||
  local  = SE->_is_local;
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
  //  PREFETCH_R(A);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Prefetch
 | 
					  // Prefetch
 | 
				
			||||||
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
 | 
					  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
 | 
				
			||||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										135
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										135
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,135 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					// No guard can be multiply included as undef clearage
 | 
				
			||||||
 | 
					#undef VZERO
 | 
				
			||||||
 | 
					#undef VMOV
 | 
				
			||||||
 | 
					#undef VLOAD
 | 
				
			||||||
 | 
					#undef VSTORE
 | 
				
			||||||
 | 
					#define VZERO(A)                  VZEROd(A)
 | 
				
			||||||
 | 
					#define VMOV(A,B)                 VMOVd(A,B)
 | 
				
			||||||
 | 
					#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 | 
				
			||||||
 | 
					#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VADD
 | 
				
			||||||
 | 
					#undef VSUB
 | 
				
			||||||
 | 
					#undef VMUL
 | 
				
			||||||
 | 
					#undef VMADD
 | 
				
			||||||
 | 
					#define VADD(A,B,C)               VADDd(A,B,C)
 | 
				
			||||||
 | 
					#define VSUB(A,B,C)               VSUBd(A,B,C)
 | 
				
			||||||
 | 
					#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESI
 | 
				
			||||||
 | 
					#undef VTIMESI0 
 | 
				
			||||||
 | 
					#undef VTIMESI1
 | 
				
			||||||
 | 
					#undef VTIMESI2 
 | 
				
			||||||
 | 
					#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI
 | 
				
			||||||
 | 
					#undef VACCTIMESI0
 | 
				
			||||||
 | 
					#undef VACCTIMESI1
 | 
				
			||||||
 | 
					#undef VACCTIMESI2
 | 
				
			||||||
 | 
					#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VPERM0
 | 
				
			||||||
 | 
					#undef VPERM1
 | 
				
			||||||
 | 
					#undef VPERM2
 | 
				
			||||||
 | 
					#undef VPERM3
 | 
				
			||||||
 | 
					#define VPERM0(A,B)               VPERM0d(A,B)
 | 
				
			||||||
 | 
					#define VPERM1(A,B)               VPERM1d(A,B)
 | 
				
			||||||
 | 
					#define VPERM2(A,B)               VPERM2d(A,B)
 | 
				
			||||||
 | 
					#define VPERM3(A,B)               VPERM3d(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VSHUFMEM
 | 
				
			||||||
 | 
					#undef VADDMEM
 | 
				
			||||||
 | 
					#undef VSUBMEM
 | 
				
			||||||
 | 
					#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
 | 
				
			||||||
 | 
					#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
 | 
				
			||||||
 | 
					#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VMOVIDUP
 | 
				
			||||||
 | 
					#undef VMOVRDUP
 | 
				
			||||||
 | 
					#undef VMADDSBUB
 | 
				
			||||||
 | 
					#undef VSHUF
 | 
				
			||||||
 | 
					#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
 | 
				
			||||||
 | 
					#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
 | 
				
			||||||
 | 
					#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
 | 
				
			||||||
 | 
					#define VSHUD(A,B)                                       VSHUFd(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef ZEND1
 | 
				
			||||||
 | 
					#undef ZEND2
 | 
				
			||||||
 | 
					#undef ZLOAD
 | 
				
			||||||
 | 
					#undef ZMUL
 | 
				
			||||||
 | 
					#undef ZMADD
 | 
				
			||||||
 | 
					#undef ZMULMEM2SP
 | 
				
			||||||
 | 
					#undef ZMADDMEM2SP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
 | 
				
			||||||
 | 
					#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
 | 
				
			||||||
 | 
					#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										135
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										135
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,135 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./lib/simd/Avx512Asm.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					// No guard can be multiply included as undef clearge of macros
 | 
				
			||||||
 | 
					#undef VZERO
 | 
				
			||||||
 | 
					#undef VMOV
 | 
				
			||||||
 | 
					#undef VLOAD
 | 
				
			||||||
 | 
					#undef VSTORE
 | 
				
			||||||
 | 
					#define VZERO(A)                  VZEROf(A)
 | 
				
			||||||
 | 
					#define VMOV(A,B)                 VMOVf(A,B)
 | 
				
			||||||
 | 
					#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
				
			||||||
 | 
					#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VADD
 | 
				
			||||||
 | 
					#undef VSUB
 | 
				
			||||||
 | 
					#undef VMUL
 | 
				
			||||||
 | 
					#undef VMADD
 | 
				
			||||||
 | 
					#define VADD(A,B,C)               VADDf(A,B,C)
 | 
				
			||||||
 | 
					#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
				
			||||||
 | 
					#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESI
 | 
				
			||||||
 | 
					#undef VTIMESI0 
 | 
				
			||||||
 | 
					#undef VTIMESI1
 | 
				
			||||||
 | 
					#undef VTIMESI2 
 | 
				
			||||||
 | 
					#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI
 | 
				
			||||||
 | 
					#undef VACCTIMESI0
 | 
				
			||||||
 | 
					#undef VACCTIMESI1
 | 
				
			||||||
 | 
					#undef VACCTIMESI2
 | 
				
			||||||
 | 
					#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI0
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI1MEM
 | 
				
			||||||
 | 
					#undef VACCTIMESMINUSI2MEM
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VPERM0
 | 
				
			||||||
 | 
					#undef VPERM1
 | 
				
			||||||
 | 
					#undef VPERM2
 | 
				
			||||||
 | 
					#undef VPERM3
 | 
				
			||||||
 | 
					#define VPERM0(A,B)               VPERM0f(A,B)
 | 
				
			||||||
 | 
					#define VPERM1(A,B)               VPERM1f(A,B)
 | 
				
			||||||
 | 
					#define VPERM2(A,B)               VPERM2f(A,B)
 | 
				
			||||||
 | 
					#define VPERM3(A,B)               VPERM3f(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VSHUFMEM
 | 
				
			||||||
 | 
					#undef VADDMEM
 | 
				
			||||||
 | 
					#undef VSUBMEM
 | 
				
			||||||
 | 
					#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
				
			||||||
 | 
					#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
 | 
				
			||||||
 | 
					#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VMOVIDUP
 | 
				
			||||||
 | 
					#undef VMOVRDUP
 | 
				
			||||||
 | 
					#undef VMADDSBUB
 | 
				
			||||||
 | 
					#undef VSHUF
 | 
				
			||||||
 | 
					#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
 | 
				
			||||||
 | 
					#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
 | 
				
			||||||
 | 
					#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 | 
				
			||||||
 | 
					#define VSHUF(A,B)                                       VSHUFf(A,B)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef ZEND1
 | 
				
			||||||
 | 
					#undef ZEND2
 | 
				
			||||||
 | 
					#undef ZLOAD
 | 
				
			||||||
 | 
					#undef ZMUL
 | 
				
			||||||
 | 
					#undef ZMADD
 | 
				
			||||||
 | 
					#undef ZMULMEM2SP
 | 
				
			||||||
 | 
					#undef ZMADDMEM2SP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
 | 
				
			||||||
 | 
					#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
 | 
				
			||||||
 | 
					#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -201,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 | 
					// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 | 
				
			||||||
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
 | 
					// KNL is DUAL issue for FP, and lifting these loads is potentially important.
 | 
				
			||||||
// Need detailed profile data to be sure.
 | 
					// Need detailed profile data to be sure.
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
#define PREFETCH_U(A) \
 | 
					#define PREFETCH_U(A) \
 | 
				
			||||||
  LOAD64(%r8,&U._odata[sU](A)) \
 | 
					  LOAD64(%r8,&U._odata[sU](A)) \
 | 
				
			||||||
  __asm__ (		       \
 | 
					  __asm__ (		       \
 | 
				
			||||||
@@ -230,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
  VPREFETCHW(9,%r8)	       \
 | 
					  VPREFETCHW(9,%r8)	       \
 | 
				
			||||||
  VPREFETCHW(10,%r8)	       \
 | 
					  VPREFETCHW(10,%r8)	       \
 | 
				
			||||||
  VPREFETCHW(11,%r8)	       );
 | 
					  VPREFETCHW(11,%r8)	       );
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
 | 
					#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -244,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 | 
					#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 | 
				
			||||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 | 
					#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
#define MULT_2SPIN_UNOPT(ptr)				\
 | 
					#define MULT_2SPIN_UNOPT(ptr)				\
 | 
				
			||||||
	   LOAD64(%r8,ptr)			\
 | 
						   LOAD64(%r8,ptr)			\
 | 
				
			||||||
  __asm__ (					\
 | 
					  __asm__ (					\
 | 
				
			||||||
@@ -289,6 +290,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
	   ZEND2(UChi_11,Z3,Chi_10)			\
 | 
						   ZEND2(UChi_11,Z3,Chi_10)			\
 | 
				
			||||||
	   ZEND2(UChi_02,Z4,Chi_02)			\
 | 
						   ZEND2(UChi_02,Z4,Chi_02)			\
 | 
				
			||||||
	   ZEND2(UChi_12,Z5,Chi_12)	     );
 | 
						   ZEND2(UChi_12,Z5,Chi_12)	     );
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
 | 
					#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
 | 
				
			||||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
 | 
					#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
 | 
				
			||||||
@@ -299,10 +301,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
 | 
					#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
 | 
				
			||||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
 | 
					#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
 | 
					// MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
 | 
				
			||||||
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
 | 
					#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
#define MULT_2SPIN_PF(ptr,pf,VPF)			\
 | 
					#define MULT_2SPIN_PF(ptr,pf,VPF)			\
 | 
				
			||||||
	   LOAD64(%r8,ptr)			\
 | 
						   LOAD64(%r8,ptr)			\
 | 
				
			||||||
	   LOAD64(%r9,pf)			\
 | 
						   LOAD64(%r9,pf)			\
 | 
				
			||||||
@@ -343,8 +345,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
	   ZEND2(UChi_02,Z4,Chi_02)			\
 | 
						   ZEND2(UChi_02,Z4,Chi_02)			\
 | 
				
			||||||
	   VPF(11,%r9)						\
 | 
						   VPF(11,%r9)						\
 | 
				
			||||||
	   ZEND2(UChi_12,Z5,Chi_12)	     );
 | 
						   ZEND2(UChi_12,Z5,Chi_12)	     );
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 0 
 | 
				
			||||||
#define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 | 
					#define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 | 
				
			||||||
	   LOAD64(%r8,ptr)			\
 | 
						   LOAD64(%r8,ptr)			\
 | 
				
			||||||
	   LOAD64(%r9,pf)			\
 | 
						   LOAD64(%r9,pf)			\
 | 
				
			||||||
@@ -364,7 +367,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
	   VPF(9,%r9)						\
 | 
						   VPF(9,%r9)						\
 | 
				
			||||||
	   VPF(10,%r9)						\
 | 
						   VPF(10,%r9)						\
 | 
				
			||||||
	   VPF(11,%r9)						);
 | 
						   VPF(11,%r9)						);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Pretty much Perfectly Pipelined
 | 
					// Pretty much Perfectly Pipelined
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -720,7 +723,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
  VSUB(UChi_11,result_31,result_31)\
 | 
					  VSUB(UChi_11,result_31,result_31)\
 | 
				
			||||||
  VSUB(UChi_12,result_32,result_32) );
 | 
					  VSUB(UChi_12,result_32,result_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define PREFETCH_CHIMU(A) 
 | 
					//define PREFETCH_CHIMU(A) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define PERMUTE_DIR0 __asm__ ( 	\
 | 
					#define PERMUTE_DIR0 __asm__ ( 	\
 | 
				
			||||||
  VPERM0(Chi_00,Chi_00)	\
 | 
					  VPERM0(Chi_00,Chi_00)	\
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user