mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Changes to remove warnings under icc; disambiguate AVX512 from IMCI correctly
and drop swizzles in AVX512. Don't know why these compiled.
This commit is contained in:
		
							
								
								
									
										15
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										15
									
								
								configure
									
									
									
									
										vendored
									
									
								
							@@ -1384,9 +1384,9 @@ Optional Features:
 | 
				
			|||||||
  --disable-dependency-tracking
 | 
					  --disable-dependency-tracking
 | 
				
			||||||
                          speeds up one-time build
 | 
					                          speeds up one-time build
 | 
				
			||||||
  --disable-openmp        do not use OpenMP
 | 
					  --disable-openmp        do not use OpenMP
 | 
				
			||||||
  --enable-simd=SSE4|AVX|AVX2|AVX512|MIC
 | 
					  --enable-simd=SSE4|AVX|AVX2|AVX512|IMCI
 | 
				
			||||||
                          Select instructions to be SSE4.0, AVX 1.0, AVX
 | 
					                          Select instructions to be SSE4.0, AVX 1.0, AVX
 | 
				
			||||||
                          2.0+FMA, AVX 512, MIC
 | 
					                          2.0+FMA, AVX 512, IMCI
 | 
				
			||||||
  --enable-precision=single|double
 | 
					  --enable-precision=single|double
 | 
				
			||||||
                          Select default word size of Real
 | 
					                          Select default word size of Real
 | 
				
			||||||
  --enable-comms=none|mpi Select communications
 | 
					  --enable-comms=none|mpi Select communications
 | 
				
			||||||
@@ -6414,13 +6414,20 @@ $as_echo "#define AVX2 1" >>confdefs.h
 | 
				
			|||||||
$as_echo "$as_me: WARNING: Your processor does not support AVX2 instructions" >&2;}
 | 
					$as_echo "$as_me: WARNING: Your processor does not support AVX2 instructions" >&2;}
 | 
				
			||||||
       fi
 | 
					       fi
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
     AVX512|MIC)
 | 
					     AVX512)
 | 
				
			||||||
       echo Configuring for AVX512 and MIC
 | 
					       echo Configuring for AVX512
 | 
				
			||||||
 | 
					
 | 
				
			||||||
$as_echo "#define AVX512 1" >>confdefs.h
 | 
					$as_echo "#define AVX512 1" >>confdefs.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
       supported="cross compilation"
 | 
					       supported="cross compilation"
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
 | 
					     IMCI)
 | 
				
			||||||
 | 
					       echo Configuring for IMCI
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$as_echo "#define IMCI 1" >>confdefs.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					       supported="cross compilation"
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
     NEONv8)
 | 
					     NEONv8)
 | 
				
			||||||
       echo Configuring for experimental ARMv8a support
 | 
					       echo Configuring for experimental ARMv8a support
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										15
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								configure.ac
									
									
									
									
									
								
							@@ -65,8 +65,8 @@ AC_CHECK_FUNCS([gettimeofday])
 | 
				
			|||||||
#Please install or provide the correct path to your installation
 | 
					#Please install or provide the correct path to your installation
 | 
				
			||||||
#Info at: http://www.mpfr.org/)])
 | 
					#Info at: http://www.mpfr.org/)])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\
 | 
					AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|IMCI],\
 | 
				
			||||||
	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\
 | 
						[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 | 
				
			||||||
	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
 | 
						[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
supported=no
 | 
					supported=no
 | 
				
			||||||
@@ -99,9 +99,14 @@ case ${ac_SIMD} in
 | 
				
			|||||||
       AC_MSG_WARN([Your processor does not support AVX2 instructions])
 | 
					       AC_MSG_WARN([Your processor does not support AVX2 instructions])
 | 
				
			||||||
       fi
 | 
					       fi
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
     AVX512|MIC)
 | 
					     AVX512)
 | 
				
			||||||
       echo Configuring for AVX512 and MIC
 | 
					       echo Configuring for AVX512 
 | 
				
			||||||
       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Corner] )
 | 
					       AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] )
 | 
				
			||||||
 | 
					       supported="cross compilation"
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
 | 
					     IMCI)
 | 
				
			||||||
 | 
					       echo Configuring for IMCI
 | 
				
			||||||
 | 
					       AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
 | 
				
			||||||
       supported="cross compilation"
 | 
					       supported="cross compilation"
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
     NEONv8)
 | 
					     NEONv8)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -17,6 +17,9 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <algorithms/iterative/ConjugateGradientMultiShift.h>
 | 
					#include <algorithms/iterative/ConjugateGradientMultiShift.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Lanczos support
 | 
				
			||||||
 | 
					#include <algorithms/iterative/MatrixUtils.h>
 | 
				
			||||||
 | 
					#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <algorithms/CoarsenedMatrix.h>
 | 
					#include <algorithms/CoarsenedMatrix.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -73,6 +73,5 @@ operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return t
 | 
				
			|||||||
template<typename _Tp>  inline bool
 | 
					template<typename _Tp>  inline bool
 | 
				
			||||||
operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 | 
					operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    
 | 
					 | 
				
			||||||
}; // namespace Grid
 | 
					}; // namespace Grid
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -6,7 +6,7 @@
 | 
				
			|||||||
/* AVX2 Intrinsics */
 | 
					/* AVX2 Intrinsics */
 | 
				
			||||||
#undef AVX2
 | 
					#undef AVX2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* AVX512 Intrinsics for Knights Corner */
 | 
					/* AVX512 Intrinsics for Knights Landing */
 | 
				
			||||||
#undef AVX512
 | 
					#undef AVX512
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* EMPTY_SIMD only for DEBUGGING */
 | 
					/* EMPTY_SIMD only for DEBUGGING */
 | 
				
			||||||
@@ -110,6 +110,9 @@
 | 
				
			|||||||
/* Define to 1 if you have the <unistd.h> header file. */
 | 
					/* Define to 1 if you have the <unistd.h> header file. */
 | 
				
			||||||
#undef HAVE_UNISTD_H
 | 
					#undef HAVE_UNISTD_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* IMCI Intrinsics for Knights Corner */
 | 
				
			||||||
 | 
					#undef IMCI
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* NEON ARMv8 Experimental support */
 | 
					/* NEON ARMv8 Experimental support */
 | 
				
			||||||
#undef NEONv8
 | 
					#undef NEONv8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										388
									
								
								lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										388
									
								
								lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,388 @@
 | 
				
			|||||||
 | 
					#ifndef GRID_IRL_H
 | 
				
			||||||
 | 
					#define GRID_IRL_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					    // Base classes for iterative processes based on operators
 | 
				
			||||||
 | 
					    // single input vec, single output vec.
 | 
				
			||||||
 | 
					    /////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template<class Field> 
 | 
				
			||||||
 | 
					    class ImplicitlyRestartedLanczos {
 | 
				
			||||||
 | 
					public:       
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int Niter;
 | 
				
			||||||
 | 
					    int Nk;
 | 
				
			||||||
 | 
					    int Np;
 | 
				
			||||||
 | 
					    RealD enorm;
 | 
				
			||||||
 | 
					    RealD vthr;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    LinearOperatorBase<Field> &_Linop;
 | 
				
			||||||
 | 
					    OperatorFunction<Field>   &_poly;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ImplicitlyRestartedLanczos(
 | 
				
			||||||
 | 
								       LinearOperatorBase<Field> &Linop,
 | 
				
			||||||
 | 
								       OperatorFunction<Field> & poly,
 | 
				
			||||||
 | 
								       int _Nk,
 | 
				
			||||||
 | 
								       int _Np,
 | 
				
			||||||
 | 
								       RealD _enorm,
 | 
				
			||||||
 | 
								       RealD _vthrs,
 | 
				
			||||||
 | 
								       int _Niter) :
 | 
				
			||||||
 | 
					    _Linop(Linop),
 | 
				
			||||||
 | 
					      _poly(poly),
 | 
				
			||||||
 | 
					      Nk(_Nk),
 | 
				
			||||||
 | 
					      Np(_Np),
 | 
				
			||||||
 | 
					      enorm(_enorm),
 | 
				
			||||||
 | 
					      vthr(_vthrs)
 | 
				
			||||||
 | 
					    { 
 | 
				
			||||||
 | 
					      vthr=_vthrs;
 | 
				
			||||||
 | 
					      Niter=_Niter;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void step(Vector<RealD>& lmda,
 | 
				
			||||||
 | 
						      Vector<RealD>& lmdb, 
 | 
				
			||||||
 | 
						      Vector<Field>& evec,
 | 
				
			||||||
 | 
						      Field& f,int Nm,int k)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      assert( k< Nm );
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      w = opr_->mult(evec[k]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if(k==0){  // Initial step
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
						RealD wnorm= w*w;
 | 
				
			||||||
 | 
						std::cout<<"wnorm="<<wnorm<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						RealD alph = evec[k] * w;
 | 
				
			||||||
 | 
						w -= alph * evec[k];
 | 
				
			||||||
 | 
						lmd[k] = alph;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						RealD beta = w * w;
 | 
				
			||||||
 | 
						beta = sqrt(beta);
 | 
				
			||||||
 | 
						RealD betar = 1.0/beta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						evec[k+1] = betar * w;
 | 
				
			||||||
 | 
						lme[k] = beta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      } else {   // Iteration step
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						w -= lme[k-1] * evec[k-1];
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						RealD alph = evec[k] * w;
 | 
				
			||||||
 | 
						w -= alph * evec[k];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						RealD beta = w * w;
 | 
				
			||||||
 | 
						beta = sqrt(beta);
 | 
				
			||||||
 | 
						RealD betar = 1.0/beta;
 | 
				
			||||||
 | 
						w *= betar;
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						lmd[k] = alph;
 | 
				
			||||||
 | 
						lme[k] = beta;
 | 
				
			||||||
 | 
						orthogonalize(w,evec,k);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						if(k < Nm-1) evec[k+1] = w;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void qr_decomp(Vector<RealD>& lmda,
 | 
				
			||||||
 | 
							   Vector<RealD>& lmdb,
 | 
				
			||||||
 | 
							   int Nk,
 | 
				
			||||||
 | 
							   int Nm,
 | 
				
			||||||
 | 
							   Vector<RealD>& Qt,
 | 
				
			||||||
 | 
							   RealD Dsft, 
 | 
				
			||||||
 | 
							   int kmin,
 | 
				
			||||||
 | 
							   int kmax)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      int k = kmin-1;
 | 
				
			||||||
 | 
					      RealD x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      RealD Fden = 1.0/sqrt((lmd[k]-Dsh)*(lmd[k]-Dsh) +lme[k]*lme[k]);
 | 
				
			||||||
 | 
					      RealD c = ( lmd[k] -Dsh) *Fden;
 | 
				
			||||||
 | 
					      RealD s = -lme[k] *Fden;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      RealD tmpa1 = lmd[k];
 | 
				
			||||||
 | 
					      RealD tmpa2 = lmd[k+1];
 | 
				
			||||||
 | 
					      RealD tmpb  = lme[k];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
 | 
				
			||||||
 | 
					      lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
 | 
				
			||||||
 | 
					      lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
 | 
				
			||||||
 | 
					      x        = -s*lme[k+1];
 | 
				
			||||||
 | 
					      lme[k+1] = c*lme[k+1];
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      for(int i=0; i<Nk; ++i){
 | 
				
			||||||
 | 
						RealD Qtmp1 = Qt[i+Nm*k  ];
 | 
				
			||||||
 | 
						RealD Qtmp2 = Qt[i+Nm*(k+1)];
 | 
				
			||||||
 | 
						Qt[i+Nm*k    ] = c*Qtmp1 - s*Qtmp2;
 | 
				
			||||||
 | 
						Qt[i+Nm*(k+1)] = s*Qtmp1 + c*Qtmp2; 
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Givens transformations
 | 
				
			||||||
 | 
					      for(int k = kmin; k < kmax-1; ++k){
 | 
				
			||||||
 | 
						RealD Fden = 1.0/sqrt( x*x +lme[k-1]*lme[k-1]);
 | 
				
			||||||
 | 
						RealD c = lme[k-1]*Fden;
 | 
				
			||||||
 | 
						RealD s = - x*Fden;
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						RealD tmpa1 = lmd[k];
 | 
				
			||||||
 | 
						RealD tmpa2 = lmd[k+1];
 | 
				
			||||||
 | 
						RealD tmpb  = lme[k];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						lmd[k]   = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
 | 
				
			||||||
 | 
						lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
 | 
				
			||||||
 | 
						lme[k]   = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
 | 
				
			||||||
 | 
						lme[k-1] = c*lme[k-1] -s*x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if(k != kmax-2){
 | 
				
			||||||
 | 
						  x = -s*lme[k+1];
 | 
				
			||||||
 | 
						  lme[k+1] = c*lme[k+1];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for(int i=0; i<Nk; ++i){
 | 
				
			||||||
 | 
						  RealD Qtmp1 = Qt[i+Nm*k    ];
 | 
				
			||||||
 | 
						  RealD Qtmp2 = Qt[i+Nm*(k+1)];
 | 
				
			||||||
 | 
						  Qt[i+Nm*k    ] = c*Qtmp1 -s*Qtmp2;
 | 
				
			||||||
 | 
						  Qt[i+Nm*(k+1)] = s*Qtmp1 +c*Qtmp2;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void diagonalize(Vector<RealD>& lmda,
 | 
				
			||||||
 | 
							     Vector<RealD>& lmdb, 
 | 
				
			||||||
 | 
							     int Nm2,
 | 
				
			||||||
 | 
							     int Nm,
 | 
				
			||||||
 | 
							     Vector<RealD>& Qt)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      int Niter = 100*Nm;
 | 
				
			||||||
 | 
					      int kmin = 1;
 | 
				
			||||||
 | 
					      int kmax = Nk;
 | 
				
			||||||
 | 
					      // (this should be more sophisticated)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for(int iter=0; iter<Niter; ++iter){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// determination of 2x2 leading submatrix
 | 
				
			||||||
 | 
						RealD dsub = lmd[kmax-1]-lmd[kmax-2];
 | 
				
			||||||
 | 
						RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
 | 
				
			||||||
 | 
						RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
 | 
				
			||||||
 | 
						// (Dsh: shift)
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						// transformation
 | 
				
			||||||
 | 
						qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						// Convergence criterion (redef of kmin and kamx)
 | 
				
			||||||
 | 
						for(int j=kmax-1; j>= kmin; --j){
 | 
				
			||||||
 | 
						  RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
 | 
				
			||||||
 | 
						  if(fabs(lme[j-1])+dds > dds){
 | 
				
			||||||
 | 
						    kmax = j+1;
 | 
				
			||||||
 | 
						    goto continued;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						Niter = iter;
 | 
				
			||||||
 | 
						return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      continued:
 | 
				
			||||||
 | 
						for(int j=0; j<kmax-1; ++j){
 | 
				
			||||||
 | 
						  RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
 | 
				
			||||||
 | 
						  if(fabs(lme[j])+dds > dds){
 | 
				
			||||||
 | 
						    kmin = j+1;
 | 
				
			||||||
 | 
						    break;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      std::cout << "[QL method] Error - Too many iteration: "<<Niter<<"\n";
 | 
				
			||||||
 | 
					      abort();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void orthogonalize(Field& w,
 | 
				
			||||||
 | 
							       const Vector<Field>& evec,
 | 
				
			||||||
 | 
							       int k)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					  // Schmidt orthogonalization                                   
 | 
				
			||||||
 | 
					                                                
 | 
				
			||||||
 | 
					      size_t size = w.size();
 | 
				
			||||||
 | 
					      assert(size%2 ==0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      std::slice re(0,size/2,2);
 | 
				
			||||||
 | 
					      std::slice im(1,size/2,2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for(int j=0; j<k; ++j){
 | 
				
			||||||
 | 
						RealD prdr = evec[j]*w;
 | 
				
			||||||
 | 
						RealD prdi = evec[j].im_prod(w);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						valarray<RealD> evr(evec[j][re]);
 | 
				
			||||||
 | 
						valarray<RealD> evi(evec[j][im]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						w.add(re, -prdr*evr +prdi*evi);
 | 
				
			||||||
 | 
						w.add(im, -prdr*evi -prdi*evr);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void calc(Vector<RealD>& lmd,
 | 
				
			||||||
 | 
						      Vector<Field>& evec,
 | 
				
			||||||
 | 
						      const Field& b,
 | 
				
			||||||
 | 
						      int& Nsbt,
 | 
				
			||||||
 | 
						      int& Nconv)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
						const size_t fsize = evec[0].size();
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
						Nconv = -1;
 | 
				
			||||||
 | 
						Nsbt = 0;
 | 
				
			||||||
 | 
						int Nm = Nk_+Np_;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout << " -- Nk = " << Nk_ << " Np = "<< Np_ << endl;
 | 
				
			||||||
 | 
						std::cout << " -- Nm = " << Nm << endl;
 | 
				
			||||||
 | 
						std::cout << " -- size of lmd   = " << lmd.size() << endl;
 | 
				
			||||||
 | 
						std::cout << " -- size of evec  = " << evec.size() << endl;
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						assert(Nm < evec.size() && Nm < lmd.size());
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						vector<RealD> lme(Nm);
 | 
				
			||||||
 | 
						vector<RealD> lmd2(Nm);
 | 
				
			||||||
 | 
						vector<RealD> lme2(Nm);
 | 
				
			||||||
 | 
						vector<RealD> Qt(Nm*Nm);
 | 
				
			||||||
 | 
						vector<int>    Iconv(Nm);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						vector<Field>  B(Nm);
 | 
				
			||||||
 | 
						for(int k=0; k<Nm; ++k) B[k].resize(fsize);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						Field f(fsize);
 | 
				
			||||||
 | 
						Field v(fsize);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
						int k1 = 1;
 | 
				
			||||||
 | 
						int k2 = Nk_;
 | 
				
			||||||
 | 
						int kconv = 0;
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int Kdis  = 0;
 | 
				
			||||||
 | 
						int Kthrs = 0;
 | 
				
			||||||
 | 
						RealD beta_k;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
						// Set initial vector
 | 
				
			||||||
 | 
						evec[0] = 1.0;
 | 
				
			||||||
 | 
						RealD vnorm = evec[0]*evec[0];
 | 
				
			||||||
 | 
						evec[0] = 1.0/sqrt(vnorm);
 | 
				
			||||||
 | 
						// (uniform vector)
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						// Initial Nk steps
 | 
				
			||||||
 | 
						for(int k=0; k<k2; ++k) step(lmd,lme,evec,f,Nm,k);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						// Restarting loop begins
 | 
				
			||||||
 | 
						for(int iter = 0; iter<Niter_; ++iter){
 | 
				
			||||||
 | 
						  std::cout<<"\n iteration = "<< iter << endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  int Nm2 = Nm - kconv;
 | 
				
			||||||
 | 
						  for(int k=k2; k<Nm; ++k) step(lmd,lme,evec,f,Nm,k);
 | 
				
			||||||
 | 
						  f *= lme[Nm-1];
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  // getting eigenvalues
 | 
				
			||||||
 | 
						  for(int k=0; k<Nm2; ++k){
 | 
				
			||||||
 | 
						    lmd2[k] = lmd[k+k1-1];
 | 
				
			||||||
 | 
						    lme2[k] = lme[k+k1-1];
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  setUnit_Qt(Nm,Qt);
 | 
				
			||||||
 | 
						  diagonalize(lmd2,lme2,Nm2,Nm,Qt);
 | 
				
			||||||
 | 
						  // sorting
 | 
				
			||||||
 | 
						  sort_->push(lmd2,Nm);
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  // Implicitly shifted QR transformations
 | 
				
			||||||
 | 
						  setUnit_Qt(Nm,Qt);
 | 
				
			||||||
 | 
						  for(int ip=k2; ip<Nm; ++ip) 
 | 
				
			||||||
 | 
						    qr_decomp(lmd,lme,Nm,Nm,Qt,lmd2[ip],k1,Nm);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
						  for(int i=0; i<(Nk_+1); ++i) B[i] = 0.0;
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  for(int j=k1-1; j<k2+1; ++j){
 | 
				
			||||||
 | 
						    for(int k=0; k<Nm; ++k){
 | 
				
			||||||
 | 
						      B[j] += Qt[k+Nm*j] * evec[k];
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  for(int j=k1-1; j<k2+1; ++j) evec[j] = B[j];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  // Compressed vector f and beta(k2)
 | 
				
			||||||
 | 
						  f *= Qt[Nm-1+Nm*(k2-1)];
 | 
				
			||||||
 | 
						  f += lme[k2-1] * evec[k2];
 | 
				
			||||||
 | 
						  beta_k = f * f;
 | 
				
			||||||
 | 
						  beta_k = sqrt(beta_k);
 | 
				
			||||||
 | 
						  std::cout<<" beta(k) = "<<beta_k<<endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  RealD betar = 1.0/beta_k;
 | 
				
			||||||
 | 
						  evec[k2] = betar * f;
 | 
				
			||||||
 | 
						  lme[k2-1] = beta_k;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  // Convergence test
 | 
				
			||||||
 | 
						  for(int k=0; k<Nm2; ++k){    
 | 
				
			||||||
 | 
						    lmd2[k] = lmd[k];
 | 
				
			||||||
 | 
						    lme2[k] = lme[k];
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  setUnit_Qt(Nm,Qt);
 | 
				
			||||||
 | 
						  diagonalize(lmd2,lme2,Nk_,Nm,Qt);
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  for(int k = 0; k<Nk_; ++k) B[k]=0.0;
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  for(int j = 0; j<Nk_; ++j){
 | 
				
			||||||
 | 
						    for(int k = 0; k<Nk_; ++k){
 | 
				
			||||||
 | 
						      B[j] += Qt[k+j*Nm] * evec[k];
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  Kdis = 0;
 | 
				
			||||||
 | 
						  Kthrs = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::cout << setiosflags(ios_base::scientific);
 | 
				
			||||||
 | 
						  for(int i=0; i<Nk_; ++i){
 | 
				
			||||||
 | 
						    v = opr_->mult(B[i]);
 | 
				
			||||||
 | 
						    //std::cout<<"vv="<<v*v<<std::endl;
 | 
				
			||||||
 | 
						    
 | 
				
			||||||
 | 
						    RealD vnum = B[i]*v;
 | 
				
			||||||
 | 
						    RealD vden = B[i]*B[i];
 | 
				
			||||||
 | 
						    lmd2[i] = vnum/vden;
 | 
				
			||||||
 | 
						    v -= lmd2[i]*B[i];
 | 
				
			||||||
 | 
						    RealD vv = v*v;
 | 
				
			||||||
 | 
						    
 | 
				
			||||||
 | 
						    std::cout << " [" << setw(3)<< setiosflags(ios_base::right) <<i<<"] ";
 | 
				
			||||||
 | 
						    std::cout << setw(25)<< setiosflags(ios_base::left)<< lmd2[i];
 | 
				
			||||||
 | 
						    std::cout <<"  "<< setw(25)<< setiosflags(ios_base::right)<< vv<< endl;
 | 
				
			||||||
 | 
						    
 | 
				
			||||||
 | 
						    if(vv<enorm_){
 | 
				
			||||||
 | 
						      Iconv[Kdis] = i;
 | 
				
			||||||
 | 
						      ++Kdis;
 | 
				
			||||||
 | 
						      if(sort_->saturated(lmd2[i],vthr)) ++Kthrs;
 | 
				
			||||||
 | 
						      std::cout<<"Kthrs="<<Kthrs<<endl;
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						  }  // i-loop end
 | 
				
			||||||
 | 
						  std::cout << resetiosflags(ios_base::scientific);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  std::cout<<" #modes converged: "<<Kdis<<endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  if(Kthrs > 0){
 | 
				
			||||||
 | 
						    // (there is a converged eigenvalue larger than Vthrs.)
 | 
				
			||||||
 | 
						    Nconv = iter;
 | 
				
			||||||
 | 
						    goto converged;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						} // end of iter loop
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						std::cout<<"\n NOT converged.\n";
 | 
				
			||||||
 | 
						abort();
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					      converged:
 | 
				
			||||||
 | 
						// Sorting
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						lmd.clear();
 | 
				
			||||||
 | 
						evec.clear();
 | 
				
			||||||
 | 
						for(int i=0; i<Kdis; ++i){
 | 
				
			||||||
 | 
						  lmd.push_back(lmd2[Iconv[i]]);
 | 
				
			||||||
 | 
						  evec.push_back(B[Iconv[i]]);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						sort_->push(lmd,evec,Kdis);
 | 
				
			||||||
 | 
						Nsbt = Kdis - Kthrs;
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						std::cout << "\n Converged\n Summary :\n";
 | 
				
			||||||
 | 
						std::cout << " -- Iterations  = "<< Nconv  << "\n";
 | 
				
			||||||
 | 
						std::cout << " -- beta(k)     = "<< beta_k << "\n";
 | 
				
			||||||
 | 
						std::cout << " -- Kdis        = "<< Kdis   << "\n";
 | 
				
			||||||
 | 
						std::cout << " -- Nsbt        = "<< Nsbt   << "\n";
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										48
									
								
								lib/algorithms/iterative/MatrixUtils.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								lib/algorithms/iterative/MatrixUtils.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,48 @@
 | 
				
			|||||||
 | 
					#ifndef GRID_MATRIX_UTILS_H
 | 
				
			||||||
 | 
					#define GRID_MATRIX_UTILS_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  namespace MatrixUtils { 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<class T> inline void Size(Matrix<T>& A,int &N,int &M){
 | 
				
			||||||
 | 
					      N=A.size(); assert(N>0);
 | 
				
			||||||
 | 
					      M=A[0].size();
 | 
				
			||||||
 | 
					      for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
						assert(A[i].size()==M);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<class T> inline void SizeSquare(Matrix<T>& A,int &N)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      int M;
 | 
				
			||||||
 | 
					      Size(A,N,M);
 | 
				
			||||||
 | 
					      assert(N==M);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    template<class T> inline void Fill(Matrix<T>& A,T & val)
 | 
				
			||||||
 | 
					    { 
 | 
				
			||||||
 | 
					      int N,M;
 | 
				
			||||||
 | 
					      Size(A,N,M);
 | 
				
			||||||
 | 
					      for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					      for(int j=0;j<M;j++){
 | 
				
			||||||
 | 
						A[i][j]=val;
 | 
				
			||||||
 | 
					      }}
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    template<class T> inline void Diagonal(Matrix<T>& A,T & val)
 | 
				
			||||||
 | 
					    { 
 | 
				
			||||||
 | 
					      int N;
 | 
				
			||||||
 | 
					      SizeSquare(A,N);
 | 
				
			||||||
 | 
					      for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
						A[i][i]=val;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    template<class T> inline void Identity(Matrix<T>& A)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      Fill(A,0.0);
 | 
				
			||||||
 | 
					      Diagonal(A,1.0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
@@ -29,6 +29,9 @@ extern int GridCshiftPermuteMap[4][16];
 | 
				
			|||||||
class LatticeBase {};
 | 
					class LatticeBase {};
 | 
				
			||||||
class LatticeExpressionBase {};
 | 
					class LatticeExpressionBase {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 | 
				
			||||||
 | 
					template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template <typename Op, typename T1>                           
 | 
					template <typename Op, typename T1>                           
 | 
				
			||||||
class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 | 
					class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 | 
				
			||||||
 public:
 | 
					 public:
 | 
				
			||||||
@@ -59,7 +62,7 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    GridBase *_grid;
 | 
					    GridBase *_grid;
 | 
				
			||||||
    int checkerboard;
 | 
					    int checkerboard;
 | 
				
			||||||
    std::vector<vobj,alignedAllocator<vobj> > _odata;
 | 
					    Vector<vobj> _odata;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    // to pthread need a computable loop where loop induction is not required
 | 
					    // to pthread need a computable loop where loop induction is not required
 | 
				
			||||||
    int begin(void) { return 0;};
 | 
					    int begin(void) { return 0;};
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -42,7 +42,7 @@ namespace Grid{
 | 
				
			|||||||
	  // Staple in direction mu
 | 
						  // Staple in direction mu
 | 
				
			||||||
	  WilsonLoops<GaugeField>::Staple(dSdU_mu,U,mu);
 | 
						  WilsonLoops<GaugeField>::Staple(dSdU_mu,U,mu);
 | 
				
			||||||
	  dSdU_mu = Ta(Umu*adj(dSdU_mu))*factor;
 | 
						  dSdU_mu = Ta(Umu*adj(dSdU_mu))*factor;
 | 
				
			||||||
	  pokeLorentz(dSdU, dSdU_mu, mu);
 | 
						  PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      };
 | 
					      };
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -10,15 +10,15 @@ public:
 | 
				
			|||||||
  virtual void push(const std::string &s) = 0;
 | 
					  virtual void push(const std::string &s) = 0;
 | 
				
			||||||
  virtual void pop(void) =0;
 | 
					  virtual void pop(void) =0;
 | 
				
			||||||
  virtual void write( const std::string& s,const std::string &output      ) =0;
 | 
					  virtual void write( const std::string& s,const std::string &output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s,  int16_t    output      ) =0;
 | 
					  virtual void write( const std::string& s,const  int16_t    output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s, uint16_t    output      ) =0;
 | 
					  virtual void write( const std::string& s,const uint16_t    output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s,  int32_t    output      ) =0;
 | 
					  virtual void write( const std::string& s,const  int32_t    output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s, uint32_t    output      ) =0;
 | 
					  virtual void write( const std::string& s,const uint32_t    output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s,  int64_t    output      ) =0;
 | 
					  virtual void write( const std::string& s,const  int64_t    output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s, uint64_t    output      ) =0;
 | 
					  virtual void write( const std::string& s,const uint64_t    output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s,  float      output      ) =0;
 | 
					  virtual void write( const std::string& s,const  float      output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s, double      output      ) =0;
 | 
					  virtual void write( const std::string& s,const double      output      ) =0;
 | 
				
			||||||
  virtual void write( const std::string& s, bool        output      ) =0;
 | 
					  virtual void write( const std::string& s,const bool        output      ) =0;
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,19 +35,19 @@ public:
 | 
				
			|||||||
      write(s,cstr[c]);
 | 
					      write(s,cstr[c]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  void write( const std::string& s,  char       output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  char       output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  int16_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int16_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint16_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint16_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  int32_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int32_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint32_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint32_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  int64_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int64_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint64_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint64_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  float      output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  float      output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, double      output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const double      output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, bool        output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const bool        output      ) { writeInternal(s,output); };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
private:
 | 
					private:
 | 
				
			||||||
  template<class T> void writeInternal( const std::string& s, T output ){
 | 
					  template<class T> void writeInternal( const std::string& s,const T output ){
 | 
				
			||||||
    // FIXME --- htons, htonl, htno64 etc..
 | 
					    // FIXME --- htons, htonl, htno64 etc..
 | 
				
			||||||
    file.write((char *)&output,sizeof(T));
 | 
					    file.write((char *)&output,sizeof(T));
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -120,14 +120,14 @@ THE SOFTWARE.
 | 
				
			|||||||
  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
 | 
					  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))		\
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  template<class Writer> friend void write(Writer &WR,const std::string &s, const cname &obj){ \
 | 
					  friend void write(Writer &WR,const std::string &s, const cname &obj){ \
 | 
				
			||||||
    push(WR,s);\
 | 
					    push(WR,s);\
 | 
				
			||||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
 | 
					    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
 | 
				
			||||||
    pop(WR);\
 | 
					    pop(WR);\
 | 
				
			||||||
  } \
 | 
					  } \
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  \
 | 
					  \
 | 
				
			||||||
  template<class Reader> friend void read(Reader &RD,const std::string &s, cname &obj){	\
 | 
					  friend void read(Reader &RD,const std::string &s, cname &obj){	\
 | 
				
			||||||
    push(RD,s);\
 | 
					    push(RD,s);\
 | 
				
			||||||
    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
 | 
					    GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__))	\
 | 
				
			||||||
    pop(RD);\
 | 
					    pop(RD);\
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,6 +3,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <serialisation/MacroMagic.h>
 | 
					#include <serialisation/MacroMagic.h>
 | 
				
			||||||
#include <serialisation/BaseIO.h>
 | 
					#include <serialisation/BaseIO.h>
 | 
				
			||||||
 | 
					#include <stdint.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -12,17 +13,17 @@ namespace Grid {
 | 
				
			|||||||
  inline void push(Writer & WR,const char *s)        { WR.push(std::string(s));}
 | 
					  inline void push(Writer & WR,const char *s)        { WR.push(std::string(s));}
 | 
				
			||||||
  inline void pop (Writer & WR)                      { WR.pop();}
 | 
					  inline void pop (Writer & WR)                      { WR.pop();}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s,const char * output      ) { wr.write(s,std::string(output)); };
 | 
					  //  inline void write(Writer& wr, const std::string& s,const char * output      ) { wr.write(s,std::string(output)); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s,const std::string &output) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const std::string &output) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s,  int16_t    output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const  int16_t    output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s, uint16_t    output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const uint16_t    output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s,  int32_t    output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const  int32_t    output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s, uint32_t    output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const uint32_t    output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s,  int64_t    output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const  int64_t    output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s, uint64_t    output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const uint64_t    output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s,  float      output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const  float      output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s, double      output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const double      output      ) { wr.write(s,output); };
 | 
				
			||||||
  inline void write(Writer& wr, const std::string& s, bool        output      ) { wr.write(s,output); };
 | 
					  inline void write(Writer& wr, const std::string& s,const bool        output      ) { wr.write(s,output); };
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  inline void push(Reader & WR,const std::string &s) { WR.push(s);}
 | 
					  inline void push(Reader & WR,const std::string &s) { WR.push(s);}
 | 
				
			||||||
  inline void push(Reader & WR,const char *s)        { WR.push(std::string(s));}
 | 
					  inline void push(Reader & WR,const char *s)        { WR.push(std::string(s));}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -43,19 +43,19 @@ public:
 | 
				
			|||||||
    indent();
 | 
					    indent();
 | 
				
			||||||
    file<<output<<std::endl;
 | 
					    file<<output<<std::endl;
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  void write( const std::string& s,  int16_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int16_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint16_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint16_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  int32_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int32_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint32_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint32_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  int64_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int64_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint64_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint64_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  float      output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  float      output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, double      output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const double      output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, bool        output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const bool        output      ) { writeInternal(s,output); };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
private:
 | 
					private:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class T> void writeInternal( const std::string& s, T output ){
 | 
					  template<class T> void writeInternal( const std::string& s,const T output ){
 | 
				
			||||||
    indent();
 | 
					    indent();
 | 
				
			||||||
    file << std::boolalpha << output<<std::endl;
 | 
					    file << std::boolalpha << output<<std::endl;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -49,19 +49,20 @@ public:
 | 
				
			|||||||
    pugi::xml_node leaf=node.append_child(s.c_str());
 | 
					    pugi::xml_node leaf=node.append_child(s.c_str());
 | 
				
			||||||
    leaf.append_child(pugi::node_pcdata).set_value(output.c_str());
 | 
					    leaf.append_child(pugi::node_pcdata).set_value(output.c_str());
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  void write( const std::string& s,  int16_t    output      ) { writeInternal(s,output); };
 | 
					
 | 
				
			||||||
  void write( const std::string& s, uint16_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int16_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  int32_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint16_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint32_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int32_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  int64_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint32_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, uint64_t    output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  int64_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s,  float      output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const uint64_t    output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, double      output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const  float      output      ) { writeInternal(s,output); };
 | 
				
			||||||
  void write( const std::string& s, bool        output      ) { writeInternal(s,output); };
 | 
					  void write( const std::string& s,const double      output      ) { writeInternal(s,output); };
 | 
				
			||||||
 | 
					  void write( const std::string& s,const bool        output      ) { writeInternal(s,output); };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
private:
 | 
					private:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class T> void writeInternal( const std::string& s, T output ){
 | 
					  template<class T> void writeInternal( const std::string& s,const T output ){
 | 
				
			||||||
    std::ostringstream os;
 | 
					    std::ostringstream os;
 | 
				
			||||||
    os << std::boolalpha << output;
 | 
					    os << std::boolalpha << output;
 | 
				
			||||||
    write(s,os.str());
 | 
					    write(s,os.str());
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -149,49 +149,33 @@ namespace Optimization {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Note, we can beat the shuf overhead in chain with two temporaries
 | 
				
			||||||
 | 
					  // Ar Ai , Br Bi,  Ai Ar  // one shuf
 | 
				
			||||||
 | 
					  //tmpr Ar Br,  Ai Bi    // Mul/Mac/Mac
 | 
				
			||||||
 | 
					  //tmpi Br Ai,  Bi Ar    // Mul/Mac/Mac
 | 
				
			||||||
 | 
					  // add tmpi,shuf(tmpi)
 | 
				
			||||||
 | 
					  // sub tmpr,shuf(tmpi)
 | 
				
			||||||
 | 
					  // shuf(tmpr,tmpi).    // Could drop/trade for write mask
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Gives
 | 
				
			||||||
 | 
					  //  2mul,4 mac +add+sub = 8 flop type insns
 | 
				
			||||||
 | 
					  //  3shuf + 2 (+shuf)   = 5/6 simd perm and 1/2 the load.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct MultComplex{
 | 
					  struct MultComplex{
 | 
				
			||||||
    // Complex float
 | 
					    // Complex float
 | 
				
			||||||
    inline __m512 operator()(__m512 a, __m512 b){
 | 
					    inline __m512 operator()(__m512 a, __m512 b){
 | 
				
			||||||
      __m512 vzero,ymm0,ymm1,real, imag;
 | 
					      // dup, dup, perm, mul, madd
 | 
				
			||||||
      vzero = _mm512_setzero_ps();
 | 
					      __m512 a_real = _mm512_moveldup_ps( a ); // Ar Ar
 | 
				
			||||||
      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
 | 
					      __m512 a_imag = _mm512_movehdup_ps( a ); // Ai Ai
 | 
				
			||||||
      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
 | 
					      a_imag = _mm512_mul_ps( a_imag, _mm512_permute_ps( b, 0xB1 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
 | 
					      return _mm512_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
				
			||||||
      ymm1  = _mm512_mul_ps(real, b);
 | 
					 | 
				
			||||||
      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
 | 
					 | 
				
			||||||
      return _mm512_fmadd_ps(ymm0,imag,ymm1);
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Complex double
 | 
					    // Complex double
 | 
				
			||||||
    inline __m512d operator()(__m512d a, __m512d b){
 | 
					    inline __m512d operator()(__m512d a, __m512d b){
 | 
				
			||||||
      /* This is from
 | 
					      __m512d a_real = _mm512_shuffle_pd( a, a, 0x00 );
 | 
				
			||||||
       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
 | 
					      __m512d a_imag = _mm512_shuffle_pd( a, a, 0xFF );
 | 
				
			||||||
       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
 | 
					      a_imag = _mm512_mul_pd( a_imag, _mm512_permute_pd( b, 0x55 ) );
 | 
				
			||||||
       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
 | 
					      return _mm512_fmaddsub_pd( a_real, b, a_imag );
 | 
				
			||||||
       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
 | 
					 | 
				
			||||||
       * booktitle = {Proceedings of the International Conference on Supercomputing},
 | 
					 | 
				
			||||||
       * series = {ICS '11},
 | 
					 | 
				
			||||||
       * year = {2011},
 | 
					 | 
				
			||||||
       * isbn = {978-1-4503-0102-2},
 | 
					 | 
				
			||||||
       * location = {Tucson, Arizona, USA},
 | 
					 | 
				
			||||||
       * pages = {265--274},
 | 
					 | 
				
			||||||
       * numpages = {10},
 | 
					 | 
				
			||||||
       * url = {http://doi.acm.org/10.1145/1995896.1995938},
 | 
					 | 
				
			||||||
       * doi = {10.1145/1995896.1995938},
 | 
					 | 
				
			||||||
       * acmid = {1995938},
 | 
					 | 
				
			||||||
       * publisher = {ACM},
 | 
					 | 
				
			||||||
       * address = {New York, NY, USA},
 | 
					 | 
				
			||||||
       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
 | 
					 | 
				
			||||||
       *                } 
 | 
					 | 
				
			||||||
       */
 | 
					 | 
				
			||||||
      __m512d vzero,ymm0,ymm1,real,imag;
 | 
					 | 
				
			||||||
      vzero =_mm512_setzero_pd();
 | 
					 | 
				
			||||||
      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
 | 
					 | 
				
			||||||
      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
 | 
					 | 
				
			||||||
      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
 | 
					 | 
				
			||||||
      ymm1 =  _mm512_mul_pd(real, b);
 | 
					 | 
				
			||||||
      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
 | 
					 | 
				
			||||||
      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -227,12 +211,12 @@ namespace Optimization {
 | 
				
			|||||||
    //Complex single
 | 
					    //Complex single
 | 
				
			||||||
    inline __m512 operator()(__m512 in, __m512 ret){
 | 
					    inline __m512 operator()(__m512 in, __m512 ret){
 | 
				
			||||||
      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
					      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
				
			||||||
      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
 | 
					      return _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex double
 | 
					    //Complex double
 | 
				
			||||||
    inline __m512d operator()(__m512d in, __m512d ret){
 | 
					    inline __m512d operator()(__m512d in, __m512d ret){
 | 
				
			||||||
      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
					      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
				
			||||||
      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
 | 
					      return _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -241,13 +225,13 @@ namespace Optimization {
 | 
				
			|||||||
  struct TimesI{
 | 
					  struct TimesI{
 | 
				
			||||||
    //Complex single
 | 
					    //Complex single
 | 
				
			||||||
    inline __m512 operator()(__m512 in, __m512 ret){
 | 
					    inline __m512 operator()(__m512 in, __m512 ret){
 | 
				
			||||||
      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
 | 
					      __m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
 | 
				
			||||||
      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
 | 
					      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); 
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Complex double
 | 
					    //Complex double
 | 
				
			||||||
    inline __m512d operator()(__m512d in, __m512d ret){
 | 
					    inline __m512d operator()(__m512d in, __m512d ret){
 | 
				
			||||||
      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
 | 
					      __m512d tmp = _mm512_shuffle_pd(tmp,tmp,_MM_SHUFFLE(1,0,3,2));
 | 
				
			||||||
      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
 | 
					      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); 
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -325,8 +309,8 @@ namespace Grid {
 | 
				
			|||||||
    } conv;
 | 
					    } conv;
 | 
				
			||||||
    conv.v = b.v;
 | 
					    conv.v = b.v;
 | 
				
			||||||
    switch(perm){
 | 
					    switch(perm){
 | 
				
			||||||
    case 3:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
 | 
					    case 3 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(2,3,0,1)); break;
 | 
				
			||||||
    case 2:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; 
 | 
					    case 2 : conv.f = _mm512_shuffle_ps(conv.f,conv.f,_MM_SHUFFLE(1,0,3,2)); break;
 | 
				
			||||||
    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
 | 
					    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
 | 
				
			||||||
    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
 | 
					    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
 | 
				
			||||||
    default: assert(0); break;
 | 
					    default: assert(0); break;
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										355
									
								
								lib/simd/Grid_imci.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										355
									
								
								lib/simd/Grid_imci.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,355 @@
 | 
				
			|||||||
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					/*! @file Grid_knc.h
 | 
				
			||||||
 | 
					  @brief Optimization libraries for AVX512 instructions set for KNC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Using intrinsics
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					// Time-stamp: <2015-06-09 14:27:28 neo>
 | 
				
			||||||
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <immintrin.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifndef KNC_ONLY_STORES
 | 
				
			||||||
 | 
					#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
 | 
				
			||||||
 | 
					#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Optimization {
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  struct Vsplat{
 | 
				
			||||||
 | 
					    //Complex float
 | 
				
			||||||
 | 
					    inline __m512 operator()(float a, float b){
 | 
				
			||||||
 | 
					      return _mm512_set_ps(b,a,b,a,b,a,b,a,b,a,b,a,b,a,b,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real float
 | 
				
			||||||
 | 
					    inline __m512 operator()(float a){
 | 
				
			||||||
 | 
					      return _mm512_set1_ps(a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex double
 | 
				
			||||||
 | 
					    inline __m512d operator()(double a, double b){
 | 
				
			||||||
 | 
					      return _mm512_set_pd(b,a,b,a,b,a,b,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Real double
 | 
				
			||||||
 | 
					    inline __m512d operator()(double a){
 | 
				
			||||||
 | 
					      return _mm512_set1_pd(a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline __m512i operator()(Integer a){
 | 
				
			||||||
 | 
					      return _mm512_set1_epi32(a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Vstore{
 | 
				
			||||||
 | 
					    //Float 
 | 
				
			||||||
 | 
					    inline void operator()(__m512 a, float* F){
 | 
				
			||||||
 | 
					      _mm512_store_ps(F,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Double
 | 
				
			||||||
 | 
					    inline void operator()(__m512d a, double* D){
 | 
				
			||||||
 | 
					      _mm512_store_pd(D,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline void operator()(__m512i a, Integer* I){
 | 
				
			||||||
 | 
					      _mm512_store_si512((__m512i *)I,a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Vstream{
 | 
				
			||||||
 | 
					    //Float
 | 
				
			||||||
 | 
					    inline void operator()(float * a, __m512 b){
 | 
				
			||||||
 | 
					      _mm512_storenrngo_ps(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Double
 | 
				
			||||||
 | 
					    inline void operator()(double * a, __m512d b){
 | 
				
			||||||
 | 
					      _mm512_storenrngo_pd(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Vset{
 | 
				
			||||||
 | 
					    // Complex float 
 | 
				
			||||||
 | 
					    inline __m512 operator()(Grid::ComplexF *a){
 | 
				
			||||||
 | 
					      return _mm512_set_ps(a[7].imag(),a[7].real(),a[6].imag(),a[6].real(),
 | 
				
			||||||
 | 
								   a[5].imag(),a[5].real(),a[4].imag(),a[4].real(),
 | 
				
			||||||
 | 
								   a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
 | 
				
			||||||
 | 
								   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Complex double 
 | 
				
			||||||
 | 
					    inline __m512d operator()(Grid::ComplexD *a){
 | 
				
			||||||
 | 
					      return _mm512_set_pd(a[3].imag(),a[3].real(),a[2].imag(),a[2].real(),
 | 
				
			||||||
 | 
								   a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real float 
 | 
				
			||||||
 | 
					    inline __m512 operator()(float *a){
 | 
				
			||||||
 | 
					      return _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
 | 
				
			||||||
 | 
								    a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real double
 | 
				
			||||||
 | 
					    inline __m512d operator()(double *a){
 | 
				
			||||||
 | 
					      return _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Integer
 | 
				
			||||||
 | 
					    inline __m512i operator()(Integer *a){
 | 
				
			||||||
 | 
					      return _mm512_set_epi32( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
 | 
				
			||||||
 | 
								       a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template <typename Out_type, typename In_type>
 | 
				
			||||||
 | 
					  struct Reduce{
 | 
				
			||||||
 | 
					    //Need templated class to overload output type
 | 
				
			||||||
 | 
					    //General form must generate error if compiled
 | 
				
			||||||
 | 
					    inline Out_type operator()(In_type in){
 | 
				
			||||||
 | 
					      printf("Error, using wrong Reduce function\n");
 | 
				
			||||||
 | 
					      exit(1);
 | 
				
			||||||
 | 
					      return 0;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // Arithmetic operations
 | 
				
			||||||
 | 
					  /////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  struct Sum{
 | 
				
			||||||
 | 
					    //Complex/Real float
 | 
				
			||||||
 | 
					    inline __m512 operator()(__m512 a, __m512 b){
 | 
				
			||||||
 | 
					      return _mm512_add_ps(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex/Real double
 | 
				
			||||||
 | 
					    inline __m512d operator()(__m512d a, __m512d b){
 | 
				
			||||||
 | 
					      return _mm512_add_pd(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline __m512i operator()(__m512i a, __m512i b){
 | 
				
			||||||
 | 
					      return _mm512_add_epi32(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Sub{
 | 
				
			||||||
 | 
					    //Complex/Real float
 | 
				
			||||||
 | 
					    inline __m512 operator()(__m512 a, __m512 b){
 | 
				
			||||||
 | 
					      return _mm512_sub_ps(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex/Real double
 | 
				
			||||||
 | 
					    inline __m512d operator()(__m512d a, __m512d b){
 | 
				
			||||||
 | 
					      return _mm512_sub_pd(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Integer
 | 
				
			||||||
 | 
					    inline __m512i operator()(__m512i a, __m512i b){
 | 
				
			||||||
 | 
					      return _mm512_sub_epi32(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct MultComplex{
 | 
				
			||||||
 | 
					    // Complex float
 | 
				
			||||||
 | 
					    inline __m512 operator()(__m512 a, __m512 b){
 | 
				
			||||||
 | 
					      __m512 vzero,ymm0,ymm1,real, imag;
 | 
				
			||||||
 | 
					      vzero = _mm512_setzero_ps();
 | 
				
			||||||
 | 
					      ymm0  = _mm512_swizzle_ps(a, _MM_SWIZ_REG_CDAB); // 
 | 
				
			||||||
 | 
					      real  = (__m512)_mm512_mask_or_epi32((__m512i)a, 0xAAAA,(__m512i)vzero,(__m512i)ymm0);
 | 
				
			||||||
 | 
					      imag  = _mm512_mask_sub_ps(a, 0x5555,vzero, ymm0);
 | 
				
			||||||
 | 
					      ymm1  = _mm512_mul_ps(real, b);
 | 
				
			||||||
 | 
					      ymm0  = _mm512_swizzle_ps(b, _MM_SWIZ_REG_CDAB); // OK
 | 
				
			||||||
 | 
					      return _mm512_fmadd_ps(ymm0,imag,ymm1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Complex double
 | 
				
			||||||
 | 
					    inline __m512d operator()(__m512d a, __m512d b){
 | 
				
			||||||
 | 
					      /* This is from
 | 
				
			||||||
 | 
					       * Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets 
 | 
				
			||||||
 | 
					       * @inproceedings{McFarlin:2011:ASV:1995896.1995938,
 | 
				
			||||||
 | 
					       * author = {McFarlin, Daniel S. and Arbatov, Volodymyr and Franchetti, Franz and P\"{u}schel, Markus},
 | 
				
			||||||
 | 
					       * title = {Automatic SIMD Vectorization of Fast Fourier Transforms for the Larrabee and AVX Instruction Sets},
 | 
				
			||||||
 | 
					       * booktitle = {Proceedings of the International Conference on Supercomputing},
 | 
				
			||||||
 | 
					       * series = {ICS '11},
 | 
				
			||||||
 | 
					       * year = {2011},
 | 
				
			||||||
 | 
					       * isbn = {978-1-4503-0102-2},
 | 
				
			||||||
 | 
					       * location = {Tucson, Arizona, USA},
 | 
				
			||||||
 | 
					       * pages = {265--274},
 | 
				
			||||||
 | 
					       * numpages = {10},
 | 
				
			||||||
 | 
					       * url = {http://doi.acm.org/10.1145/1995896.1995938},
 | 
				
			||||||
 | 
					       * doi = {10.1145/1995896.1995938},
 | 
				
			||||||
 | 
					       * acmid = {1995938},
 | 
				
			||||||
 | 
					       * publisher = {ACM},
 | 
				
			||||||
 | 
					       * address = {New York, NY, USA},
 | 
				
			||||||
 | 
					       * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
 | 
				
			||||||
 | 
					       *                } 
 | 
				
			||||||
 | 
					       */
 | 
				
			||||||
 | 
					      __m512d vzero,ymm0,ymm1,real,imag;
 | 
				
			||||||
 | 
					      vzero =_mm512_setzero_pd();
 | 
				
			||||||
 | 
					      ymm0 =  _mm512_swizzle_pd(a, _MM_SWIZ_REG_CDAB); // 
 | 
				
			||||||
 | 
					      real =(__m512d)_mm512_mask_or_epi64((__m512i)a, 0xAA,(__m512i)vzero,(__m512i) ymm0);
 | 
				
			||||||
 | 
					      imag =  _mm512_mask_sub_pd(a, 0x55,vzero, ymm0);
 | 
				
			||||||
 | 
					      ymm1 =  _mm512_mul_pd(real, b);
 | 
				
			||||||
 | 
					      ymm0 =  _mm512_swizzle_pd(b, _MM_SWIZ_REG_CDAB); // OK
 | 
				
			||||||
 | 
					      return  _mm512_fmadd_pd(ymm0,imag,ymm1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  struct Mult{
 | 
				
			||||||
 | 
					    // Real float
 | 
				
			||||||
 | 
					    inline __m512 operator()(__m512 a, __m512 b){
 | 
				
			||||||
 | 
					      return _mm512_mul_ps(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Real double
 | 
				
			||||||
 | 
					    inline __m512d operator()(__m512d a, __m512d b){
 | 
				
			||||||
 | 
					      return _mm512_mul_pd(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Integer
 | 
				
			||||||
 | 
					    inline __m512i operator()(__m512i a, __m512i b){
 | 
				
			||||||
 | 
					      return _mm512_mullo_epi32(a,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct Conj{
 | 
				
			||||||
 | 
					    // Complex single
 | 
				
			||||||
 | 
					    inline __m512 operator()(__m512 in){
 | 
				
			||||||
 | 
					      return _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // Zero out 0+real 0-imag  
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // Complex double
 | 
				
			||||||
 | 
					    inline __m512d operator()(__m512d in){
 | 
				
			||||||
 | 
					      return _mm512_mask_sub_pd(in, 0xaa,_mm512_setzero_pd(), in);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    // do not define for integer input
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct TimesMinusI{
 | 
				
			||||||
 | 
					    //Complex single
 | 
				
			||||||
 | 
					    inline __m512 operator()(__m512 in, __m512 ret){
 | 
				
			||||||
 | 
					      __m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag 
 | 
				
			||||||
 | 
					      return _mm512_swizzle_ps(tmp, _MM_SWIZ_REG_CDAB);// OK
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex double
 | 
				
			||||||
 | 
					    inline __m512d operator()(__m512d in, __m512d ret){
 | 
				
			||||||
 | 
					      __m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag 
 | 
				
			||||||
 | 
					      return  _mm512_swizzle_pd(tmp, _MM_SWIZ_REG_CDAB);// OK
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct TimesI{
 | 
				
			||||||
 | 
					    //Complex single
 | 
				
			||||||
 | 
					    inline __m512 operator()(__m512 in, __m512 ret){
 | 
				
			||||||
 | 
					      __m512 tmp = _mm512_swizzle_ps(in, _MM_SWIZ_REG_CDAB);// OK
 | 
				
			||||||
 | 
					      return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); // real -imag
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Complex double
 | 
				
			||||||
 | 
					    inline __m512d operator()(__m512d in, __m512d ret){
 | 
				
			||||||
 | 
					      __m512d tmp = _mm512_swizzle_pd(in, _MM_SWIZ_REG_CDAB);// OK
 | 
				
			||||||
 | 
					      return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp); // real -imag
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //////////////////////////////////////////////
 | 
				
			||||||
 | 
					  // Some Template specialization
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  //Complex float Reduce
 | 
				
			||||||
 | 
					  template<>
 | 
				
			||||||
 | 
					  inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
 | 
				
			||||||
 | 
					    return Grid::ComplexF(_mm512_mask_reduce_add_ps(0x5555, in),_mm512_mask_reduce_add_ps(0xAAAA, in));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  //Real float Reduce
 | 
				
			||||||
 | 
					  template<>
 | 
				
			||||||
 | 
					  inline Grid::RealF Reduce<Grid::RealF, __m512>::operator()(__m512 in){
 | 
				
			||||||
 | 
					    return _mm512_reduce_add_ps(in);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  //Complex double Reduce
 | 
				
			||||||
 | 
					  template<>
 | 
				
			||||||
 | 
					  inline Grid::ComplexD Reduce<Grid::ComplexD, __m512d>::operator()(__m512d in){
 | 
				
			||||||
 | 
					    return Grid::ComplexD(_mm512_mask_reduce_add_pd(0x55, in),_mm512_mask_reduce_add_pd(0xAA, in));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  //Real double Reduce
 | 
				
			||||||
 | 
					  template<>
 | 
				
			||||||
 | 
					  inline Grid::RealD Reduce<Grid::RealD, __m512d>::operator()(__m512d in){
 | 
				
			||||||
 | 
					    return _mm512_reduce_add_pd(in);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //Integer Reduce
 | 
				
			||||||
 | 
					  template<>
 | 
				
			||||||
 | 
					  inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
 | 
				
			||||||
 | 
					    // FIXME unimplemented
 | 
				
			||||||
 | 
					    printf("Reduce : Missing integer implementation -> FIX\n");
 | 
				
			||||||
 | 
					    assert(0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Here assign types 
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					  typedef __m512 SIMD_Ftype;  // Single precision type
 | 
				
			||||||
 | 
					  typedef __m512d SIMD_Dtype; // Double precision type
 | 
				
			||||||
 | 
					  typedef __m512i SIMD_Itype; // Integer type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // prefecth
 | 
				
			||||||
 | 
					  inline void v_prefetch0(int size, const char *ptr){
 | 
				
			||||||
 | 
					    for(int i=0;i<size;i+=64){ //  Define L1 linesize above
 | 
				
			||||||
 | 
					      _mm_prefetch(ptr+i+4096,_MM_HINT_T1);
 | 
				
			||||||
 | 
					      _mm_prefetch(ptr+i+512,_MM_HINT_T0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  inline void prefetch_HINT_T0(const char *ptr){
 | 
				
			||||||
 | 
					    _mm_prefetch(ptr,_MM_HINT_T0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Gpermute utilities consider coalescing into 1 Gpermute
 | 
				
			||||||
 | 
					  template < typename VectorSIMD > 
 | 
				
			||||||
 | 
					    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
 | 
				
			||||||
 | 
					    union { 
 | 
				
			||||||
 | 
					      __m512 f;
 | 
				
			||||||
 | 
					      decltype(VectorSIMD::v) v;
 | 
				
			||||||
 | 
					    } conv;
 | 
				
			||||||
 | 
					    conv.v = b.v;
 | 
				
			||||||
 | 
					    switch(perm){
 | 
				
			||||||
 | 
					    case 3:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_CDAB); break;
 | 
				
			||||||
 | 
					    case 2:  conv.f = _mm512_swizzle_ps(conv.f,_MM_SWIZ_REG_BADC); break; 
 | 
				
			||||||
 | 
					    case 1 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(2,3,0,1)); break;
 | 
				
			||||||
 | 
					    case 0 : conv.f = _mm512_permute4f128_ps(conv.f,(_MM_PERM_ENUM)_MM_SHUFFLE(1,0,3,2)); break;
 | 
				
			||||||
 | 
					    default: assert(0); break;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    y.v=conv.v;
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // Function name aliases
 | 
				
			||||||
 | 
					  typedef Optimization::Vsplat   VsplatSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Vstore   VstoreSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Vset     VsetSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Vstream  VstreamSIMD;
 | 
				
			||||||
 | 
					  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Arithmetic operations
 | 
				
			||||||
 | 
					  typedef Optimization::Sum         SumSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Sub         SubSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Mult        MultSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::MultComplex MultComplexSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::Conj        ConjSIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::TimesMinusI TimesMinusISIMD;
 | 
				
			||||||
 | 
					  typedef Optimization::TimesI      TimesISIMD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -19,6 +19,9 @@
 | 
				
			|||||||
#if defined AVX512
 | 
					#if defined AVX512
 | 
				
			||||||
#include "Grid_avx512.h"
 | 
					#include "Grid_avx512.h"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined IMCI
 | 
				
			||||||
 | 
					#include "Grid_imci.h"
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
#if defined QPX
 | 
					#if defined QPX
 | 
				
			||||||
#include "Grid_qpx.h"
 | 
					#include "Grid_qpx.h"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -263,15 +266,13 @@ namespace Grid {
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
  // this is only for the complex version
 | 
					  // this is only for the complex version
 | 
				
			||||||
  template <class S, class V, IfComplex<S> =0, class ABtype> 
 | 
					  template <class S, class V, IfComplex<S> =0, class ABtype> 
 | 
				
			||||||
    inline void vsplat(Grid_simd<S,V> &ret,ABtype a, ABtype b){
 | 
					  inline void vsplat(Grid_simd<S,V> &ret,ABtype a, ABtype b){
 | 
				
			||||||
    ret.v = binary<V>(a, b, VsplatSIMD());
 | 
					    ret.v = binary<V>(a, b, VsplatSIMD());
 | 
				
			||||||
  }    
 | 
					  }    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // overload if complex
 | 
					  // overload if complex
 | 
				
			||||||
  template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) {
 | 
					  template <class S,class V> inline void vsplat(Grid_simd<S,V> &ret, EnableIf<is_complex < S >, S> c) {
 | 
				
			||||||
    Real a = real(c);
 | 
					    vsplat(ret,real(c),imag(c));
 | 
				
			||||||
    Real b = imag(c);
 | 
					 | 
				
			||||||
    vsplat(ret,a,b);
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //if real fill with a, if complex fill with a in the real part (first function above)
 | 
					  //if real fill with a, if complex fill with a in the real part (first function above)
 | 
				
			||||||
@@ -290,8 +291,8 @@ namespace Grid {
 | 
				
			|||||||
  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
 | 
					  template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));} 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // if not complex overload here 
 | 
					  // if not complex overload here 
 | 
				
			||||||
  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,1.0); }
 | 
					  template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
 | 
				
			||||||
  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret)     { vsplat(ret,0.0); }
 | 
					  template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
 | 
				
			||||||
   
 | 
					   
 | 
				
			||||||
  // For integral types
 | 
					  // For integral types
 | 
				
			||||||
  template <class S,class V,IfInteger<S> = 0 > inline void vone(Grid_simd<S,V> &ret)  {vsplat(ret,1); }
 | 
					  template <class S,class V,IfInteger<S> = 0 > inline void vone(Grid_simd<S,V> &ret)  {vsplat(ret,1); }
 | 
				
			||||||
@@ -304,13 +305,18 @@ namespace Grid {
 | 
				
			|||||||
  ///////////////////////
 | 
					  ///////////////////////
 | 
				
			||||||
  // Vstream
 | 
					  // Vstream
 | 
				
			||||||
  ///////////////////////
 | 
					  ///////////////////////
 | 
				
			||||||
  template <class S,class V, IfNotInteger<S> = 0 > 
 | 
					  template <class S,class V, IfReal<S> = 0 > 
 | 
				
			||||||
    inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
 | 
					  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
 | 
				
			||||||
      binary<void>((Real*)&out.v, in.v, VstreamSIMD());
 | 
					    binary<void>((S *)&out.v, in.v, VstreamSIMD());
 | 
				
			||||||
    }
 | 
					  }
 | 
				
			||||||
 | 
					  template <class S,class V, IfComplex<S> = 0 > 
 | 
				
			||||||
 | 
					  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
 | 
				
			||||||
 | 
					    typedef typename S::value_type T;
 | 
				
			||||||
 | 
					    binary<void>((T *)&out.v, in.v, VstreamSIMD());
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template <class S,class V, IfInteger<S> = 0 > 
 | 
					  template <class S,class V, IfInteger<S> = 0 > 
 | 
				
			||||||
    inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
 | 
					  inline void vstream(Grid_simd<S,V> &out,const Grid_simd<S,V> &in){
 | 
				
			||||||
    out=in;
 | 
					    out=in;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -44,7 +44,10 @@ icpc-avx512)
 | 
				
			|||||||
  CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3  -std=c++11" --host=none  LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
					  CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3  -std=c++11" --host=none  LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
				
			||||||
  ;;
 | 
					  ;;
 | 
				
			||||||
icpc-mic)
 | 
					icpc-mic)
 | 
				
			||||||
  CXX=icpc ../../configure --host=none --enable-simd=AVX512 CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
					  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
				
			||||||
 | 
					  ;;
 | 
				
			||||||
 | 
					icpc-mic-avx512)
 | 
				
			||||||
 | 
					  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3  -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
				
			||||||
  ;;
 | 
					  ;;
 | 
				
			||||||
clang-sse)
 | 
					clang-sse)
 | 
				
			||||||
CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
					CXX=clang++ ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,5 +1,5 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
 | 
					bin_PROGRAMS = Test_GaugeAction Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_hdcr Test_gamma Test_hmc_EODWFRatio Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_partfrac_force Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd  Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 | 
					Test_GaugeAction_SOURCES=Test_GaugeAction.cc
 | 
				
			||||||
@@ -85,6 +85,8 @@ Test_dwf_fpgcr_LDADD=-lGrid
 | 
				
			|||||||
Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
 | 
					Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
 | 
				
			||||||
Test_dwf_hdcr_LDADD=-lGrid
 | 
					Test_dwf_hdcr_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 | 
				
			||||||
 | 
					#Test_dwf_lanczos_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Test_gamma_SOURCES=Test_gamma.cc
 | 
					Test_gamma_SOURCES=Test_gamma.cc
 | 
				
			||||||
Test_gamma_LDADD=-lGrid
 | 
					Test_gamma_LDADD=-lGrid
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										57
									
								
								tests/Test_dwf_lanczos.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								tests/Test_dwf_lanczos.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,57 @@
 | 
				
			|||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace std;
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int Ls=8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion    src(FGrid); gaussian(RNG5,src);
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid); 
 | 
				
			||||||
 | 
					  SU3::HotConfiguration(RNG4, Umu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<LatticeColourMatrix> U(4,UGrid);
 | 
				
			||||||
 | 
					  for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  RealD mass=0.1;
 | 
				
			||||||
 | 
					  RealD M5=1.8;
 | 
				
			||||||
 | 
					  DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int Nk = 10;
 | 
				
			||||||
 | 
					  const int Np = 1;
 | 
				
			||||||
 | 
					  RealD enorm  = 1.0;
 | 
				
			||||||
 | 
					  RealD vthrs  = 1;
 | 
				
			||||||
 | 
					  const int Nit= 1000;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,
 | 
				
			||||||
 | 
											 Nk,Np,enorm,vthrs,Nit);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  std::vector<RealD>          eval(Nk);
 | 
				
			||||||
 | 
					  std::vector<LatticeFermion> evec(Nk,FGrid);
 | 
				
			||||||
 | 
					  IRL.calc(eval,evec,
 | 
				
			||||||
 | 
						   src,
 | 
				
			||||||
 | 
						   Nsbt,
 | 
				
			||||||
 | 
						   Nconv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -1,6 +1,6 @@
 | 
				
			|||||||
#include <Grid.h>
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
using namespace Grid;
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class myclass {
 | 
					class myclass {
 | 
				
			||||||
public:
 | 
					public:
 | 
				
			||||||
@@ -24,29 +24,32 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
uint16_t i16 = 1;
 | 
					 int16_t i16 = 1;
 | 
				
			||||||
uint16_t u16 = 2;
 | 
					uint16_t u16 = 2;
 | 
				
			||||||
uint32_t i32 = 3;
 | 
					 int32_t i32 = 3;
 | 
				
			||||||
uint32_t u32 = 4;
 | 
					uint32_t u32 = 4;
 | 
				
			||||||
uint64_t i64 = 5;
 | 
					 int64_t i64 = 5;
 | 
				
			||||||
uint64_t u64 = 6;
 | 
					uint64_t u64 = 6;
 | 
				
			||||||
float    f = M_PI;
 | 
					float    f = M_PI;
 | 
				
			||||||
double   d = 2*M_PI;
 | 
					double   d = 2*M_PI;
 | 
				
			||||||
bool     b = false;
 | 
					bool     b = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int main(int argc,char **argv)
 | 
					int main(int argc,char **argv)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    XMLWriter WR("bother.xml");
 | 
					    XMLWriter WR("bother.xml");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    push(WR,"BasicTypes");
 | 
					    push(WR,"BasicTypes");
 | 
				
			||||||
    write(WR,"i16",i16);
 | 
					    write(WR,std::string("i16"),i16);
 | 
				
			||||||
    write(WR,"u16",u16);
 | 
					    write(WR,"u16",u16);
 | 
				
			||||||
    write(WR,"i32",i32);
 | 
					    write(WR,"i32",i32);
 | 
				
			||||||
    write(WR,"i32",u32);
 | 
					    write(WR,"u32",u32);
 | 
				
			||||||
    write(WR,"i64",i64);
 | 
					    write(WR,"i64",i64);
 | 
				
			||||||
    write(WR,"i64",u64);
 | 
					    write(WR,"u64",u64);
 | 
				
			||||||
    write(WR,"f",f);
 | 
					    write(WR,"f",f);
 | 
				
			||||||
    write(WR,"d",d);
 | 
					    write(WR,"d",d);
 | 
				
			||||||
    write(WR,"b",b);
 | 
					    write(WR,"b",b);
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user