mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge remote-tracking branch 'origin/master' into ckelly-dec12-2015
This commit is contained in:
		
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							@@ -418,6 +418,244 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalOMPbench(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						 DoubledGaugeField & U,
 | 
			
		||||
						 const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=ss;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=ss+ ssoff;
 | 
			
		||||
 | 
			
		||||
	    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
			
		||||
	      sU = lo.Reorder(sU);
 | 
			
		||||
	    }
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=ss;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalL1bench(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						DoubledGaugeField & U,
 | 
			
		||||
						const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  alltime-=usecond();
 | 
			
		||||
  Compressor compressor(dag);
 | 
			
		||||
 | 
			
		||||
  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int HT      = GridThread::GetHyperThreads();
 | 
			
		||||
  int cores   = GridThread::GetCores();
 | 
			
		||||
  int nwork = U._grid->oSites();
 | 
			
		||||
  
 | 
			
		||||
  commtime -=usecond();
 | 
			
		||||
  auto handle = st.HaloExchangeBegin(in,compressor);
 | 
			
		||||
  st.HaloExchangeComplete(handle);
 | 
			
		||||
  commtime +=usecond();
 | 
			
		||||
 | 
			
		||||
  jointime -=usecond();
 | 
			
		||||
  jointime +=usecond();
 | 
			
		||||
  
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  // Not loop ordering and data layout.
 | 
			
		||||
  // Designed to create 
 | 
			
		||||
  // - per thread reuse in L1 cache for U
 | 
			
		||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel 
 | 
			
		||||
  {
 | 
			
		||||
  for(int jjj=0;jjj<100;jjj++){
 | 
			
		||||
#pragma omp barrier
 | 
			
		||||
  dslashtime -=usecond();
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
    if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	{
 | 
			
		||||
	  int sd;
 | 
			
		||||
	  for(sd=0;sd<Ls;sd++){
 | 
			
		||||
	    int sU=0;
 | 
			
		||||
	    int sF = sd+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    if( this->AsmOptDslash ) {
 | 
			
		||||
      //      for(int i=0;i<1;i++){
 | 
			
		||||
      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      //	PerformanceCounter Counter(i);
 | 
			
		||||
      //	Counter.Start();
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int t=0;t<threads;t++){
 | 
			
		||||
 | 
			
		||||
	int hyperthread = t%HT;
 | 
			
		||||
	int core        = t/HT;
 | 
			
		||||
 | 
			
		||||
        int sswork, swork,soff,ssoff,  sU,sF;
 | 
			
		||||
	
 | 
			
		||||
	GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
			
		||||
	GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
			
		||||
 | 
			
		||||
	for(int ss=0;ss<sswork;ss++){
 | 
			
		||||
	  for(int s=soff;s<soff+swork;s++){
 | 
			
		||||
 | 
			
		||||
	    sU=0;
 | 
			
		||||
	    sF = s+Ls*sU;
 | 
			
		||||
	    Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      //      Counter.Stop();
 | 
			
		||||
      //      Counter.Report();
 | 
			
		||||
      //      }
 | 
			
		||||
    } else if( this->HandOptDslash ) {
 | 
			
		||||
#pragma omp for
 | 
			
		||||
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU;
 | 
			
		||||
	  Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    } else { 
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
			
		||||
	int sU=0;
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  int sF = s+Ls*sU; 
 | 
			
		||||
	  Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  }
 | 
			
		||||
  dslashtime +=usecond();
 | 
			
		||||
  alltime+=usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
 | 
			
		||||
						     DoubledGaugeField & U,
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,4 @@
 | 
			
		||||
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
@@ -120,6 +121,20 @@ namespace Grid {
 | 
			
		||||
			FermionField &out,
 | 
			
		||||
			int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalOMPbench(StencilImpl & st,
 | 
			
		||||
				LebesgueOrder &lo,
 | 
			
		||||
				DoubledGaugeField &U,
 | 
			
		||||
				const FermionField &in, 
 | 
			
		||||
				FermionField &out,
 | 
			
		||||
				int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalL1bench(StencilImpl & st,
 | 
			
		||||
				LebesgueOrder &lo,
 | 
			
		||||
				DoubledGaugeField &U,
 | 
			
		||||
				const FermionField &in, 
 | 
			
		||||
				FermionField &out,
 | 
			
		||||
				int dag);
 | 
			
		||||
 | 
			
		||||
      void DhopInternalCommsThenCompute(StencilImpl & st,
 | 
			
		||||
			LebesgueOrder &lo,
 | 
			
		||||
			DoubledGaugeField &U,
 | 
			
		||||
@@ -148,7 +163,7 @@ namespace Grid {
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Data members require to support the functionality
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
    protected:
 | 
			
		||||
    public:
 | 
			
		||||
 | 
			
		||||
      // Add these to the support from Wilson
 | 
			
		||||
      GridBase *_FourDimGrid;
 | 
			
		||||
 
 | 
			
		||||
@@ -32,81 +32,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512wilson.h>
 | 
			
		||||
 | 
			
		||||
#undef VLOAD
 | 
			
		||||
#undef VSTORE
 | 
			
		||||
#undef VMUL
 | 
			
		||||
#undef VMADD
 | 
			
		||||
#undef ZEND
 | 
			
		||||
#undef ZLOAD
 | 
			
		||||
#undef ZMUL
 | 
			
		||||
#undef ZMADD
 | 
			
		||||
#undef VZERO
 | 
			
		||||
#undef VTIMESI
 | 
			
		||||
#undef VTIMESMINUSI
 | 
			
		||||
#undef VMOVIDUP 
 | 
			
		||||
#undef VMOVRDUP 
 | 
			
		||||
#undef VMADDSUB
 | 
			
		||||
#undef VSHUF
 | 
			
		||||
#include <simd/Intel512single.h>
 | 
			
		||||
 | 
			
		||||
#define VZERO(A)                  VZEROf(A)
 | 
			
		||||
#define VMOV(A,B)                 VMOVf(A,B)
 | 
			
		||||
#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
			
		||||
#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
			
		||||
 | 
			
		||||
#define VADD(A,B,C)               VADDf(A,B,C)
 | 
			
		||||
#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
			
		||||
#define VMUL(Uri,Uir,Chi)  VMULf(Uri,Uir,Chi)
 | 
			
		||||
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
 | 
			
		||||
#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
 | 
			
		||||
#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#define VPERM0(A,B)               VPERM0f(A,B)
 | 
			
		||||
#define VPERM1(A,B)               VPERM1f(A,B)
 | 
			
		||||
#define VPERM2(A,B)               VPERM2f(A,B)
 | 
			
		||||
#define VPERM3(A,B)               VPERM3f(A,B)
 | 
			
		||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
			
		||||
 | 
			
		||||
#define ZEND1(A,B,C)               ZEND1f(A,B,C)
 | 
			
		||||
#define ZEND2(A,B,C)               ZEND2f(A,B,C)
 | 
			
		||||
#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 | 
			
		||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
			
		||||
 | 
			
		||||
#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
			
		||||
 | 
			
		||||
#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
 | 
			
		||||
#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
 | 
			
		||||
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
 | 
			
		||||
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
 | 
			
		||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum) 
 | 
			
		||||
#define VSHUF(A,B) VSHUFf(A,B)
 | 
			
		||||
 | 
			
		||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
@@ -136,26 +63,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,ss);
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
  else               pf=(void *)&pbuf[SE->_offset];
 | 
			
		||||
 | 
			
		||||
  LOAD64(%r9,pf);
 | 
			
		||||
  __asm__( 
 | 
			
		||||
	  VPREFETCH(0,%r9)
 | 
			
		||||
	  VPREFETCH(1,%r9)
 | 
			
		||||
	  VPREFETCH(2,%r9)
 | 
			
		||||
	  VPREFETCH(3,%r9)
 | 
			
		||||
	  VPREFETCH(4,%r9)
 | 
			
		||||
	  VPREFETCH(5,%r9)
 | 
			
		||||
	  VPREFETCH(6,%r9)
 | 
			
		||||
	  VPREFETCH(7,%r9)
 | 
			
		||||
	  VPREFETCH(8,%r9)
 | 
			
		||||
	  VPREFETCH(9,%r9)
 | 
			
		||||
	  VPREFETCH(10,%r9)
 | 
			
		||||
	  VPREFETCH(11,%r9) );
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  // Xm
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
@@ -322,8 +229,6 @@ void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField
 | 
			
		||||
  offset = SE->_offset;
 | 
			
		||||
  local  = SE->_is_local;
 | 
			
		||||
    
 | 
			
		||||
  //  PREFETCH_R(A);
 | 
			
		||||
 | 
			
		||||
  // Prefetch
 | 
			
		||||
  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
 | 
			
		||||
  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										135
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										135
									
								
								lib/simd/Intel512double.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,135 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
// No guard can be multiply included as undef clearage
 | 
			
		||||
#undef VZERO
 | 
			
		||||
#undef VMOV
 | 
			
		||||
#undef VLOAD
 | 
			
		||||
#undef VSTORE
 | 
			
		||||
#define VZERO(A)                  VZEROd(A)
 | 
			
		||||
#define VMOV(A,B)                 VMOVd(A,B)
 | 
			
		||||
#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 | 
			
		||||
#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 | 
			
		||||
 | 
			
		||||
#undef VADD
 | 
			
		||||
#undef VSUB
 | 
			
		||||
#undef VMUL
 | 
			
		||||
#undef VMADD
 | 
			
		||||
#define VADD(A,B,C)               VADDd(A,B,C)
 | 
			
		||||
#define VSUB(A,B,C)               VSUBd(A,B,C)
 | 
			
		||||
#define VMUL(Uri,Uir,Chi)         VMULd(Uri,Uir,Chi)
 | 
			
		||||
#define VMADD(Uri,Uir,Chi)        VMADDd(Uri,Uir,Chi)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef VTIMESI
 | 
			
		||||
#undef VTIMESI0 
 | 
			
		||||
#undef VTIMESI1
 | 
			
		||||
#undef VTIMESI2 
 | 
			
		||||
#define VTIMESI(A,B,C)                 VTIMESId(A,B,C)
 | 
			
		||||
#define VTIMESI0(A,B,C)                VTIMESI0d(A,B,C)
 | 
			
		||||
#define VTIMESI1(A,B,C)                VTIMESI1d(A,B,C)
 | 
			
		||||
#define VTIMESI2(A,B,C)                VTIMESI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VTIMESMINUSI
 | 
			
		||||
#undef VTIMESMINUSI0
 | 
			
		||||
#undef VTIMESMINUSI1
 | 
			
		||||
#undef VTIMESMINUSI2
 | 
			
		||||
#define VTIMESMINUSI(A,B,C)            VTIMESMINUSId(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0d(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1d(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI
 | 
			
		||||
#undef VACCTIMESI0
 | 
			
		||||
#undef VACCTIMESI1
 | 
			
		||||
#undef VACCTIMESI2
 | 
			
		||||
#define VACCTIMESI(A,B,C)         VACCTIMESId(A,B,C)
 | 
			
		||||
#define VACCTIMESI0(A,B,C)             VACCTIMESI0d(A,B,C)
 | 
			
		||||
#define VACCTIMESI1(A,B,C)             VACCTIMESI1d(A,B,C)
 | 
			
		||||
#define VACCTIMESI2(A,B,C)             VACCTIMESI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI
 | 
			
		||||
#undef VACCTIMESMINUSI0
 | 
			
		||||
#undef VACCTIMESMINUSI1
 | 
			
		||||
#undef VACCTIMESMINUSI2
 | 
			
		||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSId(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0d(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1d(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2d(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI1MEM
 | 
			
		||||
#undef VACCTIMESI2MEM
 | 
			
		||||
#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMd(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMd(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI1MEM
 | 
			
		||||
#undef VACCTIMESMINUSI2MEM
 | 
			
		||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VPERM0
 | 
			
		||||
#undef VPERM1
 | 
			
		||||
#undef VPERM2
 | 
			
		||||
#undef VPERM3
 | 
			
		||||
#define VPERM0(A,B)               VPERM0d(A,B)
 | 
			
		||||
#define VPERM1(A,B)               VPERM1d(A,B)
 | 
			
		||||
#define VPERM2(A,B)               VPERM2d(A,B)
 | 
			
		||||
#define VPERM3(A,B)               VPERM3d(A,B)
 | 
			
		||||
 | 
			
		||||
#undef VSHUFMEM
 | 
			
		||||
#undef VADDMEM
 | 
			
		||||
#undef VSUBMEM
 | 
			
		||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMd(OFF,A,DEST)
 | 
			
		||||
#define VADDMEM(O,A,B,C)                                 VADDMEMd(O,A,B,C)
 | 
			
		||||
#define VSUBMEM(O,A,B,C)                                 VSUBMEMd(O,A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VMOVIDUP
 | 
			
		||||
#undef VMOVRDUP
 | 
			
		||||
#undef VMADDSUB
 | 
			
		||||
#undef VSHUF
 | 
			
		||||
#define VMOVIDUP(A,B,C)                                  VMOVIDUPd(A,B,C)
 | 
			
		||||
#define VMOVRDUP(A,B,C)                                  VMOVRDUPd(A,B,C)
 | 
			
		||||
#define VMADDSUB(A,B,accum)                              VMADDSUBd(A,B,accum) 
 | 
			
		||||
#define VSHUF(A,B)                                       VSHUFd(A,B)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef ZEND1
 | 
			
		||||
#undef ZEND2
 | 
			
		||||
#undef ZLOAD
 | 
			
		||||
#undef ZMUL
 | 
			
		||||
#undef ZMADD
 | 
			
		||||
#undef ZMULMEM2SP
 | 
			
		||||
#undef ZMADDMEM2SP
 | 
			
		||||
 | 
			
		||||
#define ZEND1(A,B,C)                                     ZEND1d(A,B,C)
 | 
			
		||||
#define ZEND2(A,B,C)                                     ZEND2d(A,B,C)
 | 
			
		||||
#define ZLOAD(A,B,C,D)                                   ZLOADd(A,B,C,D)
 | 
			
		||||
#define ZMUL(A,B,C,D,E)                                  ZMULd(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)                                 ZMADDd(A,B,C,D,E)
 | 
			
		||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										135
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										135
									
								
								lib/simd/Intel512single.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,135 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/simd/Avx512Asm.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
// No guard can be multiply included as undef clearge of macros
 | 
			
		||||
#undef VZERO
 | 
			
		||||
#undef VMOV
 | 
			
		||||
#undef VLOAD
 | 
			
		||||
#undef VSTORE
 | 
			
		||||
#define VZERO(A)                  VZEROf(A)
 | 
			
		||||
#define VMOV(A,B)                 VMOVf(A,B)
 | 
			
		||||
#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
			
		||||
#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
			
		||||
 | 
			
		||||
#undef VADD
 | 
			
		||||
#undef VSUB
 | 
			
		||||
#undef VMUL
 | 
			
		||||
#undef VMADD
 | 
			
		||||
#define VADD(A,B,C)               VADDf(A,B,C)
 | 
			
		||||
#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
			
		||||
#define VMUL(Uri,Uir,Chi)         VMULf(Uri,Uir,Chi)
 | 
			
		||||
#define VMADD(Uri,Uir,Chi)        VMADDf(Uri,Uir,Chi)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef VTIMESI
 | 
			
		||||
#undef VTIMESI0 
 | 
			
		||||
#undef VTIMESI1
 | 
			
		||||
#undef VTIMESI2 
 | 
			
		||||
#define VTIMESI(A,B,C)                 VTIMESIf(A,B,C)
 | 
			
		||||
#define VTIMESI0(A,B,C)                VTIMESI0f(A,B,C)
 | 
			
		||||
#define VTIMESI1(A,B,C)                VTIMESI1f(A,B,C)
 | 
			
		||||
#define VTIMESI2(A,B,C)                VTIMESI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VTIMESMINUSI
 | 
			
		||||
#undef VTIMESMINUSI0
 | 
			
		||||
#undef VTIMESMINUSI1
 | 
			
		||||
#undef VTIMESMINUSI2
 | 
			
		||||
#define VTIMESMINUSI(A,B,C)            VTIMESMINUSIf(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI0(A,B,C)           VTIMESMINUSI0f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI1(A,B,C)           VTIMESMINUSI1f(A,B,C)
 | 
			
		||||
#define VTIMESMINUSI2(A,B,C)           VTIMESMINUSI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI
 | 
			
		||||
#undef VACCTIMESI0
 | 
			
		||||
#undef VACCTIMESI1
 | 
			
		||||
#undef VACCTIMESI2
 | 
			
		||||
#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
			
		||||
#define VACCTIMESI0(A,B,C)             VACCTIMESI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESI1(A,B,C)             VACCTIMESI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESI2(A,B,C)             VACCTIMESI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI
 | 
			
		||||
#undef VACCTIMESMINUSI0
 | 
			
		||||
#undef VACCTIMESMINUSI1
 | 
			
		||||
#undef VACCTIMESMINUSI2
 | 
			
		||||
#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI0(A,B,C)        VACCTIMESMINUSI0f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI1(A,B,C)        VACCTIMESMINUSI1f(A,B,C)
 | 
			
		||||
#define VACCTIMESMINUSI2(A,B,C)        VACCTIMESMINUSI2f(A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESI1MEM
 | 
			
		||||
#undef VACCTIMESI2MEM
 | 
			
		||||
#define VACCTIMESI1MEM(A,ACC,O,P)      VACCTIMESI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESI2MEM(A,ACC,O,P)      VACCTIMESI2MEMf(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VACCTIMESMINUSI1MEM
 | 
			
		||||
#undef VACCTIMESMINUSI2MEM
 | 
			
		||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
			
		||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
			
		||||
 | 
			
		||||
#undef VPERM0
 | 
			
		||||
#undef VPERM1
 | 
			
		||||
#undef VPERM2
 | 
			
		||||
#undef VPERM3
 | 
			
		||||
#define VPERM0(A,B)               VPERM0f(A,B)
 | 
			
		||||
#define VPERM1(A,B)               VPERM1f(A,B)
 | 
			
		||||
#define VPERM2(A,B)               VPERM2f(A,B)
 | 
			
		||||
#define VPERM3(A,B)               VPERM3f(A,B)
 | 
			
		||||
 | 
			
		||||
#undef VSHUFMEM
 | 
			
		||||
#undef VADDMEM
 | 
			
		||||
#undef VSUBMEM
 | 
			
		||||
#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
			
		||||
#define VADDMEM(O,A,B,C)                                 VADDMEMf(O,A,B,C)
 | 
			
		||||
#define VSUBMEM(O,A,B,C)                                 VSUBMEMf(O,A,B,C)
 | 
			
		||||
 | 
			
		||||
#undef VMOVIDUP
 | 
			
		||||
#undef VMOVRDUP
 | 
			
		||||
#undef VMADDSUB
 | 
			
		||||
#undef VSHUF
 | 
			
		||||
#define VMOVIDUP(A,B,C)                                  VMOVIDUPf(A,B,C)
 | 
			
		||||
#define VMOVRDUP(A,B,C)                                  VMOVRDUPf(A,B,C)
 | 
			
		||||
#define VMADDSUB(A,B,accum)                              VMADDSUBf(A,B,accum) 
 | 
			
		||||
#define VSHUF(A,B)                                       VSHUFf(A,B)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#undef ZEND1
 | 
			
		||||
#undef ZEND2
 | 
			
		||||
#undef ZLOAD
 | 
			
		||||
#undef ZMUL
 | 
			
		||||
#undef ZMADD
 | 
			
		||||
#undef ZMULMEM2SP
 | 
			
		||||
#undef ZMADDMEM2SP
 | 
			
		||||
 | 
			
		||||
#define ZEND1(A,B,C)                                     ZEND1f(A,B,C)
 | 
			
		||||
#define ZEND2(A,B,C)                                     ZEND2f(A,B,C)
 | 
			
		||||
#define ZLOAD(A,B,C,D)                                   ZLOADf(A,B,C,D)
 | 
			
		||||
#define ZMUL(A,B,C,D,E)                                  ZMULf(A,B,C,D,E)
 | 
			
		||||
#define ZMADD(A,B,C,D,E)                                 ZMADDf(A,B,C,D,E)
 | 
			
		||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
			
		||||
 | 
			
		||||
@@ -201,7 +201,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
 | 
			
		||||
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
 | 
			
		||||
// Need detailed profile data to be sure.
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
#define PREFETCH_U(A) \
 | 
			
		||||
  LOAD64(%r8,&U._odata[sU](A)) \
 | 
			
		||||
  __asm__ (		       \
 | 
			
		||||
@@ -230,7 +230,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
  VPREFETCHW(9,%r8)	       \
 | 
			
		||||
  VPREFETCHW(10,%r8)	       \
 | 
			
		||||
  VPREFETCHW(11,%r8)	       );
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
 | 
			
		||||
 | 
			
		||||
@@ -244,6 +244,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
 | 
			
		||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
#define MULT_2SPIN_UNOPT(ptr)				\
 | 
			
		||||
	   LOAD64(%r8,ptr)			\
 | 
			
		||||
  __asm__ (					\
 | 
			
		||||
@@ -289,6 +290,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	   ZEND2(UChi_11,Z3,Chi_10)			\
 | 
			
		||||
	   ZEND2(UChi_02,Z4,Chi_02)			\
 | 
			
		||||
	   ZEND2(UChi_12,Z5,Chi_12)	     );
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
 | 
			
		||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
 | 
			
		||||
@@ -299,10 +301,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
 | 
			
		||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
 | 
			
		||||
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr);
 | 
			
		||||
 | 
			
		||||
// MULT_2SPINa(ptr)        MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
#define MULT_2SPIN_PF(ptr,pf,VPF)			\
 | 
			
		||||
	   LOAD64(%r8,ptr)			\
 | 
			
		||||
	   LOAD64(%r9,pf)			\
 | 
			
		||||
@@ -343,8 +344,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	   ZEND2(UChi_02,Z4,Chi_02)			\
 | 
			
		||||
	   VPF(11,%r9)						\
 | 
			
		||||
	   ZEND2(UChi_12,Z5,Chi_12)	     );
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if 0 
 | 
			
		||||
#define MULT_2SPIN_PFNONE(ptr,pf,VPF)			\
 | 
			
		||||
	   LOAD64(%r8,ptr)			\
 | 
			
		||||
	   LOAD64(%r9,pf)			\
 | 
			
		||||
@@ -364,7 +366,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
	   VPF(9,%r9)						\
 | 
			
		||||
	   VPF(10,%r9)						\
 | 
			
		||||
	   VPF(11,%r9)						);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// Pretty much Perfectly Pipelined
 | 
			
		||||
 | 
			
		||||
@@ -720,7 +722,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
  VSUB(UChi_11,result_31,result_31)\
 | 
			
		||||
  VSUB(UChi_12,result_32,result_32) );
 | 
			
		||||
 | 
			
		||||
#define PREFETCH_CHIMU(A) 
 | 
			
		||||
//define PREFETCH_CHIMU(A) 
 | 
			
		||||
 | 
			
		||||
#define PERMUTE_DIR0 __asm__ ( 	\
 | 
			
		||||
  VPERM0(Chi_00,Chi_00)	\
 | 
			
		||||
@@ -813,4 +815,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
           VMADDSUB(Z5,Chi_12,UChi_12)\
 | 
			
		||||
                                                );
 | 
			
		||||
 | 
			
		||||
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user