mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Remove assembly tests
This commit is contained in:
		@@ -1,494 +0,0 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./tests/Test_zmm.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
#ifdef TEST_ZMM
 | 
			
		||||
 | 
			
		||||
int main(int argc,char **argv)
 | 
			
		||||
{
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
#include <simd/Intel512wilson.h>
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 ;
 | 
			
		||||
 | 
			
		||||
void ZmulF(void *ptr1,void *ptr2,void *ptr3);
 | 
			
		||||
void Zmul(void *ptr1,void *ptr2,void *ptr3);
 | 
			
		||||
void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
 | 
			
		||||
void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
 | 
			
		||||
void TimesIAvx512F(void *ptr1,void *ptr3);
 | 
			
		||||
void TimesIAvx512(void *ptr1,void *ptr3);
 | 
			
		||||
void TimesMinusIAvx512F(void *ptr1,void *ptr3);
 | 
			
		||||
void TimesMinusIAvx512(void *ptr1,void *ptr3);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main(int argc,char **argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  auto latt4 = GridDefaultLatt();
 | 
			
		||||
  const int Ls=16;
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  auto mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
 | 
			
		||||
 | 
			
		||||
  vColourMatrixD mat;
 | 
			
		||||
  vHalfSpinColourVectorD vec;
 | 
			
		||||
  vHalfSpinColourVectorD vec1;
 | 
			
		||||
  vHalfSpinColourVectorD vec2;
 | 
			
		||||
  vHalfSpinColourVectorD vec3;
 | 
			
		||||
 
 | 
			
		||||
  vHalfSpinColourVectorD matvec;
 | 
			
		||||
  vHalfSpinColourVectorD ref;
 | 
			
		||||
  vComplexD err;
 | 
			
		||||
 | 
			
		||||
  random(sRNG,vec1); 
 | 
			
		||||
  vec1 = std::complex<double>(0.1,3.0);
 | 
			
		||||
  random(sRNG,vec2);
 | 
			
		||||
  vec2=2.0;
 | 
			
		||||
  random(sRNG,vec3);
 | 
			
		||||
 | 
			
		||||
  //std::cout << "Zmul  vec1"<<vec1<<" &vec1 "<<& vec1<<std::endl;
 | 
			
		||||
  //std::cout << "Zmul  vec2"<<vec2<<" &vec2 "<<& vec2<<std::endl;
 | 
			
		||||
  //std::cout << "Zmul  vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
 | 
			
		||||
  for(int sp=0;sp<2;sp++){
 | 
			
		||||
  for(int co=0;co<3;co++){
 | 
			
		||||
  ref()(sp)(co) = vec1()(sp)(co)*vec2()(sp)(co);
 | 
			
		||||
  }}
 | 
			
		||||
 | 
			
		||||
  Zmul((void *)&vec1,(void *)&vec2,(void *)&vec3);
 | 
			
		||||
  //std::cout << "Zmul  vec3"<<vec3<<" &vec3 "<<& vec3<<std::endl;
 | 
			
		||||
  //std::cout << "Zmul \n\t ref "<<ref<<"\n\t vec3"<<vec3 <<std::endl;
 | 
			
		||||
  ref = ref - vec3;
 | 
			
		||||
  err = TensorRemove(innerProduct(ref,ref));
 | 
			
		||||
  std::cout <<"Zmul diff   "<< Reduce(err)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  random(sRNG,mat);
 | 
			
		||||
  mat = Zero();
 | 
			
		||||
  mat()()(0,0) = 1.0;
 | 
			
		||||
  random(sRNG,vec);
 | 
			
		||||
 | 
			
		||||
  ref = mat*vec;
 | 
			
		||||
  
 | 
			
		||||
  WilsonDslashAvx512((void *)&vec, (void *)&mat,(void *)&matvec);
 | 
			
		||||
 | 
			
		||||
  //std::cout << ref   <<std::endl;
 | 
			
		||||
  //std::cout << matvec<<std::endl;
 | 
			
		||||
  ref = ref - matvec;
 | 
			
		||||
  err = TensorRemove(innerProduct(ref,ref));
 | 
			
		||||
  std::cout <<"Double SU3 x 2spin diff   "<< Reduce(err)<<std::endl;
 | 
			
		||||
  vColourMatrixF matF;
 | 
			
		||||
  vHalfSpinColourVectorF vec1F;
 | 
			
		||||
  vHalfSpinColourVectorF vec2F;
 | 
			
		||||
  vHalfSpinColourVectorF vec3F;
 | 
			
		||||
  vHalfSpinColourVectorF vecF;
 | 
			
		||||
  vHalfSpinColourVectorF matvecF;
 | 
			
		||||
  vHalfSpinColourVectorF refF;
 | 
			
		||||
  vComplexF errF;
 | 
			
		||||
 | 
			
		||||
  random(sRNG,matF);
 | 
			
		||||
  matF = Zero();
 | 
			
		||||
  matF()()(0,0)=1.0;
 | 
			
		||||
  random(sRNG,vecF);
 | 
			
		||||
 | 
			
		||||
  refF = matF*vecF;
 | 
			
		||||
 | 
			
		||||
  WilsonDslashAvx512F((void *)&vecF, (void *)&matF,(void *)&matvecF);
 | 
			
		||||
  //std::cout << refF   <<std::endl;
 | 
			
		||||
  //std::cout << matvecF<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
  refF = refF-matvecF;
 | 
			
		||||
  errF = TensorRemove(innerProduct(refF,refF));
 | 
			
		||||
  std::cout <<"Single SU3 x 2spin diff   "<< Reduce(errF)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  TimesIAvx512F((void *)&vecF,(void *)&matvecF);
 | 
			
		||||
  //std::cout << timesI(vecF)<<std::endl;
 | 
			
		||||
  //std::cout << matvecF<<std::endl;
 | 
			
		||||
  refF = timesI(vecF)-matvecF;
 | 
			
		||||
  errF = TensorRemove(innerProduct(refF,refF));
 | 
			
		||||
  std::cout <<" timesI single diff  "<< Reduce(errF)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  TimesIAvx512((void *)&vec,(void *)&matvec);
 | 
			
		||||
  //std::cout << timesI(vec)<<std::endl;
 | 
			
		||||
  //std::cout << matvec<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
  ref = timesI(vec)-matvec;
 | 
			
		||||
  err = TensorRemove(innerProduct(ref,ref));
 | 
			
		||||
  std::cout <<" timesI double diff  "<< Reduce(err)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  TimesMinusIAvx512F((void *)&vecF,(void *)&matvecF);
 | 
			
		||||
  //std::cout << timesMinusI(vecF)<<std::endl;
 | 
			
		||||
  //std::cout << matvecF<<std::endl;
 | 
			
		||||
  refF = timesMinusI(vecF)-matvecF;
 | 
			
		||||
  errF = TensorRemove(innerProduct(refF,refF));
 | 
			
		||||
  std::cout <<" timesMinusI single diff  "<< Reduce(errF)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  TimesMinusIAvx512((void *)&vec,(void *)&matvec);
 | 
			
		||||
  //std::cout << timesMinusI(vec)<<std::endl;
 | 
			
		||||
  //std::cout << matvec<<std::endl;
 | 
			
		||||
 | 
			
		||||
  ref = timesMinusI(vec)-matvec;
 | 
			
		||||
  err = TensorRemove(innerProduct(ref,ref));
 | 
			
		||||
  std::cout <<" timesMinusI double diff  "<< Reduce(err)<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src (FGrid);
 | 
			
		||||
  LatticeFermion tmp (FGrid);
 | 
			
		||||
  LatticeFermion srce(FrbGrid);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion resulto(FrbGrid); resulto=Zero();
 | 
			
		||||
  LatticeFermion resulta(FrbGrid); resulta=Zero();
 | 
			
		||||
  LatticeFermion diff(FrbGrid); 
 | 
			
		||||
  LatticeGaugeField Umu(UGrid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  random(RNG5,src);
 | 
			
		||||
#if 1
 | 
			
		||||
  SU3::HotConfiguration(RNG4,Umu);
 | 
			
		||||
#else
 | 
			
		||||
  int mmu=2;
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,UGrid);
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
    if ( mu!=mmu ) U[mu] = Zero();
 | 
			
		||||
    if ( mu==mmu ) U[mu] = 1.0;
 | 
			
		||||
    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 pickCheckerboard(Even,srce,src);
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
			
		||||
  int ncall=50;
 | 
			
		||||
  double t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.DhopOE(srce,resulto,0);
 | 
			
		||||
  }
 | 
			
		||||
  double t1=usecond();
 | 
			
		||||
 | 
			
		||||
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
  double flops=1344*volume/2;
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm result "<< norm2(resulto)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "mflop/s =   "<< flops*ncall/(t1-t0)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
 | 
			
		||||
#if 1
 | 
			
		||||
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
    PerformanceCounter Counter(i);
 | 
			
		||||
    Counter.Start();
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
    Counter.Stop();
 | 
			
		||||
    Counter.Report();
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  //resulta = (-0.5) * resulta;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Called Asm Dw"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm result "<< norm2(resulta)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "mflop/s =   "<< flops*ncall/(t1-t0)<<std::endl;
 | 
			
		||||
  diff = resulto-resulta;
 | 
			
		||||
  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
 | 
			
		||||
  std::cout<<std::endl;
 | 
			
		||||
#if 0
 | 
			
		||||
  std::cout<<"=========== result Grid ============="<<std::endl;
 | 
			
		||||
  std::cout<<std::endl;
 | 
			
		||||
  tmp = Zero();
 | 
			
		||||
  setCheckerboard(tmp,resulto);
 | 
			
		||||
  std::cout<<tmp<<std::endl;
 | 
			
		||||
  std::cout<<std::endl;
 | 
			
		||||
  std::cout<<"=========== result ASM ============="<<std::endl;
 | 
			
		||||
  std::cout<<std::endl;
 | 
			
		||||
  tmp = Zero();
 | 
			
		||||
  setCheckerboard(tmp,resulta);
 | 
			
		||||
  std::cout<<tmp<<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512double.h>
 | 
			
		||||
 | 
			
		||||
#define zz Z0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void Zmul(void *ptr1,void *ptr2,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k6 " : : :);
 | 
			
		||||
  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k7 " : : :);
 | 
			
		||||
 | 
			
		||||
#define CC result_00
 | 
			
		||||
  LOAD64(%r9,ptr1);
 | 
			
		||||
  LOAD64(%r8,ptr2);
 | 
			
		||||
  LOAD64(%r10,ptr3)
 | 
			
		||||
  __asm__ (
 | 
			
		||||
  VLOAD(0,%r8,CC)
 | 
			
		||||
  ZLOAD(0,%r9,Chi_00,Z0) 
 | 
			
		||||
  ZMUL(Chi_00,Z0,CC,UChi_00,Z1)
 | 
			
		||||
  //VSTORE(0,%r10,UChi_00)
 | 
			
		||||
  //VSTORE(1,%r10,Z1)
 | 
			
		||||
  ZEND1(UChi_00,Z1,Z0)
 | 
			
		||||
  //VSTORE(2,%r10,UChi_00)
 | 
			
		||||
  ZEND2(UChi_00,Z1,Z0)
 | 
			
		||||
  //VSTORE(3,%r10,UChi_00)
 | 
			
		||||
  VSTORE(0,%r10,UChi_00)
 | 
			
		||||
  VLOAD(1,%r8,CC)
 | 
			
		||||
  ZLOAD(1,%r9,Chi_01,Z0) 
 | 
			
		||||
  ZMUL(Chi_01,Z0,CC,UChi_01,Z1)
 | 
			
		||||
  ZEND1(UChi_01,Z1,Z0)
 | 
			
		||||
  ZEND2(UChi_01,Z1,Z0)
 | 
			
		||||
  VSTORE(1,%r10,UChi_01)
 | 
			
		||||
  VLOAD(2,%r8,CC)
 | 
			
		||||
  ZLOAD(2,%r9,Chi_02,Z0) 
 | 
			
		||||
  ZMUL(Chi_02,Z0,CC,UChi_02,Z1)
 | 
			
		||||
  ZEND1(UChi_02,Z1,Z0)
 | 
			
		||||
  ZEND2(UChi_02,Z1,Z0)
 | 
			
		||||
  VSTORE(2,%r10,UChi_02)
 | 
			
		||||
  VLOAD(3,%r8,CC)
 | 
			
		||||
  ZLOAD(3,%r9,Chi_10,Z0) 
 | 
			
		||||
  ZMUL(Chi_10,Z0,CC,UChi_10,Z1)
 | 
			
		||||
  ZEND1(UChi_10,Z1,Z0)
 | 
			
		||||
  ZEND2(UChi_10,Z1,Z0)
 | 
			
		||||
  VSTORE(3,%r10,UChi_10)
 | 
			
		||||
  VLOAD(4,%r8,CC)
 | 
			
		||||
  ZLOAD(4,%r9,Chi_11,Z0) 
 | 
			
		||||
  ZMUL(Chi_11,Z0,CC,UChi_11,Z1)
 | 
			
		||||
  ZEND1(UChi_11,Z1,Z0)
 | 
			
		||||
  ZEND2(UChi_11,Z1,Z0)
 | 
			
		||||
  VSTORE(4,%r10,UChi_11)
 | 
			
		||||
  VLOAD(5,%r8,CC)
 | 
			
		||||
  ZLOAD(5,%r9,Chi_12,Z0) 
 | 
			
		||||
  ZMUL(Chi_12,Z0,CC,UChi_12,Z1)
 | 
			
		||||
  ZEND1(UChi_12,Z1,Z0)
 | 
			
		||||
  ZEND2(UChi_12,Z1,Z0)
 | 
			
		||||
  VSTORE(5,%r10,UChi_12)
 | 
			
		||||
  );
 | 
			
		||||
}
 | 
			
		||||
void TimesMinusIAvx512(void *ptr1,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k6 " : : :);
 | 
			
		||||
  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k7 " : : :);
 | 
			
		||||
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
 | 
			
		||||
  LOAD_CHI(ptr1);
 | 
			
		||||
 | 
			
		||||
  __asm__ (
 | 
			
		||||
  VZERO(zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_00,UChi_00,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_01,UChi_01,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_02,UChi_02,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_10,UChi_10,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_11,UChi_11,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_12,UChi_12,zz)
 | 
			
		||||
  );
 | 
			
		||||
 | 
			
		||||
  SAVE_UCHI(ptr3);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TimesIAvx512(void *ptr1,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k6 " : : :);
 | 
			
		||||
  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k7 " : : :);
 | 
			
		||||
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
  
 | 
			
		||||
  LOAD_CHI(ptr1);
 | 
			
		||||
  
 | 
			
		||||
  __asm__ (
 | 
			
		||||
  VZERO(zz)
 | 
			
		||||
  VTIMESI(Chi_00,UChi_00,zz)
 | 
			
		||||
  VTIMESI(Chi_01,UChi_01,zz)
 | 
			
		||||
  VTIMESI(Chi_02,UChi_02,zz)
 | 
			
		||||
  VTIMESI(Chi_10,UChi_10,zz)
 | 
			
		||||
  VTIMESI(Chi_11,UChi_11,zz)
 | 
			
		||||
  VTIMESI(Chi_12,UChi_12,zz)
 | 
			
		||||
  );
 | 
			
		||||
 | 
			
		||||
  SAVE_UCHI(ptr3);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  int return_address;
 | 
			
		||||
  // prototype computed goto to eliminate ABI save restore on call/return in
 | 
			
		||||
  // generated assembly.
 | 
			
		||||
  static void * table[] = { &&save, &&mult };
 | 
			
		||||
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
 | 
			
		||||
  LOAD_CHI(ptr1);
 | 
			
		||||
 | 
			
		||||
  return_address = 0;
 | 
			
		||||
  goto mult;
 | 
			
		||||
 | 
			
		||||
 save:
 | 
			
		||||
  SAVE_UCHI(ptr3);
 | 
			
		||||
  return;
 | 
			
		||||
 | 
			
		||||
 mult:
 | 
			
		||||
  MULT_2SPIN(ptr2);
 | 
			
		||||
  goto *table[return_address];
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#include <simd/Intel512single.h>
 | 
			
		||||
 | 
			
		||||
void ZmulF(void *ptr1,void *ptr2,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k6 " : : :);
 | 
			
		||||
  __asm__ ("mov     $0x5555, %%eax "  : : :"%eax");
 | 
			
		||||
  __asm__ ("kmovw    %%eax, %%k7 " : : :);
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
  ZLOAD(0,ptr1,Chi_00,Z0);
 | 
			
		||||
  ZLOAD(1,ptr1,Chi_01,Z1);
 | 
			
		||||
  ZLOAD(2,ptr1,Chi_02,Z2);
 | 
			
		||||
  ZLOAD(3,ptr1,Chi_10,Z3);
 | 
			
		||||
  ZLOAD(4,ptr1,Chi_11,Z4);
 | 
			
		||||
  ZLOAD(5,ptr1,Chi_12,Z5);
 | 
			
		||||
 | 
			
		||||
  VLOAD(0,ptr2,Chi_20);
 | 
			
		||||
  VLOAD(1,ptr2,Chi_21);
 | 
			
		||||
  VLOAD(2,ptr2,Chi_22);
 | 
			
		||||
  VLOAD(3,ptr2,Chi_30);  
 | 
			
		||||
  VLOAD(4,ptr2,Chi_31);  
 | 
			
		||||
  VLOAD(5,ptr2,Chi_32);  
 | 
			
		||||
 | 
			
		||||
  ZMUL(Chi_00,Z0,Chi_20,UChi_00,UChi_20);
 | 
			
		||||
  ZMUL(Chi_01,Z1,Chi_21,UChi_01,UChi_21);
 | 
			
		||||
  ZMUL(Chi_02,Z2,Chi_22,UChi_02,UChi_22);
 | 
			
		||||
  ZMUL(Chi_10,Z3,Chi_23,UChi_10,UChi_30);
 | 
			
		||||
  ZMUL(Chi_11,Z4,Chi_24,UChi_11,UChi_31);
 | 
			
		||||
  ZMUL(Chi_12,Z5,Chi_25,UChi_12,UChi_32);
 | 
			
		||||
  
 | 
			
		||||
  ZEND1(UChi_00,UChi_20,Z0);
 | 
			
		||||
  ZEND1(UChi_01,UChi_21,Z1);
 | 
			
		||||
  ZEND1(UChi_02,UChi_22,Z2);
 | 
			
		||||
  ZEND1(UChi_10,UChi_30,Z3);
 | 
			
		||||
  ZEND1(UChi_11,UChi_31,Z4);
 | 
			
		||||
  ZEND1(UChi_12,UChi_32,Z5);
 | 
			
		||||
 | 
			
		||||
  ZEND2(UChi_00,UChi_20,Z0);
 | 
			
		||||
  ZEND2(UChi_01,UChi_21,Z1);
 | 
			
		||||
  ZEND2(UChi_02,UChi_22,Z2);
 | 
			
		||||
  ZEND2(UChi_10,UChi_30,Z3);
 | 
			
		||||
  ZEND2(UChi_11,UChi_31,Z4);
 | 
			
		||||
  ZEND2(UChi_12,UChi_32,Z5);
 | 
			
		||||
 | 
			
		||||
  SAVE_UCHI(ptr3); 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TimesMinusIAvx512F(void *ptr1,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
 | 
			
		||||
  LOAD_CHI(ptr1);
 | 
			
		||||
  __asm__ (
 | 
			
		||||
  VZERO(zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_00,UChi_00,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_01,UChi_01,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_02,UChi_02,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_10,UChi_10,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_11,UChi_11,zz)
 | 
			
		||||
  VTIMESMINUSI(Chi_12,UChi_12,zz)
 | 
			
		||||
           );
 | 
			
		||||
  SAVE_UCHI(ptr3);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TimesIAvx512F(void *ptr1,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
  
 | 
			
		||||
  LOAD_CHI(ptr1);
 | 
			
		||||
  __asm__ (
 | 
			
		||||
  VZERO(zz)
 | 
			
		||||
  VTIMESI(Chi_00,UChi_00,zz)
 | 
			
		||||
  VTIMESI(Chi_01,UChi_01,zz)
 | 
			
		||||
  VTIMESI(Chi_02,UChi_02,zz)
 | 
			
		||||
  VTIMESI(Chi_10,UChi_10,zz)
 | 
			
		||||
  VTIMESI(Chi_11,UChi_11,zz)
 | 
			
		||||
  VTIMESI(Chi_12,UChi_12,zz)
 | 
			
		||||
	   );
 | 
			
		||||
  SAVE_UCHI(ptr3);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)
 | 
			
		||||
{
 | 
			
		||||
  MASK_REGS;
 | 
			
		||||
 | 
			
		||||
  LOAD_CHI(ptr1);
 | 
			
		||||
 | 
			
		||||
  MULT_ADDSUB_2SPIN(ptr2);
 | 
			
		||||
  //MULT_2SPIN(ptr2);
 | 
			
		||||
 | 
			
		||||
  SAVE_UCHI(ptr3);
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
#else
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
  std::cerr << "error: no ZMM test for the selected architecture" << std::endl;
 | 
			
		||||
 | 
			
		||||
  return 1;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
		Reference in New Issue
	
	Block a user