1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Merge branch 'master' into hadrons

# Conflicts:
#	configure
This commit is contained in:
Antonin Portelli 2015-12-02 10:57:51 +00:00
commit 4a7f3d1b7b
66 changed files with 6258 additions and 9317 deletions

View File

@ -24,7 +24,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=8;
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -82,22 +82,25 @@ int main (int argc, char ** argv)
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall=10;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
double t1=usecond();
int ncall=100;
{
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
Dw.Report();
}
if (1)
@ -140,6 +143,18 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
{
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);

View File

@ -8,7 +8,7 @@ int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Nvec=4;
const int Nvec=8;
typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
typedef iVector<vReal,Nvec> Vec;
@ -28,7 +28,7 @@ int main (int argc, char ** argv)
const int lmax = 16536*16;
for(int lat=4;lat<=lmax;lat*=2){
int Nloop=lmax*128*4/lat;
int Nloop=lmax*4/lat;
std::vector<int> latt_size ({2*mpi_layout[0],2*mpi_layout[1],4*mpi_layout[2],lat*mpi_layout[3]});
@ -45,7 +45,7 @@ int main (int argc, char ** argv)
std::vector<LatticeVec> x(threads,&Grid);
for(int t=0;t<threads;t++){
random(pRNG,x[t]);
// random(pRNG,x[t]);
}
double start=usecond();

View File

@ -28,7 +28,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t lmax=44;
#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
for(int lat=4;lat<=lmax;lat+=4){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
@ -37,11 +37,11 @@ int main (int argc, char ** argv)
uint64_t Nloop=NLOOP;
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
double a=2.0;
@ -72,11 +72,11 @@ int main (int argc, char ** argv)
int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
double a=2.0;
uint64_t Nloop=NLOOP;
@ -110,11 +110,11 @@ int main (int argc, char ** argv)
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
RealD a=2.0;
@ -145,10 +145,10 @@ int main (int argc, char ** argv)
uint64_t Nloop=NLOOP;
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); random(pRNG,z);
LatticeVec x(&Grid); random(pRNG,x);
LatticeVec y(&Grid); random(pRNG,y);
// GridParallelRNG pRNG(&Grid); pRNG.SeedRandomDevice();
LatticeVec z(&Grid); //random(pRNG,z);
LatticeVec x(&Grid); //random(pRNG,x);
LatticeVec y(&Grid); //random(pRNG,y);
RealD a=2.0;
Real nn;
double start=usecond();

View File

@ -90,7 +90,7 @@ int main (int argc, char ** argv)
WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall=10000;
int ncall=1000;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);

8035
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -38,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(gmp.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
@ -65,7 +66,7 @@ AC_CHECK_FUNCS([gettimeofday])
#Please install or provide the correct path to your installation
#Info at: http://www.mpfr.org/)])
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|IMCI],\
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
@ -90,6 +91,15 @@ case ${ac_SIMD} in
AC_MSG_WARN([Your processor does not support AVX instructions])
fi
;;
AVXFMA4)
echo Configuring for AVX
AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
if test x"$ax_cv_support_avx_ext" = x"yes"; then dnl minimal support for AVX
supported=yes
else
AC_MSG_WARN([Your processor does not support AVX instructions])
fi
;;
AVX2)
echo Configuring for AVX2
AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )

View File

@ -18,8 +18,8 @@
#include <algorithms/iterative/ConjugateGradientMultiShift.h>
// Lanczos support
//#include <algorithms/iterative/MatrixUtils.h>
//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <algorithms/iterative/MatrixUtils.h>
#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <algorithms/CoarsenedMatrix.h>

View File

@ -9,6 +9,9 @@
/* AVX512 Intrinsics for Knights Landing */
#undef AVX512
/* AVX Intrinsics with FMA4 */
#undef AVXFMA4
/* EMPTY_SIMD only for DEBUGGING */
#undef EMPTY_SIMD

View File

@ -1,7 +1,6 @@
/****************************************************************************/
/* pab: Signal magic. Processor state dump is x86-64 specific */
/****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
@ -16,10 +15,10 @@
#include <algorithm>
#include <iterator>
#undef __X86_64
#define MAC
#define __X86_64
#ifdef MAC
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
@ -31,8 +30,10 @@ namespace Grid {
//////////////////////////////////////////////////////
static std::vector<int> Grid_default_latt;
static std::vector<int> Grid_default_mpi;
int GridThread::_threads;
int GridThread::_threads =1;
int GridThread::_hyperthreads=1;
int GridThread::_cores=1;
const std::vector<int> &GridDefaultLatt(void) {return Grid_default_latt;};
const std::vector<int> &GridDefaultMpi(void) {return Grid_default_mpi;};
@ -118,13 +119,19 @@ void GridParseLayout(char **argv,int argc,
arg= GridCmdOptionPayload(argv,argv+argc,"--grid");
GridCmdOptionIntVector(arg,latt);
}
if( GridCmdOptionExists(argv,argv+argc,"--omp") ){
if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
std::vector<int> ompthreads(0);
arg= GridCmdOptionPayload(argv,argv+argc,"--omp");
arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
GridCmdOptionIntVector(arg,ompthreads);
assert(ompthreads.size()==1);
GridThread::SetThreads(ompthreads[0]);
}
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
std::vector<int> cores(0);
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
GridCmdOptionIntVector(arg,cores);
GridThread::SetCores(cores[0]);
}
}
@ -183,6 +190,11 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
}
GridParseLayout(*argv,*argc,
Grid_default_latt,
Grid_default_mpi);
@ -222,11 +234,14 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
printf(" mem address %llx\n",(unsigned long long)si->si_addr);
printf(" code %d\n",si->si_code);
#ifdef __X86_64
// Linux/Posix
#ifdef __linux__
// And x86 64bit
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
printf(" instruction %llx\n",(unsigned long long)sc->rip);
#define REG(A) printf(" %s %lx\n",#A,sc-> A);
REG(rdi);
REG(rsi);
REG(rbp);
@ -247,7 +262,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
REG(r14);
REG(r15);
#endif
#ifdef MAC
#ifdef HAVE_EXECINFO_H
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
for (int i = 0; i < symbols; i++){

View File

@ -1,4 +1,4 @@
HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/MultiShiftFunction.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/CoarsenedMatrix.h ./algorithms/iterative/AdefGeneric.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/ConjugateGradientMultiShift.h ./algorithms/iterative/ConjugateResidual.h ./algorithms/iterative/ImplicitlyRestartedLanczos.h ./algorithms/iterative/MatrixUtils.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/PrecConjugateResidual.h ./algorithms/iterative/PrecGeneralisedConjugateResidual.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/Preconditioner.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Config.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./Init.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_comparison_utils.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_unary.h ./lattice/Lattice_where.h ./Lattice.h ./Log.h ./Old/Tensor_peek.h ./Old/Tensor_poke.h ./parallelIO/BinaryIO.h ./parallelIO/NerscIO.h ./pugixml/pugixml.h ./qcd/action/ActionBase.h ./qcd/action/ActionParams.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/FermionOperatorImpl.h ./qcd/action/fermion/g5HermitianLinop.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h ./qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/gauge/WilsonGaugeAction.h ./qcd/action/pseudofermion/EvenOddSchurDifferentiable.h ./qcd/action/pseudofermion/OneFlavourEvenOddRational.h ./qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h ./qcd/action/pseudofermion/OneFlavourRational.h ./qcd/action/pseudofermion/OneFlavourRationalRatio.h ./qcd/action/pseudofermion/TwoFlavour.h ./qcd/action/pseudofermion/TwoFlavourEvenOdd.h ./qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h ./qcd/action/pseudofermion/TwoFlavourRatio.h ./qcd/hmc/HMC.h ./qcd/hmc/integrators/Integrator.h ./qcd/hmc/integrators/Integrator_algorithm.h ./qcd/QCD.h ./qcd/spin/Dirac.h ./qcd/spin/TwoSpinor.h ./qcd/utils/CovariantCshift.h ./qcd/utils/LinalgUtils.h ./qcd/utils/SpaceTimeGrid.h ./qcd/utils/SUn.h ./qcd/utils/WilsonLoops.h ./serialisation/BaseIO.h ./serialisation/BinaryIO.h ./serialisation/MacroMagic.h ./serialisation/Serialisation.h ./serialisation/TextIO.h ./serialisation/XmlIO.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_empty.h ./simd/Grid_imci.h ./simd/Grid_neon.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Grid_vector_unops.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_determinant.h ./tensors/Tensor_exp.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_index.h ./tensors/Tensor_inner.h ./tensors/Tensor_logical.h ./tensors/Tensor_outer.h ./tensors/Tensor_reality.h ./tensors/Tensor_Ta.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./tensors/Tensor_unary.h ./Tensors.h ./Threads.h ./Timer.h
CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
CCFILES=./algorithms/approx/MultiShiftFunction.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./Init.cc ./Log.cc ./pugixml/pugixml.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/hmc/HMC.cc ./qcd/spin/Dirac.cc ./qcd/utils/SpaceTimeGrid.cc ./serialisation/BinaryIO.cc ./serialisation/TextIO.cc ./serialisation/XmlIO.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc

28
lib/PerfCount.cc Normal file
View File

@ -0,0 +1,28 @@
#include <Grid.h>
#include <PerfCount.h>
namespace Grid {
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." },
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." },
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." },
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." },
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......"},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS...."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS....."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS..."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS.."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS"},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS......."},
// { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS....."},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......"},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS...."},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS..."},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS."},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......"},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS...."}
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
};
}

133
lib/PerfCount.h Normal file
View File

@ -0,0 +1,133 @@
#ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H
#include <sys/time.h>
#include <ctime>
#include <chrono>
#include <string.h>
#include <sys/ioctl.h>
#include <syscall.h>
#include <linux/perf_event.h>
namespace Grid {
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags)
{
int ret;
ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
group_fd, flags);
return ret;
}
class PerformanceCounter {
private:
typedef struct {
public:
uint32_t type;
uint64_t config;
const char *name;
} PerformanceCounterConfig;
static const PerformanceCounterConfig PerformanceCounterConfigs [];
public:
enum PerformanceCounterType {
CPUCYCLES=0,
INSTRUCTIONS,
// STALL_CYCLES,
CACHE_REFERENCES,
CACHE_MISSES,
L1D_READ_MISS,
L1D_READ_ACCESS,
L1D_WRITE_MISS,
L1D_WRITE_ACCESS,
L1D_PREFETCH_MISS,
L1D_PREFETCH_ACCESS,
LL_READ_MISS,
// LL_READ_ACCESS,
LL_WRITE_MISS,
LL_WRITE_ACCESS,
LL_PREFETCH_MISS,
LL_PREFETCH_ACCESS,
L1I_READ_MISS,
L1I_READ_ACCESS,
PERFORMANCE_COUNTER_NUM_TYPES
};
public:
int PCT;
struct perf_event_attr pe;
long long count;
int fd;
uint64_t elapsed;
uint64_t begin;
static int NumTypes(void){
return PERFORMANCE_COUNTER_NUM_TYPES;
}
PerformanceCounter(int _pct) {
assert(_pct>=0);
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
fd=-1;
count=0;
PCT =_pct;
Open();
}
void Open(void)
{
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.size = sizeof(struct perf_event_attr);
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
pe.inherit = 1;
pe.type = PerformanceCounterConfigs[PCT].type;
pe.config= PerformanceCounterConfigs[PCT].config;
const char * name = PerformanceCounterConfigs[PCT].name;
fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
if (fd == -1) {
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
perror("Error is");
}
}
void Start(void)
{
if ( fd!= -1) {
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
}
begin =__rdtsc();
}
void Stop(void) {
count=0;
if ( fd!= -1) {
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
::read(fd, &count, sizeof(long long));
}
elapsed = __rdtsc() - begin;
}
void Report(void) {
printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
}
~PerformanceCounter()
{
close(fd);
}
};
}
#endif

View File

@ -48,10 +48,14 @@ namespace Grid {
int _around_the_world;
};
template<class vobj,class cobj, class compressor>
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public:
typedef uint32_t StencilInteger;
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object;
int _checkerboard;
int _npoints; // Move to template param?
@ -66,33 +70,334 @@ namespace Grid {
// npoints x Osites() of these
std::vector<std::vector<StencilEntry> > _entries;
// Comms buffers
std::vector<std::vector<scalar_object> > send_buf_extract;
std::vector<std::vector<scalar_object> > recv_buf_extract;
std::vector<scalar_object *> pointers;
std::vector<scalar_object *> rpointers;
Vector<cobj> send_buf;
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
int _unified_buffer_size;
int _request_count;
double buftime;
double gathertime;
double commtime;
double commstime;
double halotime;
double scattertime;
double mergetime;
double gathermtime;
double splicetime;
double nosplicetime;
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances);
CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
{
gathertime=0;
commtime=0;
commstime=0;
halotime=0;
scattertime=0;
mergetime=0;
gathermtime=0;
buftime=0;
splicetime=0;
nosplicetime=0;
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_entries[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
// std::cout<<"Comms 0x3"<<std::endl;
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
// std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
}
}
// for(int ss=0;ss<osites;ss++){
// std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
// _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
// }
}
}
void Local (int point, int dimension,int shiftpm,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (shiftpm+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) ) {
wraparound = 1;
}
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
}
}
void Comms (int point,int dimension,int shiftpm,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
// assert(simd_layout==1); // Why?
assert(comm_dim==1);
int shift = (shiftpm + fd) %fd;
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
int offnode = (comm_proc!= 0);
// std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
wraparound = 1;
}
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute=0;
_entries[point][so+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute =0;
_entries[point][so+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// CartesianStencil(GridBase *grid,
// int npoints,
// int checkerboard,
// const std::vector<int> &directions,
// const std::vector<int> &distances);
// Add to tables for various cases; is this mistaken. only local if 1 proc in dim
// Can this be avoided with simpler coding of comms?
void Local (int point, int dimension,int shift,int cbmask);
void Comms (int point, int dimension,int shift,int cbmask);
void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
// void Local (int point, int dimension,int shift,int cbmask);
// void Comms (int point, int dimension,int shift,int cbmask);
// void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
// void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
// Could allow a functional munging of the halo to another type during the comms.
// this could implement the 16bit/32bit/64bit compression.
template<class vobj,class cobj, class compressor> void
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
{
// conformable(source._grid,_grid);
assert(source._grid==_grid);
halotime-=usecond();
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
int u_comm_offset=0;
@ -126,24 +431,33 @@ namespace Grid {
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
if (splice_dim) {
splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
splicetime+=usecond();
} else {
nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
nosplicetime+=usecond();
}
} else {
std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if(splice_dim){
splicetime-=usecond();
GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
splicetime+=usecond();
} else {
nosplicetime-=usecond();
GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
nosplicetime+=usecond();
}
}
}
}
halotime+=usecond();
}
template<class vobj,class cobj, class compressor>
void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor & compress)
@ -164,28 +478,29 @@ namespace Grid {
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc) {
int words = send_buf.size();
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(cobj);
gathertime-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
gathertime+=usecond();
int rank = _grid->_processor;
int recv_from_rank;
@ -195,31 +510,27 @@ namespace Grid {
assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy
commtime-=usecond();
_grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
(void *)&u_comm_buf[u_comm_offset],
recv_from_rank,
bytes);
commtime+=usecond();
for(int i=0;i<buffer_size;i++){
u_comm_buf[u_comm_offset+i]=recv_buf[i];
}
u_comm_offset+=buffer_size;
u_comm_offset+=words;
}
}
}
template<class vobj,class cobj, class compressor>
void GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
int &u_comm_offset,compressor &compress)
{
buftime-=usecond();
const int Nsimd = _grid->Nsimd();
typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type;
typedef typename cobj::scalar_object scalar_object;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@ -241,17 +552,23 @@ namespace Grid {
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
int words = sizeof(cobj)/sizeof(vector_type);
/*
* possibly slow to allocate
* Doesn't matter in this test, but may want to preallocate in the
* dirac operators
*/
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);
assert(cbmask==0x3); // Fixme think there is a latent bug if not true
std::vector<scalar_object *> pointers(Nsimd); //
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
// Should grow to max size and then cost very little thereafter
send_buf_extract.resize(Nsimd);
recv_buf_extract.resize(Nsimd);
for(int l=0;l<Nsimd;l++){
if( send_buf_extract[l].size() < buffer_size) {
send_buf_extract[l].resize(buffer_size);
recv_buf_extract[l].resize(buffer_size);
}
}
pointers.resize(Nsimd);
rpointers.resize(Nsimd);
int bytes = buffer_size*sizeof(scalar_object);
buftime+=usecond();
///////////////////////////////////////////
// Work out what to send where
@ -272,7 +589,9 @@ namespace Grid {
}
int sx = (x+sshift)%rd;
gathermtime-=usecond();
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
gathermtime+=usecond();
for(int i=0;i<Nsimd;i++){
@ -299,11 +618,13 @@ namespace Grid {
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
commstime-=usecond();
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
xmit_to_rank,
(void *)&recv_buf_extract[i][0],
recv_from_rank,
bytes);
commstime+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
@ -313,11 +634,13 @@ namespace Grid {
}
// Here we don't want to scatter, just place into a buffer.
mergetime-=usecond();
PARALLEL_FOR_LOOP
for(int i=0;i<buffer_size;i++){
assert(u_comm_offset+i<_unified_buffer_size);
// assert(u_comm_offset+i<_unified_buffer_size);
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
}
mergetime+=usecond();
u_comm_offset+=buffer_size;
}
}

View File

@ -24,7 +24,16 @@ namespace Grid {
class GridThread {
public:
static int _threads;
static int _hyperthreads;
static int _cores;
static void SetCores(int cr) {
#ifdef GRID_OMP
_cores = cr;
#else
_cores = 1;
#endif
}
static void SetThreads(int thr) {
#ifdef GRID_OMP
_threads = MIN(thr,omp_get_max_threads()) ;
@ -35,22 +44,28 @@ class GridThread {
};
static void SetMaxThreads(void) {
#ifdef GRID_OMP
setenv("KMP_AFFINITY","balanced",1);
_threads = omp_get_max_threads();
omp_set_num_threads(_threads);
#else
_threads = 1;
#endif
};
static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
static int GetCores(void) { return _cores; };
static int GetThreads(void) { return _threads; };
static int SumArraySize(void) {return _threads;};
static void GetWork(int nwork, int me, int & mywork, int & myoff){
int basework = nwork/_threads;
int backfill = _threads-(nwork%_threads);
if ( me >= _threads ) {
GetWork(nwork,me,mywork,myoff,_threads);
}
static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
int basework = nwork/units;
int backfill = units-(nwork%units);
if ( me >= units ) {
mywork = myoff = 0;
} else {
mywork = (nwork+me)/_threads;
mywork = (nwork+me)/units;
myoff = basework * me;
if ( me > backfill )
myoff+= (me-backfill);

View File

@ -170,7 +170,7 @@ namespace Grid {
////////////////////
Geometry geom;
GridBase * _grid;
CartesianStencil Stencil;
CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil;
std::vector<CoarseMatrix> A;

View File

@ -9,23 +9,34 @@ namespace Grid {
////////////////////////////////////////////////////////////////////////////////////////////
// Simple general polynomial with user supplied coefficients
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<double> Coeffs;
std::vector<RealD> Coeffs;
public:
Polynomial(std::vector<double> &_Coeffs) : Coeffs(_Coeffs) {};
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN = in;
Field AtoN(in._grid);
Field Mtmp(in._grid);
AtoN = in;
out = AtoN*Coeffs[0];
// std::cout <<"Poly in " <<norm2(in)<<std::endl;
// std::cout <<"0 " <<norm2(out)<<std::endl;
for(int n=1;n<Coeffs.size();n++){
Field Mtmp=AtoN;
Linop.Op(Mtmp,AtoN);
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
// std::cout << n<<" " <<norm2(out)<<std::endl;
}
};
};
@ -36,15 +47,15 @@ namespace Grid {
template<class Field>
class Chebyshev : public OperatorFunction<Field> {
private:
std::vector<double> Coeffs;
std::vector<RealD> Coeffs;
int order;
double hi;
double lo;
RealD hi;
RealD lo;
public:
void csv(std::ostream &out){
for (double x=lo; x<hi; x+=(hi-lo)/1000) {
double f = approx(x);
for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
@ -53,15 +64,19 @@ namespace Grid {
// Convenience for plotting the approximation
void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(double x=lo;x<hi;x+=(hi-lo)/50.0){
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
//
Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
////////////////////////////////////////////////////////////////////////////////////////////////////
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
@ -69,24 +84,26 @@ namespace Grid {
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
double s=0;
RealD s=0;
for(int k=0;k<order;k++){
double y=std::cos(M_PI*(k+0.5)/order);
double x=0.5*(y*(hi-lo)+(hi+lo));
double f=func(x);
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
double M=order;
double alpha = M_PI/(M+2);
double lmax = std::cos(alpha);
double sumUsq =0;
std::vector<double> U(M);
std::vector<double> a(M);
std::vector<double> g(M);
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
@ -107,18 +124,18 @@ namespace Grid {
Coeffs[m]*=g[m];
}
}
double approx(double x) // Convenience for plotting the approximation
RealD approx(RealD x) // Convenience for plotting the approximation
{
double Tn;
double Tnm;
double Tnp;
RealD Tn;
RealD Tnm;
RealD Tnp;
double y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
double T0=1;
double T1=y;
RealD T0=1;
RealD T1=y;
double sum;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
@ -151,8 +168,8 @@ namespace Grid {
std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
// Tn=T1 = (xscale M + mscale)in
double xscale = 2.0/(hi-lo);
double mscale = -(hi+lo)/(hi-lo);
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
T1=y*xscale+in*mscale;
@ -179,5 +196,121 @@ namespace Grid {
};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
alpha(_alpha),
beta(_beta),
mu(_mu)
{
order=_order;
Coeffs.resize(order);
for(int i=0;i<_order;i++){
Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in._grid;
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in._grid;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
}
#endif

View File

@ -0,0 +1,106 @@
#ifndef GRID_DENSE_MATRIX_H
#define GRID_DENSE_MATRIX_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Matrix untils
/////////////////////////////////////////////////////////////
template<class T> using DenseVector = std::vector<T>;
template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
template<class T> void Size(DenseVector<T> & vec, int &N)
{
N= vec.size();
}
template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M)
{
N= mat.size();
M= mat[0].size();
}
template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N)
{
int M; Size(mat,N,M);
assert(N==M);
}
template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) {
mat.resize(N);
for(int i=0;i<N;i++){
mat[i].resize(M);
}
}
template<class T> void Fill(DenseMatrix<T> & mat, T&val) {
int N,M;
Size(mat,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
mat[i][j] = val;
}}
}
/** Transpose of a matrix **/
template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
int N,M;
Size(mat,N,M);
DenseMatrix<T> C; Resize(C,M,N);
for(int i=0;i<M;i++){
for(int j=0;j<N;j++){
C[i][j] = mat[j][i];
}}
return C;
}
/** Set DenseMatrix to unit matrix **/
template<class T> void Unity(DenseMatrix<T> &A){
int N; SizeSquare(A,N);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
if ( i==j ) A[i][j] = 1;
else A[i][j] = 0;
}
}
}
/** Add C * I to matrix **/
template<class T>
void PlusUnit(DenseMatrix<T> & A,T c){
int dim; SizeSquare(A,dim);
for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;}
}
/** return the Hermitian conjugate of matrix **/
template<class T>
DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
int dim; SizeSquare(mat,dim);
DenseMatrix<T> C; Resize(C,dim,dim);
for(int i=0;i<dim;i++){
for(int j=0;j<dim;j++){
C[i][j] = conj(mat[j][i]);
}
}
return C;
}
/**Get a square submatrix**/
template <class T>
DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
{
DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
H[i-row_st][j-col_st]=A[i][j];
}}
return H;
}
}
#include <algorithms/iterative/Householder.h>
#include <algorithms/iterative/Francis.h>
#endif

View File

@ -0,0 +1,52 @@
#ifndef GRID_EIGENSORT_H
#define GRID_EIGENSORT_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Eigen sorter to begin with
/////////////////////////////////////////////////////////////
template<class Field>
class SortEigen {
private:
static bool less_lmd(RealD left,RealD right){
return fabs(left) < fabs(right);
}
static bool less_pair(std::pair<RealD,Field>& left,
std::pair<RealD,Field>& right){
return fabs(left.first) < fabs(right.first);
}
public:
void push(DenseVector<RealD>& lmd,
DenseVector<Field>& evec,int N) {
DenseVector<std::pair<RealD, Field> > emod;
typename DenseVector<std::pair<RealD, Field> >::iterator it;
for(int i=0;i<lmd.size();++i){
emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
}
partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
it=emod.begin();
for(int i=0;i<N;++i){
lmd[i]=it->first;
evec[i]=it->second;
++it;
}
}
void push(DenseVector<RealD>& lmd,int N) {
std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
}
bool saturated(RealD lmd, RealD thrs) {
return fabs(lmd) > fabs(thrs);
}
};
}
#endif

View File

@ -0,0 +1,498 @@
#ifndef FRANCIS_H
#define FRANCIS_H
#include <cstdlib>
#include <string>
#include <cmath>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <fstream>
#include <complex>
#include <algorithm>
//#include <timer.h>
//#include <lapacke.h>
//#include <Eigen/Dense>
namespace Grid {
template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
template <class T> int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
/**
Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
H =
x x x x x x x x x
x x x x x x x x x
0 x x x x x x x x
0 0 x x x x x x x
0 0 0 x x x x x x
0 0 0 0 x x x x x
0 0 0 0 0 x x x x
0 0 0 0 0 0 x x x
0 0 0 0 0 0 0 x x
Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
**/
template <class T>
int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
{
DenseMatrix<T> H = Hin;
int N ; SizeSquare(H,N);
int M = N;
Fill(evals,0);
Fill(evecs,0);
T s,t,x=0,y=0,z=0;
T u,d;
T apd,amd,bc;
DenseVector<T> p(N,0);
T nrm = Norm(H); ///DenseMatrix Norm
int n, m;
int e = 0;
int it = 0;
int tot_it = 0;
int l = 0;
int r = 0;
DenseMatrix<T> P; Resize(P,N,N); Unity(P);
DenseVector<int> trows(N,0);
/// Check if the matrix is really hessenberg, if not abort
RealD sth = 0;
for(int j=0;j<N;j++){
for(int i=j+2;i<N;i++){
sth = abs(H[i][j]);
if(sth > small){
std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
exit(1);
}
}
}
do{
std::cout << "Francis QR Step N = " << N << std::endl;
/** Check for convergence
x x x x x
0 x x x x
0 0 x x x
0 0 x x x
0 0 0 0 x
for this matrix l = 4
**/
do{
l = Chop_subdiag(H,nrm,e,small);
r = 0; ///May have converged on more than one eval
///Single eval
if(l == N-1){
evals[e] = H[l][l];
N--; e++; r++; it = 0;
}
///RealD eval
if(l == N-2){
trows[l+1] = 1; ///Needed for UTSolve
apd = H[l][l] + H[l+1][l+1];
amd = H[l][l] - H[l+1][l+1];
bc = (T)4.0*H[l+1][l]*H[l][l+1];
evals[e] = (T)0.5*( apd + sqrt(amd*amd + bc) );
evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
N-=2; e+=2; r++; it = 0;
}
} while(r>0);
if(N ==0) break;
DenseVector<T > ck; Resize(ck,3);
DenseVector<T> v; Resize(v,3);
for(int m = N-3; m >= l; m--){
///Starting vector essentially random shift.
if(it%10 == 0 && N >= 3 && it > 0){
s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
z = H[m+1][m]*H[m+2][m+1];
}
///Starting vector implicit Q theorem
else{
s = (H[N-2][N-2] + H[N-1][N-1]);
t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
z = H[m+1][m]*H[m+2][m+1];
}
ck[0] = x; ck[1] = y; ck[2] = z;
if(m == l) break;
/** Some stupid thing from numerical recipies, seems to work**/
// PAB.. for heaven's sake quote page, purpose, evidence it works.
// what sort of comment is that!?!?!?
u=abs(H[m][m-1])*(abs(y)+abs(z));
d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
if ((T)abs(u+d) == (T)abs(d) ){
l = m; break;
}
//if (u < small){l = m; break;}
}
if(it > 100000){
std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
std::cout << "got " << e << " evals " << l << " " << N << std::endl;
exit(1);
}
normalize(ck); ///Normalization cancels in PHP anyway
T beta;
Householder_vector<T >(ck, 0, 2, v, beta);
Householder_mult<T >(H,v,beta,0,l,l+2,0);
Householder_mult<T >(H,v,beta,0,l,l+2,1);
///Accumulate eigenvector
Householder_mult<T >(P,v,beta,0,l,l+2,1);
int sw = 0; ///Are we on the last row?
for(int k=l;k<N-2;k++){
x = H[k+1][k];
y = H[k+2][k];
z = (T)0.0;
if(k+3 <= N-1){
z = H[k+3][k];
} else{
sw = 1;
v[2] = (T)0.0;
}
ck[0] = x; ck[1] = y; ck[2] = z;
normalize(ck);
Householder_vector<T >(ck, 0, 2-sw, v, beta);
Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
///Accumulate eigenvector
Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
}
it++;
tot_it++;
}while(N > 1);
N = evals.size();
///Annoying - UT solves in reverse order;
DenseVector<T> tmp; Resize(tmp,N);
for(int i=0;i<N;i++){
tmp[i] = evals[N-i-1];
}
evals = tmp;
UTeigenvectors(H, trows, evals, evecs);
for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
return tot_it;
}
template <class T>
int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
{
/**
Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
H =
x x 0 0 0 0
x x x 0 0 0
0 x x x 0 0
0 0 x x x 0
0 0 0 x x x
0 0 0 0 x x
Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary. **/
return my_Wilkinson(Hin, evals, evecs, small, small);
}
template <class T>
int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
{
int N; SizeSquare(Hin,N);
int M = N;
///I don't want to modify the input but matricies must be passed by reference
//Scale a matrix by its "norm"
//RealD Hnorm = abs( Hin.LargestDiag() ); H = H*(1.0/Hnorm);
DenseMatrix<T> H; H = Hin;
RealD Hnorm = abs(Norm(Hin));
H = H * (1.0 / Hnorm);
// TODO use openmp and memset
Fill(evals,0);
Fill(evecs,0);
T s, t, x = 0, y = 0, z = 0;
T u, d;
T apd, amd, bc;
DenseVector<T> p; Resize(p,N); Fill(p,0);
T nrm = Norm(H); ///DenseMatrix Norm
int n, m;
int e = 0;
int it = 0;
int tot_it = 0;
int l = 0;
int r = 0;
DenseMatrix<T> P; Resize(P,N,N);
Unity(P);
DenseVector<int> trows(N, 0);
/// Check if the matrix is really symm tridiag
RealD sth = 0;
for(int j = 0; j < N; ++j)
{
for(int i = j + 2; i < N; ++i)
{
if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
{
std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
std::cout << "Warning tridiagonalize and call again" << std::endl;
// exit(1); // see what is going on
//return;
}
}
}
do{
do{
//Jasper
//Check if the subdiagonal term is small enough (<small)
//if true then it is converged.
//check start from H.dim - e - 1
//How to deal with more than 2 are converged?
//What if Chop_symm_subdiag return something int the middle?
//--------------
l = Chop_symm_subdiag(H,nrm, e, small);
r = 0; ///May have converged on more than one eval
//Jasper
//In this case
// x x 0 0 0 0
// x x x 0 0 0
// 0 x x x 0 0
// 0 0 x x x 0
// 0 0 0 x x 0
// 0 0 0 0 0 x <- l
//--------------
///Single eval
if(l == N - 1)
{
evals[e] = H[l][l];
N--;
e++;
r++;
it = 0;
}
//Jasper
// x x 0 0 0 0
// x x x 0 0 0
// 0 x x x 0 0
// 0 0 x x 0 0
// 0 0 0 0 x x <- l
// 0 0 0 0 x x
//--------------
///RealD eval
if(l == N - 2)
{
trows[l + 1] = 1; ///Needed for UTSolve
apd = H[l][l] + H[l + 1][ l + 1];
amd = H[l][l] - H[l + 1][l + 1];
bc = (T) 4.0 * H[l + 1][l] * H[l][l + 1];
evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
N -= 2;
e += 2;
r++;
it = 0;
}
}while(r > 0);
//Jasper
//Already converged
//--------------
if(N == 0) break;
DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
for(int m = N - 3; m >= l; m--)
{
///Starting vector essentially random shift.
if(it%10 == 0 && N >= 3 && it > 0)
{
t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
x = H[m][m] - t;
z = H[m + 1][m];
} else {
///Starting vector implicit Q theorem
d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
t = H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2]
/ (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
x = H[m][m] - t;
z = H[m + 1][m];
}
//Jasper
//why it is here????
//-----------------------
if(m == l)
break;
u = abs(H[m][m - 1]) * (abs(y) + abs(z));
d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
if ((T)abs(u + d) == (T)abs(d))
{
l = m;
break;
}
}
//Jasper
if(it > 1000000)
{
std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
std::cout << "got " << e << " evals " << l << " " << N << std::endl;
exit(1);
}
//
T s, c;
Givens_calc<T>(x, z, c, s);
Givens_mult<T>(H, l, l + 1, c, -s, 0);
Givens_mult<T>(H, l, l + 1, c, s, 1);
Givens_mult<T>(P, l, l + 1, c, s, 1);
//
for(int k = l; k < N - 2; ++k)
{
x = H.A[k + 1][k];
z = H.A[k + 2][k];
Givens_calc<T>(x, z, c, s);
Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
Givens_mult<T>(H, k + 1, k + 2, c, s, 1);
Givens_mult<T>(P, k + 1, k + 2, c, s, 1);
}
it++;
tot_it++;
}while(N > 1);
N = evals.size();
///Annoying - UT solves in reverse order;
DenseVector<T> tmp(N);
for(int i = 0; i < N; ++i)
tmp[i] = evals[N-i-1];
evals = tmp;
//
UTeigenvectors(H, trows, evals, evecs);
//UTSymmEigenvectors(H, trows, evals, evecs);
for(int i = 0; i < evals.size(); ++i)
{
evecs[i] = P * evecs[i];
normalize(evecs[i]);
evals[i] = evals[i] * Hnorm;
}
// // FIXME this is to test
// Hin.write("evecs3", evecs);
// Hin.write("evals3", evals);
// // check rsd
// for(int i = 0; i < M; i++) {
// vector<T> Aevec = Hin * evecs[i];
// RealD norm2(0.);
// for(int j = 0; j < M; j++) {
// norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
// }
// }
return tot_it;
}
template <class T>
void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
/**
turn a matrix A =
x x x x x
x x x x x
x x x x x
x x x x x
x x x x x
into
x x x x x
x x x x x
0 x x x x
0 0 x x x
0 0 0 x x
with householder rotations
Slow.
*/
int N ; SizeSquare(A,N);
DenseVector<T > p; Resize(p,N); Fill(p,0);
for(int k=start;k<N-2;k++){
//cerr << "hess" << k << std::endl;
DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);} ///kth column
normalize(ck); ///Normalization cancels in PHP anyway
T beta;
Householder_vector<T >(ck, 0, ck.size()-1, v, beta); ///Householder vector
Householder_mult<T>(A,v,beta,start,k+1,N-1,0); ///A -> PA
Householder_mult<T >(A,v,beta,start,k+1,N-1,1); ///PA -> PAP^H
///Accumulate eigenvector
Householder_mult<T >(Q,v,beta,start,k+1,N-1,1); ///Q -> QP^H
}
/*for(int l=0;l<N-2;l++){
for(int k=l+2;k<N;k++){
A(0,k,l);
}
}*/
}
template <class T>
void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
///Tridiagonalize a matrix
int N; SizeSquare(A,N);
Hess(A,Q,start);
/*for(int l=0;l<N-2;l++){
for(int k=l+2;k<N;k++){
A(0,l,k);
}
}*/
}
template <class T>
void ForceTridiagonal(DenseMatrix<T> &A){
///Tridiagonalize a matrix
int N ; SizeSquare(A,N);
for(int l=0;l<N-2;l++){
for(int k=l+2;k<N;k++){
A[l][k]=0;
A[k][l]=0;
}
}
}
template <class T>
int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
///Solve a symmetric eigensystem, not necessarily in tridiagonal form
int N; SizeSquare(Ain,N);
DenseMatrix<T > A; A = Ain;
DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
Tri(A,Q,0);
int it = my_Wilkinson<T>(A, evals, evecs, small);
for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
return it;
}
template <class T>
int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
return my_Wilkinson(Ain, evals, evecs, small);
}
template <class T>
int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
return my_SymmEigensystem(Ain, evals, evecs, small);
}
template <class T>
int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
///Solve a general eigensystem, not necessarily in tridiagonal form
int N = Ain.dim;
DenseMatrix<T > A(N); A = Ain;
DenseMatrix<T > Q(N);Q.Unity();
Hess(A,Q,0);
int it = QReigensystem<T>(A, evals, evecs, small);
for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
return it;
}
}
#endif

View File

@ -0,0 +1,215 @@
#ifndef HOUSEHOLDER_H
#define HOUSEHOLDER_H
#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
#define ENTER() std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
#define LEAVE() std::cout << GridLogMessage << "EXIT "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
#include <cstdlib>
#include <string>
#include <cmath>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <fstream>
#include <complex>
#include <algorithm>
namespace Grid {
/** Comparison function for finding the max element in a vector **/
template <class T> bool cf(T i, T j) {
return abs(i) < abs(j);
}
/**
Calculate a real Givens angle
**/
template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
RealD mz = (RealD)abs(z);
if(mz==0.0){
c = 1; s = 0;
}
if(mz >= (RealD)abs(y)){
T t = -y/z;
s = (T)1.0 / sqrt ((T)1.0 + t * t);
c = s * t;
} else {
T t = -z/y;
c = (T)1.0 / sqrt ((T)1.0 + t * t);
s = c * t;
}
}
template <class T> inline void Givens_mult(DenseMatrix<T> &A, int i, int k, T c, T s, int dir)
{
int q ; SizeSquare(A,q);
if(dir == 0){
for(int j=0;j<q;j++){
T nu = A[i][j];
T w = A[k][j];
A[i][j] = (c*nu + s*w);
A[k][j] = (-s*nu + c*w);
}
}
if(dir == 1){
for(int j=0;j<q;j++){
T nu = A[j][i];
T w = A[j][k];
A[j][i] = (c*nu - s*w);
A[j][k] = (s*nu + c*w);
}
}
}
/**
from input = x;
Compute the complex Householder vector, v, such that
P = (I - b v transpose(v) )
b = 2/v.v
P | x | | x | k = 0
| x | | 0 |
| x | = | 0 |
| x | | 0 | j = 3
| x | | x |
These are the "Unreduced" Householder vectors.
**/
template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
{
int N ; Size(input,N);
T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
if(abs(m) > 0.0){
T alpha = 0;
for(int i=k; i<j+1; i++){
v[i] = input[i]/m;
alpha = alpha + v[i]*conj(v[i]);
}
alpha = sqrt(alpha);
beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
if(abs(v[k]) > 0.0) v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
else v[k] = -alpha;
} else{
for(int i=k; i<j+1; i++){
v[i] = 0.0;
}
}
}
/**
from input = x;
Compute the complex Householder vector, v, such that
P = (I - b v transpose(v) )
b = 2/v.v
Px = alpha*e_dir
These are the "Unreduced" Householder vectors.
**/
template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
{
int N = input.size();
T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
if(abs(m) > 0.0){
T alpha = 0;
for(int i=k; i<j+1; i++){
v[i] = input[i]/m;
alpha = alpha + v[i]*conj(v[i]);
}
alpha = sqrt(alpha);
beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
else v[dir] = -alpha;
}else{
for(int i=k; i<j+1; i++){
v[i] = 0.0;
}
}
}
/**
Compute the product PA if trans = 0
AP if trans = 1
P = (I - b v transpose(v) )
b = 2/v.v
start at element l of matrix A
v is of length j - k + 1 of v are nonzero
**/
template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
{
int N ; SizeSquare(A,N);
if(abs(beta) > 0.0){
for(int p=l; p<N; p++){
T s = 0;
if(trans==0){
for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
s *= beta;
for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
} else {
for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
s *= beta;
for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
}
}
}
}
/**
Compute the product PA if trans = 0
AP if trans = 1
P = (I - b v transpose(v) )
b = 2/v.v
start at element l of matrix A
v is of length j - k + 1 of v are nonzero
A is tridiagonal
**/
template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
{
if(abs(beta) > 0.0){
int N ; SizeSquare(A,N);
DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0);
T s;
for(int p=l; p<M; p++){
s = 0;
if(trans==0){
for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
}else{
for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
}
s = beta*s;
if(trans==0){
for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
}else{
for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
}
}
for(int p=l; p<M; p++){
if(trans==0){
for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
}else{
for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
}
}
}
}
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,426 @@
#ifndef MATRIX_H
#define MATRIX_H
#include <cstdlib>
#include <string>
#include <cmath>
#include <vector>
#include <iostream>
#include <iomanip>
#include <complex>
#include <typeinfo>
#include <Grid.h>
/** Sign function **/
template <class T> T sign(T p){return ( p/abs(p) );}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////// Hijack STL containers for our wicked means /////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class T> using Vector = Vector<T>;
template<class T> using Matrix = Vector<Vector<T> >;
template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); }
template<class T> void Resize(Matrix<T > & mat, int N, int M) {
mat.resize(N);
for(int i=0;i<N;i++){
mat[i].resize(M);
}
}
template<class T> void Size(Vector<T> & vec, int &N)
{
N= vec.size();
}
template<class T> void Size(Matrix<T> & mat, int &N,int &M)
{
N= mat.size();
M= mat[0].size();
}
template<class T> void SizeSquare(Matrix<T> & mat, int &N)
{
int M; Size(mat,N,M);
assert(N==M);
}
template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1)
{
int N2,M2;
Size(mat1,N1,M1);
Size(mat2,N2,M2);
assert(N1==N2);
assert(M1==M2);
}
//*****************************************
//* (Complex) Vector operations *
//*****************************************
/**Conj of a Vector **/
template <class T> Vector<T> conj(Vector<T> p){
Vector<T> q(p.size());
for(int i=0;i<p.size();i++){q[i] = conj(p[i]);}
return q;
}
/** Norm of a Vector**/
template <class T> T norm(Vector<T> p){
T sum = 0;
for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
return abs(sqrt(sum));
}
/** Norm squared of a Vector **/
template <class T> T norm2(Vector<T> p){
T sum = 0;
for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
return abs((sum));
}
/** Sum elements of a Vector **/
template <class T> T trace(Vector<T> p){
T sum = 0;
for(int i=0;i<p.size();i++){sum = sum + p[i];}
return sum;
}
/** Fill a Vector with constant c **/
template <class T> void Fill(Vector<T> &p, T c){
for(int i=0;i<p.size();i++){p[i] = c;}
}
/** Normalize a Vector **/
template <class T> void normalize(Vector<T> &p){
T m = norm(p);
if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;}
}
/** Vector by scalar **/
template <class T, class U> Vector<T> times(Vector<T> p, U s){
for(int i=0;i<p.size();i++){p[i] *= s;}
return p;
}
template <class T, class U> Vector<T> times(U s, Vector<T> p){
for(int i=0;i<p.size();i++){p[i] *= s;}
return p;
}
/** inner product of a and b = conj(a) . b **/
template <class T> T inner(Vector<T> a, Vector<T> b){
T m = 0.;
for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];}
return m;
}
/** sum of a and b = a + b **/
template <class T> Vector<T> add(Vector<T> a, Vector<T> b){
Vector<T> m(a.size());
for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];}
return m;
}
/** sum of a and b = a - b **/
template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){
Vector<T> m(a.size());
for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];}
return m;
}
/**
*********************************
* Matrices *
*********************************
**/
template<class T> void Fill(Matrix<T> & mat, T&val) {
int N,M;
Size(mat,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
mat[i][j] = val;
}}
}
/** Transpose of a matrix **/
Matrix<T> Transpose(Matrix<T> & mat){
int N,M;
Size(mat,N,M);
Matrix C; Resize(C,M,N);
for(int i=0;i<M;i++){
for(int j=0;j<N;j++){
C[i][j] = mat[j][i];
}}
return C;
}
/** Set Matrix to unit matrix **/
template<class T> void Unity(Matrix<T> &mat){
int N; SizeSquare(mat,N);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
if ( i==j ) A[i][j] = 1;
else A[i][j] = 0;
}
}
}
/** Add C * I to matrix **/
template<class T>
void PlusUnit(Matrix<T> & A,T c){
int dim; SizeSquare(A,dim);
for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;}
}
/** return the Hermitian conjugate of matrix **/
Matrix<T> HermitianConj(Matrix<T> &mat){
int dim; SizeSquare(mat,dim);
Matrix<T> C; Resize(C,dim,dim);
for(int i=0;i<dim;i++){
for(int j=0;j<dim;j++){
C[i][j] = conj(mat[j][i]);
}
}
return C;
}
/** return diagonal entries as a Vector **/
Vector<T> diag(Matrix<T> &A)
{
int dim; SizeSquare(A,dim);
Vector<T> d; Resize(d,dim);
for(int i=0;i<dim;i++){
d[i] = A[i][i];
}
return d;
}
/** Left multiply by a Vector **/
Vector<T> operator *(Vector<T> &B,Matrix<T> &A)
{
int K,M,N;
Size(B,K);
Size(A,M,N);
assert(K==M);
Vector<T> C; Resize(C,N);
for(int j=0;j<N;j++){
T sum = 0.0;
for(int i=0;i<M;i++){
sum += B[i] * A[i][j];
}
C[j] = sum;
}
return C;
}
/** return 1/diagonal entries as a Vector **/
Vector<T> inv_diag(Matrix<T> & A){
int dim; SizeSquare(A,dim);
Vector<T> d; Resize(d,dim);
for(int i=0;i<dim;i++){
d[i] = 1.0/A[i][i];
}
return d;
}
/** Matrix Addition **/
inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B)
{
int N,M ; SizeSame(A,B,N,M);
Matrix C; Resize(C,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
C[i][j] = A[i][j] + B[i][j];
}
}
return C;
}
/** Matrix Subtraction **/
inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){
int N,M ; SizeSame(A,B,N,M);
Matrix C; Resize(C,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
C[i][j] = A[i][j] - B[i][j];
}}
return C;
}
/** Matrix scalar multiplication **/
inline Matrix<T> operator* (Matrix<T> & A,T c){
int N,M; Size(A,N,M);
Matrix C; Resize(C,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
C[i][j] = A[i][j]*c;
}}
return C;
}
/** Matrix Matrix multiplication **/
inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){
int K,L,N,M;
Size(A,K,L);
Size(B,N,M); assert(L==N);
Matrix C; Resize(C,K,M);
for(int i=0;i<K;i++){
for(int j=0;j<M;j++){
T sum = 0.0;
for(int k=0;k<N;k++) sum += A[i][k]*B[k][j];
C[i][j] =sum;
}
}
return C;
}
/** Matrix Vector multiplication **/
inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){
int M,N,K;
Size(A,N,M);
Size(B,K); assert(K==M);
Vector<T> C; Resize(C,N);
for(int i=0;i<N;i++){
T sum = 0.0;
for(int j=0;j<M;j++) sum += A[i][j]*B[j];
C[i] = sum;
}
return C;
}
/** Some version of Matrix norm **/
/*
inline T Norm(){ // this is not a usual L2 norm
T norm = 0;
for(int i=0;i<dim;i++){
for(int j=0;j<dim;j++){
norm += abs(A[i][j]);
}}
return norm;
}
*/
/** Some version of Matrix norm **/
template<class T> T LargestDiag(Matrix<T> &A)
{
int dim ; SizeSquare(A,dim);
T ld = abs(A[0][0]);
for(int i=1;i<dim;i++){
T cf = abs(A[i][i]);
if(abs(cf) > abs(ld) ){ld = cf;}
}
return ld;
}
/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small)
{
int dim; SizeSquare(A,dim);
for(int l = dim - 1 - offset; l >= 1; l--) {
if((U)abs(A[l][l - 1]) < (U)small) {
A[l][l-1]=(U)0.0;
return l;
}
}
return 0;
}
/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small)
{
int dim; SizeSquare(A,dim);
for(int l = dim - 1 - offset; l >= 1; l--) {
if((U)abs(A[l][l - 1]) < (U)small) {
A[l][l - 1] = (U)0.0;
A[l - 1][l] = (U)0.0;
return l;
}
}
return 0;
}
/**Assign a submatrix to a larger one**/
template<class T>
void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
{
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
A[i][j] = S[i - row_st][j - col_st];
}
}
}
/**Get a square submatrix**/
template <class T>
Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end)
{
Matrix<T> H; Resize(row_end - row_st,col_end-col_st);
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
H[i-row_st][j-col_st]=A[i][j];
}}
return H;
}
/**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/
template<class T>
void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
{
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
A[i][j] = S[i - row_st][j - col_st];
}}
}
/** compute b_i A_ij b_j **/ // surprised no Conj
template<class T> T proj(Matrix<T> A, Vector<T> B){
int dim; SizeSquare(A,dim);
int dimB; Size(B,dimB);
assert(dimB==dim);
T C = 0;
for(int i=0;i<dim;i++){
T sum = 0.0;
for(int j=0;j<dim;j++){
sum += A[i][j]*B[j];
}
C += B[i]*sum; // No conj?
}
return C;
}
/*
*************************************************************
*
* Matrix Vector products
*
*************************************************************
*/
// Instead make a linop and call my CG;
/// q -> q Q
template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q)
{
int M; SizeSquare(Q,M);
int N; Size(q,N);
assert(M==N);
times(q,Q,N);
}
/// q -> q Q
template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N)
{
GridBase *grid = q[0]._grid;
int M; SizeSquare(Q,M);
int K; Size(q,K);
assert(N<M);
assert(N<K);
Vector<Fermion> S(N,grid );
for(int j=0;j<N;j++){
S[j] = zero;
for(int k=0;k<N;k++){
S[j] = S[j] + q[k]* Q[k][j];
}
}
for(int j=0;j<q.size();j++){
q[j] = S[j];
}
}
#endif

View File

@ -0,0 +1,122 @@
#include <math.h>
#include <stdlib.h>
#include <vector>
struct Bisection {
static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig)
{
int i,j;
std::vector<RealD> evec1(row_num+3);
std::vector<RealD> evec2(row_num+3);
RealD eps2;
ALPHA[1]=0.;
BETHA[1]=0.;
for(i=0;i<row_num-1;i++) {
ALPHA[i+1] = A[i*(row_num+1)].real();
BETHA[i+2] = A[i*(row_num+1)+1].real();
}
ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real();
bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2);
bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2);
// Do we really need to sort here?
int begin=1;
int end = row_num;
int swapped=1;
while(swapped) {
swapped=0;
for(i=begin;i<end;i++){
if(mag(evec2[i])>mag(evec2[i+1])) {
swap(evec2+i,evec2+i+1);
swapped=1;
}
}
end--;
for(i=end-1;i>=begin;i--){
if(mag(evec2[i])>mag(evec2[i+1])) {
swap(evec2+i,evec2+i+1);
swapped=1;
}
}
begin++;
}
for(i=0;i<row_num;i++){
for(j=0;j<row_num;j++) {
if(i==j) H[i*row_num+j]=evec2[i+1];
else H[i*row_num+j]=0.;
}
}
}
static void bisec(std::vector<RealD> &c,
std::vector<RealD> &b,
int n,
int m1,
int m2,
RealD eps1,
RealD relfeh,
std::vector<RealD> &x,
RealD &eps2)
{
std::vector<RealD> wu(n+2);
RealD h,q,x1,xu,x0,xmin,xmax;
int i,a,k;
b[1]=0.0;
xmin=c[n]-fabs(b[n]);
xmax=c[n]+fabs(b[n]);
for(i=1;i<n;i++){
h=fabs(b[i])+fabs(b[i+1]);
if(c[i]+h>xmax) xmax= c[i]+h;
if(c[i]-h<xmin) xmin= c[i]-h;
}
xmax *=2.;
eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin);
if(eps1<=0.0) eps1=eps2;
eps2=0.5*eps1+7.0*(eps2);
x0=xmax;
for(i=m1;i<=m2;i++){
x[i]=xmax;
wu[i]=xmin;
}
for(k=m2;k>=m1;k--){
xu=xmin;
i=k;
do{
if(xu<wu[i]){
xu=wu[i];
i=m1-1;
}
i--;
}while(i>=m1);
if(x0>x[k]) x0=x[k];
while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){
x1=(xu+x0)/2;
a=0;
q=1.0;
for(i=1;i<=n;i++){
q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh);
if(q<0) a++;
}
// printf("x1=%e a=%d\n",x1,a);
if(a<k){
if(a<m1){
xu=x1;
wu[m1]=x1;
}else {
xu=x1;
wu[a+1]=x1;
if(x[a]>x1) x[a]=x1;
}
}else x0=x1;
}
x[k]=(x0+xu)/2;
}
}
}

View File

@ -0,0 +1 @@

View File

@ -29,16 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
if ( cbmask == 0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int bo = n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int bo = n*rhs._grid->_slice_block[dimension];
buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb &cbmask ) {
buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
}
}
}
}
}
@ -59,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
if ( cbmask ==0x3){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int o=n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb & cbmask ) {
cobj temp;
temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset);
}
}
} else {
assert(0); //Fixme think this is buggy
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int offset = b+n*rhs._grid->_slice_block[dimension];
if ( ocb & cbmask ) {
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
extract<cobj>(temp,pointers,offset);
}
}
}
}
@ -109,16 +135,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
if ( cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
rhs._odata[so+o+b]=buffer[bo+b];
}
}
} else {
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
rhs._odata[so+o+b]=buffer[bo++];
}
}
}
}
}
@ -137,16 +175,28 @@ PARALLEL_NESTED_LOOP2
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
if(cbmask ==0x3 ) {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset);
}
}
} else {
assert(0); // think this is buggy FIXME
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset);
}
}
}
}
}
@ -166,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension];
if(cbmask == 0x3 ){
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
int o =n*rhs._grid->_slice_stride[dimension]+b;
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
} else {
PARALLEL_NESTED_LOOP2
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension]+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
}
}
}
}

View File

@ -9,7 +9,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs._grid);
Lattice<vobj> ret(rhs._grid);
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
@ -26,10 +26,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
if ( !comm_dim ) {
// std::cout << "Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
// std::cout << "Cshift_comms_simd" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
// std::cout << "Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
return ret;
@ -42,9 +45,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3);
} else {
// std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
@ -113,12 +120,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
// for(int i=0;i<words;i++){
// std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << " 0x" << cbmask<<std::endl;
// }
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
}
}

View File

@ -212,9 +212,10 @@ PARALLEL_FOR_LOOP
// Constructor requires "grid" passed.
// what about a default grid?
//////////////////////////////////////////////////////////////////
Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
// _odata.reserve(_grid->oSites());
// _odata.resize(_grid->oSites());
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
assert((((uint64_t)&_odata[0])&0xF) ==0);
checkerboard=0;
}

View File

@ -1,11 +1,10 @@
#ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H
#ifdef HAVE_ENDIAN_H
#include <endian.h>
#endif
#include <arpa/inet.h>
#include <algorithm>
// 64bit endian swap is a portability pain

View File

@ -26,7 +26,7 @@ namespace Grid {
// and Methods:
// void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
// void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
// void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
// void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
//
@ -101,6 +101,7 @@ namespace Grid {
typedef typename Impl::SiteSpinor SiteSpinor; \
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
typedef typename Impl::Compressor Compressor; \
typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams;
///////
@ -112,7 +113,6 @@ namespace Grid {
typedef ImplGauge<S,Nrepresentation> Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
@ -128,10 +128,11 @@ namespace Grid {
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams;
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
ImplParams Params;
WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
mult(&phi(),&U(mu),&chi());
}
@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
typedef GparityWilsonImplParams ImplParams;
ImplParams Params;
GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {};
// provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
typedef SiteHalfSpinor vobj;
typedef typename SiteHalfSpinor::scalar_object sobj;

View File

@ -109,7 +109,7 @@ namespace QCD {
///////////////////////////////////
template<class Impl>
void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
@ -123,7 +123,7 @@ namespace QCD {
FermionField Atilde(B._grid);
Atilde = A;
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
st.HaloExchange(B,comm_buf,compressor);
for(int mu=0;mu<Nd;mu++){
@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
Compressor compressor(dag);
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
Stencil.HaloExchange(in,comm_buf,compressor);
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP
template<class Impl>
void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) {
assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor(dag);
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
st.HaloExchange(in,comm_buf,compressor);
if ( dag == DaggerYes ) {
if( HandOptDslash ) {

View File

@ -73,14 +73,14 @@ namespace Grid {
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(CartesianStencil & st,
void DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) ;
@ -108,9 +108,9 @@ namespace Grid {
GridBase * _cbgrid;
//Defines the stencils for even and odd
CartesianStencil Stencil;
CartesianStencil StencilEven;
CartesianStencil StencilOdd;
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@ -1,4 +1,5 @@
#include <Grid.h>
#include <PerfCount.h>
namespace Grid {
namespace QCD {
@ -7,6 +8,7 @@ namespace QCD {
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
int WilsonFermion5DStatic::HandOptDslash;
int WilsonFermion5DStatic::AsmOptDslash;
// 5d lattice for DWF.
template<class Impl>
@ -68,6 +70,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
ImportGauge(_Umu);
commtime=0;
dslashtime=0;
}
template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
@ -85,7 +89,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
Compressor compressor(DaggerNo);
Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
Stencil.HaloExchange(in,comm_buf,compressor);
int skip = (disp==1) ? 0 : 1;
@ -105,7 +109,7 @@ PARALLEL_FOR_LOOP
};
template<class Impl>
void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
@ -122,7 +126,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
st.HaloExchange(B,comm_buf,compressor);
Atilde=A;
@ -194,6 +198,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Halo time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil All time "<<Stencil.halotime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commstime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge time "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil buf time "<<Stencil.buftime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A,
@ -212,7 +237,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
@ -220,19 +245,32 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
Compressor compressor(dag);
st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
int threads = GridThread::GetThreads();
int HT = GridThread::GetHyperThreads();
int cores = GridThread::GetCores();
int nwork = U._grid->oSites();
commtime -=usecond();
st.HaloExchange(in,comm_buf,compressor);
commtime +=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
dslashtime -=usecond();
if ( dag == DaggerYes ) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
#pragma omp parallel for schedule(static)
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
int sU=ss;
if ( LebesgueOrder::UseLebesgueOrder ) {
sU=lo.Reorder(ss);
}
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
}
@ -251,16 +289,79 @@ PARALLEL_FOR_LOOP
}
}
} else {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
if( this->AsmOptDslash ) {
// for(int i=0;i<1;i++){
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
// PerformanceCounter Counter(i);
// Counter.Start();
#pragma omp parallel for
for(int t=0;t<threads;t++){
int hyperthread = t%HT;
int core = t/HT;
int sswork, swork,soff,ssoff, sU,sF;
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
for(int ss=0;ss<sswork;ss++){
for(int s=soff;s<soff+swork;s++){
sU=ss+ ssoff;
if ( LebesgueOrder::UseLebesgueOrder ) {
sU = lo.Reorder(sU);
}
sF = s+Ls*sU;
Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
}
}
}
// Counter.Stop();
// Counter.Report();
// }
} else if( this->HandOptDslash ) {
#pragma omp parallel for
for(int t=0;t<threads;t++){
int hyperthread = t%HT;
int core = t/HT;
int sswork, swork,soff, sU,sF;
sswork = (nwork + cores-1)/cores;
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
for(int ss=0;ss<sswork;ss++){
sU=ss+core*sswork; // max locality within an L2 slice
if ( LebesgueOrder::UseLebesgueOrder ) {
sU = lo.Reorder(sU);
}
if ( sU < nwork ) {
for(int s=soff;s<soff+swork;s++){
sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
}
}
}
}
/*
#pragma omp parallel for schedule(static)
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
// int sU=lo.Reorder(ss);
int sU=ss;
if ( LebesgueOrder::UseLebesgueOrder ) {
sU=lo.Reorder(ss);
}
int sF = s+Ls*sU;
Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
}
}
*/
} else {
PARALLEL_FOR_LOOP
@ -274,6 +375,7 @@ PARALLEL_FOR_LOOP
}
}
}
dslashtime +=usecond();
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)

View File

@ -19,6 +19,7 @@ namespace Grid {
class WilsonFermion5DStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static int AsmOptDslash; // these are a temporary hack
static int HandOptDslash; // these are a temporary hack
static const std::vector<int> directions;
static const std::vector<int> displacements;
@ -31,7 +32,8 @@ namespace Grid {
public:
INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels;
double commtime;
double dslashtime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
@ -72,14 +74,14 @@ namespace Grid {
///////////////////////////////////////////////////////////////
// New methods added
///////////////////////////////////////////////////////////////
void DerivInternal(CartesianStencil & st,
void DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(CartesianStencil & st,
void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
@ -97,6 +99,7 @@ namespace Grid {
// DoubleStore
void ImportGauge(const GaugeField &_Umu);
void Report(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
@ -112,9 +115,9 @@ namespace Grid {
int Ls;
//Defines the stencils for even and odd
CartesianStencil Stencil;
CartesianStencil StencilEven;
CartesianStencil StencilOdd;
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@ -3,7 +3,7 @@ namespace Grid {
namespace QCD {
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
@ -78,7 +78,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
}
Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
accumReconXm(result,Uchi);
// Ym
SE=st.GetEntry(ptype,Ym,sF);
if ( SE->_is_local && SE->_permute ) {
@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
};
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
}
template<class Impl>
void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
{

View File

@ -17,36 +17,47 @@ namespace Grid {
typedef FermionOperator<Impl> Base;
public:
void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in,FermionField &out);
void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
#if defined(AVX512) || defined(IMCI)
void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
#else
void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p){
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
}
#endif
#define HANDOPT
#ifdef HANDOPT
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out);
#else
void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
}
void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int sF,int sU,const FermionField &in, FermionField &out)
{

View File

@ -0,0 +1,317 @@
#include <Grid.h>
#if defined(AVX512) || defined (IMCI)
#include <simd/Avx512Asm.h>
#undef VLOAD
#undef VSTORE
#undef VMUL
#undef VMADD
#undef ZEND
#undef ZLOAD
#undef ZMUL
#undef ZMADD
#undef VZERO
#undef VTIMESI
#undef VTIMESMINUSI
#define VZERO(A) VZEROf(A)
#define VMOV(A,B) VMOVf(A,B)
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
#define VADD(A,B,C) VADDf(A,B,C)
#define VSUB(A,B,C) VSUBf(A,B,C)
#define VMUL(Uri,Uir,Chi,UChi,Z) VMULf(Uri,Uir,Chi,UChi,Z)
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
#define VPERM0(A,B) VPERM0f(A,B)
#define VPERM1(A,B) VPERM1f(A,B)
#define VPERM2(A,B) VPERM2f(A,B)
#define VPERM3(A,B) VPERM3f(A,B)
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
#define ZEND1(A,B,C) ZEND1f(A,B,C)
#define ZEND2(A,B,C) ZEND2f(A,B,C)
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
namespace Grid {
namespace QCD {
template<class Impl>
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
{
uint64_t now;
uint64_t first ;
int offset,local,perm, ptype;
const SiteHalfSpinor *pbuf = & buf[0];
const SiteSpinor *plocal = & in._odata[0];
void *pf;
int osites = in._grid->oSites();
StencilEntry *SE;
//#define STAMP(i) timers[i] = __rdtsc() ;
#define STAMP(i) //timers[i] = __rdtsc() ;
MASK_REGS;
first = __rdtsc();
SE=st.GetEntry(ptype,Xm,ss);
#if 0
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
LOAD64(%r9,pf);
__asm__(
VPREFETCH(0,%r9)
VPREFETCH(1,%r9)
VPREFETCH(2,%r9)
VPREFETCH(3,%r9)
VPREFETCH(4,%r9)
VPREFETCH(5,%r9)
VPREFETCH(6,%r9)
VPREFETCH(7,%r9)
VPREFETCH(8,%r9)
VPREFETCH(9,%r9)
VPREFETCH(10,%r9)
VPREFETCH(11,%r9) );
#endif
// Xm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Ym,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFXM(Xm,pf);
}
XM_RECON;
// Ym
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Zm,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFYM(Ym,pf);
}
YM_RECON_ACCUM;
// Zm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Tm,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFZM(Zm,pf);
}
ZM_RECON_ACCUM;
// Tm
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
SE=st.GetEntry(ptype,Tp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
TM_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFTM(Tm,pf);
}
TM_RECON_ACCUM;
// Tp
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Zp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
TP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFTP(Tp,pf);
}
TP_RECON_ACCUM;
// Zp
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Yp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
ZP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFZP(Zp,pf);
}
ZP_RECON_ACCUM;
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
// Prefetch
SE=st.GetEntry(ptype,Xp,ss);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
YP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFYP(Yp,pf);
}
YP_RECON_ACCUM;
// Xp
perm = SE->_permute;
offset = SE->_offset;
local = SE->_is_local;
// PREFETCH_R(A);
// Prefetch
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
else pf=(void *)&pbuf[SE->_offset];
if ( local ) {
XP_PROJMEM(&plocal[offset]);
if ( perm) {
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(&pbuf[offset]);
}
{
MULT_2SPIN_DIR_PFXP(Xp,pf);
}
XP_RECON_ACCUM;
debug:
SAVE_RESULT(&out._odata[ss]);
}
template class WilsonKernels<WilsonImplF>;
template class WilsonKernels<WilsonImplD>;
}}
#endif

View File

@ -282,7 +282,7 @@ namespace QCD {
#ifdef HANDOPT
template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
}
template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out)
{

View File

@ -1,49 +1,211 @@
#ifndef GRID_SERIALISATION_ABSTRACT_READER_H
#define GRID_SERIALISATION_ABSTRACT_READER_H
#include <type_traits>
namespace Grid {
class Writer {
public:
virtual ~Writer() {};
virtual void push(const std::string &s) = 0;
virtual void pop(void) =0;
virtual void write( const std::string& s,const std::string &output ) =0;
virtual void write( const std::string& s,const int16_t output ) =0;
virtual void write( const std::string& s,const uint16_t output ) =0;
virtual void write( const std::string& s,const int32_t output ) =0;
virtual void write( const std::string& s,const uint32_t output ) =0;
virtual void write( const std::string& s,const int64_t output ) =0;
virtual void write( const std::string& s,const uint64_t output ) =0;
virtual void write( const std::string& s,const float output ) =0;
virtual void write( const std::string& s,const double output ) =0;
virtual void write( const std::string& s,const bool output ) =0;
};
class Reader {
public:
virtual ~Reader() {};
virtual void read( const std::string& s,std::string &output ) =0;
virtual void push(const std::string &s) =0;
virtual void pop(void) = 0;
virtual void read( const std::string& s, int16_t &output ) =0;
virtual void read( const std::string& s, uint16_t &output ) =0;
virtual void read( const std::string& s, int32_t &output ) =0;
virtual void read( const std::string& s, uint32_t &output ) =0;
virtual void read( const std::string& s, int64_t &output ) =0;
virtual void read( const std::string& s, uint64_t &output ) =0;
virtual void read( const std::string& s, float &output ) =0;
virtual void read( const std::string& s, double &output ) =0;
virtual void read( const std::string& s, bool &output ) =0;
class Serializable {};
// static polymorphism implemented using CRTP idiom
// Static abstract writer
template <typename T>
class Writer
{
public:
Writer(void);
virtual ~Writer(void) = default;
void push(const std::string &s);
void pop(void);
template <typename U>
typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
write(const std::string& s, const U &output);
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
write(const std::string& s, const U &output);
private:
T *upcast;
};
// Static abstract reader
template <typename T>
class Reader
{
public:
Reader(void);
virtual ~Reader(void) = default;
void push(const std::string &s);
void pop(void);
template <typename U>
typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
read(const std::string& s, U &output);
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
read(const std::string& s, U &output);
protected:
template <typename U>
void fromString(U &output, const std::string &s);
private:
T *upcast;
};
// Generic writer interface
template <typename T>
inline void push(Writer<T> &w, const std::string &s)
{
w.push(s);
}
template <typename T>
inline void push(Writer<T> &w, const char *s)
{
w.push(std::string(s));
}
template <typename T>
inline void pop(Writer<T> &w)
{
w.pop();
}
template <typename T, typename U>
inline void write(Writer<T> &w, const std::string& s, const U &output)
{
w.write(s, output);
}
// Generic reader interface
template <typename T>
inline void push(Reader<T> &r, const std::string &s)
{
r.push(s);
}
template <typename T>
inline void push(Reader<T> &r, const char *s)
{
r.push(std::string(s));
}
template <typename T>
inline void pop(Reader<T> &r)
{
r.pop();
}
template <typename T, typename U>
inline void read(Reader<T> &r, const std::string &s, U &output)
{
r.read(s, output);
}
template < class T >
inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v)
{
os << "[";
for (auto &x: v)
{
os << x << " ";
}
if (v.size() > 0)
{
os << "\b";
}
os << "]";
return os;
}
// Writer template implementation ////////////////////////////////////////////
template <typename T>
Writer<T>::Writer(void)
{
upcast = static_cast<T *>(this);
}
template <typename T>
void Writer<T>::push(const std::string &s)
{
upcast->push(s);
}
template <typename T>
void Writer<T>::pop(void)
{
upcast->pop();
}
template <typename T>
template <typename U>
typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
Writer<T>::write(const std::string &s, const U &output)
{
U::write(*this, s, output);
}
template <typename T>
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
Writer<T>::write(const std::string &s, const U &output)
{
upcast->writeDefault(s, output);
}
// Reader template implementation ////////////////////////////////////////////
template <typename T>
Reader<T>::Reader(void)
{
upcast = static_cast<T *>(this);
}
template <typename T>
void Reader<T>::push(const std::string &s)
{
upcast->push(s);
}
template <typename T>
void Reader<T>::pop(void)
{
upcast->pop();
}
template <typename T>
template <typename U>
typename std::enable_if<std::is_base_of<Serializable, U>::value, void>::type
Reader<T>::read(const std::string &s, U &output)
{
U::read(*this, s, output);
}
template <typename T>
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
Reader<T>::read(const std::string &s, U &output)
{
upcast->readDefault(s, output);
}
template <typename T>
template <typename U>
void Reader<T>::fromString(U &output, const std::string &s)
{
std::istringstream is(s);
is.exceptions(std::ios::failbit);
try
{
is >> std::boolalpha >> output;
}
catch(std::istringstream::failure &e)
{
std::cerr << "numerical conversion failure on '" << s << "' ";
std::cerr << "(typeid: " << typeid(U).name() << ")" << std::endl;
abort();
}
}
};
}
#endif

View File

@ -0,0 +1,36 @@
#include <Grid.h>
using namespace Grid;
using namespace std;
// Writer implementation ///////////////////////////////////////////////////////
BinaryWriter::BinaryWriter(const string &fileName)
: file_(fileName, ios::binary|ios::out)
{}
template <>
void BinaryWriter::writeDefault(const string &s, const string &output)
{
uint64_t sz = output.size();
write("", sz);
for (uint64_t i = 0; i < sz; ++i)
{
write("", output[i]);
}
}
// Reader implementation ///////////////////////////////////////////////////////
BinaryReader::BinaryReader(const string &fileName)
: file_(fileName, ios::binary|ios::in)
{}
template <>
void BinaryReader::readDefault(const string &s, string &output)
{
uint64_t sz;
read("", sz);
output.reserve(sz);
file_.read((char *)output.data(), sz);
}

View File

@ -9,101 +9,76 @@
#include <vector>
#include <cassert>
namespace Grid {
class BinaryWriter : public Writer {
private:
std::ofstream file;
public:
BinaryWriter(const std::string &_file) : file(_file,std::ios::binary|std::ios::out) {}
~BinaryWriter() {}
// Binary is scopeless
void push(const std::string &s) {}
void pop(void) {}
void write(const std::string& s,const std::string &output) {
uint32_t sz = output.size();
write(s,sz);
const char * cstr = output.c_str();
for(int c=0;c<output.size();c++){
write(s,cstr[c]);
}
class BinaryWriter: public Writer<BinaryWriter>
{
public:
BinaryWriter(const std::string &fileName);
virtual ~BinaryWriter(void) = default;
void push(const std::string &s) {};
void pop(void) {};
template <typename U>
void writeDefault(const std::string &s, const U &x);
template <typename U>
void writeDefault(const std::string &s, const std::vector<U> &x);
private:
std::ofstream file_;
};
void write( const std::string& s,const char output ) { writeInternal(s,output); };
void write( const std::string& s,const int16_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint16_t output ) { writeInternal(s,output); };
void write( const std::string& s,const int32_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint32_t output ) { writeInternal(s,output); };
void write( const std::string& s,const int64_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint64_t output ) { writeInternal(s,output); };
void write( const std::string& s,const float output ) { writeInternal(s,output); };
void write( const std::string& s,const double output ) { writeInternal(s,output); };
void write( const std::string& s,const bool output ) { writeInternal(s,output); };
private:
template<class T> void writeInternal( const std::string& s,const T output ){
// FIXME --- htons, htonl, htno64 etc..
file.write((char *)&output,sizeof(T));
class BinaryReader: public Reader<BinaryReader>
{
public:
BinaryReader(const std::string &fileName);
virtual ~BinaryReader(void) = default;
void push(const std::string &s) {};
void pop(void) {};
template <typename U>
void readDefault(const std::string &s, U &output);
template <typename U>
void readDefault(const std::string &s, std::vector<U> &output);
private:
std::ifstream file_;
};
// Writer template implementation ////////////////////////////////////////////
template <typename U>
void BinaryWriter::writeDefault(const std::string &s, const U &x)
{
file_.write((char *)&x, sizeof(U));
}
};
class BinaryReader : public Reader{
private:
std::ifstream file;
public:
BinaryReader(const std::string &_file) : file(_file,std::ios::binary|std::ios::in) {}
~BinaryReader() {}
// Binary is scopeless
void push(const std::string &s) { }
void pop(void) { }
void read( const std::string& s,std::string &output ) {
output.clear();
uint32_t sz;
file.read((char *)&sz,sizeof(sz));
for(int c=0;c<sz;c++){
char ch;
file.read(&ch,sizeof(ch));
output.push_back(ch);
template <typename U>
void BinaryWriter::writeDefault(const std::string &s, const std::vector<U> &x)
{
uint64_t sz = x.size();
write("", sz);
for (uint64_t i = 0; i < sz; ++i)
{
write("", x[i]);
}
}
// Reader template implementation ////////////////////////////////////////////
template <typename U>
void BinaryReader::readDefault(const std::string &s, U &output)
{
file_.read((char *)&output, sizeof(U));
}
template <typename U>
void BinaryReader::readDefault(const std::string &s, std::vector<U> &output)
{
uint64_t sz;
read("", sz);
output.resize(sz);
for (uint64_t i = 0; i < sz; ++i)
{
read("", output[i]);
}
};
void read( const std::string& s, int16_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint16_t &output ) { readInternal(s,output); };
void read( const std::string& s, int32_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint32_t &output ) { readInternal(s,output); };
void read( const std::string& s, int64_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint64_t &output ) { readInternal(s,output); };
void read( const std::string& s, float &output ) { readInternal(s,output); };
void read( const std::string& s, double &output ) { readInternal(s,output); };
void read( const std::string& s, bool &output ) { readInternal(s,output); };
private:
template<class T> void readInternal( const std::string& path, T &output ){
file.read((char *)&output,sizeof(output)); // byte order??
}
};
}
#endif

View File

@ -111,8 +111,8 @@ THE SOFTWARE.
#define GRID_MACRO_MEMBER(A,B) A B;
#define GRID_MACRO_OS_WRITE_MEMBER(A,B) os<< #A <<" "#B <<" = "<< obj. B <<" ; " <<std::endl;
#define GRID_MACRO_READ_MEMBER(A,B) read(RD,#B,obj. B);
#define GRID_MACRO_WRITE_MEMBER(A,B) write(WR,#B,obj. B);
#define GRID_MACRO_READ_MEMBER(A,B) Grid::read(RD,#B,obj. B);
#define GRID_MACRO_WRITE_MEMBER(A,B) Grid::write(WR,#B,obj. B);
#define GRID_DECL_CLASS_MEMBERS(cname,...) \
\
@ -120,14 +120,16 @@ THE SOFTWARE.
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__)) \
\
\
friend void write(Writer &WR,const std::string &s, const cname &obj){ \
template <typename T>\
static void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
push(WR,s);\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \
pop(WR);\
} \
\
\
friend void read(Reader &RD,const std::string &s, cname &obj){ \
template <typename T>\
static void read(Reader<T> &RD,const std::string &s, cname &obj){ \
push(RD,s);\
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \
pop(RD);\

View File

@ -5,85 +5,6 @@
#include <serialisation/BaseIO.h>
#include <stdint.h>
namespace Grid {
// Generic reader writer interface
inline void push(Writer & WR,const std::string &s) { WR.push(s);}
inline void push(Writer & WR,const char *s) { WR.push(std::string(s));}
inline void pop (Writer & WR) { WR.pop();}
// inline void write(Writer& wr, const std::string& s,const char * output ) { wr.write(s,std::string(output)); };
inline void write(Writer& wr, const std::string& s,const std::string &output) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const int16_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const uint16_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const int32_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const uint32_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const int64_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const uint64_t output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const float output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const double output ) { wr.write(s,output); };
inline void write(Writer& wr, const std::string& s,const bool output ) { wr.write(s,output); };
inline void push(Reader & WR,const std::string &s) { WR.push(s);}
inline void push(Reader & WR,const char *s) { WR.push(std::string(s));}
inline void pop (Reader & WR) { WR.pop();}
inline void read(Reader& rd, const std::string& s,std::string &output) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, int16_t &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, uint16_t &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, int32_t &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, uint32_t &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, int64_t &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, uint64_t &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, float &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, double &output ) { rd.read(s,output); };
inline void read(Reader& rd, const std::string& s, bool &output ) { rd.read(s,output); };
template<class T> void write(Writer& wr, const std::string& s,const std::vector<T> output ) {
push(wr,s);
uint64_t sz =output.size();
write(wr,"N",sz);
for(int i=0;i<output.size();i++){
std::ostringstream oss; oss << "elem" << i;
write(wr,oss.str(),output[i]);
}
pop(wr);
};
template<class T>
void read(Reader& rd, const std::string& s,std::vector<T> &output ) {
push(rd,s);
uint64_t sz; read(rd,"N",sz);
// skip the vector length
T tmp;
output.resize(0);
for(int i=0;i<sz;i++){
std::ostringstream oss; oss << "elem" << i;
read(rd,oss.str(),tmp);
output.push_back(tmp);
}
pop(rd);
};
template < class T >
inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v)
{
os << "[";
for (typename std::vector<T>::const_iterator ii = v.begin(); ii != v.end(); ++ii)
{
os << " " << *ii;
}
os << " ]";
return os;
}
}
//////////////////////////////////////////
// Todo:
//////////////////////////////////////////
@ -97,7 +18,7 @@ namespace Grid {
// Select the default serialiser use ifdef's
//////////////////////////////////////////
namespace Grid {
typedef XMLReader DefaultReader;
typedef XMLWriter DefaultWriter;
typedef XmlReader DefaultReader;
typedef XmlWriter DefaultWriter;
}
#endif

View File

@ -0,0 +1,66 @@
#include <Grid.h>
using namespace Grid;
using namespace std;
// Writer implementation ///////////////////////////////////////////////////////
TextWriter::TextWriter(const string &fileName)
: file_(fileName, ios::out)
{}
void TextWriter::push(const string &s)
{
level_++;
};
void TextWriter::pop(void)
{
level_--;
};
void TextWriter::indent(void)
{
for (int i = 0; i < level_; ++i)
{
file_ << '\t';
}
};
// Reader implementation ///////////////////////////////////////////////////////
TextReader::TextReader(const string &fileName)
: file_(fileName, ios::in)
{}
void TextReader::push(const string &s)
{
level_++;
};
void TextReader::pop(void)
{
level_--;
};
void TextReader::checkIndent(void)
{
char c;
for (int i = 0; i < level_; ++i)
{
file_.get(c);
if (c != '\t')
{
cerr << "mismatch on tab " << c << " level " << level_;
cerr << " i "<< i <<endl;
abort();
}
}
}
template <>
void TextReader::readDefault(const string &s, string &output)
{
checkIndent();
output.clear();
getline(file_, output);
}

View File

@ -9,140 +9,88 @@
#include <vector>
#include <cassert>
namespace Grid {
class TextWriter : public Writer {
private:
std::ofstream file;
int level;
void indent(void) {
for(int i=0;i<level;i++){
file <<"\t";
}
}
public:
TextWriter(const std::string &_file) : file(_file,std::ios::out) {
level=0;
}
~TextWriter() { }
void push(const std::string &s)
namespace Grid
{
class TextWriter: public Writer<TextWriter>
{
// std::string tmp = s;
// write(s,tmp);
level++;
}
void pop(void) {
level--;
}
void write( const std::string& s,const std::string &output ) {
indent();
file<<output<<std::endl;
public:
TextWriter(const std::string &fileName);
virtual ~TextWriter(void) = default;
void push(const std::string &s);
void pop(void);
template <typename U>
void writeDefault(const std::string &s, const U &output);
template <typename U>
void writeDefault(const std::string &s, const std::vector<U> &output);
private:
void indent(void);
private:
std::ofstream file_;
int level_{0};
};
void write( const std::string& s,const int16_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint16_t output ) { writeInternal(s,output); };
void write( const std::string& s,const int32_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint32_t output ) { writeInternal(s,output); };
void write( const std::string& s,const int64_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint64_t output ) { writeInternal(s,output); };
void write( const std::string& s,const float output ) { writeInternal(s,output); };
void write( const std::string& s,const double output ) { writeInternal(s,output); };
void write( const std::string& s,const bool output ) { writeInternal(s,output); };
private:
template<class T> void writeInternal( const std::string& s,const T output ){
class TextReader: public Reader<TextReader>
{
public:
TextReader(const std::string &fileName);
virtual ~TextReader(void) = default;
void push(const std::string &s);
void pop(void);
template <typename U>
void readDefault(const std::string &s, U &output);
template <typename U>
void readDefault(const std::string &s, std::vector<U> &output);
private:
void checkIndent(void);
private:
std::ifstream file_;
int level_{0};
};
// Writer template implementation ////////////////////////////////////////////
template <typename U>
void TextWriter::writeDefault(const std::string &s, const U &output)
{
indent();
file << std::boolalpha << output<<std::endl;
file_ << std::boolalpha << output << std::endl;
}
};
class TextReader : public Reader {
private:
std::ifstream file;
int level;
public:
TextReader(const std::string &_file) : file(_file,std::ios::in) { level = 0;};
~TextReader() { }
void read( const std::string& s,std::string &output ) {
char c='a';
for(int i=0;i<level;i++){
file.get(c);
if ( c != '\t' )
std::cout << "mismatch on tab "<<c<<" level "<< level<< " i "<< i<<std::endl;
}
output.clear();
std::getline(file,output);
};
void push(const std::string &s) {
// std::string tmp; read(s,tmp);
level++;
}
void pop(void) { level--; }
template<class T>
void read( const std::string& s, std::vector<T> &output ) {
template <typename U>
void TextWriter::writeDefault(const std::string &s, const std::vector<U> &output)
{
uint64_t sz = output.size();
push(s);
uint64_t n; read("N",n);
// skip the vector length
T tmp;
output.resize(0);
for(int i=0;i<n;i++){
std::ostringstream oss; oss << "elem" << i;
read(*this,oss.str(),tmp);
output.push_back(tmp);
write(s, sz);
for (uint64_t i = 0; i < sz; ++i)
{
write(s, output[i]);
}
pop();
};
void read( const std::string& s, int16_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint16_t &output ) { readInternal(s,output); };
void read( const std::string& s, int32_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint32_t &output ) { readInternal(s,output); };
void read( const std::string& s, int64_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint64_t &output ) { readInternal(s,output); };
void read( const std::string& s, float &output ) { readInternal(s,output); };
void read( const std::string& s, double &output ) { readInternal(s,output); };
void read( const std::string& s, bool &output ) { readInternal(s,output); };
private:
template<class T> void readInternal( const std::string& path, T &output ){
std::string asString;
read(path,asString);
convert(asString,output);
}
template<class T> void convert(const std::string &asString,T &output)
{
std::istringstream is(asString); is.exceptions(std::ios::failbit);
try {
is >> std::boolalpha >> output;
} catch(std::istringstream::failure e) {
std::cerr << "XML read failure on "<<" "<<asString<<" "<<typeid(T).name()<<std::endl;
}
assert( is.tellg()==-1);
}
};
// Reader template implementation ////////////////////////////////////////////
template <typename U>
void TextReader::readDefault(const std::string &s, U &output)
{
std::string buf;
readDefault(s, buf);
fromString(output, buf);
}
template <typename U>
void TextReader::readDefault(const std::string &s, std::vector<U> &output)
{
uint64_t sz;
read("", sz);
output.resize(sz);
for (uint64_t i = 0; i < sz; ++i)
{
read("", output[i]);
}
}
}
#endif

View File

@ -0,0 +1,59 @@
#include <Grid.h>
using namespace Grid;
using namespace std;
// Writer implementation ///////////////////////////////////////////////////////
XmlWriter::XmlWriter(const string &fileName)
: fileName_(fileName)
{
node_ = doc_.append_child();
node_.set_name("grid");
}
XmlWriter::~XmlWriter(void)
{
doc_.save_file(fileName_.c_str(), " ");
}
void XmlWriter::push(const string &s)
{
node_ = node_.append_child(s.c_str());
}
void XmlWriter::pop(void)
{
node_ = node_.parent();
}
// Reader implementation ///////////////////////////////////////////////////////
XmlReader::XmlReader(const string &fileName)
: fileName_(fileName)
{
pugi::xml_parse_result result = doc_.load_file(fileName_.c_str());
if ( !result )
{
cerr << "XML error description: " << result.description() << "\n";
cerr << "XML error offset : " << result.offset << "\n";
abort();
}
node_ = doc_.child("grid");
}
void XmlReader::push(const string &s)
{
node_ = node_.child(s.c_str());
}
void XmlReader::pop(void)
{
node_ = node_.parent();
}
template <>
void XmlReader::readDefault(const string &s, string &output)
{
output = node_.child(s.c_str()).first_child().value();
}

View File

@ -11,136 +11,93 @@
#include "pugixml/pugixml.h"
namespace Grid {
class XMLWriter : public Writer {
private:
pugi::xml_document doc;
pugi::xml_node node;
std::string file;
public:
XMLWriter(const std::string &_file) : file(_file)
namespace Grid
{
class XmlWriter: public Writer<XmlWriter>
{
node=doc.append_child();
node.set_name("document");
}
~XMLWriter()
{
// simple_walker walker;
// doc.traverse(walker);
doc.save_file(file.c_str()," ");
}
void push(const std::string &s)
{
node = node.append_child(s.c_str());
}
void pop(void) {
node = node.parent();
}
void write( const std::string& s,const std::string &output ) {
pugi::xml_node leaf=node.append_child(s.c_str());
leaf.append_child(pugi::node_pcdata).set_value(output.c_str());
public:
XmlWriter(const std::string &fileName);
virtual ~XmlWriter(void);
void push(const std::string &s);
void pop(void);
template <typename U>
void writeDefault(const std::string &s, const U &x);
template <typename U>
void writeDefault(const std::string &s, const std::vector<U> &x);
private:
pugi::xml_document doc_;
pugi::xml_node node_;
std::string fileName_;
};
void write( const std::string& s,const int16_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint16_t output ) { writeInternal(s,output); };
void write( const std::string& s,const int32_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint32_t output ) { writeInternal(s,output); };
void write( const std::string& s,const int64_t output ) { writeInternal(s,output); };
void write( const std::string& s,const uint64_t output ) { writeInternal(s,output); };
void write( const std::string& s,const float output ) { writeInternal(s,output); };
void write( const std::string& s,const double output ) { writeInternal(s,output); };
void write( const std::string& s,const bool output ) { writeInternal(s,output); };
private:
template<class T> void writeInternal( const std::string& s,const T output ){
class XmlReader: public Reader<XmlReader>
{
public:
XmlReader(const std::string &fileName);
virtual ~XmlReader(void) = default;
void push(const std::string &s);
void pop(void);
template <typename U>
void readDefault(const std::string &s, U &output);
template <typename U>
void readDefault(const std::string &s, std::vector<U> &output);
private:
pugi::xml_document doc_;
pugi::xml_node node_;
std::string fileName_;
};
// Writer template implementation ////////////////////////////////////////////
template <typename U>
void XmlWriter::writeDefault(const std::string &s, const U &x)
{
std::ostringstream os;
os << std::boolalpha << output;
write(s,os.str());
os << std::boolalpha << x;
pugi::xml_node leaf = node_.append_child(s.c_str());
leaf.append_child(pugi::node_pcdata).set_value(os.str().c_str());
}
};
class XMLReader : public Reader {
private:
pugi::xml_document doc;
pugi::xml_node node;
public:
XMLReader(const std::string &_file)
template <typename U>
void XmlWriter::writeDefault(const std::string &s, const std::vector<U> &x)
{
pugi::xml_parse_result result = doc.load_file(_file.c_str());
if ( !result ) {
std::cout << "XML error description: " << result.description() << "\n";
std::cout << "XML error offset: " << result.offset << "\n";
push(s);
for (auto &x_i: x)
{
write("elem", x_i);
}
assert(result);
// simple_walker walker;
// doc.traverse(walker);
node= doc.child("document");
}
~XMLReader() { }
void read( const std::string& s,std::string &output ) {
output=node.child(s.c_str()).first_child().value();
};
void push(const std::string &s)
{
node = node.child(s.c_str());
}
void pop(void) {
node = node.parent();
}
void read( const std::string& s, int16_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint16_t &output ) { readInternal(s,output); };
void read( const std::string& s, int32_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint32_t &output ) { readInternal(s,output); };
void read( const std::string& s, int64_t &output ) { readInternal(s,output); };
void read( const std::string& s, uint64_t &output ) { readInternal(s,output); };
void read( const std::string& s, float &output ) { readInternal(s,output); };
void read( const std::string& s, double &output ) { readInternal(s,output); };
void read( const std::string& s, bool &output ) { readInternal(s,output); };
private:
template<class T> void readInternal( const std::string& path, T &output ){
std::string asString;
read(path,asString);
convert(asString,output);
}
template<class T> void convert(const std::string &asString,T &output)
{
std::istringstream is(asString); is.exceptions(std::ios::failbit);
try {
is >> std::boolalpha >> output;
} catch(std::istringstream::failure e) {
std::cerr << "XML read failure on "<<" "<<asString<<" "<<typeid(T).name()<<std::endl;
}
assert( is.tellg()==-1);
pop();
}
// Reader template implementation ////////////////////////////////////////////
template <typename U>
void XmlReader::readDefault(const std::string &s, U &output)
{
std::string buf;
readDefault(s, buf);
fromString(output, buf);
}
template <typename U>
void XmlReader::readDefault(const std::string &s, std::vector<U> &output)
{
pugi::xml_node nodeCpy;
std::string buf;
unsigned int i = 0;
push(s);
while (node_.child("elem"))
{
output.resize(i + 1);
read("elem", output[i]);
node_.child("elem").set_name("elem-done");
i++;
}
pop();
}
};
}
#endif

1112
lib/simd/Avx512Asm.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,9 @@
//----------------------------------------------------------------------
#include <immintrin.h>
#ifdef AVXFMA4
#include <x86intrin.h>
#endif
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
#ifndef _mm256_set_m128i
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
@ -132,7 +135,7 @@ namespace Optimization {
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1)
#if defined (AVX1) || defined (AVXFMA4)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
@ -146,7 +149,6 @@ namespace Optimization {
#if defined (AVX2)
return _mm256_add_epi32(a,b);
#endif
}
};
@ -161,7 +163,7 @@ namespace Optimization {
}
//Integer
inline __m256i operator()(__m256i a, __m256i b){
#if defined (AVX1)
#if defined (AVX1) || defined (AVXFMA4)
__m128i a0,a1;
__m128i b0,b1;
a0 = _mm256_extractf128_si256(a,0);
@ -182,6 +184,7 @@ namespace Optimization {
struct MultComplex{
// Complex float
inline __m256 operator()(__m256 a, __m256 b){
#if defined (AVX1)
__m256 ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
@ -190,6 +193,20 @@ namespace Optimization {
ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm256_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_ps(ymm0,ymm1);
#endif
#if defined (AVXFMA4)
__m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar,
__m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai
__m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1));
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
#if defined (AVX2)
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
}
// Complex double
inline __m256d operator()(__m256d a, __m256d b){
@ -215,6 +232,7 @@ namespace Optimization {
IF IMM0[3] = 0
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
*/
#if defined (AVX1)
__m256d ymm0,ymm1,ymm2;
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
@ -222,10 +240,71 @@ namespace Optimization {
ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai b'11,11
ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm256_addsub_pd(ymm0,ymm1);
#endif
#if defined (AVXFMA4)
__m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
#if defined (AVX2)
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
#endif
}
};
#if 0
struct ComplexDot {
inline void Prep(__m256 ari,__m256 &air) {
cdotRIperm(ari,air);
}
inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
riir=air*b;
iirr=arr*b;
};
inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
mac(riir,air,b);
mac(iirr,ari,b);
}
inline void End(__m256 ari,__m256 &air) {
// cdotRI
}
};
#endif
struct Mult{
inline void mac(__m256 &a, __m256 b, __m256 c){
#if defined (AVX1)
a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
#endif
#if defined (AVXFMA4)
a= _mm256_macc_ps(b,c,a);
#endif
#if defined (AVX2)
a= _mm256_fmadd_ps( b, c, a);
#endif
}
inline void mac(__m256d &a, __m256d b, __m256d c){
#if defined (AVX1)
a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
#endif
#if defined (AVXFMA4)
a= _mm256_macc_pd(b,c,a);
#endif
#if defined (AVX2)
a= _mm256_fmadd_pd( b, c, a);
#endif
}
// Real float
inline __m256 operator()(__m256 a, __m256 b){
return _mm256_mul_ps(a,b);

View File

@ -8,10 +8,7 @@
//----------------------------------------------------------------------
#include <immintrin.h>
#ifndef KNC_ONLY_STORES
#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512
#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512
#endif
namespace Optimization {
@ -59,14 +56,15 @@ namespace Optimization {
struct Vstream{
//Float
inline void operator()(float * a, __m512 b){
_mm512_storenrngo_ps(a,b);
//_mm512_stream_ps(a,b);
_mm512_store_ps(a,b);
}
//Double
inline void operator()(double * a, __m512d b){
_mm512_storenrngo_pd(a,b);
//_mm512_stream_pd(a,b);
_mm512_store_pd(a,b);
}
};
@ -180,6 +178,15 @@ namespace Optimization {
};
struct Mult{
inline void mac(__m512 &a, __m512 b, __m512 c){
a= _mm512_fmadd_ps( b, c, a);
}
inline void mac(__m512d &a, __m512d b, __m512d c){
a= _mm512_fmadd_pd( b, c, a);
}
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_mul_ps(a,b);

View File

@ -157,6 +157,12 @@ namespace Optimization {
};
struct Mult{
inline float mac(float a, float b,double c){
return 0;
}
inline double mac(double a, double b,double c){
return 0;
}
// Real float
inline float operator()(float a, float b){
return 0;

View File

@ -9,12 +9,6 @@
#include <immintrin.h>
//#ifndef KNC_ONLY_STORES
//#define _mm512_storenrngo_ps _mm512_store_ps // not present in AVX512
//#define _mm512_storenrngo_pd _mm512_store_pd // not present in AVX512
//#endif
namespace Optimization {
struct Vsplat{
@ -197,6 +191,15 @@ namespace Optimization {
};
struct Mult{
inline void mac(__m512 &a, __m512 b, __m512 c){
a= _mm512_fmadd_ps( b, c, a);
}
inline void mac(__m512d &a, __m512d b, __m512d c){
a= _mm512_fmadd_pd( b, c, a);
}
// Real float
inline __m512 operator()(__m512 a, __m512 b){
return _mm512_mul_ps(a,b);

View File

@ -171,6 +171,12 @@ namespace Optimization {
struct Mult{
// Real float
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
return vaddq_f32(vmulq_f32(b,c),a);
}
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
return vaddq_f64(vmulq_f64(b,c),a);
}
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vmulq_f32(a,b);
}

View File

@ -171,6 +171,15 @@ namespace Optimization {
};
struct Mult{
inline void mac(__m128 &a, __m128 b, __m128 c){
a= _mm128_add_ps(_mm128_mul_ps(b,c),a);
}
inline void mac(__m128d &a, __m128d b, __m128d c){
a= _mm128_add_pd(_mm128_mul_pd(b,c),a);
}
// Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_mul_ps(a,b);

View File

@ -13,7 +13,7 @@
#ifdef SSE4
#include "Grid_sse4.h"
#endif
#if defined (AVX1)|| defined (AVX2)
#if defined (AVX1)|| defined (AVX2) || defined (AVXFMA4)
#include "Grid_avx.h"
#endif
#if defined AVX512
@ -133,7 +133,11 @@ namespace Grid {
///////////////////////////////////////////////
// mac, mult, sub, add, adj
///////////////////////////////////////////////
// FIXME -- alias this to an inline MAC struct.
friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }

View File

@ -1,8 +1,10 @@
#include <Grid.h>
#include <algorithm>
namespace Grid {
int LebesgueOrder::UseLebesgueOrder;
std::vector<int> LebesgueOrder::Block({2,2,2,2});
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
n--; // 1000 0011 --> 1000 0010
@ -15,12 +17,79 @@ LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
return n;
};
LebesgueOrder::LebesgueOrder(GridBase *grid)
LebesgueOrder::LebesgueOrder(GridBase *_grid)
{
grid = _grid;
if ( Block[0]==0) ZGraph();
else CartesianBlocking();
}
void LebesgueOrder::CartesianBlocking(void)
{
_LebesgueReorder.resize(0);
std::cout << GridLogMessage << " CartesianBlocking ";
for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
std::cout<<std::endl;
IndexInteger ND = grid->_ndimension;
assert(ND==4);
assert(ND==Block.size());
std::vector<IndexInteger> dims(ND);
std::vector<IndexInteger> xo(ND,0);
std::vector<IndexInteger> xi(ND,0);
for(IndexInteger mu=0;mu<ND;mu++){
dims[mu] = grid->_rdimensions[mu];
}
IterateO(ND,ND-1,xo,xi,dims);
};
void LebesgueOrder::IterateO(int ND,int dim,
std::vector<IndexInteger> & xo,
std::vector<IndexInteger> & xi,
std::vector<IndexInteger> &dims)
{
for(xo[dim]=0;xo[dim]<dims[dim];xo[dim]+=Block[dim]){
if ( dim > 0 ) {
IterateO(ND,dim-1,xo,xi,dims);
} else {
IterateI(ND,ND-1,xo,xi,dims);
}
}
};
void LebesgueOrder::IterateI(int ND,
int dim,
std::vector<IndexInteger> & xo,
std::vector<IndexInteger> & xi,
std::vector<IndexInteger> &dims)
{
std::vector<IndexInteger> x(ND);
for(xi[dim]=0;xi[dim]<std::min(dims[dim]-xo[dim],Block[dim]);xi[dim]++){
if ( dim > 0 ) {
IterateI(ND,dim-1,xo,xi,dims);
} else {
for(int d=0;d<ND;d++){
x[d]=xi[d]+xo[d];
}
IndexInteger index;
grid->IndexFromCoor(x,index,grid->_rdimensions);
_LebesgueReorder.push_back(index);
}
}
}
void LebesgueOrder::ZGraph(void)
{
_LebesgueReorder.resize(0);
// Align up dimensions to power of two.
const IndexInteger one=1;
IndexInteger ND = grid->_ndimension;
std::vector<IndexInteger> dims(ND);
std::vector<IndexInteger> adims(ND);
@ -36,17 +105,7 @@ LebesgueOrder::LebesgueOrder(GridBase *grid)
// i) avoids recursion
// ii) has loop lengths at most the width of a 32 bit word.
int sitebit=0;
int split=2;
for(int mu=0;mu<ND;mu++){ // mu 0 takes bit 0; mu 1 takes bit 1 etc...
for(int bit=0;bit<split;bit++){
IndexInteger mask = one<<bit;
if ( mask&(adims[mu]-1) ){
bitlist[mu].push_back(sitebit);
sitebit++;
}
}
}
for(int bit=split;bit<32;bit++){
for(int bit=0;bit<32;bit++){
IndexInteger mask = one<<bit;
for(int mu=0;mu<ND;mu++){ // mu 0 takes bit 0; mu 1 takes bit 1 etc...
if ( mask&(adims[mu]-1) ){
@ -94,10 +153,22 @@ LebesgueOrder::LebesgueOrder(GridBase *grid)
+ dims[0]*ax[1]
+dims[0]*dims[1]*ax[2]
+dims[0]*dims[1]*dims[2]*ax[3];
assert(site < vol);
_LebesgueReorder.push_back(site);
}
}
}
assert( _LebesgueReorder.size() == vol );
std::vector<int> coor(4);
for(IndexInteger asite=0;asite<vol;asite++){
grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
std::cout << " site "<<asite << "->" << _LebesgueReorder[asite]<< " = ["
<< coor[0]<<","
<< coor[1]<<","
<< coor[2]<<","
<< coor[3]<<"]"
<<std::endl;
}
}
}

View File

@ -9,17 +9,37 @@ namespace Grid {
class LebesgueOrder {
public:
typedef int32_t IndexInteger;
static int UseLebesgueOrder;
GridBase *grid;
typedef uint32_t IndexInteger;
public:
LebesgueOrder(GridBase *_grid);
inline IndexInteger Reorder(IndexInteger ss) {
return UseLebesgueOrder ? _LebesgueReorder[ss] : ss;
return _LebesgueReorder[ss] ;
};
////////////////////////////
// Space filling fractal for cache oblivious
////////////////////////////
void ZGraph(void);
IndexInteger alignup(IndexInteger n);
LebesgueOrder(GridBase *grid);
/////////////////////////////////
// Cartesian stencil blocking strategy
/////////////////////////////////
static std::vector<int> Block;
void CartesianBlocking(void);
void IterateO(int ND,int dim,
std::vector<IndexInteger> & xo,
std::vector<IndexInteger> & xi,
std::vector<IndexInteger> &dims);
void IterateI(int ND,int dim,
std::vector<IndexInteger> & xo,
std::vector<IndexInteger> & xi,
std::vector<IndexInteger> &dims);
private:
std::vector<IndexInteger> _LebesgueReorder;

View File

@ -1,268 +1,6 @@
#include "Grid.h"
namespace Grid {
CartesianStencil::CartesianStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints)
{
_npoints = npoints;
_grid = grid;
_directions = directions;
_distances = distances;
_unified_buffer_size=0;
_request_count =0;
int osites = _grid->oSites();
for(int i=0;i<npoints;i++){
int point = i;
_entries[i].resize( osites);
int dimension = directions[i];
int displacement = distances[i];
int shift = displacement;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
_permute_type[point]=_grid->PermuteType(dimension);
_checkerboard = checkerboard;
// the permute type
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
int sshift[2];
// Underlying approach. For each local site build
// up a table containing the npoint "neighbours" and whether they
// live in lattice or a comms buffer.
if ( !comm_dim ) {
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Local(point,dimension,shift,0x3);
} else {
Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Local(point,dimension,shift,0x2);// both with block stride loop iteration
}
} else { // All permute extract done in comms phase prior to Stencil application
// So tables are the same whether comm_dim or splice_dim
sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Comms(point,dimension,shift,0x3);
} else {
Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Comms(point,dimension,shift,0x2);// both with block stride loop iteration
}
}
}
}
void CartesianStencil::Local (int point, int dimension,int shiftpm,int cbmask)
{
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int gd = _grid->_gdimensions[dimension];
// Map to always positive shift modulo global full dimension.
int shift = (shiftpm+fd)%fd;
// the permute type
int permute_dim =_grid->PermuteDim(dimension);
for(int x=0;x<rd;x++){
int o = 0;
int bo = x * _grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) ) {
wraparound = 1;
}
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = 1-wrap;
}
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
}
}
void CartesianStencil::Comms (int point,int dimension,int shiftpm,int cbmask)
{
GridBase *grid=_grid;
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
int pd = _grid->_processors[dimension];
int simd_layout = _grid->_simd_layout[dimension];
int comm_dim = _grid->_processors[dimension] >1 ;
// assert(simd_layout==1); // Why?
assert(comm_dim==1);
int shift = (shiftpm + fd) %fd;
assert(shift>=0);
assert(shift<fd);
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
_comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
// send to one or more remote nodes.
int cb= (cbmask==0x2)? Odd : Even;
int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
for(int x=0;x<rd;x++){
int offnode = (((x+sshift)%fd) >= rd );
// int comm_proc = ((x+sshift)/ld)%pd;
// int offnode = (comm_proc!=0);
int sx = (x+sshift)%rd;
int wraparound=0;
if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
wraparound = 1;
}
if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
wraparound = 1;
}
if (!offnode) {
int permute_slice=0;
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
// GatherPlaneSimple (point,dimension,sx,cbmask);
int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
int unified_buffer_offset = _unified_buffer_size;
_unified_buffer_size += words;
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int o = 0; // relative offset to base within plane
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int ro = rplane*_grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
_entries[point][lo+o+b]._offset =ro+o+b;
_entries[point][lo+o+b]._is_local=1;
_entries[point][lo+o+b]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
// Routine builds up integer table for each site in _offsets, _is_local, _permute
void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
{
int rd = _grid->_rdimensions[dimension];
if ( !_grid->CheckerBoarded(dimension) ) {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
// Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute=0;
_entries[point][so+o+b]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
} else {
int so = plane*_grid->_ostride[dimension]; // base offset for start of plane
int o = 0; // relative offset to base within plane
int bo = 0; // offset in buffer
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
_entries[point][so+o+b]._offset =offset+(bo++);
_entries[point][so+o+b]._is_local=0;
_entries[point][so+o+b]._permute =0;
_entries[point][so+o+b]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];
}
}
}
}

View File

@ -115,23 +115,21 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
template<class vobj> inline
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
{
typedef typename vobj::scalar_type scalar_type ;
typedef typename vobj::vector_type vector_type ;
const int words=sizeof(vobj)/sizeof(vector_type);
const int Nsimd=vobj::vector_type::Nsimd();
int Nextr=extracted.size();
int s = Nsimd/Nextr;
scalar_type * vp = (scalar_type *)&vec;
std::vector<scalar_type *> pointers(Nsimd);
for(int i=0;i<Nextr;i++) {
pointers[i] =(scalar_type *)& extracted[i][offset];
}
vector_type *vp = (vector_type *)&vec;
for(int w=0;w<words;w++){
extract<vector_type,scalar_type>(&vp[w],pointers,w);
for(int i=0;i<Nextr;i++){
scalar_type * pointer = (scalar_type *)& extracted[i][offset];
pointer[w] = vp[i*s+w*Nsimd];
}
}
}
@ -173,16 +171,19 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
const int words=sizeof(vobj)/sizeof(vector_type);
int Nextr=extracted.size();
int s=Nsimd/Nextr;
std::vector<scalar_type *> pointers(Nextr);
for(int i=0;i<Nextr;i++)
pointers[i] =(scalar_type *)& extracted[i][offset];
vector_type *vp = (vector_type *)&vec;
scalar_type *pointer;
scalar_type *vp = (scalar_type *)&vec;
for(int w=0;w<words;w++){
merge<vector_type,scalar_type>(&vp[w],pointers,w);
for(int i=0;i<Nextr;i++){
for(int ii=0;ii<s;ii++){
pointer=(scalar_type *)&extracted[i][offset];
vp[w*Nsimd+i*s+ii] = pointer[w];
}
}
}
}
}
}
#endif

View File

@ -86,16 +86,16 @@ Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
Test_dwf_hdcr_LDADD=-lGrid
#Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
#Test_dwf_lanczos_LDADD=-lGrid
Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
Test_dwf_lanczos_LDADD=-lGrid
Test_gamma_SOURCES=Test_gamma.cc
Test_gamma_LDADD=-lGrid
#Test_gparity_SOURCES=Test_gparity.cc
#Test_gparity_LDADD=-lGrid
Test_gparity_SOURCES=Test_gparity.cc
Test_gparity_LDADD=-lGrid
#Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
@ -190,6 +190,10 @@ Test_stencil_SOURCES=Test_stencil.cc
Test_stencil_LDADD=-lGrid
Test_synthetic_lanczos_SOURCES=Test_synthetic_lanczos.cc
Test_synthetic_lanczos_LDADD=-lGrid
Test_wilson_cg_prec_SOURCES=Test_wilson_cg_prec.cc
Test_wilson_cg_prec_LDADD=-lGrid

View File

@ -54,27 +54,27 @@ int main (int argc, char ** argv)
TComplex cm;
for(int dir=0;dir<Nd;dir++){
if ( dir!=1 ) continue;
// if ( dir!=1 ) continue;
for(int shift=0;shift<latt_size[dir];shift++){
std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
// std::cout<<GridLogMessage<<"Even grid"<<std::endl;
std::cout<<GridLogMessage<<"Even grid"<<std::endl;
ShiftUe = Cshift(Ue,dir,shift); // Shift everything cb by cb
// std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
// std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
ShiftUo = Cshift(Uo,dir,shift);
// std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
// std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
setCheckerboard(rbShiftU,ShiftUe);
setCheckerboard(rbShiftU,ShiftUo);
// std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
// std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
ShiftU = Cshift(U,dir,shift); // Shift everything
// std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
std::vector<int> coor(4);
@ -105,18 +105,18 @@ int main (int argc, char ** argv)
Fine.CoorFromIndex(peer,index,latt_size);
if (nrm > 0){
std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
index=real(scm);
Fine.CoorFromIndex(peer,index,latt_size);
std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
exit(-1);
}
}}}}
int exx=0;
std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
@ -144,20 +144,21 @@ int main (int argc, char ** argv)
Fine.CoorFromIndex(peer,index,latt_size);
if (nrm > 0){
std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
index=real(scm);
Fine.CoorFromIndex(peer,index,latt_size);
std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
exit(-1);
} else if (0) {
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
exx=1;
} else if (1) {
std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
}
}}}}
if (exx) exit(-1);
}
}

View File

@ -35,21 +35,26 @@ int main (int argc, char ** argv)
MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
const int Nk = 10;
const int Np = 1;
RealD enorm = 1.0;
RealD vthrs = 1;
const int Nit= 1000;
const int Nk = 30;
const int Np = 10;
const int Nm = Nk+Np;
const int MaxIt= 10000;
RealD resid = 1.0e-8;
ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,
Nk,Np,enorm,vthrs,Nit);
std::vector<double> Coeffs(1,1.0);
Polynomial<LatticeFermion> PolyX(Coeffs);
ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,Nk,Nm,resid,MaxIt);
std::vector<RealD> eval(Nk);
std::vector<LatticeFermion> evec(Nk,FGrid);
std::vector<RealD> eval(Nm);
std::vector<LatticeFermion> evec(Nm,FGrid);
for(int i=0;i<Nm;i++){
std::cout << i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
};
int Nconv;
IRL.calc(eval,evec,
src,
Nsbt,
Nconv);

View File

@ -298,7 +298,7 @@ int main (int argc, char ** argv)
c = scm()(1,1)(1,2);
scm()(1,1)(2,1) = c;
pokeIndex<ColourIndex> (c_m,c,0,0);
// pokeIndex<ColourIndex> (c_m,c,0,0);
}
FooBar = Bar;

View File

@ -1,36 +1,35 @@
#include <Grid.h>
namespace Grid {
class myclass {
public:
GRID_DECL_CLASS_MEMBERS(myclass,
int, x,
double, y,
bool , b,
std::string, name,
std::vector<double>, array,
std::vector<std::vector<double> >, twodimarray,
);
myclass(){}
myclass(int i) : array(4,5.1), twodimarray(3,std::vector<double>(2,1.23456)) {
x=i;
y=2*i;
b=true;
name="bother said pooh";
}
};
class myclass: Serializable {
public:
GRID_DECL_CLASS_MEMBERS(myclass,
int, x,
double, y,
bool , b,
std::string, name,
std::vector<double>, array,
std::vector<std::vector<double>>, twodimarray,
);
myclass() {}
myclass(int i)
: array(4,5.1), twodimarray(3,std::vector<double>(2,1.23456))
{
x=i;
y=2*i;
b=true;
name="bother said pooh";
}
};
}
int16_t i16 = 1;
int16_t i16 = 1;
uint16_t u16 = 2;
int32_t i32 = 3;
int32_t i32 = 3;
uint32_t u32 = 4;
int64_t i64 = 5;
int64_t i64 = 5;
uint64_t u64 = 6;
float f = M_PI;
double d = 2*M_PI;
@ -41,8 +40,9 @@ using namespace Grid;
int main(int argc,char **argv)
{
{
XMLWriter WR("bother.xml");
XmlWriter WR("bother.xml");
// test basic type writing
push(WR,"BasicTypes");
write(WR,std::string("i16"),i16);
write(WR,"u16",u16);
@ -54,40 +54,54 @@ int main(int argc,char **argv)
write(WR,"d",d);
write(WR,"b",b);
pop(WR);
// test serializable class writing
myclass obj(1234); // non-trivial constructor
write(WR,"obj",obj);
WR.write("obj2", obj);
std::vector<myclass> vec;
vec.push_back(myclass(1234));
vec.push_back(myclass(5678));
vec.push_back(myclass(3838));
write(WR, "objvec", vec);
};
XMLReader RD("bother.xml");
myclass copy1;
myclass copy2;
myclass copy3;
read(RD,"obj",copy1);
std::cout << "Loaded " << copy1<<std::endl;
// read tests
myclass copy1, copy2, copy3;
std::vector<myclass> veccopy1, veccopy2, veccopy3;
//// XML
{
XmlReader RD("bother.xml");
read(RD,"obj",copy1);
read(RD,"objvec", veccopy1);
std::cout << "Loaded (XML) -----------------" << std::endl;
std::cout << copy1 << std::endl << veccopy1 << std::endl;
}
//// binary
{
BinaryWriter BWR("bother.bin");
write(BWR,"discard",copy1 );
write(BWR,"discard",veccopy1 );
}
{
{
BinaryReader BRD("bother.bin");
read (BRD,"discard",copy2 );
std::cout<<copy2<<std::endl;
read (BRD,"discard",veccopy2 );
std::cout << "Loaded (bin) -----------------" << std::endl;
std::cout << copy2 << std::endl << veccopy2 << std::endl;
}
//// text
{
TextWriter TWR("bother.txt");
write(TWR,"discard",copy1 );
write(TWR,"discard",veccopy1 );
}
{
{
TextReader TRD("bother.txt");
read (TRD,"discard",copy3 );
std::cout<<copy3<<std::endl;
read (TRD,"discard",veccopy3 );
std::cout << "Loaded (txt) -----------------" << std::endl;
std::cout << copy3 << std::endl << veccopy3 << std::endl;
}
}

View File

@ -8,6 +8,10 @@ int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
// typedef LatticeColourMatrix Field;
typedef LatticeComplex Field;
typedef typename Field::vector_object vobj;
typedef typename vobj::scalar_object sobj;
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
@ -18,23 +22,40 @@ int main (int argc, char ** argv)
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
GridParallelRNG fRNG(&Fine);
// fRNG.SeedRandomDevice();
std::vector<int> seeds({1,2,3,4});
fRNG.SeedFixedIntegers(seeds);
LatticeColourMatrix Foo(&Fine);
LatticeColourMatrix Bar(&Fine);
LatticeColourMatrix Check(&Fine);
LatticeColourMatrix Diff(&Fine);
Field Foo(&Fine);
Field Bar(&Fine);
Field Check(&Fine);
Field Diff(&Fine);
LatticeComplex lex(&Fine);
lex = zero;
random(fRNG,Foo);
gaussian(fRNG,Bar);
/*
Integer stride =1000;
{
double nrm;
LatticeComplex coor(&Fine);
for(int d=0;d<Nd;d++){
LatticeCoordinate(coor,d);
lex = lex + coor*stride;
stride=stride/10;
}
Foo=lex;
}
*/
for(int dir=0;dir<4;dir++){
for(int disp=0;disp<Fine._fdimensions[dir];disp++){
std::cout<<GridLogMessage << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
std::cout<< std::fixed <<GridLogMessage << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
// start to test the Cartesian npoint stencil infrastructure
int npoint=1;
std::vector<int> directions(npoint,dir);
@ -48,8 +69,8 @@ int main (int argc, char ** argv)
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
}
std::vector<vColourMatrix,alignedAllocator<vColourMatrix> > comm_buf(myStencil._unified_buffer_size);
SimpleCompressor<vColourMatrix> compress;
std::vector<vobj,alignedAllocator<vobj> > comm_buf(myStencil._unified_buffer_size);
SimpleCompressor<vobj> compress;
myStencil.HaloExchange(Foo,comm_buf,compress);
Bar = Cshift(Foo,dir,disp);
@ -75,9 +96,114 @@ int main (int argc, char ** argv)
Real nrm = norm2(Diff);
std::cout<<GridLogMessage<<"N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;
Real snrmC =0;
Real snrmB =0;
Real snrm =0;
std::vector<int> coor(4);
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
RealD diff;
sobj check,bar;
peekSite(check,Check,coor);
peekSite(bar,Bar,coor);
sobj ddiff;
ddiff = check -bar;
diff =norm2(ddiff);
if ( diff > 0){
std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
<<") " <<check<<" vs "<<bar<<std::endl;
}
}}}}
}
}
std::cout<<GridLogMessage<<"Testing RedBlack\n ";
Field EFoo(&rbFine);
Field OFoo(&rbFine);
Field ECheck(&rbFine);
Field OCheck(&rbFine);
pickCheckerboard(Even,EFoo,Foo);
pickCheckerboard(Odd ,OFoo,Foo);
for(int dir=0;dir<4;dir++){
for(int disp=0;disp<rbFine._fdimensions[dir];disp++){
std::cout<<GridLogMessage << "Using stencil to shift rb dim "<<dir<< " by "<<disp<<std::endl;
// start to test the Cartesian npoint stencil infrastructure
int npoint=1;
std::vector<int> directions(npoint,dir);
std::vector<int> displacements(npoint,disp);
CartesianStencil EStencil(&rbFine,npoint,Even,directions,displacements);
CartesianStencil OStencil(&rbFine,npoint,Odd,directions,displacements);
std::vector<int> ocoor(4);
for(int o=0;o<Fine.oSites();o++){
Fine.oCoorFromOindex(ocoor,o);
ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
}
std::vector<vobj,alignedAllocator<vobj> > Ecomm_buf(EStencil._unified_buffer_size);
std::vector<vobj,alignedAllocator<vobj> > Ocomm_buf(OStencil._unified_buffer_size);
SimpleCompressor<vobj> compress;
EStencil.HaloExchange(EFoo,Ecomm_buf,compress);
OStencil.HaloExchange(OFoo,Ocomm_buf,compress);
Bar = Cshift(Foo,dir,disp);
if ( disp & 0x1 ) {
ECheck.checkerboard = Even;
OCheck.checkerboard = Odd;
} else {
ECheck.checkerboard = Odd;
OCheck.checkerboard = Even;
}
// Implement a stencil code that should agree with that darn cshift!
for(int i=0;i<OCheck._grid->oSites();i++){
int permute_type;
StencilEntry *SE;
SE = EStencil.GetEntry(permute_type,0,i);
std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
if ( SE->_is_local && SE->_permute )
permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type);
else if (SE->_is_local)
OCheck._odata[i] = EFoo._odata[SE->_offset];
else
OCheck._odata[i] = Ecomm_buf[SE->_offset];
}
for(int i=0;i<ECheck._grid->oSites();i++){
int permute_type;
StencilEntry *SE;
SE = OStencil.GetEntry(permute_type,0,i);
std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
if ( SE->_is_local && SE->_permute )
permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);
else if (SE->_is_local)
ECheck._odata[i] = OFoo._odata[SE->_offset];
else
ECheck._odata[i] = Ocomm_buf[SE->_offset];
}
setCheckerboard(Check,ECheck);
setCheckerboard(Check,OCheck);
Real nrmC = norm2(Check);
Real nrmB = norm2(Bar);
Diff = Check-Bar;
Real nrm = norm2(Diff);
std::cout<<GridLogMessage<<"RB N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;
std::vector<int> coor(4);
for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
@ -85,33 +211,22 @@ int main (int argc, char ** argv)
for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
Complex diff;
ColourMatrix check,bar;
RealD diff;
sobj check,bar;
peekSite(check,Check,coor);
peekSite(bar,Bar,coor);
for(int r=0;r<3;r++){
for(int c=0;c<3;c++){
diff =check()()(r,c)-bar()()(r,c);
double nn=real(conjugate(diff)*diff);
if ( nn > 0){
printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
coor[0],coor[1],coor[2],coor[3],r,c,
nn,
real(check()()(r,c)),
imag(check()()(r,c)),
real(bar()()(r,c))
);
}
snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c));
snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c));
snrm=snrm+nn;
}}
sobj ddiff;
ddiff = check -bar;
diff =norm2(ddiff);
if ( diff > 0){
std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3] <<") "
<<"shift "<<disp<<" dir "<< dir
<< " stencil impl " <<check<<" vs cshift impl "<<bar<<std::endl;
}
}}}}
std::cout<<GridLogMessage<<"scalar N2diff = "<<snrm<<" " <<snrmC<<" "<<snrmB<<std::endl;
}
}

View File

@ -0,0 +1,123 @@
#include <fenv.h>
#include <Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
static int
FEenableexcept (unsigned int excepts)
{
static fenv_t fenv;
unsigned int new_excepts = excepts & FE_ALL_EXCEPT,
old_excepts; // previous masks
if ( fegetenv (&fenv) ) return -1;
old_excepts = fenv.__control & FE_ALL_EXCEPT;
// unmask
fenv.__control &= ~new_excepts;
fenv.__mxcsr &= ~(new_excepts << 7);
return ( fesetenv (&fenv) ? -1 : old_excepts );
}
template<class Field> class DumbOperator : public LinearOperatorBase<Field> {
public:
LatticeComplex scale;
DumbOperator(GridBase *grid) : scale(grid)
{
GridParallelRNG pRNG(grid);
std::vector<int> seeds({5,6,7,8});
pRNG.SeedFixedIntegers(seeds);
random(pRNG,scale);
scale = exp(-real(scale)*6.0);
std::cout << " True matrix \n"<< scale <<std::endl;
}
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {};
void OpDir (const Field &in, Field &out,int dir,int disp){};
void Op (const Field &in, Field &out){
out = scale * in;
}
void AdjOp (const Field &in, Field &out){
out = scale * in;
}
void HermOp(const Field &in, Field &out){
double n1, n2;
HermOpAndNorm(in,out,n1,n2);
}
void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
ComplexD dot;
out = scale * in;
dot= innerProduct(in,out);
n1=real(dot);
dot = innerProduct(out,out);
n2=real(dot);
}
};
int main (int argc, char ** argv)
{
FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT);
Grid_init(&argc,&argv);
GridCartesian *grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridParallelRNG RNG(grid);
std::vector<int> seeds({1,2,3,4});
RNG.SeedFixedIntegers(seeds);
RealD alpha = 1.0;
RealD beta = 0.03;
RealD mu = 0.0;
int order = 11;
ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
std::ofstream file("pooh.dat");
Cheby.csv(file);
HermOpOperatorFunction<LatticeComplex> X;
DumbOperator<LatticeComplex> HermOp(grid);
const int Nk = 40;
const int Nm = 80;
const int Nit= 10000;
int Nconv;
RealD eresid = 1.0e-8;
ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nm,eresid,Nit);
ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit);
LatticeComplex src(grid); gaussian(RNG,src);
{
std::vector<RealD> eval(Nm);
std::vector<LatticeComplex> evec(Nm,grid);
IRL.calc(eval,evec,src, Nconv);
}
{
std::vector<RealD> eval(Nm);
std::vector<LatticeComplex> evec(Nm,grid);
ChebyIRL.calc(eval,evec,src, Nconv);
}
Grid_finalize();
}

293
tests/Test_zmm.cc Normal file
View File

@ -0,0 +1,293 @@
#include <Grid.h>
#include <PerfCount.h>
#include <simd/Avx512Asm.h>
using namespace Grid;
using namespace Grid::QCD;
void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
void TimesIAvx512F(void *ptr1,void *ptr3);
void TimesIAvx512(void *ptr1,void *ptr3);
int main(int argc,char **argv)
{
Grid_init(&argc,&argv);
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
int threads = GridThread::GetThreads();
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
vColourMatrixD mat;
vHalfSpinColourVectorD vec;
vHalfSpinColourVectorD matvec;
vHalfSpinColourVectorD ref;
vComplexD err;
random(sRNG,mat);
random(sRNG,vec);
ref = mat*vec;
WilsonDslashAvx512((void *)&vec, (void *)&mat,(void *)&matvec);
ref = ref - matvec;
err = TensorRemove(innerProduct(ref,ref));
std::cout <<"Double SU3 x 2spin diff "<< Reduce(err)<<std::endl;
vColourMatrixF matF;
vHalfSpinColourVectorF vecF;
vHalfSpinColourVectorF matvecF;
vHalfSpinColourVectorF refF;
vComplexF errF;
random(sRNG,matF);
random(sRNG,vecF);
refF = matF*vecF;
WilsonDslashAvx512F((void *)&vecF, (void *)&matF,(void *)&matvecF);
refF = refF-matvecF;
errF = TensorRemove(innerProduct(refF,refF));
std::cout <<"Single SU3 x 2spin diff "<< Reduce(errF)<<std::endl;
TimesIAvx512F((void *)&vecF,(void *)&matvecF);
refF = timesI(vecF)-matvecF;
errF = TensorRemove(innerProduct(refF,refF));
std::cout <<" timesI single diff "<< Reduce(errF)<<std::endl;
TimesIAvx512((void *)&vec,(void *)&matvec);
ref = timesI(vec)-matvec;
err = TensorRemove(innerProduct(ref,ref));
std::cout <<" timesI double diff "<< Reduce(err)<<std::endl;
LatticeFermion src (FGrid);
LatticeFermion srce(FrbGrid);
LatticeFermion resulto(FrbGrid); resulto=zero;
LatticeFermion resulta(FrbGrid); resulta=zero;
LatticeFermion diff(FrbGrid);
LatticeGaugeField Umu(UGrid);
#if 1
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
random(RNG5,src);
random(RNG4,Umu);
#else
int mmu=3;
std::vector<LatticeColourMatrix> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
if ( mu!=mmu ) U[mu] = zero;
if ( mu==mmu ) U[mu] = 1.0;
PokeIndex<LorentzIndex>(Umu,U[mu],mu);
}
#endif
pickCheckerboard(Even,srce,src);
RealD mass=0.1;
RealD M5 =1.8;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall=50;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopOE(srce,resulto,0);
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume/2;
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(resulto)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops*ncall/(t1-t0)<<std::endl;
QCD::WilsonFermion5DStatic::AsmOptDslash=1;
t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopOE(srce,resulta,0);
}
t1=usecond();
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
Dw.DhopOE(srce,resulta,0);
PerformanceCounter Counter(i);
Counter.Start();
Dw.DhopOE(srce,resulta,0);
Counter.Stop();
Counter.Report();
}
resulta = (-0.5) * resulta;
std::cout<<GridLogMessage << "Called Asm Dw"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(resulta)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops*ncall/(t1-t0)<<std::endl;
diff = resulto-resulta;
std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
}
#undef VLOAD
#undef VSTORE
#undef VMUL
#undef VMADD
#undef ZEND1
#undef ZEND2
#undef ZLOAD
#undef ZMUL
#undef ZMADD
#define VZERO(A) VZEROd(A)
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
#define VMUL(Uri,Uir,Chi,UChi,Z) VMULd(Uri,Uir,Chi,UChi,Z)
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDd(Uri,Uir,Chi,UChi,Z)
#define ZEND1(A,B,C) ZEND1d(A,B,C)
#define ZEND2(A,B,C) ZEND2d(A,B,C)
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define zz Z0
void TimesIAvx512(void *ptr1,void *ptr3)
{
__asm__ ("mov $0xAAAA, %%eax " : : :"%eax");
__asm__ ("kmov %%eax, %%k6 " : : :);
__asm__ ("knot %%k6, %%k7 " : : :);
MASK_REGS;
LOAD_CHI(ptr1);
__asm__ (
VZERO(zz)
VTIMESI(Chi_00,UChi_00,zz)
VTIMESI(Chi_01,UChi_01,zz)
VTIMESI(Chi_02,UChi_02,zz)
VTIMESI(Chi_10,UChi_10,zz)
VTIMESI(Chi_11,UChi_11,zz)
VTIMESI(Chi_12,UChi_12,zz)
);
SAVE_UCHI(ptr3);
}
void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)
{
int return_address;
// prototype computed goto to eliminate ABI save restore on call/return in
// generated assembly.
static void * table[] = { &&save, &&mult };
MASK_REGS;
LOAD_CHI(ptr1);
return_address = 0;
goto mult;
save:
SAVE_UCHI(ptr3);
return;
mult:
MULT_2SPIN(ptr2);
goto *table[return_address];
}
#undef VLOAD
#undef VSTORE
#undef VMUL
#undef VMADD
#undef ZEND1
#undef ZEND2
#undef ZLOAD
#undef ZMUL
#undef ZMADD
#undef VZERO
#undef VTIMESI
#undef VTIMESI0
#undef VTIMESI1
#undef VTIMESI2
#undef VTIMESMINUSI
#undef ZMULMEM2SP
#undef ZMADDMEM2SP
#define VZERO(A) VZEROf(A)
#define VMOV(A,B) VMOVf(A,B)
#define VADD(A,B,C) VADDf(A,B,C)
#define VSUB(A,B,C) VSUBf(A,B,C)
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
#define VMUL(Uri,Uir,Chi,UChi,Z) VMULf(Uri,Uir,Chi,UChi,Z)
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
#define ZEND1(A,B,C) ZEND1f(A,B,C)
#define ZEND2(A,B,C) ZEND2f(A,B,C)
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
void TimesIAvx512F(void *ptr1,void *ptr3)
{
MASK_REGS;
LOAD_CHI(ptr1);
__asm__ (
VZERO(zz)
VTIMESI(Chi_00,UChi_00,zz)
VTIMESI(Chi_01,UChi_01,zz)
VTIMESI(Chi_02,UChi_02,zz)
VTIMESI(Chi_10,UChi_10,zz)
VTIMESI(Chi_11,UChi_11,zz)
VTIMESI(Chi_12,UChi_12,zz)
);
SAVE_UCHI(ptr3);
}
void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)
{
MASK_REGS;
LOAD_CHI(ptr1);
MULT_2SPIN(ptr2);
SAVE_UCHI(ptr3);
return;
}