Comms and memory benchmarks added

2025-11-03 05:24:32 +00:00 · 2015-05-03 09:44:47 +01:00
parent 99a1ff423d
commit 193860dbc8
14 changed files with 300 additions and 59 deletions
--- a/benchmarks/Grid_comms.cc
+++ b/benchmarks/Grid_comms.cc
@@ -25,17 +25,19 @@ int main (int argc, char ** argv)
  for(int lat=4;lat<=16;lat+=4){
    for(int Ls=1;Ls<=16;Ls*=2){

+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+
+      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
      double start=usecond();
-      int ncomm=0;
      for(int i=0;i<Nloop;i++){
-	std::vector<int> latt_size  ({lat,lat,lat,lat});
-    
-	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-
-	std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-	std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;

 	ncomm=0;
@@ -68,11 +70,10 @@ int main (int argc, char ** argv)
 	  
 	  }
 	}
-
 	Grid.SendToRecvFromComplete(requests);
 	Grid.Barrier();
-      }

+      }
      double stop=usecond();

      double xbytes    = Nloop*bytes*2*ncomm;
@@ -96,18 +97,20 @@ int main (int argc, char ** argv)
  for(int lat=4;lat<=16;lat+=4){
    for(int Ls=1;Ls<=16;Ls*=2){

+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+
+
+      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
+
      double start=usecond();
-      int ncomm=0;
      for(int i=0;i<Nloop;i++){
-	std::vector<int> latt_size  ({lat,lat,lat,lat});
    
-	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-
-
-	std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-	std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-
 	ncomm=0;
 	for(int mu=0;mu<4;mu++){
 	
@@ -131,7 +134,6 @@ int main (int argc, char ** argv)
 	    }

 	    comm_proc = mpi_layout[mu]-1;
-	  
 	    {
 	      std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
--- a/benchmarks/Grid_memory_bandwidth.cc
+++ b/benchmarks/Grid_memory_bandwidth.cc
@@ -0,0 +1,150 @@
+#include <Grid.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> simd_layout({1,2,2,2});
+  std::vector<int> mpi_layout ({1,1,1,1});
+
+  const int Nvec=8;
+  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
+
+  int Nloop=100;
+
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "= Benchmarking AXPY bandwidth"<<std::endl;
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
+
+  
+  for(int lat=4;lat<=32;lat+=4){
+
+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
+      double a=1.0;
+
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	//	z=a*x+y;
+	//   inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+	axpy(z,a,x,y);
+      }
+      double stop=usecond();
+      double time = stop-start;
+      
+      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
+      std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+
+    }
+
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "= Benchmarking a*x + y bandwidth"<<std::endl;
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
+
+  
+  for(int lat=4;lat<=32;lat+=4){
+
+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
+      double a=1.0;
+
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	z=a*x+y;
+      }
+      double stop=usecond();
+      double time = stop-start;
+      
+      double bytes=3*lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
+      std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+
+    }
+
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "= Benchmarking COPY bandwidth"<<std::endl;
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
+
+
+  for(int lat=4;lat<=32;lat+=4){
+
+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
+      RealD a=1.0;
+
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	x=z;
+      }
+      double stop=usecond();
+      double time = stop-start;
+      
+      double bytes=2*lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
+      std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+
+  }
+
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "= Benchmarking READ bandwidth"<<std::endl;
+  std::cout << "===================================================================================================="<<std::endl;
+  std::cout << "  L  "<<"\t\t"<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s"<<std::endl;
+
+
+  for(int lat=4;lat<=32;lat+=4){
+
+      std::vector<int> latt_size  ({lat,lat,lat,lat});
+
+      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
+
+      LatticeVec z(&Grid); random(pRNG,z);
+      LatticeVec x(&Grid); random(pRNG,x);
+      LatticeVec y(&Grid); random(pRNG,y);
+      RealD a=1.0;
+      ComplexD nn;
+
+      double start=usecond();
+      for(int i=0;i<Nloop;i++){
+	nn=norm2(x);
+      }
+      double stop=usecond();
+      double time = stop-start;
+      
+      double bytes=lat*lat*lat*lat*Nvec*sizeof(Real)*Nloop;
+      std::cout << lat<<"\t\t"<<bytes<<"\t\t"<<bytes/time<<std::endl;
+
+  }    
+
+  Grid_finalize();
+}
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -14,6 +14,7 @@

 #include <complex>
 #include <vector>
+#include <valarray>
 #include <iostream>
 #include <cassert>
 #include <random>
--- a/lib/Grid_lattice.h
+++ b/lib/Grid_lattice.h
@@ -26,7 +26,8 @@ class Lattice
 public:
    GridBase *_grid;
    int checkerboard;
-    std::vector<vobj,alignedAllocator<vobj> > _odata;
+    //std::vector<vobj,alignedAllocator<vobj> > _odata;
+    std::valarray<vobj> _odata;
 public:

    typedef typename vobj::scalar_type scalar_type;
@@ -36,9 +37,9 @@ public:
    // Constructor requires "grid" passed.
    // what about a default grid?
    //////////////////////////////////////////////////////////////////
-    Lattice(GridBase *grid) : _grid(grid) {
+ Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
      //        _odata.reserve(_grid->oSites());
-        _odata.resize(_grid->oSites());
+      //        _odata.resize(_grid->oSites());
        assert((((uint64_t)&_odata[0])&0xF) ==0);
        checkerboard=0;
    }
--- a/lib/communicator/Grid_communicator_mpi.cc
+++ b/lib/communicator/Grid_communicator_mpi.cc
@@ -93,7 +93,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
-  ierr=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  
  assert(ierr==0);
--- a/lib/lattice/Grid_lattice_arith.h
+++ b/lib/lattice/Grid_lattice_arith.h
@@ -3,6 +3,9 @@

 namespace Grid {

+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // unary negation
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class vobj>
  inline Lattice<vobj> operator -(const Lattice<vobj> &r)
  {
@@ -13,25 +16,10 @@ namespace Grid {
    }
    return ret;
  }
-  
-  template<class vobj>
-  inline void axpy(Lattice<vobj> &ret,double a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-    conformable(lhs,rhs);
-#pragma omp parallel for
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
-      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
-    }
-  }
-  template<class vobj>
-  inline void axpy(Lattice<vobj> &ret,std::complex<double> a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
-    conformable(lhs,rhs);
-#pragma omp parallel for
-    for(int ss=0;ss<lhs._grid->oSites();ss++){
-      axpy(&ret._odata[ss],a,&lhs._odata[ss],&rhs._odata[ss]);
-    }
-  }
-  
-  
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
  template<class obj1,class obj2,class obj3>
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
    conformable(lhs,rhs);
@@ -69,7 +57,89 @@ namespace Grid {
    }
  }
  
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class obj1,class obj2,class obj3>
+    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mult(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mac(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+  template<class obj1,class obj2,class obj3>
+    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  //  avoid copy back routines for mult, mac, sub, add
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class obj1,class obj2,class obj3>
+    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+    uint32_t vec_len = lhs._grid->oSites();
+#pragma omp parallel for
+    for(int ss=0;ss<vec_len;ss++){
+      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  
+  template<class obj1,class obj2,class obj3>
+    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  template<class obj1,class obj2,class obj3>
+    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
+    conformable(lhs,rhs);
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
+    }
+  }
+  
+  /////////////////////////////////////////////////////////////////////////////////////
  // Lattice BinOp Lattice,
+  /////////////////////////////////////////////////////////////////////////////////////
  template<class left,class right>
    inline auto operator * (const Lattice<left> &lhs,const Lattice<right> &rhs)-> Lattice<decltype(lhs._odata[0]*rhs._odata[0])>
  {
@@ -156,5 +226,17 @@ namespace Grid {
      }
      return ret;
    }
+
+  template<class sobj,class vobj>
+  inline void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &lhs,const Lattice<vobj> &rhs){
+    conformable(lhs,rhs);
+    vobj tmp;
+#pragma omp parallel for
+    for(int ss=0;ss<lhs._grid->oSites();ss++){
+      tmp = a*lhs._odata[ss];
+      ret._odata[ss]= tmp+rhs._odata[ss];
+    }
+  }
+
 }
 #endif
--- a/lib/math/Grid_math_arith_mac.h
+++ b/lib/math/Grid_math_arith_mac.h
@@ -7,6 +7,7 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////// MAC         ///////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////

    ///////////////////////////
    // Legal multiplication table
@@ -74,8 +75,6 @@ inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __
    }
    return;
 }
-
-
 }

 #endif
--- a/lib/math/Grid_math_arith_mul.h
+++ b/lib/math/Grid_math_arith_mul.h
@@ -7,7 +7,6 @@ namespace Grid {
    ///////////////////////////////////////////////////////////////////////////////////////////////////
    /////////////////////////////////////////// MUL         ///////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////
-
    
 template<class rtype,class vtype,class mtype>
 inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@@ -16,7 +16,7 @@ namespace Grid {
 // However note that doing this eliminates some syntactical sugar such as 
 // calling the constructor explicitly or implicitly
 //
-#define TENSOR_IS_POD
+#undef TENSOR_IS_POD

 template<class vtype> class iScalar
 {
@@ -36,7 +36,7 @@ public:
  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;

 #ifndef TENSOR_IS_POD
-  iScalar(){;};
+  iScalar()=default;
  iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
  iScalar(const Zero &z){ *this = zero; };
 #endif
@@ -126,7 +126,7 @@ public:

 #ifndef TENSOR_IS_POD
  iVector(const Zero &z){ *this = zero; };
-  iVector() {};// Empty constructure
+  iVector() =default;
 #endif

    iVector<vtype,N> & operator= (const Zero &hero){
@@ -189,7 +189,7 @@ public:

 #ifndef TENSOR_IS_POD
  iMatrix(const Zero &z){ *this = zero; };
-  iMatrix() {};
+  iMatrix() =default;
 #endif

  iMatrix<vtype,N> & operator= (const Zero &hero){
--- a/lib/simd/Grid_vComplexD.h
+++ b/lib/simd/Grid_vComplexD.h
@@ -13,7 +13,7 @@ namespace Grid {
            vzero(*this);
            return (*this);
        }
-        vComplexD(){};
+        vComplexD()=default;
        vComplexD(ComplexD a){
 	  vsplat(*this,a);
 	};
--- a/lib/simd/Grid_vComplexF.h
+++ b/lib/simd/Grid_vComplexF.h
@@ -28,7 +28,7 @@ namespace Grid {
            vzero(*this);
            return (*this);
        }
-        vComplexF(){};
+        vComplexF()=default;
        vComplexF(ComplexF a){
 	  vsplat(*this,a);
 	};
--- a/lib/simd/Grid_vRealD.h
+++ b/lib/simd/Grid_vRealD.h
@@ -10,10 +10,13 @@ namespace Grid {
 	typedef dvec  vector_type;
 	typedef RealD scalar_type;

-        vRealD(){};
+        vRealD()=default;
        vRealD(RealD a){
 	  vsplat(*this,a);
 	};
+        vRealD(Zero &zero){
+	  zeroit(*this);
+	}

        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
        friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
--- a/lib/simd/Grid_vRealF.h
+++ b/lib/simd/Grid_vRealF.h
@@ -8,14 +8,16 @@ namespace Grid {
        fvec v;

    public:
-
 	typedef fvec  vector_type;
 	typedef RealF scalar_type;

-        vRealF(){};
+        vRealF()=default;
        vRealF(RealF a){
 	  vsplat(*this,a);
 	};
+        vRealF(Zero &zero){
+	  zeroit(*this);
+	}
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
        ////////////////////////////////////
--- a/tests/Grid_gamma.cc
+++ b/tests/Grid_gamma.cc
@@ -5,11 +5,10 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;

-template<class d>
-struct scal {
-  d internal;
-};
-
+//template<class vobj> class is_pod< iScalar<vobj> >
+//{
+//
+//};

 int main (int argc, char ** argv)
 {
@@ -40,13 +39,16 @@ int main (int argc, char ** argv)
  std::cout << " Is pod " << std::is_pod<SpinVector>::value  << std::endl;
  std::cout << " Is pod double   " << std::is_pod<double>::value  << std::endl;
  std::cout << " Is pod ComplexF " << std::is_pod<ComplexF>::value  << std::endl;
-  std::cout << " Is pod scal<double> " << std::is_pod<scal<double> >::value  << std::endl;
+  std::cout << " Is triv double " << std::is_trivially_default_constructible<double>::value  << std::endl;
+  std::cout << " Is triv ComplexF " << std::is_trivially_default_constructible<ComplexF>::value  << std::endl;
  std::cout << " Is pod Scalar<double> " << std::is_pod<iScalar<double> >::value  << std::endl;
  std::cout << " Is pod Scalar<ComplexF> " << std::is_pod<iScalar<ComplexF> >::value  << std::endl;
  std::cout << " Is pod Scalar<vComplexF> " << std::is_pod<iScalar<vComplexF> >::value  << std::endl;
  std::cout << " Is pod Scalar<vComplexD> " << std::is_pod<iScalar<vComplexD> >::value  << std::endl;
  std::cout << " Is pod Scalar<vRealF> " << std::is_pod<iScalar<vRealF> >::value  << std::endl;
  std::cout << " Is pod Scalar<vRealD> " << std::is_pod<iScalar<vRealD> >::value  << std::endl;
+  std::cout << " Is triv Scalar<double> " <<std::is_trivially_default_constructible<iScalar<double> >::value << std::endl;
+  std::cout << " Is triv Scalar<vComplexD> "<<std::is_trivially_default_constructible<iScalar<vComplexD> >::value  << std::endl;

  for(int a=0;a<Ns;a++){
    ident()(a,a) = 1.0;