1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-09 21:50:45 +01:00

Merge pull request #273 from lehner/feature/gpt

Feature/gpt
This commit is contained in:
Peter Boyle 2020-05-06 10:10:51 -04:00 committed by GitHub
commit 525418abfb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1600 additions and 345 deletions

View File

@ -37,211 +37,6 @@ Author: Christoph Lehner <clehner@bnl.gov>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
Vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
#if 0
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
thread_region
{
vobj* B = Bt.data() + Nm * thread_num();
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
int nrot = j1-j0;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
// printf("BasisRotate %d %d nrot %d siteBlock %d\n",j0,j1,nrot,siteBlock);
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of Eigen matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
for(int k=0;k<Nm;++k){
for(int j=0;j<Nm;++j){
Qt_p[j*Nm+k]=Qt(j,k);
}
}
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
auto z=coalescedRead(Bp[ss]);
z=Zero();
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.View();
Vector<View> basis_v(basis.size(),result_v);
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(basis_v[k0][ss]);
B=Zero();
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Implicitly restarted lanczos // Implicitly restarted lanczos
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////

View File

@ -114,6 +114,7 @@ public:
void GlobalSumVector(RealD *,int N); void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &); void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &); void GlobalSum(uint64_t &);
void GlobalSumVector(uint64_t*,int N);
void GlobalSum(ComplexF &c); void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N); void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c); void GlobalSum(ComplexD &c);

View File

@ -255,6 +255,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){ void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);

View File

@ -70,9 +70,10 @@ CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(double &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::GlobalSum(uint32_t &){} void CartesianCommunicator::GlobalSum(uint32_t &){}
void CartesianCommunicator::GlobalSum(uint64_t &){} void CartesianCommunicator::GlobalSum(uint64_t &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){} void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}

View File

@ -31,11 +31,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_ET.h> #include <Grid/lattice/Lattice_ET.h>
#include <Grid/lattice/Lattice_arith.h> #include <Grid/lattice/Lattice_arith.h>
#include <Grid/lattice/Lattice_trace.h> #include <Grid/lattice/Lattice_trace.h>
#include <Grid/lattice/Lattice_transpose.h> //#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h> #include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h> #include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h> #include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h> //#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h> #include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h> #include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h> #include <Grid/lattice/Lattice_coordinate.h>
@ -43,4 +43,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_rng.h> #include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h> #include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h> #include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>

View File

@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp> Author: neo <cossu@post.kek.jp>
Author: Christoph Lehner <christoph@lhnr.de
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -94,7 +95,7 @@ const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg) const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{ {
auto view = arg.View(); auto view = arg.AcceleratorView(ViewRead);
return view[ss]; return view[ss];
} }

View File

@ -7,6 +7,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -36,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.AcceleratorView(ViewRead);
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@ -55,9 +56,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -72,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -88,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.View(); auto rhs_v = rhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -107,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs); mult(&tmp,&lhs_v(ss),&rhs);
@ -120,8 +121,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -134,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -147,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto lhs_v = lhs.View(); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@ -164,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -178,8 +179,8 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -192,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -205,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto rhs_v = lhs.View(); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@ -220,9 +221,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto x_v = x.View(); auto x_v = x.AcceleratorView(ViewRead);
auto y_v = y.View(); auto y_v = y.AcceleratorView(ViewRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss); auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -233,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
auto ret_v = ret.View(); auto ret_v = ret.AcceleratorView(ViewWrite);
auto x_v = x.View(); auto x_v = x.AcceleratorView(ViewRead);
auto y_v = y.View(); auto y_v = y.AcceleratorView(ViewRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -49,6 +50,26 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
assert(lhs == rhs); assert(lhs == rhs);
} }
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum LatticeAcceleratorAdvise {
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
ViewRead = 0x1,
ViewWrite = 0x2,
ViewReadWrite = 0x3
};
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator // Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA // _odata will be a managed pointer in CUDA
@ -75,6 +96,37 @@ public:
if (grid) conformable(grid, _grid); if (grid) conformable(grid, _grid);
else grid = _grid; else grid = _grid;
}; };
accelerator_inline void Advise(int advise) {
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
if (advise & AdviseInfrequentUse) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
}
if (advise & AdviseReadMostly) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
}
#endif
#endif
};
accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
int target;
cudaGetDevice(&target);
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
#endif
#endif
};
accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
#endif
#endif
};
}; };
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
@ -206,9 +258,23 @@ public:
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
{ // and HostView for thread_for
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const
{ {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this)); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.AcceleratorPrefetch(mode);
return accessor;
}
LatticeView<vobj> HostView(int mode = ViewReadWrite) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.HostPrefetch(mode);
return accessor; return accessor;
} }
@ -232,7 +298,7 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto me = AcceleratorView(ViewWrite);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,expr);
vstream(me[ss],tmp); vstream(me[ss],tmp);
@ -251,7 +317,7 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto me = AcceleratorView(ViewWrite);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,expr);
vstream(me[ss],tmp); vstream(me[ss],tmp);
@ -269,7 +335,7 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto me = View(); auto me = AcceleratorView(ViewWrite);
accelerator_for(ss,me.size(),1,{ accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr); auto tmp = eval(ss,expr);
vstream(me[ss],tmp); vstream(me[ss],tmp);
@ -357,7 +423,6 @@ public:
// copy constructor // copy constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(const Lattice& r){ Lattice(const Lattice& r){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid(); this->_grid = r.Grid();
resize(this->_grid->oSites()); resize(this->_grid->oSites());
*this = r; *this = r;
@ -380,8 +445,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = View(); auto me = AcceleratorView(ViewWrite);
auto him= r.View(); auto him= r.AcceleratorView(ViewRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
@ -394,8 +459,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = View(); auto me = AcceleratorView(ViewWrite);
auto him= r.View(); auto him= r.AcceleratorView(ViewRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });

View File

@ -0,0 +1,236 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_basis.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
// If assume basis[j] are already orthonormal,
// can take all inner products in parallel saving 2x bandwidth
// Save 3x bandwidth on the second line of loop.
// perhaps 2.5x speed up.
// 2x overall in Multigrid Lanczos
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
Vector<View> basis_v(basis.size(),tmp_v);
typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
}
#ifndef GRID_NVCC
thread_region
{
std::vector < vobj > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
return;
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of matrix
Vector<double> Qt_jv(Nm*Nm);
double *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{
int j = i/Nm;
int k = i%Nm;
Qt_p[i]=Qt(j,k);
});
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
// remaining work in this block
int ssites=MIN(siteBlock,oSites-s);
// zero out the accumulators
accelerator_for(ss,siteBlock*nrot,vobj::Nsimd(),{
decltype(coalescedRead(Bp[ss])) z;
z=Zero();
coalescedWrite(Bp[ss],z);
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
}
});
accelerator_for(sj,ssites*nrot,vobj::Nsimd(),{
int j =sj%nrot;
int jj =j0+j;
int ss =sj/nrot;
int sss=ss+s;
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef decltype(basis[0].AcceleratorView()) View;
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.AcceleratorView(ViewWrite);
Vector<View> basis_v(basis.size(),result_v);
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].AcceleratorView(ViewRead);
}
vobj zz=Zero();
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
auto B=coalescedRead(zz);
for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
}
coalescedWrite(result_v[ss], B);
});
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
NAMESPACE_END(Grid);

View File

@ -5,6 +5,7 @@
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or the Free Software Foundation; either version 2 of the License, or
@ -93,7 +94,7 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
// Double inner product // Double inner product
template<class vobj> template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{ {
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
@ -102,8 +103,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
// Might make all code paths go this way. // Might make all code paths go this way.
auto left_v = left.View(); auto left_v = left.AcceleratorView(ViewRead);
auto right_v=right.View(); auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
@ -137,11 +138,18 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
}) })
nrm = TensorRemove(sum(inner_tmp_v,sites)); nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif #endif
grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
ComplexD nrm = rankInnerProduct(left,right);
grid->GlobalSum(nrm);
return nrm;
}
///////////////////////// /////////////////////////
// Fast axpby_norm // Fast axpby_norm
// z = a x + b y // z = a x + b y
@ -167,9 +175,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
auto x_v=x.View(); auto x_v=x.AcceleratorView(ViewRead);
auto y_v=y.View(); auto y_v=y.AcceleratorView(ViewRead);
auto z_v=z.View(); auto z_v=z.AcceleratorView(ViewWrite);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
@ -204,8 +212,64 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
template<class vobj> strong_inline void
innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Lattice<vobj> &right)
{
conformable(left,right);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2);
GridBase *grid = left.Grid();
auto left_v=left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
});
tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
});
// Already promoted to double
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
#endif
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0];
nrm = real(tmp[1]);
}
template<class Op,class T1> template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr) inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object ->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object

View File

@ -37,6 +37,7 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace // Trace
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
/*
template<class vobj> template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
@ -48,6 +49,7 @@ inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
}); });
return ret; return ret;
}; };
*/
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation // Trace Index level dependent operation

View File

@ -6,6 +6,7 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -63,6 +64,7 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
} }
}); });
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.Checkerboard(); int cb = half.Checkerboard();
auto half_v = half.View(); auto half_v = half.View();
@ -81,25 +83,130 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
} }
}); });
} }
template<class vobj,class CComplex,int nbasis> ////////////////////////////////////////////////////////////////////////////////////////////
// Flexible Type Conversion for internal promotion to double as well as graceful
// treatment of scalar-compatible types
////////////////////////////////////////////////////////////////////////////////////////////
accelerator_inline void convertType(ComplexD & out, const std::complex<double> & in) {
out = in;
}
accelerator_inline void convertType(ComplexF & out, const std::complex<float> & in) {
out = in;
}
#ifdef __CUDA_ARCH__
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
}
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
}
#endif
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
}
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
convertType(out,in._internal);
}
template<typename T1,typename T2>
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in) {
for (int i=0;i<N;i++)
for (int j=0;j<N;j++)
convertType(out._internal[i][j],in._internal[i][j]);
}
template<typename T1,typename T2,int N>
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in) {
for (int i=0;i<N;i++)
convertType(out._internal[i],in._internal[i]);
}
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
accelerator_inline void convertType(T & out, const T & in) {
out = in;
}
template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
auto out_v = out.AcceleratorView(ViewWrite);
auto in_v = in.AcceleratorView(ViewRead);
accelerator_for(ss,out_v.size(),T1::Nsimd(),{
convertType(out_v[ss],in_v(ss));
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// precision-promoted local inner product
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
{
auto lhs_v = lhs.AcceleratorView(ViewRead);
auto rhs_v = rhs.AcceleratorView(ViewRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid());
auto ret_v = ret.AcceleratorView(ViewWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
});
return ret;
}
////////////////////////////////////////////////////////////////////////////////////////////
// block routines
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData, const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const VLattice &Basis)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
Lattice<CComplex> ip(coarse); Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData;
// auto fineData_ = fineData.View(); // auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View(); auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
auto ip_ = ip.View(); auto ip_ = ip.AcceleratorView(ViewReadWrite);
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
blockInnerProduct(ip,Basis[v],fineData); blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
coalescedWrite(coarseData_[sc](v),ip_(sc)); convertType(coarseData_[sc](v),ip_[sc]);
}); });
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
} }
} }
@ -166,11 +273,11 @@ inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
return; return;
} }
template<class vobj,class CComplex> template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
const Lattice<CComplex> &coarseA, const Lattice<CComplex> &coarseA,
const Lattice<vobj> &fineX, const Lattice<vobj2> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
{ {
GridBase * fine = fineZ.Grid(); GridBase * fine = fineZ.Grid();
GridBase * coarse= coarseA.Grid(); GridBase * coarse= coarseA.Grid();
@ -182,7 +289,7 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
conformable(fineX,fineZ); conformable(fineX,fineZ);
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
// FIXME merge with subdivide checking routine as this is redundant // FIXME merge with subdivide checking routine as this is redundant
@ -191,29 +298,65 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
auto fineZ_ = fineZ.View(); auto fineZ_ = fineZ.AcceleratorView(ViewWrite);
auto fineX_ = fineX.View(); auto fineX_ = fineX.AcceleratorView(ViewRead);
auto fineY_ = fineY.View(); auto fineY_ = fineY.AcceleratorView(ViewRead);
auto coarseA_= coarseA.View(); auto coarseA_= coarseA.AcceleratorView(ViewRead);
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
int sc;
Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions); int sc;
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; Coordinate coor_c(_ndimension);
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Coordinate coor_f(_ndimension);
// z = A x + y Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
coalescedWrite(fineZ_[sf],coarseA_(sc)*fineX_(sf)+fineY_(sf)); for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
}); // z = A x + y
#ifdef __CUDA_ARCH__
typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx;
#else
typename vobj2::tensor_reduced cA;
vobj cAx;
#endif
convertType(cA,TensorRemove(coarseA_(sc)));
auto prod = cA*fineX_(sf);
convertType(cAx,prod);
coalescedWrite(fineZ_[sf],cAx+fineY_(sf));
});
return; return;
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockInnerProductD(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY)
{
typedef iScalar<decltype(TensorRemove(innerProductD2(vobj(),vobj())))> dotp;
GridBase *coarse(CoarseInner.Grid());
GridBase *fine (fineX.Grid());
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse);
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
// Precision promotion
fine_inner = localInnerProductD(fineX,fineY);
blockSum(coarse_inner,fine_inner);
accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
});
}
template<class vobj,class CComplex> // deprecate
inline void blockInnerProduct(Lattice<CComplex> &CoarseInner, inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
const Lattice<vobj> &fineX, const Lattice<vobj> &fineX,
const Lattice<vobj> &fineY) const Lattice<vobj> &fineY)
@ -227,8 +370,8 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.View(); auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.View(); auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
@ -236,6 +379,7 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
CoarseInner_[ss] = coarse_inner_[ss]; CoarseInner_[ss] = coarse_inner_[ss];
}); });
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX) inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
{ {
@ -248,7 +392,7 @@ inline void blockNormalise(Lattice<CComplex> &ip,Lattice<vobj> &fineX)
// useful in multigrid project; // useful in multigrid project;
// Generic name : Coarsen? // Generic name : Coarsen?
template<class vobj> template<class vobj>
inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
@ -256,42 +400,41 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
int blockVol = fine->oSites()/coarse->oSites(); int blockVol = fine->oSites()/coarse->oSites();
// Turn this around to loop threaded over sc and interior loop auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
// over sf would thread better auto fineData_ = fineData.AcceleratorView(ViewRead);
auto coarseData_ = coarseData.View();
auto fineData_ = fineData.View();
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
coarseData_[sc]=Zero(); coarseData_[sc]=Zero();
for(int sb=0;sb<blockVol;sb++){ for(int sb=0;sb<blockVol;sb++){
int sf;
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
coarseData_[sc]=coarseData_[sc]+fineData_[sf]; int sf;
} Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
}); coarseData_[sc]=coarseData_[sc]+fineData_[sf];
}
});
return; return;
} }
template<class vobj> template<class vobj>
inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor) inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vobj> &picked,Coordinate coor)
{ {
@ -313,8 +456,8 @@ inline void blockPick(GridBase *coarse,const Lattice<vobj> &unpicked,Lattice<vob
} }
} }
template<class vobj,class CComplex> template<class CComplex,class VLattice>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) inline void blockOrthonormalize(Lattice<CComplex> &ip,VLattice &Basis)
{ {
GridBase *coarse = ip.Grid(); GridBase *coarse = ip.Grid();
GridBase *fine = Basis[0].Grid(); GridBase *fine = Basis[0].Grid();
@ -322,23 +465,30 @@ inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> >
int nbasis = Basis.size() ; int nbasis = Basis.size() ;
// checks // checks
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i].Grid(),fine); conformable(Basis[i].Grid(),fine);
} }
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
for(int u=0;u<v;u++) { for(int u=0;u<v;u++) {
//Inner product & remove component //Inner product & remove component
blockInnerProduct(ip,Basis[u],Basis[v]); blockInnerProductD(ip,Basis[u],Basis[v]);
ip = -ip; ip = -ip;
blockZAXPY<vobj,CComplex> (Basis[v],ip,Basis[u],Basis[v]); blockZAXPY(Basis[v],ip,Basis[u],Basis[v]);
} }
blockNormalise(ip,Basis[v]); blockNormalise(ip,Basis[v]);
} }
} }
template<class vobj,class CComplex>
inline void blockOrthogonalise(Lattice<CComplex> &ip,std::vector<Lattice<vobj> > &Basis) // deprecated inaccurate naming
{
blockOrthonormalize(ip,Basis);
}
#if 0 #if 0
// TODO: CPU optimized version here
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
@ -383,24 +533,18 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
#else #else
template<class vobj,class CComplex,int nbasis> template<class vobj,class CComplex,int nbasis,class VLattice>
inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData, inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<vobj> &fineData, Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis) const VLattice &Basis)
{ {
GridBase * fine = fineData.Grid(); GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid(); GridBase * coarse= coarseData.Grid();
fineData=Zero(); fineData=Zero();
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i); Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
Lattice<CComplex> cip(coarse); auto ip_ = ip.AcceleratorView(ViewRead);
auto cip_ = cip.View(); blockZAXPY(fineData,ip,Basis[i],fineData);
auto ip_ = ip.View();
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
coalescedWrite(cip_[sc], ip_(sc)());
});
blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
} }
} }
#endif #endif
@ -470,8 +614,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions; Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
auto t_v = To.View(); auto t_v = To.AcceleratorView(ViewWrite);
auto f_v = From.View(); auto f_v = From.AcceleratorView(ViewRead);
accelerator_for(idx,Fg->lSites(),1,{ accelerator_for(idx,Fg->lSites(),1,{
sobj s; sobj s;
Coordinate Fcoor(nd); Coordinate Fcoor(nd);

View File

@ -341,7 +341,7 @@ class BinaryIO {
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
assert(ieee64||ieee32|ieee64big||ieee32big); assert(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1); assert((ieee64+ieee32+ieee64big+ieee32big)==1);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////

View File

@ -301,6 +301,30 @@ struct GaugeSimpleUnmunger {
}; };
}; };
template<class fobj,class sobj>
struct GaugeDoubleStoredMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeDoubleStoredUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nds; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj> template<class fobj,class sobj>
struct Gauge3x2munger{ struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){ void operator() (fobj &in,sobj &out){

View File

@ -146,7 +146,7 @@ public:
int ieee32big = (format == std::string("IEEE32BIG")); int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger; // depending on datatype, set up munger;

224
Grid/parallelIO/OpenQcdIO.h Normal file
View File

@ -0,0 +1,224 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIO.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
struct OpenQcdHeader : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OpenQcdHeader,
int, Nt,
int, Nx,
int, Ny,
int, Nz,
double, plaq);
};
class OpenQcdIO : public BinaryIO {
public:
static constexpr double normalisationFactor = Nc; // normalisation difference: grid 18, openqcd 6
static inline int readHeader(std::string file, GridBase* grid, FieldMetaData& field) {
OpenQcdHeader header;
{
std::ifstream fin(file, std::ios::in | std::ios::binary);
fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
assert(!fin.fail());
field.data_start = fin.tellg();
fin.close();
}
header.plaq /= normalisationFactor;
// sanity check (should trigger on endian issues)
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
std::cout << GridLogDebug << "header: " << header << std::endl;
std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
assert(grid->_ndimension == Nd);
for(int d = 0; d < Nd; d++)
assert(grid->_fdimensions[d] == field.dimension[d]);
field.plaquette = header.plaq;
return field.data_start;
}
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
assert(grid != nullptr); assert(grid->_ndimension == Nd);
uint64_t offset = readHeader(file, Umu.Grid(), header);
FieldMetaData clone(header);
std::string format("IEEE64"); // they always store little endian double precsision
uint32_t nersc_csum, scidac_csuma, scidac_csumb;
GridCartesian* grid_openqcd = createOpenQcdGrid(grid);
GridRedBlackCartesian* grid_rb = SpaceTimeGrid::makeFourDimRedBlackGrid(grid);
typedef DoubleStoredColourMatrixD fobj;
typedef typename DoubleStoredGaugeField::vector_object::scalar_object sobj;
typedef typename DoubleStoredGaugeField::vector_object::Realified::scalar_type word;
word w = 0;
std::vector<fobj> iodata(grid_openqcd->lSites()); // Munge, checksum, byte order in here
std::vector<sobj> scalardata(grid->lSites());
IOobject(w, grid_openqcd, iodata, file, offset, format, BINARYIO_READ | BINARYIO_LEXICOGRAPHIC,
nersc_csum, scidac_csuma, scidac_csumb);
GridStopWatch timer;
timer.Start();
DoubleStoredGaugeField Umu_ds(grid);
auto munge = GaugeDoubleStoredMunger<DoubleStoredColourMatrixD, DoubleStoredColourMatrix>();
Coordinate ldim = grid->LocalDimensions();
thread_for(idx_g, grid->lSites(), {
Coordinate coor;
grid->LocalIndexToLocalCoor(idx_g, coor);
bool isOdd = grid_rb->CheckerBoard(coor) == Odd;
if(!isOdd) continue;
int idx_o = (coor[Tdir] * ldim[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Xdir] * ldim[Ydir] * ldim[Zdir]
+ coor[Ydir] * ldim[Zdir]
+ coor[Zdir])/2;
munge(iodata[idx_o], scalardata[idx_g]);
});
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: munge overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
vectorizeFromLexOrdArray(scalardata, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: vectorize overhead " << timer.Elapsed() << std::endl;
timer.Reset(); timer.Start();
undoDoubleStore(Umu, Umu_ds);
grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
std::string file) {
std::cout << GridLogError << "Writing to openQCD file format is not implemented" << std::endl;
exit(EXIT_FAILURE);
}
private:
static inline GridCartesian* createOpenQcdGrid(GridCartesian* grid) {
// exploit GridCartesian to be able to still use IOobject
Coordinate gdim = grid->GlobalDimensions();
Coordinate ldim = grid->LocalDimensions();
Coordinate pcoor = grid->ThisProcessorCoor();
// openqcd does rb on the z direction
gdim[Zdir] /= 2;
ldim[Zdir] /= 2;
// and has the order T X Y Z (from slowest to fastest)
std::swap(gdim[Xdir], gdim[Zdir]);
std::swap(ldim[Xdir], ldim[Zdir]);
std::swap(pcoor[Xdir], pcoor[Zdir]);
GridCartesian* ret = SpaceTimeGrid::makeFourDimGrid(gdim, grid->_simd_layout, grid->ProcessorGrid());
ret->_ldimensions = ldim;
ret->_processor_coor = pcoor;
return ret;
}
template<class vsimd>
static inline void undoDoubleStore(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
// they store T+, T-, X+, X-, Y+, Y-, Z+, Z-
for(int mu_g = 0; mu_g < Nd; ++mu_g) {
int mu_o = (mu_g + 1) % Nd;
U = PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o)
+ Cshift(PeekIndex<LorentzIndex>(Umu_ds, 2 * mu_o + 1), mu_g, +1);
PokeIndex<LorentzIndex>(Umu, U, mu_g);
}
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,281 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/OpenQcdIOChromaReference.h
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <ios>
#include <iostream>
#include <limits>
#include <iomanip>
#include <mpi.h>
#include <ostream>
#include <string>
#define CHECK {std::cerr << __FILE__ << " @l " << __LINE__ << ": CHECK" << grid->ThisRank() << std::endl;}
#define CHECK_VAR(a) { std::cerr << __FILE__ << "@l" << __LINE__ << " on "<< grid->ThisRank() << ": " << __func__ << " " << #a << "=" << (a) << std::endl; }
// #undef CHECK
// #define CHECK
NAMESPACE_BEGIN(Grid);
class ParRdr {
private:
bool const swap;
MPI_Status status;
MPI_File fp;
int err;
MPI_Datatype oddSiteType;
MPI_Datatype fileViewType;
GridBase* grid;
public:
ParRdr(MPI_Comm comm, std::string const& filename, GridBase* gridPtr)
: swap(false)
, grid(gridPtr) {
err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
assert(err == MPI_SUCCESS);
}
virtual ~ParRdr() { MPI_File_close(&fp); }
inline void errInfo(int const err, std::string const& func) {
static char estring[MPI_MAX_ERROR_STRING];
int eclass = -1, len = 0;
MPI_Error_class(err, &eclass);
MPI_Error_string(err, estring, &len);
std::cerr << func << " - Error " << eclass << ": " << estring << std::endl;
}
int readHeader(FieldMetaData& field) {
assert((grid->_ndimension == Nd) && (Nd == 4));
assert(Nc == 3);
OpenQcdHeader header;
readBlock(reinterpret_cast<char*>(&header), 0, sizeof(OpenQcdHeader), MPI_CHAR);
header.plaq /= 3.; // TODO change this into normalizationfactor
// sanity check (should trigger on endian issues) TODO remove?
assert(0 < header.Nt && header.Nt <= 1024);
assert(0 < header.Nx && header.Nx <= 1024);
assert(0 < header.Ny && header.Ny <= 1024);
assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny;
field.dimension[2] = header.Nz;
field.dimension[3] = header.Nt;
for(int d = 0; d < Nd; d++)
assert(grid->FullDimensions()[d] == field.dimension[d]);
field.plaquette = header.plaq;
field.data_start = sizeof(OpenQcdHeader);
return field.data_start;
}
void readBlock(void* const dest, uint64_t const pos, uint64_t const nbytes, MPI_Datatype const datatype) {
err = MPI_File_read_at_all(fp, pos, dest, nbytes, datatype, &status);
errInfo(err, "MPI_File_read_at_all");
// CHECK_VAR(err)
int read = -1;
MPI_Get_count(&status, datatype, &read);
// CHECK_VAR(read)
assert(nbytes == (uint64_t)read);
assert(err == MPI_SUCCESS);
}
void createTypes() {
constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
Coordinate const L = grid->GlobalDimensions();
Coordinate const l = grid->LocalDimensions();
Coordinate const i = grid->ThisProcessorCoor();
Coordinate sizes({L[2] / 2, L[1], L[0], L[3]});
Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
}
void freeTypes() {
err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
}
bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
auto hdr_offset = readHeader(meta);
CHECK
createTypes();
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
CHECK
int const domainSites = grid->lSites();
domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
// the actual READ
constexpr uint64_t cm_size = 2 * Nc * Nc * sizeof(double); // 2_complex
constexpr uint64_t os_size = Nd * 2 * cm_size; // 2_fwdbwd
constexpr uint64_t max_elems = std::numeric_limits<int>::max(); // int adressable elems: floor is fine
uint64_t const n_os = domainSites / 2;
for(uint64_t os_idx = 0; os_idx < n_os;) {
uint64_t const read_os = os_idx + max_elems <= n_os ? max_elems : n_os - os_idx;
uint64_t const cm = os_idx * Nd * 2;
readBlock(&(domain_buff[cm]), os_idx, read_os, oddSiteType);
os_idx += read_os;
}
CHECK
err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
errInfo(err, "MPI_File_set_view1");
assert(err == MPI_SUCCESS);
freeTypes();
std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
return true;
}
};
class OpenQcdIOChromaReference : public BinaryIO {
public:
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Grid::FieldMetaData& header,
std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = Umu.Grid();
typedef ColourMatrixD fobj;
std::vector<fobj> iodata(
Nd * grid->lSites()); // actual size = 2*Nd*lsites but have only lsites/2 sites in file
{
ParRdr rdr(MPI_COMM_WORLD, file, grid);
rdr.readGauge(iodata, header);
} // equivalent to using binaryio
std::vector<iDoubleStoredColourMatrix<typename vsimd::scalar_type>> Umu_ds_scalar(grid->lSites());
copyToLatticeObject(Umu_ds_scalar, iodata, grid); // equivalent to munging
DoubledGaugeField Umu_ds(grid);
vectorizeFromLexOrdArray(Umu_ds_scalar, Umu_ds);
redistribute(Umu, Umu_ds); // equivalent to undoDoublestore
FieldMetaData clone(header);
GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
// clang-format off
std::cout << GridLogMessage << "OpenQcd Configuration " << file
<< " plaquette " << clone.plaquette
<< " header " << header.plaquette
<< " difference " << plaq_diff
<< std::endl;
// clang-format on
RealD precTol = (getPrecision<vsimd>::value == 1) ? 2e-7 : 2e-15;
RealD tol = precTol * std::sqrt(grid->_Nprocessors); // taken from RQCD chroma code
if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
}
private:
template<class vsimd>
static inline void redistribute(Lattice<iLorentzColourMatrix<vsimd>>& Umu,
Lattice<iDoubleStoredColourMatrix<vsimd>> const& Umu_ds) {
Grid::conformable(Umu.Grid(), Umu_ds.Grid());
Lattice<iColourMatrix<vsimd>> U(Umu.Grid());
U = PeekIndex<LorentzIndex>(Umu_ds, 2) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 3), 0, +1); PokeIndex<LorentzIndex>(Umu, U, 0);
U = PeekIndex<LorentzIndex>(Umu_ds, 4) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 5), 1, +1); PokeIndex<LorentzIndex>(Umu, U, 1);
U = PeekIndex<LorentzIndex>(Umu_ds, 6) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 7), 2, +1); PokeIndex<LorentzIndex>(Umu, U, 2);
U = PeekIndex<LorentzIndex>(Umu_ds, 0) + Cshift(PeekIndex<LorentzIndex>(Umu_ds, 1), 3, +1); PokeIndex<LorentzIndex>(Umu, U, 3);
}
static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
std::vector<ColourMatrixD> const& node_buff,
GridBase* grid) {
assert(node_buff.size() == Nd * grid->lSites());
Coordinate const& l = grid->LocalDimensions();
Coordinate coord(Nd);
int& x = coord[0];
int& y = coord[1];
int& z = coord[2];
int& t = coord[3];
int buff_idx = 0;
for(t = 0; t < l[3]; ++t) // IMPORTANT: openQCD file ordering
for(x = 0; x < l[0]; ++x)
for(y = 0; y < l[1]; ++y)
for(z = 0; z < l[2]; ++z) {
if((t + z + y + x) % 2 == 0) continue;
int local_idx;
Lexicographic::IndexFromCoor(coord, local_idx, grid->LocalDimensions());
for(int mu = 0; mu < 2 * Nd; ++mu)
for(int c1 = 0; c1 < Nc; ++c1) {
for(int c2 = 0; c2 < Nc; ++c2) {
u_fb[local_idx](mu)()(c1,c2) = node_buff[mu+buff_idx]()()(c1,c2);
}
}
buff_idx += 2 * Nd;
}
assert(node_buff.size() == buff_idx);
}
};
NAMESPACE_END(Grid);

View File

@ -132,14 +132,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
} }
template <class Impl> template <class Impl>

View File

@ -444,19 +444,19 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifndef GRID_NVCC #ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC #ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC #ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif #endif
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");

View File

@ -59,7 +59,7 @@ public:
} }
static inline GaugeLinkField static inline GaugeLinkField
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
return Cshift(adj(Link), mu, -1); return Cshift(closure(adj(Link)), mu, -1);
} }
static inline GaugeLinkField static inline GaugeLinkField
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {

View File

@ -39,6 +39,10 @@ directory
#include <Grid/parallelIO/IldgIOtypes.h> #include <Grid/parallelIO/IldgIOtypes.h>
#include <Grid/parallelIO/IldgIO.h> #include <Grid/parallelIO/IldgIO.h>
#include <Grid/parallelIO/NerscIO.h> #include <Grid/parallelIO/NerscIO.h>
#include <Grid/parallelIO/OpenQcdIO.h>
#if !defined(GRID_COMMS_NONE)
#include <Grid/parallelIO/OpenQcdIOChromaReference.h>
#endif
NAMESPACE_CHECK(Ildg); NAMESPACE_CHECK(Ildg);
#include <Grid/qcd/hmc/checkpointers/CheckPointers.h> #include <Grid/qcd/hmc/checkpointers/CheckPointers.h>

View File

@ -485,7 +485,7 @@ public:
// Up staple ___ ___ // Up staple ___ ___
// | | // | |
tmp = Cshift(adj(U[nu]), nu, -1); tmp = Cshift(closure(adj(U[nu])), nu, -1);
tmp = adj(U2[mu]) * tmp; tmp = adj(U2[mu]) * tmp;
tmp = Cshift(tmp, mu, -2); tmp = Cshift(tmp, mu, -2);
@ -519,7 +519,7 @@ public:
// //
// | | // | |
tmp = Cshift(adj(U2[nu]), nu, -2); tmp = Cshift(closure(adj(U2[nu])), nu, -2);
tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
tmp = U2[nu] * Cshift(tmp, nu, 2); tmp = U2[nu] * Cshift(tmp, nu, 2);
Stap += Cshift(tmp, mu, 1); Stap += Cshift(tmp, mu, 1);

View File

@ -6,6 +6,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Michael Marshall <michael.marshall@ed.ac.au> Author: Michael Marshall <michael.marshall@ed.ac.au>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -55,6 +56,7 @@ class GridTensorBase {};
using Complexified = typename Traits::Complexified; \ using Complexified = typename Traits::Complexified; \
using Realified = typename Traits::Realified; \ using Realified = typename Traits::Realified; \
using DoublePrecision = typename Traits::DoublePrecision; \ using DoublePrecision = typename Traits::DoublePrecision; \
using DoublePrecision2= typename Traits::DoublePrecision2; \
static constexpr int TensorLevel = Traits::TensorLevel static constexpr int TensorLevel = Traits::TensorLevel
template <class vtype> template <class vtype>

View File

@ -8,6 +8,7 @@
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -194,6 +195,79 @@ auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decl
ret._internal = innerProductD(lhs._internal,rhs._internal); ret._internal = innerProductD(lhs._internal,rhs._internal);
return ret; return ret;
} }
//////////////////////////////////////
// innerProductD2: precision promotion without inner sum
//////////////////////////////////////
accelerator_inline vComplexD2 TensorRemove(const vComplexD2 & x) { return x; };
accelerator_inline vRealD2 TensorRemove(const vRealD2 & x) { return x; };
accelerator_inline ComplexD innerProductD2(const ComplexF &l,const ComplexF &r){ return innerProduct(l,r); }
accelerator_inline ComplexD innerProductD2(const ComplexD &l,const ComplexD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD2(const RealD &l,const RealD &r){ return innerProduct(l,r); }
accelerator_inline RealD innerProductD2(const RealF &l,const RealF &r){ return innerProduct(l,r); }
accelerator_inline vComplexD innerProductD2(const vComplexD &l,const vComplexD &r){ return innerProduct(l,r); }
accelerator_inline vRealD innerProductD2(const vRealD &l,const vRealD &r){ return innerProduct(l,r); }
accelerator_inline vComplexD2 innerProductD2(const vComplexF &l,const vComplexF &r)
{
vComplexD la,lb;
vComplexD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
vComplexD2 ret;
ret._internal[0] = innerProduct(la,ra);
ret._internal[1] = innerProduct(lb,rb);
return ret;
}
accelerator_inline vRealD2 innerProductD2(const vRealF &l,const vRealF &r)
{
vRealD la,lb;
vRealD ra,rb;
Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
vRealD2 ret;
ret._internal[0]=innerProduct(la,ra);
ret._internal[1]=innerProduct(lb,rb);
return ret;
}
// Now do it for vector, matrix, scalar
template<class l,class r,int N> accelerator_inline
auto innerProductD2 (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal[0],rhs._internal[0]))>
{
typedef decltype(innerProductD2(lhs._internal[0],rhs._internal[0])) ret_t;
iScalar<ret_t> ret;
zeroit(ret);
for(int c1=0;c1<N;c1++){
ret._internal += innerProductD2(lhs._internal[c1],rhs._internal[c1]);
}
return ret;
}
template<class l,class r,int N> accelerator_inline
auto innerProductD2 (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0]))>
{
typedef decltype(innerProductD2(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
iScalar<ret_t> ret;
ret=Zero();
for(int c1=0;c1<N;c1++){
for(int c2=0;c2<N;c2++){
ret._internal+=innerProductD2(lhs._internal[c1][c2],rhs._internal[c1][c2]);
}}
return ret;
}
template<class l,class r> accelerator_inline
auto innerProductD2 (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProductD2(lhs._internal,rhs._internal))>
{
typedef decltype(innerProductD2(lhs._internal,rhs._internal)) ret_t;
iScalar<ret_t> ret;
ret._internal = innerProductD2(lhs._internal,rhs._internal);
return ret;
}
////////////////////// //////////////////////
// Keep same precison // Keep same precison
////////////////////// //////////////////////

View File

@ -6,6 +6,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@phys.columbia.edu> Author: Christopher Kelly <ckelly@phys.columbia.edu>
Author: Michael Marshall <michael.marshall@ed.ac.au> Author: Michael Marshall <michael.marshall@ed.ac.au>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or the Free Software Foundation; either version 2 of the License, or
@ -37,6 +38,60 @@ NAMESPACE_BEGIN(Grid);
template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type { static constexpr bool notvalue = false; }; template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type { static constexpr bool notvalue = false; };
template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type { static constexpr bool notvalue = false; }; template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type { static constexpr bool notvalue = false; };
// Traits to identify scalars
template<typename T> struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; };
template<class T> struct isGridScalar<iScalar<T>> : public std::true_type { static constexpr bool notvalue = false; };
// Store double-precision data in single-precision grids for precision promoted localInnerProductD
template<typename T>
class TypePair {
public:
T _internal[2];
TypePair<T>& operator=(const Grid::Zero& o) {
_internal[0] = Zero();
_internal[1] = Zero();
return *this;
}
TypePair<T> operator+(const TypePair<T>& o) const {
TypePair<T> r;
r._internal[0] = _internal[0] + o._internal[0];
r._internal[1] = _internal[1] + o._internal[1];
return r;
}
TypePair<T>& operator+=(const TypePair<T>& o) {
_internal[0] += o._internal[0];
_internal[1] += o._internal[1];
return *this;
}
friend accelerator_inline void add(TypePair<T>* ret, const TypePair<T>* a, const TypePair<T>* b) {
add(&ret->_internal[0],&a->_internal[0],&b->_internal[0]);
add(&ret->_internal[1],&a->_internal[1],&b->_internal[1]);
}
};
typedef TypePair<ComplexD> ComplexD2;
typedef TypePair<RealD> RealD2;
typedef TypePair<vComplexD> vComplexD2;
typedef TypePair<vRealD> vRealD2;
// Traits to identify fundamental data types
template<typename T> struct isGridFundamental : public std::false_type { static constexpr bool notvalue = true; };
template<> struct isGridFundamental<vComplexF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vComplexD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vRealF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vRealD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<ComplexF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<ComplexD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<RealF> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<RealD> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vComplexD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<vRealD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<ComplexD2> : public std::true_type { static constexpr bool notvalue = false; };
template<> struct isGridFundamental<RealD2> : public std::true_type { static constexpr bool notvalue = false; };
////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////
// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD. // Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
// Use of a helper class like this allows us to template specialise and "dress" // Use of a helper class like this allows us to template specialise and "dress"
@ -81,6 +136,7 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexF Complexified; typedef ComplexF Complexified;
typedef RealF Realified; typedef RealF Realified;
typedef RealD DoublePrecision; typedef RealD DoublePrecision;
typedef RealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<RealD> : public GridTypeMapper_Base {
typedef RealD scalar_type; typedef RealD scalar_type;
@ -93,6 +149,20 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexD Complexified; typedef ComplexD Complexified;
typedef RealD Realified; typedef RealD Realified;
typedef RealD DoublePrecision; typedef RealD DoublePrecision;
typedef RealD DoublePrecision2;
};
template<> struct GridTypeMapper<RealD2> : public GridTypeMapper_Base {
typedef RealD2 scalar_type;
typedef RealD2 scalar_typeD;
typedef RealD2 vector_type;
typedef RealD2 vector_typeD;
typedef RealD2 tensor_reduced;
typedef RealD2 scalar_object;
typedef RealD2 scalar_objectD;
typedef ComplexD2 Complexified;
typedef RealD2 Realified;
typedef RealD2 DoublePrecision;
typedef RealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base { template<> struct GridTypeMapper<ComplexF> : public GridTypeMapper_Base {
typedef ComplexF scalar_type; typedef ComplexF scalar_type;
@ -105,6 +175,7 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexF Complexified; typedef ComplexF Complexified;
typedef RealF Realified; typedef RealF Realified;
typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision;
typedef ComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<ComplexD> : public GridTypeMapper_Base {
typedef ComplexD scalar_type; typedef ComplexD scalar_type;
@ -117,6 +188,20 @@ NAMESPACE_BEGIN(Grid);
typedef ComplexD Complexified; typedef ComplexD Complexified;
typedef RealD Realified; typedef RealD Realified;
typedef ComplexD DoublePrecision; typedef ComplexD DoublePrecision;
typedef ComplexD DoublePrecision2;
};
template<> struct GridTypeMapper<ComplexD2> : public GridTypeMapper_Base {
typedef ComplexD2 scalar_type;
typedef ComplexD2 scalar_typeD;
typedef ComplexD2 vector_type;
typedef ComplexD2 vector_typeD;
typedef ComplexD2 tensor_reduced;
typedef ComplexD2 scalar_object;
typedef ComplexD2 scalar_objectD;
typedef ComplexD2 Complexified;
typedef RealD2 Realified;
typedef ComplexD2 DoublePrecision;
typedef ComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base { template<> struct GridTypeMapper<Integer> : public GridTypeMapper_Base {
typedef Integer scalar_type; typedef Integer scalar_type;
@ -129,6 +214,7 @@ NAMESPACE_BEGIN(Grid);
typedef void Complexified; typedef void Complexified;
typedef void Realified; typedef void Realified;
typedef void DoublePrecision; typedef void DoublePrecision;
typedef void DoublePrecision2;
}; };
template<> struct GridTypeMapper<vRealF> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vRealF> : public GridTypeMapper_Base {
@ -142,6 +228,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexF Complexified; typedef vComplexF Complexified;
typedef vRealF Realified; typedef vRealF Realified;
typedef vRealD DoublePrecision; typedef vRealD DoublePrecision;
typedef vRealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vRealD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vRealD> : public GridTypeMapper_Base {
typedef RealD scalar_type; typedef RealD scalar_type;
@ -154,6 +241,20 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexD Complexified; typedef vComplexD Complexified;
typedef vRealD Realified; typedef vRealD Realified;
typedef vRealD DoublePrecision; typedef vRealD DoublePrecision;
typedef vRealD DoublePrecision2;
};
template<> struct GridTypeMapper<vRealD2> : public GridTypeMapper_Base {
typedef RealD2 scalar_type;
typedef RealD2 scalar_typeD;
typedef vRealD2 vector_type;
typedef vRealD2 vector_typeD;
typedef vRealD2 tensor_reduced;
typedef RealD2 scalar_object;
typedef RealD2 scalar_objectD;
typedef vComplexD2 Complexified;
typedef vRealD2 Realified;
typedef vRealD2 DoublePrecision;
typedef vRealD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vRealH> : public GridTypeMapper_Base {
// Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
@ -167,6 +268,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexH Complexified; typedef vComplexH Complexified;
typedef vRealH Realified; typedef vRealH Realified;
typedef vRealD DoublePrecision; typedef vRealD DoublePrecision;
typedef vRealD DoublePrecision2;
}; };
template<> struct GridTypeMapper<vComplexH> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vComplexH> : public GridTypeMapper_Base {
// Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types // Fixme this is incomplete until Grid supports fp16 or bfp16 arithmetic types
@ -180,6 +282,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexH Complexified; typedef vComplexH Complexified;
typedef vRealH Realified; typedef vRealH Realified;
typedef vComplexD DoublePrecision; typedef vComplexD DoublePrecision;
typedef vComplexD DoublePrecision2;
}; };
template<> struct GridTypeMapper<vComplexF> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vComplexF> : public GridTypeMapper_Base {
typedef ComplexF scalar_type; typedef ComplexF scalar_type;
@ -192,6 +295,7 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexF Complexified; typedef vComplexF Complexified;
typedef vRealF Realified; typedef vRealF Realified;
typedef vComplexD DoublePrecision; typedef vComplexD DoublePrecision;
typedef vComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vComplexD> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vComplexD> : public GridTypeMapper_Base {
typedef ComplexD scalar_type; typedef ComplexD scalar_type;
@ -204,6 +308,20 @@ NAMESPACE_BEGIN(Grid);
typedef vComplexD Complexified; typedef vComplexD Complexified;
typedef vRealD Realified; typedef vRealD Realified;
typedef vComplexD DoublePrecision; typedef vComplexD DoublePrecision;
typedef vComplexD DoublePrecision2;
};
template<> struct GridTypeMapper<vComplexD2> : public GridTypeMapper_Base {
typedef ComplexD2 scalar_type;
typedef ComplexD2 scalar_typeD;
typedef vComplexD2 vector_type;
typedef vComplexD2 vector_typeD;
typedef vComplexD2 tensor_reduced;
typedef ComplexD2 scalar_object;
typedef ComplexD2 scalar_objectD;
typedef vComplexD2 Complexified;
typedef vRealD2 Realified;
typedef vComplexD2 DoublePrecision;
typedef vComplexD2 DoublePrecision2;
}; };
template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base { template<> struct GridTypeMapper<vInteger> : public GridTypeMapper_Base {
typedef Integer scalar_type; typedef Integer scalar_type;
@ -216,6 +334,7 @@ NAMESPACE_BEGIN(Grid);
typedef void Complexified; typedef void Complexified;
typedef void Realified; typedef void Realified;
typedef void DoublePrecision; typedef void DoublePrecision;
typedef void DoublePrecision2;
}; };
#define GridTypeMapper_RepeatedTypes \ #define GridTypeMapper_RepeatedTypes \
@ -234,6 +353,7 @@ NAMESPACE_BEGIN(Grid);
using Complexified = iScalar<typename BaseTraits::Complexified>; using Complexified = iScalar<typename BaseTraits::Complexified>;
using Realified = iScalar<typename BaseTraits::Realified>; using Realified = iScalar<typename BaseTraits::Realified>;
using DoublePrecision = iScalar<typename BaseTraits::DoublePrecision>; using DoublePrecision = iScalar<typename BaseTraits::DoublePrecision>;
using DoublePrecision2= iScalar<typename BaseTraits::DoublePrecision2>;
static constexpr int Rank = BaseTraits::Rank + 1; static constexpr int Rank = BaseTraits::Rank + 1;
static constexpr std::size_t count = BaseTraits::count; static constexpr std::size_t count = BaseTraits::count;
static constexpr int Dimension(int dim) { static constexpr int Dimension(int dim) {
@ -248,6 +368,7 @@ NAMESPACE_BEGIN(Grid);
using Complexified = iVector<typename BaseTraits::Complexified, N>; using Complexified = iVector<typename BaseTraits::Complexified, N>;
using Realified = iVector<typename BaseTraits::Realified, N>; using Realified = iVector<typename BaseTraits::Realified, N>;
using DoublePrecision = iVector<typename BaseTraits::DoublePrecision, N>; using DoublePrecision = iVector<typename BaseTraits::DoublePrecision, N>;
using DoublePrecision2= iVector<typename BaseTraits::DoublePrecision2, N>;
static constexpr int Rank = BaseTraits::Rank + 1; static constexpr int Rank = BaseTraits::Rank + 1;
static constexpr std::size_t count = BaseTraits::count * N; static constexpr std::size_t count = BaseTraits::count * N;
static constexpr int Dimension(int dim) { static constexpr int Dimension(int dim) {
@ -262,6 +383,7 @@ NAMESPACE_BEGIN(Grid);
using Complexified = iMatrix<typename BaseTraits::Complexified, N>; using Complexified = iMatrix<typename BaseTraits::Complexified, N>;
using Realified = iMatrix<typename BaseTraits::Realified, N>; using Realified = iMatrix<typename BaseTraits::Realified, N>;
using DoublePrecision = iMatrix<typename BaseTraits::DoublePrecision, N>; using DoublePrecision = iMatrix<typename BaseTraits::DoublePrecision, N>;
using DoublePrecision2= iMatrix<typename BaseTraits::DoublePrecision2, N>;
static constexpr int Rank = BaseTraits::Rank + 2; static constexpr int Rank = BaseTraits::Rank + 2;
static constexpr std::size_t count = BaseTraits::count * N * N; static constexpr std::size_t count = BaseTraits::count * N * N;
static constexpr int Dimension(int dim) { static constexpr int Dimension(int dim) {

View File

@ -0,0 +1,84 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/io/Test_openqcd_io.cc
Copyright (C) 2015 - 2020
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if defined(GRID_COMMS_NONE)
#error This test requires Grid compiled with MPI
#endif
using namespace Grid;
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
auto simd_layout = GridDefaultSimd(Nd, vComplex::Nsimd());
auto mpi_layout = GridDefaultMpi();
auto latt_size = GridDefaultLatt();
GridCartesian grid(latt_size, simd_layout, mpi_layout);
GridParallelRNG pRNG(&grid);
pRNG.SeedFixedIntegers(std::vector<int>({45, 12, 81, 9}));
LatticeGaugeField Umu_ref(&grid);
LatticeGaugeField Umu_me(&grid);
LatticeGaugeField Umu_diff(&grid);
FieldMetaData header_ref;
FieldMetaData header_me;
Umu_ref = Zero();
Umu_me = Zero();
std::string file("/home/daniel/configs/openqcd/test_16x8_pbcn6");
if(GridCmdOptionExists(argv, argv + argc, "--config")) {
file = GridCmdOptionPayload(argv, argv + argc, "--config");
std::cout << "file: " << file << std::endl;
assert(!file.empty());
}
OpenQcdIOChromaReference::readConfiguration(Umu_ref, header_ref, file);
OpenQcdIO::readConfiguration(Umu_me, header_me, file);
std::cout << GridLogMessage << header_ref << std::endl;
std::cout << GridLogMessage << header_me << std::endl;
Umu_diff = Umu_ref - Umu_me;
// clang-format off
std::cout << GridLogMessage
<< "norm2(Umu_ref) = " << norm2(Umu_ref)
<< " norm2(Umu_me) = " << norm2(Umu_me)
<< " norm2(Umu_diff) = " << norm2(Umu_diff) << std::endl;
// clang-format on
Grid_finalize();
}

View File

@ -0,0 +1,126 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_innerproduct_norm.cc
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace Grid;
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
const int nIter = 100;
// clang-format off
GridCartesian *Grid_d = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexD::Nsimd()), GridDefaultMpi());
GridCartesian *Grid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
// clang-format on
GridParallelRNG pRNG_d(Grid_d);
GridParallelRNG pRNG_f(Grid_f);
std::vector<int> seeds_d({1, 2, 3, 4});
std::vector<int> seeds_f({5, 6, 7, 8});
pRNG_d.SeedFixedIntegers(seeds_d);
pRNG_f.SeedFixedIntegers(seeds_f);
// clang-format off
LatticeFermionD x_d(Grid_d); random(pRNG_d, x_d);
LatticeFermionD y_d(Grid_d); random(pRNG_d, y_d);
LatticeFermionF x_f(Grid_f); random(pRNG_f, x_f);
LatticeFermionF y_f(Grid_f); random(pRNG_f, y_f);
// clang-format on
GridStopWatch sw_ref;
GridStopWatch sw_res;
{ // double precision
ComplexD ip_d_ref, ip_d_res, diff_ip_d;
RealD norm2_d_ref, norm2_d_res, diff_norm2_d;
sw_ref.Reset();
sw_ref.Start();
for(int i = 0; i < nIter; ++i) {
ip_d_ref = innerProduct(x_d, y_d);
norm2_d_ref = norm2(x_d);
}
sw_ref.Stop();
sw_res.Reset();
sw_res.Start();
for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_d_res, norm2_d_res, x_d, y_d); }
sw_res.Stop();
diff_ip_d = ip_d_ref - ip_d_res;
diff_norm2_d = norm2_d_ref - norm2_d_res;
// clang-format off
std::cout << GridLogMessage << "Double: ip_ref = " << ip_d_ref << " ip_res = " << ip_d_res << " diff = " << diff_ip_d << std::endl;
std::cout << GridLogMessage << "Double: norm2_ref = " << norm2_d_ref << " norm2_res = " << norm2_d_res << " diff = " << diff_norm2_d << std::endl;
std::cout << GridLogMessage << "Double: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
// clang-format on
assert(diff_ip_d == 0.);
assert(diff_norm2_d == 0.);
std::cout << GridLogMessage << "Double: all checks passed" << std::endl;
}
{ // single precision
ComplexD ip_f_ref, ip_f_res, diff_ip_f;
RealD norm2_f_ref, norm2_f_res, diff_norm2_f;
sw_ref.Reset();
sw_ref.Start();
for(int i = 0; i < nIter; ++i) {
ip_f_ref = innerProduct(x_f, y_f);
norm2_f_ref = norm2(x_f);
}
sw_ref.Stop();
sw_res.Reset();
sw_res.Start();
for(int i = 0; i < nIter; ++i) { innerProductNorm(ip_f_res, norm2_f_res, x_f, y_f); }
sw_res.Stop();
diff_ip_f = ip_f_ref - ip_f_res;
diff_norm2_f = norm2_f_ref - norm2_f_res;
// clang-format off
std::cout << GridLogMessage << "Single: ip_ref = " << ip_f_ref << " ip_res = " << ip_f_res << " diff = " << diff_ip_f << std::endl;
std::cout << GridLogMessage << "Single: norm2_ref = " << norm2_f_ref << " norm2_res = " << norm2_f_res << " diff = " << diff_norm2_f << std::endl;
std::cout << GridLogMessage << "Single: time_ref = " << sw_ref.Elapsed() << " time_res = " << sw_res.Elapsed() << std::endl;
// clang-format on
assert(diff_ip_f == 0.);
assert(diff_norm2_f == 0.);
std::cout << GridLogMessage << "Single: all checks passed" << std::endl;
}
Grid_finalize();
}