1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Merge branch 'develop' into feature/json-fix

This commit is contained in:
Guido Cossu
2017-07-07 14:17:50 +01:00
102 changed files with 4235 additions and 3759 deletions

View File

@ -41,6 +41,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/Action.h>
#include <Grid/qcd/utils/GaugeFix.h>
#include <Grid/qcd/smearing/Smearing.h>
#include <Grid/parallelIO/MetaData.h>
#include <Grid/qcd/hmc/HMC_aggregate.h>

View File

@ -1,137 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/DenseMatrix.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_DENSE_MATRIX_H
#define GRID_DENSE_MATRIX_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Matrix untils
/////////////////////////////////////////////////////////////
template<class T> using DenseVector = std::vector<T>;
template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
template<class T> void Size(DenseVector<T> & vec, int &N)
{
N= vec.size();
}
template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M)
{
N= mat.size();
M= mat[0].size();
}
template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N)
{
int M; Size(mat,N,M);
assert(N==M);
}
template<class T> void Resize(DenseVector<T > & mat, int N) {
mat.resize(N);
}
template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) {
mat.resize(N);
for(int i=0;i<N;i++){
mat[i].resize(M);
}
}
template<class T> void Fill(DenseMatrix<T> & mat, T&val) {
int N,M;
Size(mat,N,M);
for(int i=0;i<N;i++){
for(int j=0;j<M;j++){
mat[i][j] = val;
}}
}
/** Transpose of a matrix **/
template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
int N,M;
Size(mat,N,M);
DenseMatrix<T> C; Resize(C,M,N);
for(int i=0;i<M;i++){
for(int j=0;j<N;j++){
C[i][j] = mat[j][i];
}}
return C;
}
/** Set DenseMatrix to unit matrix **/
template<class T> void Unity(DenseMatrix<T> &A){
int N; SizeSquare(A,N);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
if ( i==j ) A[i][j] = 1;
else A[i][j] = 0;
}
}
}
/** Add C * I to matrix **/
template<class T>
void PlusUnit(DenseMatrix<T> & A,T c){
int dim; SizeSquare(A,dim);
for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;}
}
/** return the Hermitian conjugate of matrix **/
template<class T>
DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
int dim; SizeSquare(mat,dim);
DenseMatrix<T> C; Resize(C,dim,dim);
for(int i=0;i<dim;i++){
for(int j=0;j<dim;j++){
C[i][j] = conj(mat[j][i]);
}
}
return C;
}
/**Get a square submatrix**/
template <class T>
DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
{
DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
for(int i = row_st; i<row_end; i++){
for(int j = col_st; j<col_end; j++){
H[i-row_st][j-col_st]=A[i][j];
}}
return H;
}
}
#include "Householder.h"
#include "Francis.h"
#endif

View File

@ -1,525 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/Francis.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef FRANCIS_H
#define FRANCIS_H
#include <cstdlib>
#include <string>
#include <cmath>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <fstream>
#include <complex>
#include <algorithm>
//#include <timer.h>
//#include <lapacke.h>
//#include <Eigen/Dense>
namespace Grid {
template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
template <class T> int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
/**
Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
H =
x x x x x x x x x
x x x x x x x x x
0 x x x x x x x x
0 0 x x x x x x x
0 0 0 x x x x x x
0 0 0 0 x x x x x
0 0 0 0 0 x x x x
0 0 0 0 0 0 x x x
0 0 0 0 0 0 0 x x
Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
**/
template <class T>
int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
{
DenseMatrix<T> H = Hin;
int N ; SizeSquare(H,N);
int M = N;
Fill(evals,0);
Fill(evecs,0);
T s,t,x=0,y=0,z=0;
T u,d;
T apd,amd,bc;
DenseVector<T> p(N,0);
T nrm = Norm(H); ///DenseMatrix Norm
int n, m;
int e = 0;
int it = 0;
int tot_it = 0;
int l = 0;
int r = 0;
DenseMatrix<T> P; Resize(P,N,N); Unity(P);
DenseVector<int> trows(N,0);
/// Check if the matrix is really hessenberg, if not abort
RealD sth = 0;
for(int j=0;j<N;j++){
for(int i=j+2;i<N;i++){
sth = abs(H[i][j]);
if(sth > small){
std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
exit(1);
}
}
}
do{
std::cout << "Francis QR Step N = " << N << std::endl;
/** Check for convergence
x x x x x
0 x x x x
0 0 x x x
0 0 x x x
0 0 0 0 x
for this matrix l = 4
**/
do{
l = Chop_subdiag(H,nrm,e,small);
r = 0; ///May have converged on more than one eval
///Single eval
if(l == N-1){
evals[e] = H[l][l];
N--; e++; r++; it = 0;
}
///RealD eval
if(l == N-2){
trows[l+1] = 1; ///Needed for UTSolve
apd = H[l][l] + H[l+1][l+1];
amd = H[l][l] - H[l+1][l+1];
bc = (T)4.0*H[l+1][l]*H[l][l+1];
evals[e] = (T)0.5*( apd + sqrt(amd*amd + bc) );
evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
N-=2; e+=2; r++; it = 0;
}
} while(r>0);
if(N ==0) break;
DenseVector<T > ck; Resize(ck,3);
DenseVector<T> v; Resize(v,3);
for(int m = N-3; m >= l; m--){
///Starting vector essentially random shift.
if(it%10 == 0 && N >= 3 && it > 0){
s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
z = H[m+1][m]*H[m+2][m+1];
}
///Starting vector implicit Q theorem
else{
s = (H[N-2][N-2] + H[N-1][N-1]);
t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
z = H[m+1][m]*H[m+2][m+1];
}
ck[0] = x; ck[1] = y; ck[2] = z;
if(m == l) break;
/** Some stupid thing from numerical recipies, seems to work**/
// PAB.. for heaven's sake quote page, purpose, evidence it works.
// what sort of comment is that!?!?!?
u=abs(H[m][m-1])*(abs(y)+abs(z));
d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
if ((T)abs(u+d) == (T)abs(d) ){
l = m; break;
}
//if (u < small){l = m; break;}
}
if(it > 100000){
std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
std::cout << "got " << e << " evals " << l << " " << N << std::endl;
exit(1);
}
normalize(ck); ///Normalization cancels in PHP anyway
T beta;
Householder_vector<T >(ck, 0, 2, v, beta);
Householder_mult<T >(H,v,beta,0,l,l+2,0);
Householder_mult<T >(H,v,beta,0,l,l+2,1);
///Accumulate eigenvector
Householder_mult<T >(P,v,beta,0,l,l+2,1);
int sw = 0; ///Are we on the last row?
for(int k=l;k<N-2;k++){
x = H[k+1][k];
y = H[k+2][k];
z = (T)0.0;
if(k+3 <= N-1){
z = H[k+3][k];
} else{
sw = 1;
v[2] = (T)0.0;
}
ck[0] = x; ck[1] = y; ck[2] = z;
normalize(ck);
Householder_vector<T >(ck, 0, 2-sw, v, beta);
Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
///Accumulate eigenvector
Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
}
it++;
tot_it++;
}while(N > 1);
N = evals.size();
///Annoying - UT solves in reverse order;
DenseVector<T> tmp; Resize(tmp,N);
for(int i=0;i<N;i++){
tmp[i] = evals[N-i-1];
}
evals = tmp;
UTeigenvectors(H, trows, evals, evecs);
for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
return tot_it;
}
template <class T>
int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
{
/**
Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
H =
x x 0 0 0 0
x x x 0 0 0
0 x x x 0 0
0 0 x x x 0
0 0 0 x x x
0 0 0 0 x x
Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary. **/
return my_Wilkinson(Hin, evals, evecs, small, small);
}
template <class T>
int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
{
int N; SizeSquare(Hin,N);
int M = N;
///I don't want to modify the input but matricies must be passed by reference
//Scale a matrix by its "norm"
//RealD Hnorm = abs( Hin.LargestDiag() ); H = H*(1.0/Hnorm);
DenseMatrix<T> H; H = Hin;
RealD Hnorm = abs(Norm(Hin));
H = H * (1.0 / Hnorm);
// TODO use openmp and memset
Fill(evals,0);
Fill(evecs,0);
T s, t, x = 0, y = 0, z = 0;
T u, d;
T apd, amd, bc;
DenseVector<T> p; Resize(p,N); Fill(p,0);
T nrm = Norm(H); ///DenseMatrix Norm
int n, m;
int e = 0;
int it = 0;
int tot_it = 0;
int l = 0;
int r = 0;
DenseMatrix<T> P; Resize(P,N,N);
Unity(P);
DenseVector<int> trows(N, 0);
/// Check if the matrix is really symm tridiag
RealD sth = 0;
for(int j = 0; j < N; ++j)
{
for(int i = j + 2; i < N; ++i)
{
if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
{
std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
std::cout << "Warning tridiagonalize and call again" << std::endl;
// exit(1); // see what is going on
//return;
}
}
}
do{
do{
//Jasper
//Check if the subdiagonal term is small enough (<small)
//if true then it is converged.
//check start from H.dim - e - 1
//How to deal with more than 2 are converged?
//What if Chop_symm_subdiag return something int the middle?
//--------------
l = Chop_symm_subdiag(H,nrm, e, small);
r = 0; ///May have converged on more than one eval
//Jasper
//In this case
// x x 0 0 0 0
// x x x 0 0 0
// 0 x x x 0 0
// 0 0 x x x 0
// 0 0 0 x x 0
// 0 0 0 0 0 x <- l
//--------------
///Single eval
if(l == N - 1)
{
evals[e] = H[l][l];
N--;
e++;
r++;
it = 0;
}
//Jasper
// x x 0 0 0 0
// x x x 0 0 0
// 0 x x x 0 0
// 0 0 x x 0 0
// 0 0 0 0 x x <- l
// 0 0 0 0 x x
//--------------
///RealD eval
if(l == N - 2)
{
trows[l + 1] = 1; ///Needed for UTSolve
apd = H[l][l] + H[l + 1][ l + 1];
amd = H[l][l] - H[l + 1][l + 1];
bc = (T) 4.0 * H[l + 1][l] * H[l][l + 1];
evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
N -= 2;
e += 2;
r++;
it = 0;
}
}while(r > 0);
//Jasper
//Already converged
//--------------
if(N == 0) break;
DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
for(int m = N - 3; m >= l; m--)
{
///Starting vector essentially random shift.
if(it%10 == 0 && N >= 3 && it > 0)
{
t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
x = H[m][m] - t;
z = H[m + 1][m];
} else {
///Starting vector implicit Q theorem
d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
t = H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2]
/ (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
x = H[m][m] - t;
z = H[m + 1][m];
}
//Jasper
//why it is here????
//-----------------------
if(m == l)
break;
u = abs(H[m][m - 1]) * (abs(y) + abs(z));
d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
if ((T)abs(u + d) == (T)abs(d))
{
l = m;
break;
}
}
//Jasper
if(it > 1000000)
{
std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
std::cout << "got " << e << " evals " << l << " " << N << std::endl;
exit(1);
}
//
T s, c;
Givens_calc<T>(x, z, c, s);
Givens_mult<T>(H, l, l + 1, c, -s, 0);
Givens_mult<T>(H, l, l + 1, c, s, 1);
Givens_mult<T>(P, l, l + 1, c, s, 1);
//
for(int k = l; k < N - 2; ++k)
{
x = H.A[k + 1][k];
z = H.A[k + 2][k];
Givens_calc<T>(x, z, c, s);
Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
Givens_mult<T>(H, k + 1, k + 2, c, s, 1);
Givens_mult<T>(P, k + 1, k + 2, c, s, 1);
}
it++;
tot_it++;
}while(N > 1);
N = evals.size();
///Annoying - UT solves in reverse order;
DenseVector<T> tmp(N);
for(int i = 0; i < N; ++i)
tmp[i] = evals[N-i-1];
evals = tmp;
//
UTeigenvectors(H, trows, evals, evecs);
//UTSymmEigenvectors(H, trows, evals, evecs);
for(int i = 0; i < evals.size(); ++i)
{
evecs[i] = P * evecs[i];
normalize(evecs[i]);
evals[i] = evals[i] * Hnorm;
}
// // FIXME this is to test
// Hin.write("evecs3", evecs);
// Hin.write("evals3", evals);
// // check rsd
// for(int i = 0; i < M; i++) {
// vector<T> Aevec = Hin * evecs[i];
// RealD norm2(0.);
// for(int j = 0; j < M; j++) {
// norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
// }
// }
return tot_it;
}
template <class T>
void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
/**
turn a matrix A =
x x x x x
x x x x x
x x x x x
x x x x x
x x x x x
into
x x x x x
x x x x x
0 x x x x
0 0 x x x
0 0 0 x x
with householder rotations
Slow.
*/
int N ; SizeSquare(A,N);
DenseVector<T > p; Resize(p,N); Fill(p,0);
for(int k=start;k<N-2;k++){
//cerr << "hess" << k << std::endl;
DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);} ///kth column
normalize(ck); ///Normalization cancels in PHP anyway
T beta;
Householder_vector<T >(ck, 0, ck.size()-1, v, beta); ///Householder vector
Householder_mult<T>(A,v,beta,start,k+1,N-1,0); ///A -> PA
Householder_mult<T >(A,v,beta,start,k+1,N-1,1); ///PA -> PAP^H
///Accumulate eigenvector
Householder_mult<T >(Q,v,beta,start,k+1,N-1,1); ///Q -> QP^H
}
/*for(int l=0;l<N-2;l++){
for(int k=l+2;k<N;k++){
A(0,k,l);
}
}*/
}
template <class T>
void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
///Tridiagonalize a matrix
int N; SizeSquare(A,N);
Hess(A,Q,start);
/*for(int l=0;l<N-2;l++){
for(int k=l+2;k<N;k++){
A(0,l,k);
}
}*/
}
template <class T>
void ForceTridiagonal(DenseMatrix<T> &A){
///Tridiagonalize a matrix
int N ; SizeSquare(A,N);
for(int l=0;l<N-2;l++){
for(int k=l+2;k<N;k++){
A[l][k]=0;
A[k][l]=0;
}
}
}
template <class T>
int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
///Solve a symmetric eigensystem, not necessarily in tridiagonal form
int N; SizeSquare(Ain,N);
DenseMatrix<T > A; A = Ain;
DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
Tri(A,Q,0);
int it = my_Wilkinson<T>(A, evals, evecs, small);
for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
return it;
}
template <class T>
int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
return my_Wilkinson(Ain, evals, evecs, small);
}
template <class T>
int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
return my_SymmEigensystem(Ain, evals, evecs, small);
}
template <class T>
int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
///Solve a general eigensystem, not necessarily in tridiagonal form
int N = Ain.dim;
DenseMatrix<T > A(N); A = Ain;
DenseMatrix<T > Q(N);Q.Unity();
Hess(A,Q,0);
int it = QReigensystem<T>(A, evals, evecs, small);
for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
return it;
}
}
#endif

View File

@ -1,242 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/Householder.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef HOUSEHOLDER_H
#define HOUSEHOLDER_H
#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
#define ENTER() std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
#define LEAVE() std::cout << GridLogMessage << "EXIT "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
#include <cstdlib>
#include <string>
#include <cmath>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <fstream>
#include <complex>
#include <algorithm>
namespace Grid {
/** Comparison function for finding the max element in a vector **/
template <class T> bool cf(T i, T j) {
return abs(i) < abs(j);
}
/**
Calculate a real Givens angle
**/
template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
RealD mz = (RealD)abs(z);
if(mz==0.0){
c = 1; s = 0;
}
if(mz >= (RealD)abs(y)){
T t = -y/z;
s = (T)1.0 / sqrt ((T)1.0 + t * t);
c = s * t;
} else {
T t = -z/y;
c = (T)1.0 / sqrt ((T)1.0 + t * t);
s = c * t;
}
}
template <class T> inline void Givens_mult(DenseMatrix<T> &A, int i, int k, T c, T s, int dir)
{
int q ; SizeSquare(A,q);
if(dir == 0){
for(int j=0;j<q;j++){
T nu = A[i][j];
T w = A[k][j];
A[i][j] = (c*nu + s*w);
A[k][j] = (-s*nu + c*w);
}
}
if(dir == 1){
for(int j=0;j<q;j++){
T nu = A[j][i];
T w = A[j][k];
A[j][i] = (c*nu - s*w);
A[j][k] = (s*nu + c*w);
}
}
}
/**
from input = x;
Compute the complex Householder vector, v, such that
P = (I - b v transpose(v) )
b = 2/v.v
P | x | | x | k = 0
| x | | 0 |
| x | = | 0 |
| x | | 0 | j = 3
| x | | x |
These are the "Unreduced" Householder vectors.
**/
template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
{
int N ; Size(input,N);
T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
if(abs(m) > 0.0){
T alpha = 0;
for(int i=k; i<j+1; i++){
v[i] = input[i]/m;
alpha = alpha + v[i]*conj(v[i]);
}
alpha = sqrt(alpha);
beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
if(abs(v[k]) > 0.0) v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
else v[k] = -alpha;
} else{
for(int i=k; i<j+1; i++){
v[i] = 0.0;
}
}
}
/**
from input = x;
Compute the complex Householder vector, v, such that
P = (I - b v transpose(v) )
b = 2/v.v
Px = alpha*e_dir
These are the "Unreduced" Householder vectors.
**/
template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
{
int N = input.size();
T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
if(abs(m) > 0.0){
T alpha = 0;
for(int i=k; i<j+1; i++){
v[i] = input[i]/m;
alpha = alpha + v[i]*conj(v[i]);
}
alpha = sqrt(alpha);
beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
else v[dir] = -alpha;
}else{
for(int i=k; i<j+1; i++){
v[i] = 0.0;
}
}
}
/**
Compute the product PA if trans = 0
AP if trans = 1
P = (I - b v transpose(v) )
b = 2/v.v
start at element l of matrix A
v is of length j - k + 1 of v are nonzero
**/
template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
{
int N ; SizeSquare(A,N);
if(abs(beta) > 0.0){
for(int p=l; p<N; p++){
T s = 0;
if(trans==0){
for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
s *= beta;
for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
} else {
for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
s *= beta;
for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
}
}
}
}
/**
Compute the product PA if trans = 0
AP if trans = 1
P = (I - b v transpose(v) )
b = 2/v.v
start at element l of matrix A
v is of length j - k + 1 of v are nonzero
A is tridiagonal
**/
template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
{
if(abs(beta) > 0.0){
int N ; SizeSquare(A,N);
DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0);
T s;
for(int p=l; p<M; p++){
s = 0;
if(trans==0){
for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
}else{
for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
}
s = beta*s;
if(trans==0){
for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
}else{
for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
}
}
for(int p=l; p<M; p++){
if(trans==0){
for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
}else{
for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
}
}
}
}
}
#endif

View File

@ -33,6 +33,8 @@ directory
namespace Grid {
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
@ -40,25 +42,273 @@ template <class Field>
class BlockConjugateGradient : public OperatorFunction<Field> {
public:
typedef typename Field::scalar_type scomplex;
const int blockDim = 0;
int blockDim ;
int Nblock;
BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
BlockConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
{};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
Field & Q,
const Field & R)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
sliceInnerProductMatrix(m_rr,R,R,Orthog);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Cholesky from Eigen
// There exists a ldlt that is documented as more stable
////////////////////////////////////////////////////////////////////////////////////////////////////
Eigen::MatrixXcd L = m_rr.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
// Q = R C^{-1}
//
// Q_j = R_i Cinv(i,j)
//
// NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
////////////////////////////////////////////////////////////////////////////////////////////////////
// FIXME:: make a sliceMulMatrix to avoid zero vector
sliceMulMatrix(Q,Cinv,R,Orthog);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Call one of several implementations
////////////////////////////////////////////////////////////////////////////////////////////////////
void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = 0; // First dimension is block dim
if ( CGtype == BlockCGrQ ) {
BlockCGrQsolve(Linop,Src,Psi);
} else if (CGtype == BlockCG ) {
BlockCGsolve(Linop,Src,Psi);
} else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi);
} else {
assert(0);
}
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQ implementation:
//--------------------------
// X is guess/Solution
// B is RHS
// Solve A X_i = B_i ; i refers to Nblock index
////////////////////////////////////////////////////////////////////////////
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.checkerboard = B.checkerboard;
conformable(X, B);
Field tmp(B);
Field Q(B);
Field D(B);
Field Z(B);
Field AD(B);
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,B,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
************************************************************************
* Dimensions:
*
* X,B==(Nferm x Nblock)
* A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
*
* QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
* for k:
* Z = AD
* M = [D^dag Z]^{-1}
* X = X + D MC
* QS = Q - ZM
* D = Q + D S^dag
* C = S C
*/
///////////////////////////////////////
// Initial block: initial search dir is guess
///////////////////////////////////////
std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
//1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
Linop.HermOp(X, AD);
tmp = B - AD;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
//3. Z = AD
MatrixTimer.Start();
Linop.HermOp(D, Z);
MatrixTimer.Stop();
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//5. X = X + D MC
m_tmp = m_M * m_C;
sliceMaddTimer.Start();
sliceMaddMatrix(X,m_tmp, D,X,Orthog);
sliceMaddTimer.Stop();
//6. QS = Q - ZM
sliceMaddTimer.Start();
sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
sliceMaddTimer.Stop();
QRTimer.Start();
ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
QRTimer.Stop();
//7. D = Q + D S^dag
m_tmp = m_S.adjoint();
sliceMaddTimer.Start();
sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
sliceMaddTimer.Stop();
//8. C = S C
m_C = m_S*m_C;
/*********************
* convergence monitor
*********************
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
RealD rrsum=0;
RealD rr;
for(int b=0;b<Nblock;b++) {
rrsum+=real(m_rr(b,b));
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(X, AD);
AD = AD-B;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@ -162,8 +412,9 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
*********************
*/
RealD max_resid=0;
RealD rr;
for(int b=0;b<Nblock;b++){
RealD rr = real(m_rr(b,b))/ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
@ -173,13 +424,14 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout << GridLogMessage <<"\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
@ -197,35 +449,13 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
//////////////////////////////////////////////////////////////////////////
// multiRHS conjugate gradient. Dimension zero should be the block direction
// Use this for spread out across nodes
//////////////////////////////////////////////////////////////////////////
template <class Field>
class MultiRHSConjugateGradient : public OperatorFunction<Field> {
public:
typedef typename Field::scalar_type scomplex;
const int blockDim = 0;
int Nblock;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
MultiRHSConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = 0; // First dimension is block dim
int Orthog = blockDim; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
@ -285,12 +515,10 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
MatrixTimer.Stop();
// Alpha
// sliceInnerProductVectorTest(v_pAp_test,P,AP,Orthog);
sliceInnerTimer.Start();
sliceInnerProductVector(v_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
for(int b=0;b<Nblock;b++){
// std::cout << " "<< v_pAp[b]<<" "<< v_pAp_test[b]<<std::endl;
v_alpha[b] = v_rr[b]/real(v_pAp[b]);
}
@ -332,7 +560,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tBlock "<<b<<" resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
@ -358,9 +586,8 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}
#endif

View File

@ -1,81 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/EigenSort.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_EIGENSORT_H
#define GRID_EIGENSORT_H
namespace Grid {
/////////////////////////////////////////////////////////////
// Eigen sorter to begin with
/////////////////////////////////////////////////////////////
template<class Field>
class SortEigen {
private:
//hacking for testing for now
private:
static bool less_lmd(RealD left,RealD right){
return left > right;
}
static bool less_pair(std::pair<RealD,Field const*>& left,
std::pair<RealD,Field const*>& right){
return left.first > (right.first);
}
public:
void push(DenseVector<RealD>& lmd,
DenseVector<Field>& evec,int N) {
DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());
for(int i=0;i<lmd.size();++i)
emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
for(int i=0;i<N;++i){
lmd[i]=it->first;
evec[i]=*(it->second);
++it;
}
}
void push(DenseVector<RealD>& lmd,int N) {
std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
}
bool saturated(RealD lmd, RealD thrs) {
return fabs(lmd) > fabs(thrs);
}
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -98,7 +98,14 @@ public:
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
#endif
// First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP
#pragma omp parallel for
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr;
}
@ -186,6 +193,12 @@ public:
#else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
#endif
size_type bytes = __n*sizeof(_Tp);
uint8_t *cp = (uint8_t *)ptr;
#pragma omp parallel for
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr;
}
void deallocate(pointer __p, size_type) {

View File

@ -1,4 +1,4 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
@ -369,71 +369,6 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
}
};
/*
template<class vobj>
static void sliceMaddVectorSlow (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int Orthog,RealD scale=1.0)
{
// FIXME: Implementation is slow
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
// If we based this on Cshift it would work for spread out
// but it would be even slower
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
ExtractSlice(Xslice,X,i,Orthog);
Rslice = Rslice + Xslice*(scale*a[i]);
InsertSlice(Rslice,R,i,Orthog);
}
};
template<class vobj>
static void sliceInnerProductVectorSlow( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Look at localInnerProduct implementation,
// and do inside a site loop with block strided iterators
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced scalar;
typedef typename scalar::scalar_object scomplex;
int Nblock = lhs._grid->GlobalDimensions()[Orthog];
vec.resize(Nblock);
std::vector<scomplex> sip(Nblock);
Lattice<scalar> IP(lhs._grid);
IP=localInnerProduct(lhs,rhs);
sliceSum(IP,sip,Orthog);
for(int ss=0;ss<Nblock;ss++){
vec[ss] = TensorRemove(sip[ss]);
}
}
*/
//////////////////////////////////////////////////////////////////////////////////////////
// FIXME: Implementation is slow
// If we based this on Cshift it would work for spread out
// but it would be even slower
//
// Repeated extract slice is inefficient
//
// Best base the linear combination by constructing a
// set of vectors of size grid->_rdimensions[Orthog].
//////////////////////////////////////////////////////////////////////////////////////////
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
@ -453,7 +388,6 @@ inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
@ -462,28 +396,103 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
for(int i=0;i<Nblock;i++){
ExtractSlice(Rslice,Y,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Xslice,X,j,Orthog);
Rslice = Rslice + Xslice*(scale*aa(j,i));
}
InsertSlice(Rslice,R,i,Orthog);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
}
}}
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Xslice(SliceGrid);
Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
}
}}
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
// FIXME: Implementation is slow
// Not sure of best solution.. think about it
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
@ -497,22 +506,49 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
for(int i=0;i<Nblock;i++){
ExtractSlice(Lslice,lhs,i,Orthog);
for(int j=0;j<Nblock;j++){
ExtractSlice(Rslice,rhs,j,Orthog);
mat(i,j) = innerProduct(Lslice,Rslice);
}
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
int nl = SliceGrid->_ndimension;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
#pragma omp parallel
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs[o+i*ostride];
Right[i] = rhs[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
vector_typeD rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp);
}}
}}
#pragma omp critical
{
mat += mat_thread;
}
}
#undef FORCE_DIAG
#ifdef FORCE_DIAG
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
if ( i != j ) mat(i,j)=0.0;
}
}
#endif
return;
}

View File

@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4
#ifdef AVX512
#ifdef KNL
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },

View File

@ -237,4 +237,11 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion
}}
////////////////////
// Scalar QED actions
// TODO: this needs to move to another header after rename to Fermion.h
////////////////////
#include <Grid/qcd/action/scalar/Scalar.h>
#include <Grid/qcd/action/gauge/Photon.h>
#endif

View File

@ -0,0 +1,286 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/Photon.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_PHOTON_ACTION_H
#define QCD_PHOTON_ACTION_H
namespace Grid{
namespace QCD{
template <class S>
class QedGimpl
{
public:
typedef S Simd;
template <typename vtype>
using iImplGaugeLink = iScalar<iScalar<iScalar<vtype>>>;
template <typename vtype>
using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
typedef iImplGaugeLink<Simd> SiteLink;
typedef iImplGaugeField<Simd> SiteField;
typedef SiteField SiteComplex;
typedef Lattice<SiteLink> LinkField;
typedef Lattice<SiteField> Field;
typedef Field ComplexField;
};
typedef QedGimpl<vComplex> QedGimplR;
template<class Gimpl>
class Photon
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2);
public:
Photon(Gauge gauge, ZmScheme zmScheme);
virtual ~Photon(void) = default;
void FreePropagator(const GaugeField &in, GaugeField &out);
void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
void StochasticWeight(GaugeLinkField &weight);
void StochasticField(GaugeField &out, GridParallelRNG &rng);
void StochasticField(GaugeField &out, GridParallelRNG &rng,
const GaugeLinkField &weight);
private:
void invKHatSquared(GaugeLinkField &out);
void zmSub(GaugeLinkField &out);
private:
Gauge gauge_;
ZmScheme zmScheme_;
};
typedef Photon<QedGimplR> PhotonR;
template<class Gimpl>
Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
: gauge_(gauge), zmScheme_(zmScheme)
{}
template<class Gimpl>
void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
{
FFT theFFT(in._grid);
GaugeField in_k(in._grid);
GaugeField prop_k(in._grid);
theFFT.FFT_all_dim(in_k,in,FFT::forward);
MomentumSpacePropagator(prop_k,in_k);
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
}
template<class Gimpl>
void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
{
GridBase *grid = out._grid;
GaugeLinkField kmu(grid), one(grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> &l = grid->_fdimensions;
std::vector<int> zm(nd,0);
TComplex Tone = Complex(1.0,0.0);
TComplex Tzero= Complex(0.0,0.0);
one = Complex(1.0,0.0);
out = zero;
for(int mu = 0; mu < nd; mu++)
{
Real twoPiL = M_PI*2./l[mu];
LatticeCoordinate(kmu,mu);
kmu = 2.*sin(.5*twoPiL*kmu);
out = out + kmu*kmu;
}
pokeSite(Tone, out, zm);
out = one/out;
pokeSite(Tzero, out, zm);
}
template<class Gimpl>
void Photon<Gimpl>::zmSub(GaugeLinkField &out)
{
GridBase *grid = out._grid;
const unsigned int nd = grid->_ndimension;
switch (zmScheme_)
{
case ZmScheme::qedTL:
{
std::vector<int> zm(nd,0);
TComplex Tzero = Complex(0.0,0.0);
pokeSite(Tzero, out, zm);
break;
}
case ZmScheme::qedL:
{
LatticeInteger spNrm(grid), coor(grid);
GaugeLinkField z(grid);
spNrm = zero;
for(int d = 0; d < grid->_ndimension - 1; d++)
{
LatticeCoordinate(coor,d);
spNrm = spNrm + coor*coor;
}
out = where(spNrm == Integer(0), 0.*out, out);
break;
}
default:
break;
}
}
template<class Gimpl>
void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
GaugeField &out)
{
GridBase *grid = out._grid;
LatticeComplex k2Inv(grid);
invKHatSquared(k2Inv);
zmSub(k2Inv);
out = in*k2Inv;
}
template<class Gimpl>
void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
{
auto *grid = dynamic_cast<GridCartesian *>(weight._grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> latt_size = grid->_fdimensions;
Integer vol = 1;
for(int d = 0; d < nd; d++)
{
vol = vol * latt_size[d];
}
invKHatSquared(weight);
weight = sqrt(vol*real(weight));
zmSub(weight);
}
template<class Gimpl>
void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
{
auto *grid = dynamic_cast<GridCartesian *>(out._grid);
GaugeLinkField weight(grid);
StochasticWeight(weight);
StochasticField(out, rng, weight);
}
template<class Gimpl>
void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
const GaugeLinkField &weight)
{
auto *grid = dynamic_cast<GridCartesian *>(out._grid);
const unsigned int nd = grid->_ndimension;
GaugeLinkField r(grid);
GaugeField aTilde(grid);
FFT fft(grid);
for(int mu = 0; mu < nd; mu++)
{
gaussian(rng, r);
r = weight*r;
pokeLorentz(aTilde, r, mu);
}
fft.FFT_all_dim(out, aTilde, FFT::backward);
out = real(out);
}
// template<class Gimpl>
// void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
// const GaugeField &in)
// {
//
// FeynmanGaugeMomentumSpacePropagator_TL(out,in);
//
// GridBase *grid = out._grid;
// LatticeInteger coor(grid);
// GaugeField zz(grid); zz=zero;
//
// // xyzt
// for(int d = 0; d < grid->_ndimension-1;d++){
// LatticeCoordinate(coor,d);
// out = where(coor==Integer(0),zz,out);
// }
// }
//
// template<class Gimpl>
// void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
// const GaugeField &in)
// {
//
// // what type LatticeComplex
// GridBase *grid = out._grid;
// int nd = grid->_ndimension;
//
// typedef typename GaugeField::vector_type vector_type;
// typedef typename GaugeField::scalar_type ScalComplex;
// typedef Lattice<iSinglet<vector_type> > LatComplex;
//
// std::vector<int> latt_size = grid->_fdimensions;
//
// LatComplex denom(grid); denom= zero;
// LatComplex one(grid); one = ScalComplex(1.0,0.0);
// LatComplex kmu(grid);
//
// ScalComplex ci(0.0,1.0);
// // momphase = n * 2pi / L
// for(int mu=0;mu<Nd;mu++) {
//
// LatticeCoordinate(kmu,mu);
//
// RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
//
// kmu = TwoPiL * kmu ;
//
// denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
// }
// std::vector<int> zero_mode(nd,0);
// TComplexD Tone = ComplexD(1.0,0.0);
// TComplexD Tzero= ComplexD(0.0,0.0);
//
// pokeSite(Tone,denom,zero_mode);
//
// denom= one/denom;
//
// pokeSite(Tzero,denom,zero_mode);
//
// out = zero;
// out = in*denom;
// };
}}
#endif

View File

@ -31,6 +31,7 @@ directory
#include <Grid/qcd/action/scalar/ScalarImpl.h>
#include <Grid/qcd/action/scalar/ScalarAction.h>
#include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
namespace Grid {
namespace QCD {
@ -39,6 +40,10 @@ namespace QCD {
typedef ScalarAction<ScalarImplF> ScalarActionF;
typedef ScalarAction<ScalarImplD> ScalarActionD;
template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
}
}

View File

@ -6,10 +6,10 @@
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -35,50 +35,49 @@ directory
namespace Grid {
// FIXME drop the QCD namespace everywhere here
template <class Impl>
class ScalarAction : public QCD::Action<typename Impl::Field> {
public:
template <class Impl>
class ScalarAction : public QCD::Action<typename Impl::Field> {
public:
INHERIT_FIELD_TYPES(Impl);
private:
private:
RealD mass_square;
RealD lambda;
public:
ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
virtual std::string LogParameters(){
public:
ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
virtual std::string LogParameters() {
std::stringstream sstream;
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
return sstream.str();
}
virtual std::string action_name(){return "ScalarAction";}
virtual void refresh(const Field &U,
GridParallelRNG &pRNG){}; // noop as no pseudoferms
virtual std::string action_name() {return "ScalarAction";}
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} // noop as no pseudoferms
virtual RealD S(const Field &p) {
return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
ScalarObs<Impl>::sumphider(p);
(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
ScalarObs<Impl>::sumphider(p);
};
virtual void deriv(const Field &p,
Field &force) {
Field &force) {
Field tmp(p._grid);
Field p2(p._grid);
ScalarObs<Impl>::phisquared(p2, p);
tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
};
};
} // Grid
force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
}
};
} // namespace Grid
#endif // SCALAR_ACTION_H

View File

@ -5,96 +5,158 @@
namespace Grid {
//namespace QCD {
template <class S>
class ScalarImplTypes {
public:
template <class S>
class ScalarImplTypes {
public:
typedef S Simd;
template <typename vtype>
using iImplField = iScalar<iScalar<iScalar<vtype> > >;
typedef iImplField<Simd> SiteField;
typedef SiteField SitePropagator;
typedef SiteField SiteComplex;
typedef Lattice<SiteField> Field;
typedef Field ComplexField;
typedef Field FermionField;
typedef Field PropagatorField;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
gaussian(pRNG, P);
}
static inline Field projectForce(Field& P){return P;}
static inline void update_field(Field& P, Field& U, double ep){
static inline void update_field(Field& P, Field& U, double ep) {
U += P*ep;
}
static inline RealD FieldSquareNorm(Field& U){
static inline RealD FieldSquareNorm(Field& U) {
return (- sum(trace(U*U))/2.0);
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
gaussian(pRNG, U);
}
static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
gaussian(pRNG, U);
}
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
U = 1.0;
}
static void MomentumSpacePropagator(Field &out, RealD m)
{
GridBase *grid = out._grid;
Field kmu(grid), one(grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> &l = grid->_fdimensions;
one = Complex(1.0,0.0);
out = m*m;
for(int mu = 0; mu < nd; mu++)
{
Real twoPiL = M_PI*2./l[mu];
LatticeCoordinate(kmu,mu);
kmu = 2.*sin(.5*twoPiL*kmu);
out = out + kmu*kmu;
}
out = one/out;
}
static void FreePropagator(const Field &in, Field &out,
const Field &momKernel)
{
FFT fft((GridCartesian *)in._grid);
Field inFT(in._grid);
fft.FFT_all_dim(inFT, in, FFT::forward);
inFT = inFT*momKernel;
fft.FFT_all_dim(out, inFT, FFT::backward);
}
static void FreePropagator(const Field &in, Field &out, RealD m)
{
Field momKernel(in._grid);
MomentumSpacePropagator(momKernel, m);
FreePropagator(in, out, momKernel);
}
};
template <class S, unsigned int N>
class ScalarMatrixImplTypes {
class ScalarAdjMatrixImplTypes {
public:
typedef S Simd;
typedef QCD::SU<N> Group;
template <typename vtype>
using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >;
using iImplField = iScalar<iScalar<iMatrix<vtype, N>>>;
template <typename vtype>
using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
typedef iImplField<Simd> SiteField;
typedef SiteField SitePropagator;
typedef iImplComplex<Simd> SiteComplex;
typedef iImplField<Simd> SiteField;
typedef Lattice<SiteField> Field;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
gaussian(pRNG, P);
typedef Lattice<SiteField> Field;
typedef Lattice<SiteComplex> ComplexField;
typedef Field FermionField;
typedef Field PropagatorField;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
}
static inline Field projectForce(Field& P){return P;}
static inline void update_field(Field& P, Field& U, double ep){
static inline Field projectForce(Field& P) {return P;}
static inline void update_field(Field& P, Field& U, double ep) {
U += P*ep;
}
static inline RealD FieldSquareNorm(Field& U){
return (TensorRemove(- sum(trace(U*U))*0.5).real());
static inline RealD FieldSquareNorm(Field& U) {
return (TensorRemove(sum(trace(U*U))).real());
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
gaussian(pRNG, U);
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
}
static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
gaussian(pRNG, U);
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
}
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
U = 1.0;
U = zero;
}
};
typedef ScalarImplTypes<vReal> ScalarImplR;
typedef ScalarImplTypes<vRealF> ScalarImplF;
typedef ScalarImplTypes<vRealD> ScalarImplD;
typedef ScalarImplTypes<vComplex> ScalarImplCR;
typedef ScalarImplTypes<vComplexF> ScalarImplCF;
typedef ScalarImplTypes<vComplexD> ScalarImplCD;
// Hardcoding here the size of the matrices
typedef ScalarAdjMatrixImplTypes<vComplex, QCD::Nc> ScalarAdjImplR;
typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex, Colours >;
template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF, Colours >;
template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD, Colours >;
//}
}
//}
}
#endif

View File

@ -6,10 +6,7 @@
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido,cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -30,55 +27,122 @@ directory
*************************************************************************************/
/* END LEGAL */
#ifndef SCALAR_ACTION_H
#define SCALAR_ACTION_H
#ifndef SCALAR_INT_ACTION_H
#define SCALAR_INT_ACTION_H
// Note: this action can completely absorb the ScalarAction for real float fields
// use the scalarObjs to generalise the structure
namespace Grid {
// FIXME drop the QCD namespace everywhere here
template <class Impl>
template <class Impl, int Ndim >
class ScalarInteractionAction : public QCD::Action<typename Impl::Field> {
public:
INHERIT_FIELD_TYPES(Impl);
private:
RealD mass_square;
RealD lambda;
public:
ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){};
virtual std::string LogParameters(){
typedef typename Field::vector_object vobj;
typedef CartesianStencil<vobj,vobj> Stencil;
SimpleCompressor<vobj> compressor;
int npoint = 2*Ndim;
std::vector<int> directions;// = {0,1,2,3,0,1,2,3}; // forcing 4 dimensions
std::vector<int> displacements;// = {1,1,1,1, -1,-1,-1,-1};
public:
ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){
for (int mu = 0 ; mu < Ndim; mu++){
directions[mu] = mu; directions[mu+Ndim] = mu;
displacements[mu] = 1; displacements[mu+Ndim] = -1;
}
}
virtual std::string LogParameters() {
std::stringstream sstream;
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
return sstream.str();
}
virtual std::string action_name(){return "ScalarAction";}
virtual void refresh(const Field &U,
GridParallelRNG &pRNG){}; // noop as no pseudoferms
virtual std::string action_name() {return "ScalarAction";}
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
virtual RealD S(const Field &p) {
return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
ScalarObs<Impl>::sumphider(p);
assert(p._grid->Nd() == Ndim);
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
Field action(p._grid), pshift(p._grid), phisquared(p._grid);
phisquared = p*p;
action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
for (int mu = 0; mu < Ndim; mu++) {
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
parallel_for (int i = 0; i < p._grid->oSites(); i++) {
int permute_type;
StencilEntry *SE;
vobj temp2;
const vobj *temp, *t_p;
SE = phiStencil.GetEntry(permute_type, mu, i);
t_p = &p._odata[i];
if ( SE->_is_local ) {
temp = &p._odata[SE->_offset];
if ( SE->_permute ) {
permute(temp2, *temp, permute_type);
action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
} else {
action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
}
} else {
action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
}
}
// action -= pshift*p + p*pshift;
}
// NB the trace in the algebra is normalised to 1/2
// minus sign coming from the antihermitian fields
return -(TensorRemove(sum(trace(action)))).real();
};
virtual void deriv(const Field &p,
Field &force) {
Field tmp(p._grid);
Field p2(p._grid);
ScalarObs<Impl>::phisquared(p2, p);
tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
virtual void deriv(const Field &p, Field &force) {
assert(p._grid->Nd() == Ndim);
force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
// move this outside
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
};
//for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
for (int point = 0; point < npoint; point++) {
parallel_for (int i = 0; i < p._grid->oSites(); i++) {
const vobj *temp;
vobj temp2;
int permute_type;
StencilEntry *SE;
SE = phiStencil.GetEntry(permute_type, point, i);
if ( SE->_is_local ) {
temp = &p._odata[SE->_offset];
if ( SE->_permute ) {
permute(temp2, *temp, permute_type);
force._odata[i] -= temp2;
} else {
force._odata[i] -= *temp;
}
} else {
force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
}
}
}
}
};
} // Grid
} // namespace Grid
#endif // SCALAR_ACTION_H
#endif // SCALAR_INT_ACTION_H

View File

@ -207,6 +207,12 @@ using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator,
typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
ScalarGenericHMCRunner;
typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
ScalarAdjGenericHMCRunner;
template <int Colours>
using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >;
} // namespace QCD
} // namespace Grid

View File

@ -76,7 +76,7 @@ struct HMCparameters: Serializable {
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
std::cout << "Reading HMC\n";
std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}

View File

@ -253,6 +253,7 @@ class HMCResourceManager {
template<class T, class... Types>
void AddObservable(Types&&... Args){
ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
ObservablesList.back()->print_parameters();
}
std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
@ -297,4 +298,4 @@ private:
}
}
#endif // HMC_RESOURCE_MANAGER_H
#endif // HMC_RESOURCE_MANAGER_H

View File

@ -102,7 +102,7 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
FieldMetaData header;
IldgReader _IldgReader;
_IldgReader.open(config);
_IldgReader.readConfiguration(config,U,header); // format from the header
_IldgReader.readConfiguration(U,header); // format from the header
_IldgReader.close();
std::cout << GridLogMessage << "Read ILDG Configuration from " << config

View File

@ -62,7 +62,10 @@ class Representations {
typedef Representations<FundamentalRepresentation> NoHirep;
typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields;
//typedef Representations<EmptyRep<typename ScalarMatrixImplR::Field> > ScalarMatrixFields;
typedef Representations<EmptyRep<typename ScalarAdjImplR::Field> > ScalarMatrixFields;
template < int Colours>
using ScalarNxNMatrixFields = Representations<EmptyRep<typename ScalarNxNAdjImplR<Colours>::Field> >;
// Helper classes to access the elements
// Strips the first N parameters from the tuple

View File

@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
if (maxTau - taus < epsilon){
epsilon = maxTau-taus;
}
std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
//std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
GaugeField Z(U._grid);
GaugeField Zprime(U._grid);
GaugeField tmp(U._grid), Uprime(U._grid);
@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
// adjust integration step
taus += epsilon;
std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
//std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
//std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
}
@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
out = in;
for (unsigned int step = 1; step <= Nstep; step++) {
auto start = std::chrono::high_resolution_clock::now();
std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
evolve_step(out);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
unsigned int step = 0;
do{
step++;
std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
//std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
evolve_step_adaptive(out, maxTau);
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
<< step << " "

188
lib/qcd/utils/GaugeFix.h Normal file
View File

@ -0,0 +1,188 @@
/*************************************************************************************
grid` physics library, www.github.com/paboyle/Grid
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
//#include <Grid/Grid.h>
using namespace Grid;
using namespace Grid::QCD;
template <class Gimpl>
class FourierAcceleratedGaugeFixer : public Gimpl {
public:
INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Gimpl::GaugeLinkField GaugeMat;
typedef typename Gimpl::GaugeField GaugeLorentz;
static void GaugeLinkToLieAlgebraField(const std::vector<GaugeMat> &U,std::vector<GaugeMat> &A) {
for(int mu=0;mu<Nd;mu++){
Complex cmi(0.0,-1.0);
A[mu] = Ta(U[mu]) * cmi;
}
}
static void DmuAmu(const std::vector<GaugeMat> &A,GaugeMat &dmuAmu) {
dmuAmu=zero;
for(int mu=0;mu<Nd;mu++){
dmuAmu = dmuAmu + A[mu] - Cshift(A[mu],mu,-1);
}
}
static void SteepestDescentGaugeFix(GaugeLorentz &Umu,Real & alpha,int maxiter,Real Omega_tol, Real Phi_tol,bool Fourier=false) {
GridBase *grid = Umu._grid;
Real org_plaq =WilsonLoops<Gimpl>::avgPlaquette(Umu);
Real org_link_trace=WilsonLoops<Gimpl>::linkTrace(Umu);
Real old_trace = org_link_trace;
Real trG;
std::vector<GaugeMat> U(Nd,grid);
GaugeMat dmuAmu(grid);
for(int i=0;i<maxiter;i++){
for(int mu=0;mu<Nd;mu++) U[mu]= PeekIndex<LorentzIndex>(Umu,mu);
if ( Fourier==false ) {
trG = SteepestDescentStep(U,alpha,dmuAmu);
} else {
trG = FourierAccelSteepestDescentStep(U,alpha,dmuAmu);
}
for(int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(Umu,U[mu],mu);
// Monitor progress and convergence test
// infrequently to minimise cost overhead
if ( i %20 == 0 ) {
Real plaq =WilsonLoops<Gimpl>::avgPlaquette(Umu);
Real link_trace=WilsonLoops<Gimpl>::linkTrace(Umu);
if (Fourier)
std::cout << GridLogMessage << "Fourier Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
else
std::cout << GridLogMessage << " Iteration "<<i<< " plaq= "<<plaq<< " dmuAmu " << norm2(dmuAmu)<< std::endl;
Real Phi = 1.0 - old_trace / link_trace ;
Real Omega= 1.0 - trG;
std::cout << GridLogMessage << " Iteration "<<i<< " Phi= "<<Phi<< " Omega= " << Omega<< " trG " << trG <<std::endl;
if ( (Omega < Omega_tol) && ( ::fabs(Phi) < Phi_tol) ) {
std::cout << GridLogMessage << "Converged ! "<<std::endl;
return;
}
old_trace = link_trace;
}
}
};
static Real SteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
GridBase *grid = U[0]._grid;
std::vector<GaugeMat> A(Nd,grid);
GaugeMat g(grid);
GaugeLinkToLieAlgebraField(U,A);
ExpiAlphaDmuAmu(A,g,alpha,dmuAmu);
Real vol = grid->gSites();
Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
SU<Nc>::GaugeTransform(U,g);
return trG;
}
static Real FourierAccelSteepestDescentStep(std::vector<GaugeMat> &U,Real & alpha, GaugeMat & dmuAmu) {
GridBase *grid = U[0]._grid;
Real vol = grid->gSites();
FFT theFFT((GridCartesian *)grid);
LatticeComplex Fp(grid);
LatticeComplex psq(grid); psq=zero;
LatticeComplex pmu(grid);
LatticeComplex one(grid); one = Complex(1.0,0.0);
GaugeMat g(grid);
GaugeMat dmuAmu_p(grid);
std::vector<GaugeMat> A(Nd,grid);
GaugeLinkToLieAlgebraField(U,A);
DmuAmu(A,dmuAmu);
theFFT.FFT_all_dim(dmuAmu_p,dmuAmu,FFT::forward);
//////////////////////////////////
// Work out Fp = psq_max/ psq...
//////////////////////////////////
std::vector<int> latt_size = grid->GlobalDimensions();
std::vector<int> coor(grid->_ndimension,0);
for(int mu=0;mu<Nd;mu++) {
Real TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(pmu,mu);
pmu = TwoPiL * pmu ;
psq = psq + 4.0*sin(pmu*0.5)*sin(pmu*0.5);
}
Complex psqMax(16.0);
Fp = psqMax*one/psq;
/*
static int once;
if ( once == 0 ) {
std::cout << " Fp " << Fp <<std::endl;
once ++;
}*/
pokeSite(TComplex(1.0),Fp,coor);
dmuAmu_p = dmuAmu_p * Fp;
theFFT.FFT_all_dim(dmuAmu,dmuAmu_p,FFT::backward);
GaugeMat ciadmam(grid);
Complex cialpha(0.0,-alpha);
ciadmam = dmuAmu*cialpha;
SU<Nc>::taExp(ciadmam,g);
Real trG = TensorRemove(sum(trace(g))).real()/vol/Nc;
SU<Nc>::GaugeTransform(U,g);
return trG;
}
static void ExpiAlphaDmuAmu(const std::vector<GaugeMat> &A,GaugeMat &g,Real & alpha, GaugeMat &dmuAmu) {
GridBase *grid = g._grid;
Complex cialpha(0.0,-alpha);
GaugeMat ciadmam(grid);
DmuAmu(A,dmuAmu);
ciadmam = dmuAmu*cialpha;
SU<Nc>::taExp(ciadmam,g);
}
};

View File

@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>
for (int a = 0; a < AdjointDimension; a++) {
generator(a, Ta);
auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
pokeColour(h_out, tmp, a);
pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
}
}

View File

@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
Hdf5Type<unsigned int>::type());
}
void Hdf5Reader::push(const std::string &s)
bool Hdf5Reader::push(const std::string &s)
{
group_ = group_.openGroup(s);
path_.push_back(s);
return true;
}
void Hdf5Reader::pop(void)

View File

@ -54,7 +54,7 @@ namespace Grid
public:
Hdf5Reader(const std::string &fileName);
virtual ~Hdf5Reader(void) = default;
void push(const std::string &s);
bool push(const std::string &s);
void pop(void);
template <typename U>
void readDefault(const std::string &s, U &output);

View File

@ -701,9 +701,28 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
__m128i ret;
#if defined (AVX2)
// AVX2 horizontal adds within upper and lower halves of register; use
// SSE to add upper and lower halves for result.
__m256i v1, v2;
__m128i u1, u2;
v1 = _mm256_hadd_epi32(in, in);
v2 = _mm256_hadd_epi32(v1, v1);
u1 = _mm256_castsi256_si128(v2); // upper half
u2 = _mm256_extracti128_si256(v2, 1); // lower half
ret = _mm_add_epi32(u1, u2);
#else
// No AVX horizontal add; extract upper and lower halves of register & use
// SSE intrinsics.
__m128i u1, u2, u3;
u1 = _mm256_extractf128_si256(in, 0); // upper half
u2 = _mm256_extractf128_si256(in, 1); // lower half
u3 = _mm_add_epi32(u1, u2);
u1 = _mm_hadd_epi32(u3, u3);
ret = _mm_hadd_epi32(u1, u1);
#endif
return _mm_cvtsi128_si32(ret);
}
}

View File

@ -543,6 +543,24 @@ namespace Optimization {
u512d conv; conv.v = v1;
return conv.f[0];
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// No full vector reduce, use AVX to add upper and lower halves of register
// and perform AVX reduction.
__m256i v1, v2, v3;
__m128i u1, u2, ret;
v1 = _mm512_castsi512_si256(in); // upper half
v2 = _mm512_extracti32x8_epi32(in, 1); // lower half
v3 = _mm256_add_epi32(v1, v2);
v1 = _mm256_hadd_epi32(v3, v3);
v2 = _mm256_hadd_epi32(v1, v1);
u1 = _mm256_castsi256_si128(v2) // upper half
u2 = _mm256_extracti128_si256(v2, 1); // lower half
ret = _mm_add_epi32(u1, u2);
return _mm_cvtsi128_si32(ret);
}
#else
//Complex float Reduce
template<>
@ -570,9 +588,7 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
return _mm512_reduce_add_epi32(in);
}
#endif

View File

@ -401,9 +401,7 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
return _mm512_reduce_add_epi32(in);
}

View File

@ -1,13 +1,14 @@
/*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/simd/Grid_neon.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: Nils Meyer <nils.meyer@ur.de>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
//----------------------------------------------------------------------
/*! @file Grid_sse4.h
@brief Optimization libraries for NEON (ARM) instructions set ARMv8
Experimental - Using intrinsics - DEVELOPING!
/*
ARMv8 NEON intrinsics layer by
Nils Meyer <nils.meyer@ur.de>,
University of Regensburg, Germany
SFB/TRR55
*/
// Time-stamp: <2015-07-10 17:45:09 neo>
//----------------------------------------------------------------------
#ifndef GEN_SIMD_WIDTH
#define GEN_SIMD_WIDTH 16u
#endif
#include "Grid_generic_types.h"
#include <arm_neon.h>
// ARMv8 supports double precision
namespace Grid {
namespace Optimization {
template<class vtype>
@ -46,16 +53,20 @@ namespace Optimization {
float32x4_t f;
vtype v;
};
union u128f {
float32x4_t v;
float f[4];
};
union u128d {
float64x2_t v;
double f[4];
double f[2];
};
// half precision
union u128h {
float16x8_t v;
uint16_t f[8];
};
struct Vsplat{
//Complex float
inline float32x4_t operator()(float a, float b){
@ -64,31 +75,31 @@ namespace Optimization {
}
// Real float
inline float32x4_t operator()(float a){
return vld1q_dup_f32(&a);
return vdupq_n_f32(a);
}
//Complex double
inline float32x4_t operator()(double a, double b){
float tmp[4]={(float)a,(float)b,(float)a,(float)b};
return vld1q_f32(tmp);
inline float64x2_t operator()(double a, double b){
double tmp[2]={a,b};
return vld1q_f64(tmp);
}
//Real double
inline float32x4_t operator()(double a){
return vld1q_dup_f32(&a);
//Real double // N:tbc
inline float64x2_t operator()(double a){
return vdupq_n_f64(a);
}
//Integer
//Integer // N:tbc
inline uint32x4_t operator()(Integer a){
return vld1q_dup_u32(&a);
return vdupq_n_u32(a);
}
};
struct Vstore{
//Float
//Float
inline void operator()(float32x4_t a, float* F){
vst1q_f32(F, a);
}
//Double
inline void operator()(float32x4_t a, double* D){
vst1q_f32((float*)D, a);
inline void operator()(float64x2_t a, double* D){
vst1q_f64(D, a);
}
//Integer
inline void operator()(uint32x4_t a, Integer* I){
@ -97,54 +108,54 @@ namespace Optimization {
};
struct Vstream{
//Float
struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
//Float // N:generic
inline void operator()(float * a, float32x4_t b){
memcpy(a,&b,4*sizeof(float));
}
//Double
inline void operator()(double * a, float32x4_t b){
//Double // N:generic
inline void operator()(double * a, float64x2_t b){
memcpy(a,&b,2*sizeof(double));
}
};
// Nils: Vset untested; not used currently in Grid at all;
// git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
struct Vset{
// Complex float
// Complex float // N:ok
inline float32x4_t operator()(Grid::ComplexF *a){
float32x4_t foo;
return foo;
float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
return vld1q_f32(tmp);
}
// Complex double
inline float32x4_t operator()(Grid::ComplexD *a){
float32x4_t foo;
return foo;
// Complex double // N:ok
inline float64x2_t operator()(Grid::ComplexD *a){
double tmp[2]={a[0].imag(),a[0].real()};
return vld1q_f64(tmp);
}
// Real float
// Real float // N:ok
inline float32x4_t operator()(float *a){
float32x4_t foo;
return foo;
float tmp[4]={a[3],a[2],a[1],a[0]};
return vld1q_f32(tmp);
}
// Real double
inline float32x4_t operator()(double *a){
float32x4_t foo;
return foo;
// Real double // N:ok
inline float64x2_t operator()(double *a){
double tmp[2]={a[1],a[0]};
return vld1q_f64(tmp);
}
// Integer
// Integer // N:ok
inline uint32x4_t operator()(Integer *a){
uint32x4_t foo;
return foo;
return vld1q_dup_u32(a);
}
};
// N:leaving as is
template <typename Out_type, typename In_type>
struct Reduce{
//Need templated class to overload output type
//General form must generate error if compiled
inline Out_type operator()(In_type in){
inline Out_type operator()(In_type in){
printf("Error, using wrong Reduce function\n");
exit(1);
return 0;
@ -184,26 +195,98 @@ namespace Optimization {
}
};
struct MultRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t re = vtrn1q_f32(a, a);
return vmulq_f32(re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float64x2_t re = vzip1q_f64(a, a);
return vmulq_f64(re, b);
}
};
struct MaddRealPart{
inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
float32x4_t re = vtrn1q_f32(a, a);
return vfmaq_f32(c, re, b);
}
inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
float64x2_t re = vzip1q_f64(a, a);
return vfmaq_f64(c, re, b);
}
};
struct Div{
// Real float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vdivq_f32(a, b);
}
// Real double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
return vdivq_f64(a, b);
}
};
struct MultComplex{
// Complex float
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
float32x4_t foo;
return foo;
float32x4_t r0, r1, r2, r3, r4;
// a = ar ai Ar Ai
// b = br bi Br Bi
// collect real/imag part, negate bi and Bi
r0 = vtrn1q_f32(b, b); // br br Br Br
r1 = vnegq_f32(b); // -br -bi -Br -Bi
r2 = vtrn2q_f32(b, r1); // bi -bi Bi -Bi
// the fun part
r3 = vmulq_f32(r2, a); // bi*ar -bi*ai ...
r4 = vrev64q_f32(r3); // -bi*ai bi*ar ...
// fma(a,b,c) = a+b*c
return vfmaq_f32(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi ...
// no fma, use mul and add
//float32x4_t r5;
//r5 = vmulq_f32(r0, a);
//return vaddq_f32(r4, r5);
}
// Complex double
inline float64x2_t operator()(float64x2_t a, float64x2_t b){
float32x4_t foo;
return foo;
float64x2_t r0, r1, r2, r3, r4;
// b = br bi
// collect real/imag part, negate bi
r0 = vtrn1q_f64(b, b); // br br
r1 = vnegq_f64(b); // -br -bi
r2 = vtrn2q_f64(b, r1); // bi -bi
// the fun part
r3 = vmulq_f64(r2, a); // bi*ar -bi*ai
r4 = vextq_f64(r3,r3,1); // -bi*ai bi*ar
// fma(a,b,c) = a+b*c
return vfmaq_f64(r4, r0, a); // ar*br-ai*bi ai*br+ar*bi
// no fma, use mul and add
//float64x2_t r5;
//r5 = vmulq_f64(r0, a);
//return vaddq_f64(r4, r5);
}
};
struct Mult{
// Real float
inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
return vaddq_f32(vmulq_f32(b,c),a);
//return vaddq_f32(vmulq_f32(b,c),a);
return vfmaq_f32(a, b, c);
}
inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
return vaddq_f64(vmulq_f64(b,c),a);
//return vaddq_f64(vmulq_f64(b,c),a);
return vfmaq_f64(a, b, c);
}
inline float32x4_t operator()(float32x4_t a, float32x4_t b){
return vmulq_f32(a,b);
@ -221,89 +304,275 @@ namespace Optimization {
struct Conj{
// Complex single
inline float32x4_t operator()(float32x4_t in){
return in;
// ar ai br bi -> ar -ai br -bi
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(in, r1); // ar -ai br -bi
}
// Complex double
//inline float32x4_t operator()(float32x4_t in){
// return 0;
//}
inline float64x2_t operator()(float64x2_t in){
float64x2_t r0, r1;
r0 = vextq_f64(in, in, 1); // ai ar
r1 = vnegq_f64(r0); // -ai -ar
return vextq_f64(r0, r1, 1); // ar -ai
}
// do not define for integer input
};
struct TimesMinusI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
return in;
// ar ai br bi -> ai -ar ai -br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(in); // ai ar bi br
return vtrn1q_f32(r1, r0); // ar -ai br -bi
}
//Complex double
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// return in;
//}
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// a ib -> b -ia
float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(in, tmp, 1);
}
};
struct TimesI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
//need shuffle
return in;
// ar ai br bi -> -ai ar -bi br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
r1 = vrev64q_f32(r0); // -ai -ar -bi -br
return vtrn1q_f32(r1, in); // -ai ar -bi br
}
//Complex double
//inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
// return 0;
//}
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
// a ib -> -b ia
float64x2_t tmp;
tmp = vnegq_f64(in);
return vextq_f64(tmp, in, 1);
}
};
struct Permute{
static inline float32x4_t Permute0(float32x4_t in){ // N:ok
// AB CD -> CD AB
return vextq_f32(in, in, 2);
};
static inline float32x4_t Permute1(float32x4_t in){ // N:ok
// AB CD -> BA DC
return vrev64q_f32(in);
};
static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute0(float64x2_t in){ // N:ok
// AB -> BA
return vextq_f64(in, in, 1);
};
static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
return in;
};
static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
return in;
};
};
struct Rotate{
static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
switch(n){
case 0: // AB CD -> AB CD
return tRotate<0>(in);
break;
case 1: // AB CD -> BC DA
return tRotate<1>(in);
break;
case 2: // AB CD -> CD AB
return tRotate<2>(in);
break;
case 3: // AB CD -> DA BC
return tRotate<3>(in);
break;
default: assert(0);
}
}
static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
switch(n){
case 0: // AB -> AB
return tRotate<0>(in);
break;
case 1: // AB -> BA
return tRotate<1>(in);
break;
default: assert(0);
}
}
// working, but no restriction on n
// template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
// template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
// restriction on n
template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
};
struct PrecisionChange {
static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
float16x4_t h = vcvt_f16_f32(a);
return vcvt_high_f16_f32(h, b);
}
static inline void HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
sb = vcvt_high_f32_f16(h);
// there is no direct conversion from lower float32x4_t to float64x2_t
// vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
//float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
// workaround for clang
uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
sa = vcvt_high_f32_f16(h1);
}
static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
float32x2_t s = vcvt_f32_f64(a);
return vcvt_high_f32_f64(s, b);
}
static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
b = vcvt_high_f64_f32(s);
// there is no direct conversion from lower float32x4_t to float64x2_t
float32x4_t s1 = vextq_f32(s, s, 2);
a = vcvt_high_f64_f32(s1);
}
static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
float32x4_t s1 = DtoS(a, b);
float32x4_t s2 = DtoS(c, d);
return StoH(s1, s2);
}
static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
float32x4_t s1, s2;
HtoS(h, s1, s2);
StoD(s1, a, b);
StoD(s2, c, d);
}
};
//////////////////////////////////////////////
// Exchange support
struct Exchange{
static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: ABEF
// in2: EFGH -> out2: CDGH
// z: CDAB
float32x4_t z = vextq_f32(in1, in1, 2);
// out1: ABEF
out1 = vextq_f32(z, in2, 2);
// z: GHEF
z = vextq_f32(in2, in2, 2);
// out2: CDGH
out2 = vextq_f32(in1, z, 2);
};
static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
// in1: ABCD -> out1: AECG
// in2: EFGH -> out2: BFDH
out1 = vtrn1q_f32(in1, in2);
out2 = vtrn2q_f32(in1, in2);
};
static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
assert(0);
return;
};
// double precision
static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
// in1: AB -> out1: AC
// in2: CD -> out2: BD
out1 = vzip1q_f64(in1, in2);
out2 = vzip2q_f64(in1, in2);
};
static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
assert(0);
return;
};
};
//////////////////////////////////////////////
// Some Template specialization
template < typename vtype >
void permute(vtype &a, vtype b, int perm) {
};
//Complex float Reduce
template<>
inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
return 0;
float32x4_t v1; // two complex
v1 = Optimization::Permute::Permute0(in);
v1 = vaddq_f32(v1,in);
u128f conv; conv.v=v1;
return Grid::ComplexF(conv.f[0],conv.f[1]);
}
//Real float Reduce
template<>
inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
float32x2_t high = vget_high_f32(in);
float32x2_t low = vget_low_f32(in);
float32x2_t tmp = vadd_f32(low, high);
float32x2_t sum = vpadd_f32(tmp, tmp);
return vget_lane_f32(sum,0);
return vaddvq_f32(in);
}
//Complex double Reduce
template<>
template<> // N:by Boyle
inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
return 0;
u128d conv; conv.v = in;
return Grid::ComplexD(conv.f[0],conv.f[1]);
}
//Real double Reduce
template<>
inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
float64x2_t sum = vpaddq_f64(in, in);
return vgetq_lane_f64(sum,0);
return vaddvq_f64(in);
}
//Integer Reduce
template<>
inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Here assign types
namespace Grid {
// Here assign types
// typedef Optimization::vech SIMD_Htype; // Reduced precision type
typedef float16x8_t SIMD_Htype; // Half precision type
typedef float32x4_t SIMD_Ftype; // Single precision type
typedef float64x2_t SIMD_Dtype; // Double precision type
typedef uint32x4_t SIMD_Itype; // Integer type
@ -312,13 +581,6 @@ namespace Grid {
inline void prefetch_HINT_T0(const char *ptr){};
// Gpermute function
template < typename VectorSIMD >
inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
Optimization::permute(y.v,b.v,perm);
}
// Function name aliases
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
@ -326,16 +588,19 @@ namespace Grid {
typedef Optimization::Vstream VstreamSIMD;
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Div DivSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::MultRealPart MultRealPartSIMD;
typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
}
}

View File

@ -374,6 +374,84 @@ namespace Optimization {
// Complex float
FLOAT_WRAP_2(operator(), inline)
};
#define USE_FP16
struct PrecisionChange {
static inline vech StoH (const vector4float &a, const vector4float &b) {
vech ret;
std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void HtoS (vech h, vector4float &sa, vector4float &sb) {
std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl;
assert(0);
}
static inline vector4float DtoS (vector4double a, vector4double b) {
vector4float ret;
std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void StoD (vector4float s, vector4double &a, vector4double &b) {
std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl;
assert(0);
}
static inline vech DtoH (vector4double a, vector4double b,
vector4double c, vector4double d) {
vech ret;
std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl;
assert(0);
return ret;
}
static inline void HtoD (vech h, vector4double &a, vector4double &b,
vector4double &c, vector4double &d) {
std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl;
assert(0);
}
};
//////////////////////////////////////////////
// Exchange support
#define FLOAT_WRAP_EXCHANGE(fn) \
static inline void fn(vector4float &out1, vector4float &out2, \
vector4float in1, vector4float in2) \
{ \
vector4double out1d, out2d, in1d, in2d; \
in1d = Vset()(in1); \
in2d = Vset()(in2); \
fn(out1d, out2d, in1d, in2d); \
Vstore()(out1d, out1); \
Vstore()(out2d, out2); \
}
struct Exchange{
// double precision
static inline void Exchange0(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
out1 = vec_perm(in1, in2, vec_gpci(0145));
out2 = vec_perm(in1, in2, vec_gpci(02367));
}
static inline void Exchange1(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
out1 = vec_perm(in1, in2, vec_gpci(0426));
out2 = vec_perm(in1, in2, vec_gpci(01537));
}
static inline void Exchange2(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
assert(0);
}
static inline void Exchange3(vector4double &out1, vector4double &out2,
vector4double in1, vector4double in2) {
assert(0);
}
// single precision
FLOAT_WRAP_EXCHANGE(Exchange0);
FLOAT_WRAP_EXCHANGE(Exchange1);
FLOAT_WRAP_EXCHANGE(Exchange2);
FLOAT_WRAP_EXCHANGE(Exchange3);
};
struct Permute{
//Complex double
@ -497,15 +575,19 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, int>::operator()(int in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
inline Integer Reduce<Integer, veci>::operator()(veci in){
Integer a = 0;
for (unsigned int i = 0; i < W<Integer>::r; ++i)
{
a += in.v[i];
}
return a;
}
}
////////////////////////////////////////////////////////////////////////////////
// Here assign types
typedef Optimization::vech SIMD_Htype; // Half precision type
typedef Optimization::vector4float SIMD_Ftype; // Single precision type
typedef vector4double SIMD_Dtype; // Double precision type
typedef Optimization::veci SIMD_Itype; // Integer type

View File

@ -570,9 +570,9 @@ namespace Optimization {
//Integer Reduce
template<>
inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){
// FIXME unimplemented
printf("Reduce : Missing integer implementation -> FIX\n");
assert(0);
__m128i v1 = _mm_hadd_epi32(in, in);
__m128i v2 = _mm_hadd_epi32(v1, v1);
return _mm_cvtsi128_si32(v2);
}
}

View File

@ -53,7 +53,7 @@ directory
#if defined IMCI
#include "Grid_imci.h"
#endif
#ifdef NEONv8
#ifdef NEONV8
#include "Grid_neon.h"
#endif
#if defined QPX
@ -751,8 +751,8 @@ inline Grid_simd<std::complex<R>, V> toComplex(const Grid_simd<R, V> &in) {
conv.v = in.v;
for (int i = 0; i < Rsimd::Nsimd(); i += 2) {
assert(conv.s[i + 1] ==
conv.s[i]); // trap any cases where real was not duplicated
assert(conv.s[i + 1] == conv.s[i]);
// trap any cases where real was not duplicated
// indicating the SIMD grids of real and imag assignment did not correctly
// match
conv.s[i + 1] = 0.0; // zero imaginary parts

View File

@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid {
int LebesgueOrder::UseLebesgueOrder;
#ifdef KNL
std::vector<int> LebesgueOrder::Block({8,2,2,2});
#else
std::vector<int> LebesgueOrder::Block({2,2,2,2});
#endif
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
n--; // 1000 0011 --> 1000 0010
n |= n >> 1; // 1000 0010 | 0100 0001 = 1100 0011
@ -51,8 +54,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
if ( Block[0]==0) ZGraph();
else if ( Block[1]==0) NoBlocking();
else CartesianBlocking();
}
if (0) {
std::cout << "Thread Interleaving"<<std::endl;
ThreadInterleave();
}
}
void LebesgueOrder::ThreadInterleave(void)
{
std::vector<IndexInteger> reorder = _LebesgueReorder;
std::vector<IndexInteger> throrder;
int vol = _LebesgueReorder.size();
int threads = GridThread::GetThreads();
int blockbits=3;
int blocklen = 8;
int msk = 0x7;
for(int t=0;t<threads;t++){
for(int ss=0;ss<vol;ss++){
if ( ( ss >> blockbits) % threads == t ) {
throrder.push_back(reorder[ss]);
}
}
}
_LebesgueReorder = throrder;
}
void LebesgueOrder::NoBlocking(void)
{
std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;

View File

@ -70,6 +70,8 @@ namespace Grid {
std::vector<IndexInteger> & xi,
std::vector<IndexInteger> &dims);
void ThreadInterleave(void);
private:
std::vector<IndexInteger> _LebesgueReorder;

View File

@ -285,7 +285,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
{
int dimension = _directions[point];
int displacement = _distances[point];
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];

View File

@ -156,11 +156,18 @@ class iScalar {
// convert from a something to a scalar via constructor of something arg
template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
strong_inline iScalar<vtype> operator=(T arg) {
strong_inline iScalar<vtype> operator=(T arg) {
_internal = arg;
return *this;
}
// Convert elements
template <class ttype>
strong_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
_internal = arg._internal;
return *this;
}
friend std::ostream &operator<<(std::ostream &stream,const iScalar<vtype> &o) {
stream << "S {" << o._internal << "}";
return stream;

View File

@ -80,8 +80,11 @@ template<class vtype, int N> inline iVector<vtype, N> Exponentiate(const iVector
mat iQ2 = arg*arg*alpha*alpha;
mat iQ3 = arg*iQ2*alpha;
// sign in c0 from the conventions on the Ta
c0 = -imag( trace(iQ3) ) * one_over_three;
c1 = -real( trace(iQ2) ) * one_over_two;
scalar imQ3, reQ2;
imQ3 = imag( trace(iQ3) );
reQ2 = real( trace(iQ2) );
c0 = -imQ3 * one_over_three;
c1 = -reQ2 * one_over_two;
// Cayley Hamilton checks to machine precision, tested
tmp = c1 * one_over_three;