1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Compare commits

...

20 Commits

Author SHA1 Message Date
70068cff51 Just commenting 2016-12-12 03:49:01 +00:00
85c055fa30 Tests more reductions and stops if failure is found in an iteration 2016-12-12 03:33:21 +00:00
90fedbd2af Report only on failing nodes 2016-12-09 05:57:30 +00:00
ec0c53fa68 Improvement in the CG interface for Repro 2016-12-09 05:20:38 +00:00
6ceee102e8 Moving the check outside the inner product function 2016-12-09 05:08:56 +00:00
6e57bdb6b3 Removing the fake error introduced for testing 2016-12-09 04:19:36 +00:00
4c11e36d3d Fixing a compilation error on some gcc compiler versions 2016-12-09 02:46:40 +00:00
9977c53035 Minor change 2016-12-09 02:34:37 +00:00
3a74fec62f Output reports max number of digits 2016-12-09 02:29:12 +00:00
8fb0a13f39 Cleaning up output of CG repro 2016-12-09 02:17:31 +00:00
14a1406f54 More cleanup 2016-12-08 06:14:20 +00:00
538e64e5b4 Cleaning up the CG reproduciblity test. More info reported 2016-12-08 05:50:40 +00:00
b2dc17e160 Merge branch 'develop' into feature/CG_repro 2016-12-05 05:07:01 +00:00
afbbcd2194 Compilation fix for the non OMP version 2016-11-23 12:47:19 +00:00
d4e0b11bb1 Adding few comments 2016-11-23 11:46:51 +00:00
7144ee7ae8 Reproducibility checks for inner product 2016-11-23 11:42:04 +00:00
f1908c7bc9 Adding reproducibility tests 2016-11-21 09:52:07 +00:00
036ec31c48 Merge branch 'develop' into feature/CG_repro 2016-11-20 17:54:23 +00:00
53f240200e Merge branch 'develop' into feature/CG_repro 2016-11-09 14:44:46 +00:00
9720c9ba3f First implementation of the CG reproducibility test 2016-11-06 11:13:29 +00:00
20 changed files with 674 additions and 101 deletions

49
lib/Bitwise.cc Normal file
View File

@ -0,0 +1,49 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Bitwise.cc
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <iostream>
#include <Bitwise.h>
#include <bitset>
#include <climits>
namespace Grid {
void show_binaryrep(const unsigned char* a, size_t size) {
const unsigned char* beg = a;
const unsigned char* end = a + size;
unsigned int ctr = 0;
while (beg != end) {
std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
ctr++;
if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
}
std::cout << '\n';
}
} // namespace

76
lib/Bitwise.h Normal file
View File

@ -0,0 +1,76 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Bitwise.h
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BITWISE_H
#define GRID_BITWISE_H
#include <cassert>
#include <cfloat>
#include <bitset>
#include <climits>
#include <Config.h>
#ifdef GRID_DEFAULT_PRECISION_SINGLE
#define GRID_REAL_BYTES 4
#endif
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
#define GRID_REAL_BYTES 8
#endif
namespace Grid {
void show_binaryrep(const unsigned char* a, size_t size);
template <typename T>
void show_binaryrep(const T& a) {
const char* beg = reinterpret_cast<const char*>(&a);
const char* end = beg + sizeof(a);
unsigned int ctr = 0;
while (beg != end) {
std::cout << std::bitset<CHAR_BIT>(*beg++) << ' ';
ctr++;
if (ctr % GRID_REAL_BYTES == 0) std::cout << '\n';
}
std::cout << '\n';
}
template <typename T>
void bitwise_xor(T& l, T& r, unsigned char* xors) {
assert(sizeof(l) == sizeof(r));
unsigned char* org = reinterpret_cast<unsigned char*>(&l);
unsigned char* cur = reinterpret_cast<unsigned char*>(&r);
int words = sizeof(l) / sizeof(*org);
unsigned char result = 0;
for (int w = 0; w < words; w++) xors[w] = (org[w] ^ cur[w]);
}
}; // namespace
#endif

View File

@ -62,6 +62,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/serialisation/Serialisation.h>
#include "Config.h"
#include <Grid/Timer.h>
#include <Grid/Bitwise.h>
#include <Grid/PerfCount.h>
#include <Grid/Log.h>
#include <Grid/AlignedAllocator.h>

View File

@ -100,7 +100,7 @@ void Grid_quiesce_nodes(void) {
me = shmem_my_pe();
#endif
if (me) {
std::cout.setstate(std::ios::badbit);
std::cout.setstate(std::ios::badbit);// mute all nodes except 0
}
}

View File

@ -38,19 +38,21 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_OMP
#include <omp.h>
#ifdef GRID_NUMA
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
#else
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
#else
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(runtime)")
#endif
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
#define PARALLEL_REGION _Pragma("omp parallel")
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
#define PARALLEL_REGION _Pragma("omp parallel")
#define PARALLEL_FOR_LOOP_STATIC _Pragma("omp parallel for schedule(static)")
#else
#define PARALLEL_FOR_LOOP
#define PARALLEL_FOR_LOOP_INTERN
#define PARALLEL_NESTED_LOOP2
#define PARALLEL_REGION
#define PARALLEL_FOR_LOOP_STATIC
#endif
namespace Grid {

View File

@ -9,6 +9,7 @@ Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -33,6 +34,21 @@ directory
namespace Grid {
struct CG_state {
bool do_repro;
std::vector<RealD> residuals;
CG_state() {reset();}
void reset(){
do_repro = false;
residuals.clear();
}
};
enum CGexec_mode{ Default, ReproducibilityTest };
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
@ -45,10 +61,30 @@ class ConjugateGradient : public OperatorFunction<Field> {
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
// Reproducibility controls
bool ReproTest;
CG_state CGState; //to check reproducibility by repeating the CG
ReproducibilityState<typename Field::vector_object> ReprTest; // for the inner proucts
// Constructor
ConjugateGradient(RealD tol, Integer maxit, CGexec_mode Mode = Default)
: Tolerance(tol),MaxIterations(maxit){
switch(Mode)
{
case Default :
ErrorOnNoConverge = true;
ReproTest = false;
case ReproducibilityTest :
ErrorOnNoConverge = false;
ReproTest = true;
}
};
void set_reproducibility_interval(unsigned int interval){
ReprTest.interval = interval;
}
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
Field &psi) {
@ -60,34 +96,37 @@ class ConjugateGradient : public OperatorFunction<Field> {
Field p(src);
Field mmp(src);
Field r(src);
Field psi_start(psi);// save for the repro test
if (CGState.do_repro && ReproTest)
std::cout << GridLogMessage << "Starting reproducibility test, full check every "
<< ReprTest.interval << " calls" << std::endl;
if(!ReprTest.do_check)
ReprTest.reset();
ReprTest.enable_reprocheck=ReproTest;
// Initial residual computation & set up
RealD guess = norm2(psi);
RealD guess = norm2(psi, ReprTest);
assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b);// eventually split this for the norm check
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
a = norm2(p, ReprTest);
cp = a;
ssq = norm2(src);
ssq = norm2(src, ReprTest);
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: p " << a << std::endl;
std::cout << GridLogIterative << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
@ -107,10 +146,10 @@ class ConjugateGradient : public OperatorFunction<Field> {
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
c = cp;// old residual
MatrixTimer.Start();
Linop.HermOpAndNorm(p, mmp, d, qq);
Linop.HermOpAndNorm(p, mmp, d, qq);// mmp = Ap, d=pAp
MatrixTimer.Stop();
LinalgTimer.Start();
@ -118,14 +157,31 @@ class ConjugateGradient : public OperatorFunction<Field> {
// ComplexD dck = innerProduct(p,mmp);
a = c / d;
b_pred = a * (a * qq - d) / c;
b_pred = a * (a * qq - d) / c;// a check
cp = axpy_norm(r, -a, mmp, r);
axpy(r, -a, mmp, r);// new residual r = r_old - a * Ap
cp = norm2(r, ReprTest); // bookkeeping this norm
if (ReproTest && !CGState.do_repro) {
CGState.residuals.push_back(cp); // save residuals state
std::cout << GridLogIterative << "ReproTest: Saving state" << std::endl;
}
if (ReproTest && CGState.do_repro){
// check that the residual agrees with the previous run
std::cout << GridLogIterative << "ReproTest: Checking state k=" << k << std::endl;
if (cp != CGState.residuals[k-1]){
std::cout << GridLogMessage << "Failing reproducibility test";
std::cout << GridLogMessage << " at k=" << k << std::endl;
std::cout << GridLogMessage << "saved residual = " << CGState.residuals[k-1]
<< " cp = " << cp << std::endl;
exit(1); // exit after the first failure
}
}
b = cp / c;
// Fuse these loops ; should be really easy
psi = a * p + psi;
p = p * b + r;
psi = a * p + psi; // update solution
p = p * b + r; // update search direction
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
@ -156,6 +212,22 @@ class ConjugateGradient : public OperatorFunction<Field> {
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
if (! (CGState.do_repro && ReproTest)){
CGState.do_repro = true;
ReprTest.do_check = true;
ReprTest.reset_counter();
this->operator()(Linop, src, psi_start);// run the repro test
if (ReprTest.success)
std::cout << GridLogMessage << "Reproducibility test passed" << std::endl;
else{
std::cout << GridLogMessage << "Reproducibility test failed" << std::endl;
exit(1);
}
}
// Clear state
CGState.reset();
ReprTest.reset();
return;
}
}

View File

@ -59,7 +59,7 @@ public:
int Nstop; // Number of evecs checked for convergence
int Nk; // Number of converged sought
int Np; // Np -- Number of spare vecs in kryloc space
int Np; // Np -- Number of spare vecs in krylov space
int Nm; // Nm -- total number of vectors
RealD eresid;

View File

@ -65,6 +65,7 @@ const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
@ -89,11 +90,11 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes)
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes)
{
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
}

View File

@ -68,6 +68,8 @@ class CartesianCommunicator {
static MPI_Comm communicator_world;
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t;
static char name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
static int length;
#else
typedef int CommsRequest_t;
#endif
@ -149,6 +151,7 @@ class CartesianCommunicator {
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
void PrintRankInfo(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////

View File

@ -35,6 +35,8 @@ namespace Grid {
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
char CartesianCommunicator::name[MPI_MAX_PROCESSOR_NAME]; // processing node physical name
int CartesianCommunicator::length;
// Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) {
@ -44,6 +46,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
MPI_Init(argc,argv);
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Get_processor_name(name, &length);
ShmInitGeneric();
}
@ -206,5 +210,10 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
assert(ierr==0);
}
void CartesianCommunicator::PrintRankInfo(){
std::cout << "Grid: Rank "<< _processor << " - Physical node name: " << name << std::endl;
}
}// end of namespace

View File

@ -576,5 +576,10 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
assert(ierr==0);
}
void CartesianCommunicator::PrintRankInfo(){
std::cout << "Grid: Rank "<< _processor << " - Physical node name: " << name << std::endl;
}
}

View File

@ -869,6 +869,9 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
void CartesianCommunicator::PrintRankInfo(){
std::cout << "Grid: Rank "<< _processor << " - Physical node name: " << name << std::endl;
}
};

View File

@ -37,6 +37,11 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
ShmInitGeneric();
}
void CartesianCommunicator::PrintRankInfo(){
std::cout << GridLogMessage << "No Rank Info available" << std::endl;
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
_processors = processors;
@ -60,10 +65,10 @@ void CartesianCommunicator::GlobalSum(uint64_t &){}
void CartesianCommunicator::GlobalSumVector(double *,int N){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
@ -71,19 +76,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
// Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}

View File

@ -333,5 +333,9 @@ void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
}
}
void CartesianCommunicator::PrintRankInfo(){
std::cout << GridLogMessage << "SHMEM: Rank Info not implemented yet" << std::endl;
}
}

View File

@ -31,6 +31,87 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define GRID_LATTICE_REDUCTION_H
namespace Grid {
template <class T>
class ReproducibilityState {
public:
typedef typename T::vector_type vector_type;
typedef std::vector<vector_type, alignedAllocator<vector_type> > sum_type;
unsigned int n_call;
bool do_check;
bool enable_reprocheck;
bool success;
unsigned int interval;
std::vector<sum_type> th_states;
void reset_counter() { n_call = 0; }
void reset() {
th_states.clear();
do_check = false;
enable_reprocheck = false;
success = true;
n_call = 0;
};
ReproducibilityState():interval(1) {
reset();
}
void check(GridBase* grid, sum_type &sumarray){
/////////////////////// Reproducibility section, not threaded on purpouse
if (enable_reprocheck) {
if (do_check && (n_call % interval) == 0) {
for (int thread = 0; thread < sumarray.size(); thread++) {
int words = sizeof(sumarray[thread])/sizeof(unsigned char);
unsigned char xors[words];
bitwise_xor(sumarray[thread], th_states[n_call][thread],xors);
// OR all words
unsigned char res = 0;
for (int w = 0; w < words; w++) res = res | xors[w];
Grid_unquiesce_nodes();
int rank = 0;
while (rank < grid->_Nprocessors){
if (rank == grid->ThisRank() ){
if ( res ) {
std::cout << "Reproducibility failure report" << std::endl;
grid->PrintRankInfo();
std::cout << "Call: "<< n_call << " Thread: " << thread << std::endl;
std::cout << "Size of states: " << th_states.size() << std::endl;
std::cout << std::setprecision(GRID_REAL_DIGITS+1) << std::scientific;
std::cout << "Saved partial sum : " << th_states[n_call][thread] << std::endl;
std::cout << "Current partial sum: " << sumarray[thread] << std::endl;
std::cout << "Saved state " << std::endl; show_binaryrep(th_states[n_call][thread]);
std::cout << "Current state" << std::endl; show_binaryrep(sumarray[thread]);
std::cout << "XOR result" << std::endl; show_binaryrep(xors, words);
//std::cout << std::defaultfloat; //not supported by some compilers
std::cout << std::setprecision(6);
success = false;
}
}
rank++;
grid->Barrier();
}
Grid_quiesce_nodes();
}
} else if (!do_check)
{
std::cout << GridLogDebug << "Saving thread state for inner product. Call n. "
<< n_call << std::endl;
th_states.resize(n_call+1);
th_states[n_call].resize(grid->SumArraySize());
th_states[n_call] = sumarray; // save threads state
//n_call++;
}
n_call++;
}
}
};
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
@ -39,12 +120,25 @@ namespace Grid {
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
return std::real(nrm);
}
ReproducibilityState<vobj> repr;
ComplexD nrm = innerProduct(arg, arg, repr);
return std::real(nrm);
}
template <class vobj>
inline RealD norm2(const Lattice<vobj> &arg, ReproducibilityState<vobj>& rep) {
ComplexD nrm = innerProduct(arg, arg, rep);
return std::real(nrm);
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right){
ReproducibilityState<vobj> repr;
return innerProduct(left, right, repr);
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right, ReproducibilityState<vobj>& repr)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
@ -54,24 +148,28 @@ namespace Grid {
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
sumarray[i]=zero;
}
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP_STATIC //request statically scheduled threads for reproducibility
for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProduct(left._odata[0],right._odata[0])) vnrm = zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
}
/////////// Reproducibility
repr.check(grid, sumarray);
///////////////////////////
vector_type vvnrm; vvnrm=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
vvnrm = vvnrm+sumarray[i];
}
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm);
@ -79,26 +177,26 @@ PARALLEL_FOR_LOOP
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
@ -111,24 +209,24 @@ PARALLEL_FOR_LOOP
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
sumarray[i]=zero;
}
PARALLEL_FOR_LOOP
PARALLEL_FOR_LOOP
for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
@ -138,7 +236,7 @@ PARALLEL_FOR_LOOP
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
arg._grid->GlobalSum(ssum);
return ssum;
}
@ -146,23 +244,23 @@ PARALLEL_FOR_LOOP
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid;
assert(grid!=NULL);
{
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid;
assert(grid!=NULL);
// FIXME
// std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars

View File

@ -66,6 +66,9 @@ namespace Optimization {
double f[4];
};
struct Vsplat{
//Complex float
inline __m256 operator()(float a, float b){

View File

@ -60,6 +60,13 @@ directory
#include "Grid_neon.h"
#endif
#ifdef GRID_DEFAULT_PRECISION_SINGLE
#define GRID_REAL_DIGITS FLT_DIG
#endif
#ifdef GRID_DEFAULT_PRECISION_DOUBLE
#define GRID_REAL_DIGITS DBL_DIG
#endif
namespace Grid {
//////////////////////////////////////
@ -146,6 +153,23 @@ class Grid_simd {
Grid_simd(const Grid_simd &rhs) : v(rhs.v){}; // compiles in movaps
Grid_simd(const Grid_simd &&rhs) : v(rhs.v){};
/////////////////////////////
// Comparisons
/////////////////////////////
inline bool operator==(const Grid_simd& lhs){
Grid_simd::conv_t conv1;
Grid_simd::conv_t conv2;
conv1.v = v;
conv2.v = lhs.v;
return std::equal(std::begin(conv1.s), std::end(conv1.s), std::begin(conv2.s));
}
inline bool operator!=(const Grid_simd& lhs){
return !(*this == lhs);
}
/////////////////////////////
// Constructors
/////////////////////////////

View File

@ -0,0 +1,215 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_cayley_cg_reproducibility.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
#define REPRODUCIBILITY_INTERVAL 1
template<class d>
struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
template<class What>
void TestCGinversions(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5);
template<class What>
void TestCGschur(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5);
template<class What>
void TestCGunprec(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5);
template<class What>
void TestCGprec(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5);
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout << GridLogMessage << "###########################################" << std::endl;
std::cout << GridLogMessage << "This test disables the convergence alert" << std::endl;
std::cout << GridLogMessage << "and checks whether in consecutive runs of the CG" << std::endl;
std::cout << GridLogMessage << "the internal reductions are bit reproducible." << std::endl;
std::cout << GridLogMessage << "A warning and a report is produced for every failure" << std::endl;
std::cout << GridLogMessage << "###########################################" << std::endl;
const int Ls=8;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
SU3::HotConfiguration(RNG4,Umu);
std::vector<LatticeColourMatrix> U(4,UGrid);
RealD mass=0.1;
RealD M5 =1.8;
std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
TestCGinversions<DomainWallFermionR>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
RealD b=1.5;// Scale factor b+c=2, b-c=1
RealD c=0.5;
std::cout<<GridLogMessage <<"MobiusFermion test"<<std::endl;
MobiusFermionR Dmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
TestCGinversions<MobiusFermionR>(Dmob,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
std::cout<<GridLogMessage <<"MobiusZolotarevFermion test"<<std::endl;
MobiusZolotarevFermionR Dzolo(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,0.1,2.0);
TestCGinversions<MobiusZolotarevFermionR>(Dzolo,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
std::cout<<GridLogMessage <<"ScaledShamirFermion test"<<std::endl;
ScaledShamirFermionR Dsham(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,2.0);
TestCGinversions<ScaledShamirFermionR>(Dsham,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
std::cout<<GridLogMessage <<"ShamirZolotarevFermion test"<<std::endl;
ShamirZolotarevFermionR Dshamz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
TestCGinversions<ShamirZolotarevFermionR>(Dshamz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
std::cout<<GridLogMessage <<"OverlapWilsonCayleyTanhFermion test"<<std::endl;
OverlapWilsonCayleyTanhFermionR Dov(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,1.0);
TestCGinversions<OverlapWilsonCayleyTanhFermionR>(Dov,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
std::cout<<GridLogMessage <<"OverlapWilsonCayleyZolotarevFermion test"<<std::endl;
OverlapWilsonCayleyZolotarevFermionR Dovz(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,0.1,2.0);
TestCGinversions<OverlapWilsonCayleyZolotarevFermionR>(Dovz,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5);
Grid_finalize();
}
template<class What>
void TestCGinversions(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5)
{
std::cout<<GridLogMessage << "Testing unpreconditioned inverter"<<std::endl;
TestCGunprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
std::cout<<GridLogMessage << "Testing red black preconditioned inverter"<<std::endl;
TestCGprec<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
std::cout<<GridLogMessage << "Testing red black Schur inverter"<<std::endl;
TestCGschur<What>(Ddwf,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,RNG4,RNG5);
}
template<class What>
void TestCGunprec(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5)
{
LatticeFermion src (FGrid); random(*RNG5,src);
LatticeFermion result(FGrid); result=zero;
MdagMLinearOperator<What,LatticeFermion> HermOp(Ddwf);
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
CG(HermOp,src,result);
}
template<class What>
void TestCGprec(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5)
{
LatticeFermion src (FGrid); random(*RNG5,src);
LatticeFermion src_o(FrbGrid);
LatticeFermion result_o(FrbGrid);
pickCheckerboard(Odd,src_o,src);
result_o=zero;
SchurDiagMooeeOperator<What,LatticeFermion> HermOpEO(Ddwf);
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
CG(HermOpEO,src_o,result_o);
}
template<class What>
void TestCGschur(What & Ddwf,
GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid,
GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid,
RealD mass, RealD M5,
GridParallelRNG *RNG4,
GridParallelRNG *RNG5)
{
LatticeFermion src (FGrid); random(*RNG5,src);
LatticeFermion result(FGrid); result=zero;
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000, ReproducibilityTest);
CG.set_reproducibility_interval(REPRODUCIBILITY_INTERVAL);
SchurRedBlackDiagMooeeSolve<LatticeFermion> SchurSolver(CG);
SchurSolver(Ddwf,src,result);
}

View File

@ -63,6 +63,9 @@ class HmcRunner : public NerscHmcRunner {
Real mass = -0.77;
FermionAction FermOp(U, *FGrid, *FrbGrid, mass);
// To enable the CG reproducibility tests use
//ConjugateGradient<FermionField> CG(1.0e-8, 10000, ReproducibilityTest);
// This is the plain version
ConjugateGradient<FermionField> CG(1.0e-8, 10000);
TwoFlavourPseudoFermionAction<ImplPolicy> Nf2(FermOp, CG, CG);

View File

@ -71,7 +71,7 @@ int main (int argc, char ** argv)
WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
MdagMLinearOperator<WilsonFermionR,LatticeFermion> HermOp(Dw);
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
ConjugateGradient<LatticeFermion> CG(1.0e-8,10000,true, true);
CG(HermOp,src,result);
Grid_finalize();