mirror of
https://github.com/paboyle/Grid.git
synced 2025-11-09 16:19:32 +00:00
Compare commits
27 Commits
feature/S2
...
73af020f98
| Author | SHA1 | Date | |
|---|---|---|---|
| 73af020f98 | |||
| bffb83c46e | |||
| 7031f37350 | |||
| 829dd74cb2 | |||
| 66e671985d | |||
| 5afcbcf0f3 | |||
| 9730579312 | |||
| bfae14d035 | |||
| b78fc73d19 | |||
|
|
709f8ae76c | ||
|
|
7aa06329d0 | ||
|
|
9d6a38c44c | ||
|
|
6ec5cee368 | ||
|
|
f2e9a68825 | ||
|
|
d88750e6b6 | ||
|
|
821358eda7 | ||
|
|
fce6e1f135 | ||
|
|
8f0bb3e676 | ||
|
|
262c70d967 | ||
|
|
da43ef7c2d | ||
|
|
7b60ab5df1 | ||
|
|
f6b961a64e | ||
|
|
f1ed988aa3 | ||
|
|
eea51bb604 | ||
|
|
9203126aa5 | ||
|
|
f90ba4712a | ||
|
|
3737a24096 |
@@ -51,11 +51,13 @@ directory
|
|||||||
#pragma nv_diag_suppress cast_to_qualified_type
|
#pragma nv_diag_suppress cast_to_qualified_type
|
||||||
//disables nvcc specific warning in many files
|
//disables nvcc specific warning in many files
|
||||||
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
|
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
|
||||||
|
#pragma nv_diag_suppress declared_but_not_referenced
|
||||||
#pragma nv_diag_suppress extra_semicolon
|
#pragma nv_diag_suppress extra_semicolon
|
||||||
#else
|
#else
|
||||||
//disables nvcc specific warning in json.hpp
|
//disables nvcc specific warning in json.hpp
|
||||||
#pragma diag_suppress unsigned_compare_with_zero
|
#pragma diag_suppress unsigned_compare_with_zero
|
||||||
#pragma diag_suppress cast_to_qualified_type
|
#pragma diag_suppress cast_to_qualified_type
|
||||||
|
#pragma diag_suppress declared_but_not_referenced
|
||||||
//disables nvcc specific warning in many files
|
//disables nvcc specific warning in many files
|
||||||
#pragma diag_suppress esa_on_defaulted_function_ignored
|
#pragma diag_suppress esa_on_defaulted_function_ignored
|
||||||
#pragma diag_suppress extra_semicolon
|
#pragma diag_suppress extra_semicolon
|
||||||
|
|||||||
@@ -37,7 +37,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/qcd/QCD.h>
|
#include <Grid/qcd/QCD.h>
|
||||||
#include <Grid/qcd/spin/Spin.h>
|
#include <Grid/qcd/spin/Spin.h>
|
||||||
#include <Grid/qcd/gparity/Gparity.h>
|
#include <Grid/qcd/gparity/Gparity.h>
|
||||||
#include <Grid/qcd/spin/Pauli.h> // depends on Gparity
|
|
||||||
#include <Grid/qcd/utils/Utils.h>
|
#include <Grid/qcd/utils/Utils.h>
|
||||||
#include <Grid/qcd/representations/Representations.h>
|
#include <Grid/qcd/representations/Representations.h>
|
||||||
NAMESPACE_CHECK(GridQCDCore);
|
NAMESPACE_CHECK(GridQCDCore);
|
||||||
|
|||||||
@@ -31,6 +31,5 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/cartesian/Cartesian_base.h>
|
#include <Grid/cartesian/Cartesian_base.h>
|
||||||
#include <Grid/cartesian/Cartesian_full.h>
|
#include <Grid/cartesian/Cartesian_full.h>
|
||||||
#include <Grid/cartesian/Cartesian_red_black.h>
|
#include <Grid/cartesian/Cartesian_red_black.h>
|
||||||
#include <Grid/cartesian/CartesianCrossIcosahedron.h>
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -1,241 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
|
|
||||||
|
|
||||||
Copyright (C) 2025
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Grid Support.
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
enum IcosahedralMeshType {
|
|
||||||
IcosahedralVertices,
|
|
||||||
IcosahedralEdges
|
|
||||||
} ;
|
|
||||||
enum NorthSouth {
|
|
||||||
North = 1,
|
|
||||||
South = 0
|
|
||||||
};
|
|
||||||
enum IcoshedralDirections {
|
|
||||||
IcosahedronPatchX = 0,
|
|
||||||
IcosahedronPatchY = 1,
|
|
||||||
IcosahedronPatchDiagonal=2,
|
|
||||||
NumIcosahedralPolarizations
|
|
||||||
};
|
|
||||||
|
|
||||||
const int IcosahedralPatches = 10;
|
|
||||||
const int HemiPatches=IcosahedralPatches/2;
|
|
||||||
const int NorthernHemisphere = HemiPatches;
|
|
||||||
const int SouthernHemisphere = 0;
|
|
||||||
|
|
||||||
class GridCartesianCrossIcosahedron: public GridCartesian {
|
|
||||||
|
|
||||||
public:
|
|
||||||
|
|
||||||
IcosahedralMeshType meshType;
|
|
||||||
|
|
||||||
IcosahedralMeshType MeshType(void) { return meshType; };
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
// Constructor takes a parent grid and possibly subdivides communicator.
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
/*
|
|
||||||
GridCartesian(const Coordinate &dimensions,
|
|
||||||
const Coordinate &simd_layout,
|
|
||||||
const Coordinate &processor_grid,
|
|
||||||
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
|
|
||||||
{
|
|
||||||
assert(0); // No subdivision
|
|
||||||
}
|
|
||||||
GridCartesian(const Coordinate &dimensions,
|
|
||||||
const Coordinate &simd_layout,
|
|
||||||
const Coordinate &processor_grid,
|
|
||||||
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
|
|
||||||
{
|
|
||||||
assert(0); // No subdivision
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
// Construct from comm world
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
GridCartesianCrossIcosahedron(const Coordinate &dimensions,
|
|
||||||
const Coordinate &simd_layout,
|
|
||||||
const Coordinate &processor_grid,
|
|
||||||
IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
|
|
||||||
{
|
|
||||||
meshType = _meshType;
|
|
||||||
Coordinate S2dimensions=dimensions;
|
|
||||||
Coordinate S2simd =simd_layout;
|
|
||||||
Coordinate S2procs =processor_grid;
|
|
||||||
|
|
||||||
assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
|
|
||||||
assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
|
|
||||||
assert(dimensions[_ndimension-1]==IcosahedralPatches);
|
|
||||||
assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
|
|
||||||
|
|
||||||
// Save a copy of the basic cartesian initialisation volume
|
|
||||||
cartesianOsites = this->_osites;
|
|
||||||
|
|
||||||
// allocate the pole storage if we are seeking vertex domain data
|
|
||||||
if ( meshType == IcosahedralVertices ) {
|
|
||||||
InitPoles();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual ~GridCartesianCrossIcosahedron() = default;
|
|
||||||
|
|
||||||
////////////////////////////////////////////////
|
|
||||||
// Use to decide if a given grid is icosahedral
|
|
||||||
////////////////////////////////////////////////
|
|
||||||
int hasNorthPole;
|
|
||||||
int hasSouthPole;
|
|
||||||
int northPoleOsite;
|
|
||||||
int southPoleOsite;
|
|
||||||
int northPoleOsites;
|
|
||||||
int southPoleOsites;
|
|
||||||
int cartesianOsites;
|
|
||||||
|
|
||||||
virtual int isIcosahedral(void) override { return 1;}
|
|
||||||
virtual int isIcosahedralVertex(void) override { return meshType==IcosahedralVertices;}
|
|
||||||
virtual int isIcosahedralEdge (void) override { return meshType==IcosahedralEdges;}
|
|
||||||
virtual int NorthPoleOsite(void) const override { return northPoleOsite; };
|
|
||||||
virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
|
|
||||||
virtual int SouthPoleOsite(void) const override { return southPoleOsite; };
|
|
||||||
virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
|
|
||||||
virtual int ownsNorthPole(void) const override { return hasNorthPole; };
|
|
||||||
virtual int ownsSouthPole(void) const override { return hasSouthPole; };
|
|
||||||
virtual int CartesianOsites(void) const override { return cartesianOsites; };
|
|
||||||
virtual int64_t PoleIdxForOcoor(Coordinate &Coor) override
|
|
||||||
{
|
|
||||||
// Work out the pole_osite. Pick the higher dims
|
|
||||||
Coordinate rdims;
|
|
||||||
Coordinate ocoor;
|
|
||||||
int64_t pole_idx;
|
|
||||||
int Ndm1 = this->Nd()-1;
|
|
||||||
for(int d=2;d<Ndm1;d++){
|
|
||||||
int dd=d-2;
|
|
||||||
rdims.push_back(this->_rdimensions[d]);
|
|
||||||
ocoor.push_back(Coor[d]%this->_rdimensions[d]);
|
|
||||||
}
|
|
||||||
Lexicographic::IndexFromCoor(ocoor,pole_idx,rdims);
|
|
||||||
return pole_idx;
|
|
||||||
}
|
|
||||||
virtual int64_t PoleSiteForOcoor(Coordinate &Coor) override
|
|
||||||
{
|
|
||||||
int Ndm1 = this->Nd()-1;
|
|
||||||
int64_t pole_idx = this->PoleIdxForOcoor(Coor);
|
|
||||||
int64_t pole_osite;
|
|
||||||
if ( Coor[Ndm1] >= HemiPatches ) {
|
|
||||||
pole_osite = pole_idx + this->NorthPoleOsite();
|
|
||||||
} else {
|
|
||||||
pole_osite = pole_idx + this->SouthPoleOsite();
|
|
||||||
}
|
|
||||||
return pole_osite;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void InitPoles(void)
|
|
||||||
{
|
|
||||||
int Ndm1 = _ndimension-1;
|
|
||||||
///////////////////////
|
|
||||||
// Add the extra pole storage
|
|
||||||
///////////////////////
|
|
||||||
// Vertices = 1x LxLx D1...Dn + 2.D1...Dn
|
|
||||||
// Start after the LxL and don't include the 10 patch dim
|
|
||||||
int OrthogSize = 1;
|
|
||||||
for (int d = 2; d < Ndm1; d++) {
|
|
||||||
OrthogSize *= _gdimensions[d];
|
|
||||||
}
|
|
||||||
_fsites += OrthogSize*2;
|
|
||||||
_gsites += OrthogSize*2;
|
|
||||||
|
|
||||||
// Simd reduced sizes are multiplied up.
|
|
||||||
// If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
|
|
||||||
// which should contain identical north (south) pole data
|
|
||||||
OrthogSize = 1;
|
|
||||||
for (int d = 2; d < Ndm1; d++) {
|
|
||||||
OrthogSize *= _rdimensions[d];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Grow the local volume to hold pole data
|
|
||||||
// on rank (0,0) in the LxL planes
|
|
||||||
// since SIMD must be placed in the orthogonal directions
|
|
||||||
Coordinate pcoor = this->ThisProcessorCoor();
|
|
||||||
Coordinate pgrid = this->ProcessorGrid();
|
|
||||||
|
|
||||||
const int xdim=0;
|
|
||||||
const int ydim=1;
|
|
||||||
/*
|
|
||||||
*
|
|
||||||
* /\/\/\/\/\
|
|
||||||
* /\/\/\/\/\/
|
|
||||||
* \/\/\/\/\/
|
|
||||||
*
|
|
||||||
* y
|
|
||||||
* /
|
|
||||||
* \x
|
|
||||||
*
|
|
||||||
* Labelling patches as 5 6 7 8 9
|
|
||||||
* 0 1 2 3 4
|
|
||||||
*
|
|
||||||
* Will ban distribution of the patch dimension by more than 2.
|
|
||||||
*
|
|
||||||
* Hence all 5 patches associated with the pole must have the
|
|
||||||
* appropriate "corner" of the patch L^2 located on the SAME rank.
|
|
||||||
*/
|
|
||||||
|
|
||||||
if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
|
|
||||||
hasSouthPole =1;
|
|
||||||
southPoleOsite=this->_osites;
|
|
||||||
southPoleOsites=OrthogSize;
|
|
||||||
this->_osites += OrthogSize;
|
|
||||||
} else {
|
|
||||||
hasSouthPole =0;
|
|
||||||
southPoleOsites=0;
|
|
||||||
southPoleOsite=0;
|
|
||||||
}
|
|
||||||
if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
|
|
||||||
hasNorthPole =1;
|
|
||||||
northPoleOsite=this->_osites;
|
|
||||||
northPoleOsites=OrthogSize;
|
|
||||||
this->_osites += OrthogSize;
|
|
||||||
} else {
|
|
||||||
hasNorthPole =0;
|
|
||||||
northPoleOsites=0;
|
|
||||||
northPoleOsite=0;
|
|
||||||
}
|
|
||||||
std::cout << GridLogDebug<<"Icosahedral vertex field volume " << this->_osites<<std::endl;
|
|
||||||
std::cout << GridLogDebug<<"Icosahedral south pole offset " << this->southPoleOsite<<std::endl;
|
|
||||||
std::cout << GridLogDebug<<"Icosahedral north pole offset " << this->northPoleOsite<<std::endl;
|
|
||||||
std::cout << GridLogDebug<<"Icosahedral south pole size " << this->southPoleOsites<<std::endl;
|
|
||||||
std::cout << GridLogDebug<<"Icosahedral north pole size " << this->northPoleOsites<<std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
@@ -86,25 +86,10 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// Icosahedral decisions
|
|
||||||
virtual int isIcosahedral(void) { return 0;}
|
|
||||||
virtual int isIcosahedralVertex(void) { return 0;}
|
|
||||||
virtual int isIcosahedralEdge (void) { return 0;}
|
|
||||||
virtual int ownsNorthPole(void) const { return 0; };
|
|
||||||
virtual int ownsSouthPole(void) const { return 0; };
|
|
||||||
virtual int NorthPoleOsite(void) const { return 0; };
|
|
||||||
virtual int SouthPoleOsite(void) const { return 0; };
|
|
||||||
virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
|
|
||||||
virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
|
|
||||||
virtual int CartesianOsites(void) const { return this->oSites(); };
|
|
||||||
virtual int64_t PoleIdxForOcoor(Coordinate &Coor) { return 0;};
|
|
||||||
virtual int64_t PoleSiteForOcoor(Coordinate &Coor){ return 0;}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Checkerboarding interface is virtual and overridden by
|
// Checkerboarding interface is virtual and overridden by
|
||||||
// GridCartesian / GridRedBlackCartesian
|
// GridCartesian / GridRedBlackCartesian
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
virtual int CheckerBoarded(int dim) =0;
|
virtual int CheckerBoarded(int dim) =0;
|
||||||
virtual int CheckerBoard(const Coordinate &site)=0;
|
virtual int CheckerBoard(const Coordinate &site)=0;
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||||
@@ -191,8 +176,6 @@ public:
|
|||||||
}
|
}
|
||||||
return permute_type;
|
return permute_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Array sizing queries
|
// Array sizing queries
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -183,6 +183,7 @@ public:
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
|
int IsOffNode(int rank);
|
||||||
double StencilSendToRecvFrom(void *xmit,
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,int do_xmit,
|
int xmit_to_rank,int do_xmit,
|
||||||
void *recv,
|
void *recv,
|
||||||
@@ -201,9 +202,9 @@ public:
|
|||||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,void *xmit_comp,
|
||||||
int xmit_to_rank,int do_xmit,
|
int xmit_to_rank,int do_xmit,
|
||||||
void *recv,
|
void *recv,void *recv_comp,
|
||||||
int recv_from_rank,int do_recv,
|
int recv_from_rank,int do_recv,
|
||||||
int xbytes,int rbytes,int dir);
|
int xbytes,int rbytes,int dir);
|
||||||
|
|
||||||
|
|||||||
@@ -270,24 +270,24 @@ void CartesianCommunicator::GlobalSum(double &d)
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
FlightRecorder::StepLog("AllReduce");
|
FlightRecorder::StepLog("AllReduce float");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(double &d)
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
{
|
{
|
||||||
FlightRecorder::StepLog("AllReduce");
|
FlightRecorder::StepLog("AllReduce double");
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
FlightRecorder::StepLog("AllReduce");
|
FlightRecorder::StepLog("AllReduce uint32_t");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||||
FlightRecorder::StepLog("AllReduce");
|
FlightRecorder::StepLog("AllReduce uint64_t");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
@@ -301,26 +301,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
||||||
|
FlightRecorder::StepLog("GlobalXOR");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalMax(float &f)
|
void CartesianCommunicator::GlobalMax(float &f)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("GlobalMax");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalMax(double &d)
|
void CartesianCommunicator::GlobalMax(double &d)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("GlobalMax");
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("GlobalSumVector(float *)");
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("GlobalSumVector(double *)");
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
@@ -395,11 +400,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> list;
|
std::vector<CommsRequest_t> list;
|
||||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
|
||||||
StencilSendToRecvFromComplete(list,dir);
|
StencilSendToRecvFromComplete(list,dir);
|
||||||
return offbytes;
|
return offbytes;
|
||||||
}
|
}
|
||||||
|
int CartesianCommunicator::IsOffNode(int rank)
|
||||||
|
{
|
||||||
|
int grank = ShmRanks[rank];
|
||||||
|
if ( grank == MPI_UNDEFINED ) return true;
|
||||||
|
else return false;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||||
@@ -414,9 +424,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
|||||||
return 0.0; // Do nothing -- no preparation required
|
return 0.0; // Do nothing -- no preparation required
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,void *xmit_comp,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
void *recv,
|
void *recv,void *recv_comp,
|
||||||
int from,int dor,
|
int from,int dor,
|
||||||
int xbytes,int rbytes,int dir)
|
int xbytes,int rbytes,int dir)
|
||||||
{
|
{
|
||||||
@@ -440,7 +450,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
if ( dor ) {
|
if ( dor ) {
|
||||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+from*32;
|
tag= dir+from*32;
|
||||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
|
||||||
|
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=rbytes;
|
off_node_bytes+=rbytes;
|
||||||
@@ -449,6 +460,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
else {
|
else {
|
||||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
|
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
|
||||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@@ -457,7 +469,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
if (dox) {
|
if (dox) {
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=xbytes;
|
off_node_bytes+=xbytes;
|
||||||
@@ -676,9 +688,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
|
|||||||
}
|
}
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,void *xmit_comp,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
void *recv,
|
void *recv,void *recv_comp,
|
||||||
int from,int dor,
|
int from,int dor,
|
||||||
int xbytes,int rbytes,int dir)
|
int xbytes,int rbytes,int dir)
|
||||||
{
|
{
|
||||||
@@ -829,6 +841,7 @@ int CartesianCommunicator::RankWorld(void){
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::BarrierWorld(void){
|
void CartesianCommunicator::BarrierWorld(void){
|
||||||
|
FlightRecorder::StepLog("BarrierWorld");
|
||||||
int ierr = MPI_Barrier(communicator_world);
|
int ierr = MPI_Barrier(communicator_world);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
|
|||||||
dest=0;
|
dest=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int CartesianCommunicator::IsOffNode(int rank) { return false; }
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
int xmit_to_rank,int dox,
|
int xmit_to_rank,int dox,
|
||||||
void *recv,
|
void *recv,
|
||||||
|
|||||||
@@ -543,49 +543,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||||
#if 0
|
|
||||||
HostCommBuf= acceleratorAllocHost(bytes);
|
|
||||||
#else
|
|
||||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||||
#if 0
|
|
||||||
#warning "Moving host buffers to specific NUMA domain"
|
|
||||||
int numa;
|
|
||||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
|
||||||
if(numa_name) {
|
|
||||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
|
||||||
numa = atoi(numa_name);
|
|
||||||
unsigned long page_count = bytes/page_size;
|
|
||||||
std::vector<void *> pages(page_count);
|
|
||||||
std::vector<int> nodes(page_count,numa);
|
|
||||||
std::vector<int> status(page_count,-1);
|
|
||||||
for(unsigned long p=0;p<page_count;p++){
|
|
||||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
|
||||||
}
|
|
||||||
int ret = move_pages(0,
|
|
||||||
page_count,
|
|
||||||
&pages[0],
|
|
||||||
&nodes[0],
|
|
||||||
&status[0],
|
|
||||||
MPOL_MF_MOVE);
|
|
||||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
|
||||||
if (ret) perror(" move_pages failed for reason:");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
acceleratorPin(HostCommBuf,bytes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
if ( WorldRank == 0 ){
|
if ( WorldRank == 0 ){
|
||||||
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
std::cout << Mheader " acceleratorAllocDevice "<< bytes
|
||||||
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
|
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
}
|
}
|
||||||
SharedMemoryZero(ShmCommBuf,bytes);
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
std::cout<< "Setting up IPC"<<std::endl;
|
if ( WorldRank == 0 ){
|
||||||
|
std::cout<< Mheader "Setting up IPC"<<std::endl;
|
||||||
|
}
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Loop over ranks/gpu's on our node
|
// Loop over ranks/gpu's on our node
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -616,8 +588,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
if ( err != ZE_RESULT_SUCCESS ) {
|
if ( err != ZE_RESULT_SUCCESS ) {
|
||||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
|
||||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
|
||||||
}
|
}
|
||||||
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
||||||
handle.pid = getpid();
|
handle.pid = getpid();
|
||||||
@@ -676,12 +646,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
#ifdef SHM_SOCKETS
|
#ifdef SHM_SOCKETS
|
||||||
myfd=UnixSockets::RecvFileDescriptor();
|
myfd=UnixSockets::RecvFileDescriptor();
|
||||||
#else
|
#else
|
||||||
std::cout<<"mapping seeking remote pid/fd "
|
// std::cout<<"mapping seeking remote pid/fd "
|
||||||
<<handle.pid<<"/"
|
// <<handle.pid<<"/"
|
||||||
<<handle.fd<<std::endl;
|
// <<handle.fd<<std::endl;
|
||||||
|
|
||||||
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
||||||
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||||
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
||||||
myfd = syscall(438,pidfd,handle.fd,0);
|
myfd = syscall(438,pidfd,handle.fd,0);
|
||||||
int err_t = errno;
|
int err_t = errno;
|
||||||
@@ -691,7 +661,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
||||||
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
|
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
|
||||||
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
||||||
|
|
||||||
@@ -700,9 +670,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
|
||||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
|
|
||||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
|
|
||||||
}
|
}
|
||||||
assert(thisBuf!=nullptr);
|
assert(thisBuf!=nullptr);
|
||||||
}
|
}
|
||||||
@@ -783,6 +750,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
WorldShmCommBufs[r] =ptr;
|
WorldShmCommBufs[r] =ptr;
|
||||||
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
|
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
|
||||||
}
|
}
|
||||||
|
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
|
||||||
_ShmAlloc=1;
|
_ShmAlloc=1;
|
||||||
_ShmAllocBytes = bytes;
|
_ShmAllocBytes = bytes;
|
||||||
};
|
};
|
||||||
@@ -1039,11 +1007,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
|
|||||||
{
|
{
|
||||||
int gpeer = ShmRanks[rank];
|
int gpeer = ShmRanks[rank];
|
||||||
assert(gpeer!=ShmRank); // never send to self
|
assert(gpeer!=ShmRank); // never send to self
|
||||||
|
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
|
||||||
if (gpeer == MPI_UNDEFINED){
|
if (gpeer == MPI_UNDEFINED){
|
||||||
return NULL;
|
return NULL;
|
||||||
} else {
|
} else {
|
||||||
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
||||||
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
||||||
|
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
|
||||||
return (void *) remote;
|
return (void *) remote;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
const int Cshift_verbose=0;
|
const int Cshift_verbose=0;
|
||||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
{
|
{
|
||||||
assert(!rhs.Grid()->isIcosahedral());
|
|
||||||
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
@@ -145,9 +143,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
if (comm_proc==0) {
|
if (comm_proc==0) {
|
||||||
|
FlightRecorder::StepLog("Cshift_Copy_plane");
|
||||||
tcopy-=usecond();
|
tcopy-=usecond();
|
||||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
tcopy+=usecond();
|
tcopy+=usecond();
|
||||||
|
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
@@ -155,9 +155,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
FlightRecorder::StepLog("Cshift_Gather_plane");
|
||||||
tgather-=usecond();
|
tgather-=usecond();
|
||||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||||
tgather+=usecond();
|
tgather+=usecond();
|
||||||
|
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
|
||||||
|
|
||||||
// int rank = grid->_processor;
|
// int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
@@ -168,6 +170,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
|
|
||||||
|
FlightRecorder::StepLog("Cshift_SendRecv");
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
@@ -184,10 +187,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
bytes);
|
bytes);
|
||||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||||
#endif
|
#endif
|
||||||
|
FlightRecorder::StepLog("Cshift_SendRecv_complete");
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
|
FlightRecorder::StepLog("Cshift_barrier_complete");
|
||||||
|
|
||||||
tscatter-=usecond();
|
tscatter-=usecond();
|
||||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||||
|
|||||||
@@ -30,7 +30,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
{
|
{
|
||||||
assert(!rhs.Grid()->isIcosahedral());
|
|
||||||
Lattice<vobj> ret(rhs.Grid());
|
Lattice<vobj> ret(rhs.Grid());
|
||||||
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
|
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
|
||||||
Cshift_local(ret,rhs,dimension,shift);
|
Cshift_local(ret,rhs,dimension,shift);
|
||||||
|
|||||||
@@ -373,17 +373,14 @@ public:
|
|||||||
|
|
||||||
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
|
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
uint64_t gsites=1;
|
for(int64_t g=0;g<o.Grid()->_gsites;g++){
|
||||||
uint64_t polesites=0;
|
|
||||||
for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
|
|
||||||
for(int64_t g=0;g<gsites;g++){
|
|
||||||
|
|
||||||
Coordinate gcoor;
|
Coordinate gcoor;
|
||||||
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
|
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
|
||||||
|
|
||||||
sobj ss;
|
sobj ss;
|
||||||
peekSite(ss,o,gcoor);
|
peekSite(ss,o,gcoor);
|
||||||
stream<<"["<< g<<" : ";
|
stream<<"[";
|
||||||
for(int d=0;d<gcoor.size();d++){
|
for(int d=0;d<gcoor.size();d++){
|
||||||
stream<<gcoor[d];
|
stream<<gcoor[d];
|
||||||
if(d!=gcoor.size()-1) stream<<",";
|
if(d!=gcoor.size()-1) stream<<",";
|
||||||
@@ -391,41 +388,6 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
|
|||||||
stream<<"]\t";
|
stream<<"]\t";
|
||||||
stream<<ss<<std::endl;
|
stream<<ss<<std::endl;
|
||||||
}
|
}
|
||||||
if ( o.Grid()->isIcosahedralVertex() ) {
|
|
||||||
uint64_t psites=1;
|
|
||||||
Coordinate perpdims;
|
|
||||||
for(int d=2;d<o.Grid()->_ndimension-1;d++){
|
|
||||||
int pd=o.Grid()->_gdimensions[d];
|
|
||||||
psites*=pd;
|
|
||||||
perpdims.push_back(pd);
|
|
||||||
}
|
|
||||||
for(uint64_t p=0;p<psites;p++){
|
|
||||||
sobj ss;
|
|
||||||
Coordinate orthog;
|
|
||||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
|
||||||
peekPole(ss,o,orthog,South);
|
|
||||||
stream<<"[ SouthPole : ";
|
|
||||||
for(int d=0;d<orthog.size();d++){
|
|
||||||
stream<<orthog[d];
|
|
||||||
if(d!=orthog.size()-1) stream<<",";
|
|
||||||
}
|
|
||||||
stream<<"]\t";
|
|
||||||
stream<<ss<<std::endl;
|
|
||||||
}
|
|
||||||
for(uint64_t p=0;p<psites;p++){
|
|
||||||
sobj ss;
|
|
||||||
Coordinate orthog;
|
|
||||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
|
||||||
peekPole(ss,o,orthog,North);
|
|
||||||
stream<<"[ NorthPole : ";
|
|
||||||
for(int d=0;d<orthog.size();d++){
|
|
||||||
stream<<orthog[d];
|
|
||||||
if(d!=orthog.size()-1) stream<<",";
|
|
||||||
}
|
|
||||||
stream<<"]\t";
|
|
||||||
stream<<ss<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -34,86 +34,22 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
|||||||
typedef typename iobj::scalar_type scalar_type;
|
typedef typename iobj::scalar_type scalar_type;
|
||||||
typedef typename iobj::vector_type vector_type;
|
typedef typename iobj::vector_type vector_type;
|
||||||
|
|
||||||
l=Zero();
|
|
||||||
|
|
||||||
GridBase *grid = l.Grid();
|
GridBase *grid = l.Grid();
|
||||||
int Nsimd = grid->iSites();
|
int Nsimd = grid->iSites();
|
||||||
|
|
||||||
int cartesian_vol = grid->oSites();
|
autoView(l_v, l, CpuWrite);
|
||||||
if ( grid->isIcosahedral() ) {
|
thread_for( o, grid->oSites(), {
|
||||||
cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
|
vector_type vI;
|
||||||
}
|
Coordinate gcoor;
|
||||||
{
|
ExtractBuffer<scalar_type> mergebuf(Nsimd);
|
||||||
autoView(l_v, l, CpuWrite);
|
for(int i=0;i<grid->iSites();i++){
|
||||||
thread_for( o, cartesian_vol, {
|
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
|
||||||
vector_type vI;
|
mergebuf[i]=(Integer)gcoor[mu];
|
||||||
Coordinate gcoor;
|
|
||||||
ExtractBuffer<scalar_type> mergebuf(Nsimd);
|
|
||||||
for(int i=0;i<grid->iSites();i++){
|
|
||||||
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
|
|
||||||
mergebuf[i]=(Integer)gcoor[mu];
|
|
||||||
}
|
|
||||||
merge<vector_type,scalar_type>(vI,mergebuf);
|
|
||||||
l_v[o]=vI;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (grid->isIcosahedralVertex()) {
|
|
||||||
uint64_t psites=1;
|
|
||||||
Coordinate perpdims;
|
|
||||||
typename iobj::scalar_object ss;
|
|
||||||
for(int d=2;d<grid->_ndimension-1;d++){
|
|
||||||
int pd=grid->_gdimensions[d];
|
|
||||||
psites*=pd;
|
|
||||||
perpdims.push_back(pd);
|
|
||||||
}
|
}
|
||||||
for(uint64_t p=0;p<psites;p++){
|
merge<vector_type,scalar_type>(vI,mergebuf);
|
||||||
Coordinate orthog;
|
l_v[o]=vI;
|
||||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
});
|
||||||
|
|
||||||
int icoor;
|
|
||||||
if ( mu>=2 && mu < grid->_ndimension-1) {
|
|
||||||
icoor = orthog[mu-2];
|
|
||||||
} else {
|
|
||||||
icoor = -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
ss=scalar_type(icoor);
|
|
||||||
|
|
||||||
pokePole(ss,l,orthog,South);
|
|
||||||
pokePole(ss,l,orthog,North);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
|
|
||||||
{
|
|
||||||
typedef typename iobj::scalar_object sobj;
|
|
||||||
typedef typename iobj::scalar_type scalar_type;
|
|
||||||
typedef typename iobj::vector_type vector_type;
|
|
||||||
|
|
||||||
GridBase *grid = l.Grid();
|
|
||||||
|
|
||||||
l=Zero();
|
|
||||||
|
|
||||||
assert(grid->isIcosahedralVertex());
|
|
||||||
|
|
||||||
if (grid->isIcosahedralVertex()) {
|
|
||||||
uint64_t psites=1;
|
|
||||||
Coordinate perpdims;
|
|
||||||
sobj ss;
|
|
||||||
scalar_type one(1.0);
|
|
||||||
ss=one;
|
|
||||||
for(int d=2;d<l.Grid()->_ndimension-1;d++){
|
|
||||||
int pd=l.Grid()->_gdimensions[d];
|
|
||||||
psites*=pd;
|
|
||||||
perpdims.push_back(pd);
|
|
||||||
}
|
|
||||||
for(uint64_t p=0;p<psites;p++){
|
|
||||||
Coordinate orthog;
|
|
||||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
|
||||||
pokePole(ss,l,orthog,pole);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|||||||
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
|||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
autoView( l_v , l, CpuRead);
|
autoView( l_v , l, CpuWrite);
|
||||||
extract(l_v[odx],buf);
|
extract(l_v[odx],buf);
|
||||||
|
|
||||||
s = buf[idx];
|
s = buf[idx];
|
||||||
@@ -151,261 +151,6 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
|||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
// zero for south pole, one for north pole
|
|
||||||
template<class vobj,class sobj>
|
|
||||||
void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
|
||||||
{
|
|
||||||
s=Zero();
|
|
||||||
|
|
||||||
GridBase *grid=l.Grid();
|
|
||||||
|
|
||||||
assert(grid->isIcosahedral());
|
|
||||||
assert(grid->isIcosahedralVertex());
|
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
|
||||||
|
|
||||||
int rank;
|
|
||||||
|
|
||||||
int Ndm1 = grid->_ndimension-1;
|
|
||||||
Coordinate pgrid = grid->ProcessorGrid();
|
|
||||||
const int xdim=0;
|
|
||||||
const int ydim=1;
|
|
||||||
const int pdim=Ndm1;
|
|
||||||
|
|
||||||
int64_t pole_osite;
|
|
||||||
int64_t pole_isite;
|
|
||||||
Coordinate rdims;
|
|
||||||
Coordinate idims;
|
|
||||||
Coordinate ocoor;
|
|
||||||
Coordinate icoor;
|
|
||||||
Coordinate pcoor(grid->_ndimension);
|
|
||||||
for(int d=2;d<Ndm1;d++){
|
|
||||||
int dd=d-2;
|
|
||||||
rdims.push_back(grid->_rdimensions[d]);
|
|
||||||
idims.push_back(grid->_simd_layout[d]);
|
|
||||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
|
||||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
|
||||||
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
|
||||||
}
|
|
||||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
|
||||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
|
||||||
|
|
||||||
int64_t osite;
|
|
||||||
if(isNorth == North){
|
|
||||||
pcoor[xdim] = 0;
|
|
||||||
pcoor[ydim] = pgrid[ydim]-1;
|
|
||||||
pcoor[Ndm1] = pgrid[Ndm1]-1;
|
|
||||||
osite = pole_osite + grid->NorthPoleOsite();
|
|
||||||
} else {
|
|
||||||
pcoor[xdim] = pgrid[xdim]-1;
|
|
||||||
pcoor[ydim] = 0;
|
|
||||||
pcoor[Ndm1] = 0;
|
|
||||||
osite = pole_osite + grid->SouthPoleOsite();
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = grid->RankFromProcessorCoor(pcoor);
|
|
||||||
|
|
||||||
if ( rank == grid->ThisRank() ) {
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
|
||||||
autoView( l_v , l, CpuWrite);
|
|
||||||
extract(l_v[osite],buf);
|
|
||||||
s = buf[pole_isite];
|
|
||||||
}
|
|
||||||
grid->Broadcast(rank,s);
|
|
||||||
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
template<class vobj,class sobj>
|
|
||||||
void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
|
||||||
{
|
|
||||||
GridBase *grid=l.Grid();
|
|
||||||
|
|
||||||
assert(grid->isIcosahedral());
|
|
||||||
assert(grid->isIcosahedralVertex());
|
|
||||||
|
|
||||||
grid->Broadcast(grid->BossRank(),s);
|
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
|
||||||
int rank;
|
|
||||||
int Ndm1 = grid->_ndimension-1;
|
|
||||||
Coordinate pgrid = grid->ProcessorGrid();
|
|
||||||
const int xdim=0;
|
|
||||||
const int ydim=1;
|
|
||||||
const int pdim=Ndm1;
|
|
||||||
|
|
||||||
int64_t pole_osite;
|
|
||||||
int64_t pole_isite;
|
|
||||||
Coordinate rdims;
|
|
||||||
Coordinate idims;
|
|
||||||
Coordinate ocoor;
|
|
||||||
Coordinate icoor;
|
|
||||||
Coordinate pcoor(grid->_ndimension,0);
|
|
||||||
for(int d=2;d<Ndm1;d++){
|
|
||||||
int dd = d-2;
|
|
||||||
rdims.push_back(grid->_rdimensions[d]);
|
|
||||||
idims.push_back(grid->_simd_layout[d]);
|
|
||||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
|
||||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
|
||||||
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
|
||||||
|
|
||||||
int o = orthog[dd];
|
|
||||||
int r = grid->_rdimensions[d];
|
|
||||||
int omr = o % r;
|
|
||||||
}
|
|
||||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
|
||||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
|
||||||
|
|
||||||
int64_t osite;
|
|
||||||
if(isNorth ==North){
|
|
||||||
pcoor[xdim] = 0;
|
|
||||||
pcoor[ydim] = pgrid[ydim]-1;
|
|
||||||
pcoor[Ndm1] = pgrid[Ndm1]-1;
|
|
||||||
osite = pole_osite + grid->NorthPoleOsite();
|
|
||||||
} else {
|
|
||||||
pcoor[xdim] = pgrid[xdim]-1;
|
|
||||||
pcoor[ydim] = 0;
|
|
||||||
pcoor[Ndm1] = 0;
|
|
||||||
osite = pole_osite + grid->SouthPoleOsite();
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = grid->RankFromProcessorCoor(pcoor);
|
|
||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
|
||||||
if ( rank == grid->ThisRank() ) {
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
|
||||||
autoView( l_v , l, CpuWrite);
|
|
||||||
extract(l_v[osite],buf);
|
|
||||||
buf[pole_isite] = s;
|
|
||||||
merge(l_v[osite],buf);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template<class vobj,class sobj>
|
|
||||||
void peekLocalPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
|
||||||
{
|
|
||||||
s=Zero();
|
|
||||||
|
|
||||||
GridBase *grid=l.Grid();
|
|
||||||
|
|
||||||
assert(grid->isIcosahedral());
|
|
||||||
assert(grid->isIcosahedralVertex());
|
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
|
||||||
|
|
||||||
int rank;
|
|
||||||
|
|
||||||
int Ndm1 = grid->_ndimension-1;
|
|
||||||
Coordinate pgrid = grid->ProcessorGrid();
|
|
||||||
const int xdim=0;
|
|
||||||
const int ydim=1;
|
|
||||||
const int pdim=Ndm1;
|
|
||||||
|
|
||||||
int64_t pole_osite;
|
|
||||||
int64_t pole_isite;
|
|
||||||
Coordinate rdims;
|
|
||||||
Coordinate idims;
|
|
||||||
Coordinate ocoor;
|
|
||||||
Coordinate icoor;
|
|
||||||
// Coordinate pcoor(grid->_ndimension);
|
|
||||||
for(int d=2;d<Ndm1;d++){
|
|
||||||
int dd=d-2;
|
|
||||||
rdims.push_back(grid->_rdimensions[d]);
|
|
||||||
idims.push_back(grid->_simd_layout[d]);
|
|
||||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
|
||||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
|
||||||
// pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
|
||||||
}
|
|
||||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
|
||||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
|
||||||
|
|
||||||
int64_t osite;
|
|
||||||
if(isNorth == North){
|
|
||||||
// pcoor[xdim] = 0;
|
|
||||||
// pcoor[ydim] = pgrid[ydim]-1;
|
|
||||||
// pcoor[Ndm1] = pgrid[Ndm1]-1;
|
|
||||||
osite = pole_osite + grid->NorthPoleOsite();
|
|
||||||
assert(grid->ownsNorthPole());
|
|
||||||
} else {
|
|
||||||
// pcoor[xdim] = pgrid[xdim]-1;
|
|
||||||
// pcoor[ydim] = 0;
|
|
||||||
// pcoor[Ndm1] = 0;
|
|
||||||
osite = pole_osite + grid->SouthPoleOsite();
|
|
||||||
assert(grid->ownsSouthPole());
|
|
||||||
}
|
|
||||||
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
|
||||||
autoView( l_v , l, CpuWrite);
|
|
||||||
extract(l_v[osite],buf);
|
|
||||||
s = buf[pole_isite];
|
|
||||||
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
template<class vobj,class sobj>
|
|
||||||
void pokeLocalPole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
|
||||||
{
|
|
||||||
GridBase *grid=l.Grid();
|
|
||||||
|
|
||||||
assert(grid->isIcosahedral());
|
|
||||||
assert(grid->isIcosahedralVertex());
|
|
||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
|
||||||
int rank;
|
|
||||||
int Ndm1 = grid->_ndimension-1;
|
|
||||||
|
|
||||||
const int xdim=0;
|
|
||||||
const int ydim=1;
|
|
||||||
const int pdim=Ndm1;
|
|
||||||
|
|
||||||
int64_t pole_osite;
|
|
||||||
int64_t pole_isite;
|
|
||||||
Coordinate rdims;
|
|
||||||
Coordinate idims;
|
|
||||||
Coordinate ocoor;
|
|
||||||
Coordinate icoor;
|
|
||||||
// Coordinate pcoor(grid->_ndimension,0);
|
|
||||||
for(int d=2;d<Ndm1;d++){
|
|
||||||
int dd = d-2;
|
|
||||||
rdims.push_back(grid->_rdimensions[d]);
|
|
||||||
idims.push_back(grid->_simd_layout[d]);
|
|
||||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
|
||||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
|
||||||
// pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
|
||||||
|
|
||||||
int o = orthog[dd];
|
|
||||||
int r = grid->_rdimensions[d];
|
|
||||||
int omr = o % r;
|
|
||||||
}
|
|
||||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
|
||||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
|
||||||
|
|
||||||
int64_t osite;
|
|
||||||
int insert=0;
|
|
||||||
if(isNorth ==North){
|
|
||||||
// pcoor[xdim] = 0;
|
|
||||||
// pcoor[ydim] = pgrid[ydim]-1;
|
|
||||||
// pcoor[Ndm1] = pgrid[Ndm1]-1;
|
|
||||||
osite = pole_osite + grid->NorthPoleOsite();
|
|
||||||
assert(grid->ownsNorthPole());
|
|
||||||
} else {
|
|
||||||
// pcoor[xdim] = pgrid[xdim]-1;
|
|
||||||
// pcoor[ydim] = 0;
|
|
||||||
// pcoor[Ndm1] = 0;
|
|
||||||
osite = pole_osite + grid->SouthPoleOsite();
|
|
||||||
assert(grid->ownsSouthPole());
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
|
||||||
autoView( l_v , l, CpuWrite);
|
|
||||||
extract(l_v[osite],buf);
|
|
||||||
buf[pole_isite] = s;
|
|
||||||
merge(l_v[osite],buf);
|
|
||||||
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// Peek a scalar object from the SIMD array
|
// Peek a scalar object from the SIMD array
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
@@ -434,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
pt[w] = getlane(vp[w],idx);
|
pt[w] = getlane(vp[w],idx);
|
||||||
}
|
}
|
||||||
|
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
|
|||||||
@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
assert(ok);
|
assert(ok);
|
||||||
}
|
}
|
||||||
FlightRecorder::StepLog("Start global sum");
|
FlightRecorder::StepLog("Start global sum");
|
||||||
// grid->GlobalSumP2P(nrm);
|
grid->GlobalSumP2P(nrm);
|
||||||
grid->GlobalSum(nrm);
|
// grid->GlobalSum(nrm);
|
||||||
FlightRecorder::StepLog("Finished global sum");
|
FlightRecorder::StepLog("Finished global sum");
|
||||||
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
||||||
FlightRecorder::ReductionLog(local,real(nrm));
|
FlightRecorder::ReductionLog(local,real(nrm));
|
||||||
|
|||||||
@@ -48,45 +48,31 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////
|
||||||
inline int RNGfillable(GridBase *coarse,GridBase *fine)
|
inline int RNGfillable(GridBase *coarse,GridBase *fine)
|
||||||
{
|
{
|
||||||
if ( coarse == fine ) return 1;
|
|
||||||
|
|
||||||
if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
|
int rngdims = coarse->_ndimension;
|
||||||
|
|
||||||
if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
|
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
|
||||||
assert(fine->Nd()==coarse->Nd());
|
int lowerdims = fine->_ndimension - coarse->_ndimension;
|
||||||
for(int d=0;d<fine->Nd();d++){
|
assert(lowerdims >= 0);
|
||||||
assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
|
for(int d=0;d<lowerdims;d++){
|
||||||
}
|
assert(fine->_simd_layout[d]==1);
|
||||||
return 1;
|
assert(fine->_processors[d]==1);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
|
||||||
|
|
||||||
int rngdims = coarse->_ndimension;
|
|
||||||
|
|
||||||
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
|
int multiplicity=1;
|
||||||
int lowerdims = fine->_ndimension - coarse->_ndimension;
|
for(int d=0;d<lowerdims;d++){
|
||||||
assert(lowerdims >= 0);
|
multiplicity=multiplicity*fine->_rdimensions[d];
|
||||||
for(int d=0;d<lowerdims;d++){
|
|
||||||
assert(fine->_simd_layout[d]==1);
|
|
||||||
assert(fine->_processors[d]==1);
|
|
||||||
}
|
|
||||||
|
|
||||||
int multiplicity=1;
|
|
||||||
for(int d=0;d<lowerdims;d++){
|
|
||||||
multiplicity=multiplicity*fine->_rdimensions[d];
|
|
||||||
}
|
|
||||||
// local and global volumes subdivide cleanly after SIMDization
|
|
||||||
for(int d=0;d<rngdims;d++){
|
|
||||||
int fd= d+lowerdims;
|
|
||||||
assert(coarse->_processors[d] == fine->_processors[fd]);
|
|
||||||
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
|
|
||||||
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
|
|
||||||
|
|
||||||
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
|
|
||||||
}
|
|
||||||
return multiplicity;
|
|
||||||
}
|
}
|
||||||
|
// local and global volumes subdivide cleanly after SIMDization
|
||||||
|
for(int d=0;d<rngdims;d++){
|
||||||
|
int fd= d+lowerdims;
|
||||||
|
assert(coarse->_processors[d] == fine->_processors[fd]);
|
||||||
|
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
|
||||||
|
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
|
||||||
|
|
||||||
|
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
|
||||||
|
}
|
||||||
|
return multiplicity;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -94,19 +80,6 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
|
|||||||
// this function is necessary for the LS vectorised field
|
// this function is necessary for the LS vectorised field
|
||||||
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
|
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
|
||||||
{
|
{
|
||||||
|
|
||||||
if ( coarse == fine ) return 1;
|
|
||||||
|
|
||||||
if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
|
|
||||||
|
|
||||||
if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
|
|
||||||
assert(fine->Nd()==coarse->Nd());
|
|
||||||
for(int d=0;d<fine->Nd();d++){
|
|
||||||
assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
|
|
||||||
}
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int rngdims = coarse->_ndimension;
|
int rngdims = coarse->_ndimension;
|
||||||
|
|
||||||
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
|
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
|
||||||
@@ -379,12 +352,12 @@ private:
|
|||||||
public:
|
public:
|
||||||
GridBase *Grid(void) const { return _grid; }
|
GridBase *Grid(void) const { return _grid; }
|
||||||
int generator_idx(int os,int is) {
|
int generator_idx(int os,int is) {
|
||||||
return (is*_grid->CartesianOsites()+os)%_grid->lSites(); // On the pole sites wrap back to normal generators; Icosahedral hack
|
return is*_grid->oSites()+os;
|
||||||
}
|
}
|
||||||
|
|
||||||
GridParallelRNG(GridBase *grid) : GridRNGbase() {
|
GridParallelRNG(GridBase *grid) : GridRNGbase() {
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
_vol =_grid->lSites();
|
_vol =_grid->iSites()*_grid->oSites();
|
||||||
|
|
||||||
_generators.resize(_vol);
|
_generators.resize(_vol);
|
||||||
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
|
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
|
||||||
@@ -408,7 +381,7 @@ public:
|
|||||||
|
|
||||||
int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
|
int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
|
||||||
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
|
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
|
||||||
int osites = _grid->CartesianOsites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity, except on Icosahedral
|
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
||||||
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
||||||
|
|
||||||
autoView(l_v, l, CpuWrite);
|
autoView(l_v, l, CpuWrite);
|
||||||
@@ -429,27 +402,8 @@ public:
|
|||||||
// merge into SIMD lanes, FIXME suboptimal implementation
|
// merge into SIMD lanes, FIXME suboptimal implementation
|
||||||
merge(l_v[sm], buf);
|
merge(l_v[sm], buf);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
// });
|
||||||
/*
|
|
||||||
* Fill in the poles for an Icosahedral vertex mesh
|
|
||||||
*/
|
|
||||||
if (l.Grid()->isIcosahedralVertex()) {
|
|
||||||
int64_t pole_sites=l.Grid()->NorthPoleOsites()+l.Grid()->SouthPoleOsites();
|
|
||||||
int64_t pole_base =l.Grid()->CartesianOsites();
|
|
||||||
|
|
||||||
ExtractBuffer<scalar_object> buf(Nsimd);
|
|
||||||
for (int m = 0; m < pole_sites; m++) { // Draw from same generator multiplicity times
|
|
||||||
for (int si = 0; si < Nsimd; si++) {
|
|
||||||
int gdx = 0;
|
|
||||||
scalar_type *pointer = (scalar_type *)&buf[si];
|
|
||||||
dist[gdx].reset();
|
|
||||||
for (int idx = 0; idx < words; idx++)
|
|
||||||
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
|
|
||||||
}
|
|
||||||
merge(l_v[pole_base+m], buf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_time_counter += usecond()- inner_time_counter;
|
_time_counter += usecond()- inner_time_counter;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ static constexpr int Tm = 7;
|
|||||||
|
|
||||||
static constexpr int Nc=Config_Nc;
|
static constexpr int Nc=Config_Nc;
|
||||||
static constexpr int Ns=4;
|
static constexpr int Ns=4;
|
||||||
static constexpr int Nd=Config_Nd;
|
static constexpr int Nd=4;
|
||||||
static constexpr int Nhs=2; // half spinor
|
static constexpr int Nhs=2; // half spinor
|
||||||
static constexpr int Nds=8; // double stored gauge field
|
static constexpr int Nds=8; // double stored gauge field
|
||||||
static constexpr int Ngp=2; // gparity index range
|
static constexpr int Ngp=2; // gparity index range
|
||||||
@@ -75,7 +75,6 @@ static constexpr int InverseYes=1;
|
|||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
||||||
|
|
||||||
const int SpinorIndex = 2;
|
const int SpinorIndex = 2;
|
||||||
const int PauliIndex = 2; //TensorLevel counts from the bottom!
|
|
||||||
template<typename T> struct isSpinor {
|
template<typename T> struct isSpinor {
|
||||||
static constexpr bool value = (SpinorIndex==T::TensorLevel);
|
static constexpr bool value = (SpinorIndex==T::TensorLevel);
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -123,10 +123,10 @@ public:
|
|||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
|
||||||
peekLocalSite(ScalarUmu, Umu_v, lcoor);
|
peekLocalSite(ScalarUmu, Umu_v, lcoor);
|
||||||
for (int mu = 0; mu < Nd; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||||
|
|
||||||
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
|
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
|
||||||
for (int mu = 0; mu < Nd; mu++) ScalarUds(mu + Nd) = ScalarUmu(mu);
|
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
||||||
|
|
||||||
pokeLocalSite(ScalarUds, Uds_v, lcoor);
|
pokeLocalSite(ScalarUds, Uds_v, lcoor);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -85,15 +85,6 @@ NAMESPACE_CHECK(DomainWall);
|
|||||||
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
|
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
|
||||||
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
|
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
|
||||||
NAMESPACE_CHECK(Overlap);
|
NAMESPACE_CHECK(Overlap);
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Two spin wilson fermion based
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
|
|
||||||
NAMESPACE_CHECK(TwoSpinWilson);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -41,9 +41,8 @@ NAMESPACE_CHECK(Compressor);
|
|||||||
NAMESPACE_CHECK(FermionOperatorImpl);
|
NAMESPACE_CHECK(FermionOperatorImpl);
|
||||||
#include <Grid/qcd/action/fermion/FermionOperator.h>
|
#include <Grid/qcd/action/fermion/FermionOperator.h>
|
||||||
NAMESPACE_CHECK(FermionOperator);
|
NAMESPACE_CHECK(FermionOperator);
|
||||||
#include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
|
#include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
|
||||||
#include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions
|
#include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions
|
||||||
#include <Grid/qcd/action/fermion/TwoSpinWilsonKernels.h> //used for 3D fermions, pauli in place of Dirac
|
|
||||||
NAMESPACE_CHECK(Kernels);
|
NAMESPACE_CHECK(Kernels);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -180,12 +180,6 @@ NAMESPACE_CHECK(ImplGparityWilson);
|
|||||||
#include <Grid/qcd/action/fermion/StaggeredImpl.h>
|
#include <Grid/qcd/action/fermion/StaggeredImpl.h>
|
||||||
NAMESPACE_CHECK(ImplStaggered);
|
NAMESPACE_CHECK(ImplStaggered);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Two component spinor Wilson action for 3d / Boston
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
|
||||||
#include <Grid/qcd/action/fermion/TwoSpinWilsonImpl.h>
|
|
||||||
NAMESPACE_CHECK(ImplTwoSpinWilson);
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
// Single flavour one component spinors with colour index. 5d vec
|
// Single flavour one component spinors with colour index. 5d vec
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -274,7 +274,7 @@ public:
|
|||||||
autoView( Uds_v , Uds, CpuWrite);
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
autoView( Utmp_v, Utmp, CpuWrite);
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
thread_foreach(ss,Utmp_v,{
|
thread_foreach(ss,Utmp_v,{
|
||||||
Uds_v[ss](0)(mu+Nd) = Utmp_v[ss]();
|
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
Utmp = Uconj;
|
Utmp = Uconj;
|
||||||
@@ -286,7 +286,7 @@ public:
|
|||||||
autoView( Uds_v , Uds, CpuWrite);
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
autoView( Utmp_v, Utmp, CpuWrite);
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
thread_foreach(ss,Utmp_v,{
|
thread_foreach(ss,Utmp_v,{
|
||||||
Uds_v[ss](1)(mu+Nd) = Utmp_v[ss]();
|
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -320,7 +320,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
Uconj = conjugate(*Upoke);
|
Uconj = conjugate(*Upoke);
|
||||||
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + Nd);
|
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -36,8 +36,6 @@ public:
|
|||||||
static const std::vector<int> directions;
|
static const std::vector<int> directions;
|
||||||
static const std::vector<int> displacements;
|
static const std::vector<int> displacements;
|
||||||
static const int npoint = 16;
|
static const int npoint = 16;
|
||||||
static std::vector<int> MakeDirections(void);
|
|
||||||
static std::vector<int> MakeDisplacements(void);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -156,6 +154,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
|||||||
@@ -40,8 +40,6 @@ public:
|
|||||||
static const std::vector<int> directions;
|
static const std::vector<int> directions;
|
||||||
static const std::vector<int> displacements;
|
static const std::vector<int> displacements;
|
||||||
const int npoint = 16;
|
const int npoint = 16;
|
||||||
static std::vector<int> MakeDirections(void);
|
|
||||||
static std::vector<int> MakeDisplacements(void);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@@ -181,6 +179,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
|||||||
@@ -36,8 +36,6 @@ public:
|
|||||||
static const std::vector<int> directions;
|
static const std::vector<int> directions;
|
||||||
static const std::vector<int> displacements;
|
static const std::vector<int> displacements;
|
||||||
static const int npoint = 8;
|
static const int npoint = 8;
|
||||||
static std::vector<int> MakeDirections(void);
|
|
||||||
static std::vector<int> MakeDisplacements(void);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -148,6 +146,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
|||||||
@@ -141,9 +141,9 @@ public:
|
|||||||
Udag = Udag *phases;
|
Udag = Udag *phases;
|
||||||
|
|
||||||
InsertGaugeField(Uds,U,mu);
|
InsertGaugeField(Uds,U,mu);
|
||||||
InsertGaugeField(Uds,Udag,mu+Nd);
|
InsertGaugeField(Uds,Udag,mu+4);
|
||||||
// PokeIndex<LorentzIndex>(Uds, U, mu);
|
// PokeIndex<LorentzIndex>(Uds, U, mu);
|
||||||
// PokeIndex<LorentzIndex>(Uds, Udag, mu + Nd);
|
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
||||||
|
|
||||||
// 3 hop based on thin links. Crazy huh ?
|
// 3 hop based on thin links. Crazy huh ?
|
||||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||||
@@ -156,7 +156,7 @@ public:
|
|||||||
UUUdag = UUUdag *phases;
|
UUUdag = UUUdag *phases;
|
||||||
|
|
||||||
InsertGaugeField(UUUds,UUU,mu);
|
InsertGaugeField(UUUds,UUU,mu);
|
||||||
InsertGaugeField(UUUds,UUUdag,mu+Nd);
|
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,175 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma one
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
class TwoSpinWilsonFermion3plus1DStatic {
|
|
||||||
public:
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
|
||||||
static const std::vector<int> directions;
|
|
||||||
static const std::vector<int> displacements;
|
|
||||||
static constexpr int npoint = 6;
|
|
||||||
static std::vector<int> MakeDirections(void);
|
|
||||||
static std::vector<int> MakeDisplacements(void);
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
|
||||||
typedef TwoSpinWilsonKernels<Impl> Kernels;
|
|
||||||
|
|
||||||
FermionField _tmp;
|
|
||||||
FermionField &tmp(void) { return _tmp; }
|
|
||||||
|
|
||||||
int Dirichlet;
|
|
||||||
Coordinate Block;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Implement the abstract base
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
GridBase *GaugeGrid(void) { return _ThreeDimGrid ;}
|
|
||||||
GridBase *GaugeRedBlackGrid(void) { return _ThreeDimRedBlackGrid ;}
|
|
||||||
GridBase *FermionGrid(void) { return _FourDimGrid;}
|
|
||||||
GridBase *FermionRedBlackGrid(void) { return _FourDimRedBlackGrid;}
|
|
||||||
|
|
||||||
// full checkerboard operations; leave unimplemented as abstract for now
|
|
||||||
virtual void M (const FermionField &in, FermionField &out){assert(0);};
|
|
||||||
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
|
||||||
|
|
||||||
// half checkerboard operations; leave unimplemented as abstract for now
|
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
|
||||||
virtual void Mooee (const FermionField &in, FermionField &out);
|
|
||||||
virtual void MooeeInv (const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
virtual void MeooeDag (const FermionField &in, FermionField &out);
|
|
||||||
virtual void MooeeDag (const FermionField &in, FermionField &out);
|
|
||||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
|
|
||||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
|
||||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
|
||||||
|
|
||||||
// These can be overridden by fancy 5d chiral action
|
|
||||||
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
|
||||||
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
|
||||||
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
|
||||||
|
|
||||||
// void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
|
||||||
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
|
||||||
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
|
||||||
|
|
||||||
// Implement hopping term non-hermitian hopping term; half cb or both
|
|
||||||
// Implement s-diagonal DW
|
|
||||||
void DW (const FermionField &in, FermionField &out,int dag);
|
|
||||||
void Dhop (const FermionField &in, FermionField &out,int dag);
|
|
||||||
void DhopOE(const FermionField &in, FermionField &out,int dag);
|
|
||||||
void DhopEO(const FermionField &in, FermionField &out,int dag);
|
|
||||||
|
|
||||||
void DhopComms (const FermionField &in, FermionField &out);
|
|
||||||
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
|
|
||||||
|
|
||||||
// add a DhopComm
|
|
||||||
// -- suboptimal interface will presently trigger multiple comms.
|
|
||||||
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
|
|
||||||
void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
|
|
||||||
void DhopDirComms(const FermionField &in);
|
|
||||||
void DhopDirCalc(const FermionField &in, FermionField &out,int point);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// New methods added
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
void DerivInternal(StencilImpl & st,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
GaugeField &mat,
|
|
||||||
const FermionField &A,
|
|
||||||
const FermionField &B,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
// Constructors
|
|
||||||
TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
|
|
||||||
GridCartesian &FourDimGrid,
|
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
GridCartesian &ThreeDimGrid,
|
|
||||||
GridRedBlackCartesian &ThreeDimRedBlackGrid,
|
|
||||||
double _M5,const ImplParams &p= ImplParams());
|
|
||||||
|
|
||||||
virtual void DirichletBlock(const Coordinate & block)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
// DoubleStore
|
|
||||||
void ImportGauge(const GaugeField &_Umu);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Data members require to support the functionality
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
public:
|
|
||||||
|
|
||||||
// Add these to the support from Wilson
|
|
||||||
GridBase *_ThreeDimGrid;
|
|
||||||
GridBase *_ThreeDimRedBlackGrid;
|
|
||||||
GridBase *_FourDimGrid;
|
|
||||||
GridBase *_FourDimRedBlackGrid;
|
|
||||||
|
|
||||||
double M5;
|
|
||||||
int Ls;
|
|
||||||
|
|
||||||
//Defines the stencils for even and odd
|
|
||||||
StencilImpl Stencil;
|
|
||||||
StencilImpl StencilEven;
|
|
||||||
StencilImpl StencilOdd;
|
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
|
||||||
DoubledGaugeField Umu;
|
|
||||||
DoubledGaugeField UmuEven;
|
|
||||||
DoubledGaugeField UmuOdd;
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
@@ -1,222 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Single flavour four spinors with colour index
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
|
||||||
template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
|
|
||||||
class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
|
||||||
public:
|
|
||||||
|
|
||||||
static const int Dimension = Representation::Dimension;
|
|
||||||
static const bool isFundamental = Representation::isFundamental;
|
|
||||||
|
|
||||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
|
||||||
|
|
||||||
//Necessary?
|
|
||||||
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
|
||||||
|
|
||||||
typedef typename Options::_Coeff_t Coeff_t;
|
|
||||||
|
|
||||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
|
||||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
|
|
||||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
|
||||||
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
|
||||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
|
||||||
|
|
||||||
typedef iImplSpinor<Simd> SiteSpinor;
|
|
||||||
typedef iImplPropagator<Simd> SitePropagator;
|
|
||||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
|
||||||
typedef iImplHalfCommSpinor<Simd> SiteHalfCommSpinor;
|
|
||||||
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
|
||||||
|
|
||||||
typedef Lattice<SiteSpinor> FermionField;
|
|
||||||
typedef Lattice<SitePropagator> PropagatorField;
|
|
||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
|
||||||
|
|
||||||
typedef SimpleCompressor<SiteSpinor> Compressor;
|
|
||||||
typedef WilsonImplParams ImplParams;
|
|
||||||
typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
|
|
||||||
typedef const typename StencilImpl::View_type StencilView;
|
|
||||||
|
|
||||||
ImplParams Params;
|
|
||||||
|
|
||||||
TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class _Spinor>
|
|
||||||
static accelerator_inline void multLink(_Spinor &phi,
|
|
||||||
const SiteDoubledGaugeField &U,
|
|
||||||
const _Spinor &chi,
|
|
||||||
int mu)
|
|
||||||
{
|
|
||||||
auto UU = coalescedRead(U(mu));
|
|
||||||
mult(&phi(), &UU, &chi());
|
|
||||||
}
|
|
||||||
template<class _Spinor>
|
|
||||||
static accelerator_inline void multLink(_Spinor &phi,
|
|
||||||
const SiteDoubledGaugeField &U,
|
|
||||||
const _Spinor &chi,
|
|
||||||
int mu,
|
|
||||||
StencilEntry *SE,
|
|
||||||
StencilView &St)
|
|
||||||
{
|
|
||||||
multLink(phi,U,chi,mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class _SpinorField>
|
|
||||||
inline void multLinkField(_SpinorField & out,
|
|
||||||
const DoubledGaugeField &Umu,
|
|
||||||
const _SpinorField & phi,
|
|
||||||
int mu)
|
|
||||||
{
|
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
|
||||||
autoView( out_v, out, AcceleratorWrite);
|
|
||||||
autoView( phi_v, phi, AcceleratorRead);
|
|
||||||
autoView( Umu_v, Umu, AcceleratorRead);
|
|
||||||
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
|
|
||||||
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
|
|
||||||
calcSpinor tmp;
|
|
||||||
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
|
|
||||||
coalescedWrite(out_v[sss],tmp);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class ref>
|
|
||||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
|
||||||
{
|
|
||||||
reg = memory;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
|
||||||
DoubledGaugeField &Uds,
|
|
||||||
const GaugeField &Umu)
|
|
||||||
{
|
|
||||||
typedef typename Simd::scalar_type scalar_type;
|
|
||||||
|
|
||||||
conformable(Uds.Grid(), GaugeGrid);
|
|
||||||
conformable(Umu.Grid(), GaugeGrid);
|
|
||||||
|
|
||||||
GaugeLinkField U(GaugeGrid);
|
|
||||||
GaugeLinkField tmp(GaugeGrid);
|
|
||||||
|
|
||||||
Lattice<iScalar<vInteger> > coor(GaugeGrid);
|
|
||||||
////////////////////////////////////////////////////
|
|
||||||
// apply any boundary phase or twists
|
|
||||||
////////////////////////////////////////////////////
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
|
|
||||||
////////// boundary phase /////////////
|
|
||||||
auto pha = Params.boundary_phases[mu];
|
|
||||||
scalar_type phase( real(pha),imag(pha) );
|
|
||||||
|
|
||||||
int L = GaugeGrid->GlobalDimensions()[mu];
|
|
||||||
int Lmu = L - 1;
|
|
||||||
|
|
||||||
LatticeCoordinate(coor, mu);
|
|
||||||
|
|
||||||
U = PeekIndex<LorentzIndex>(Umu, mu);
|
|
||||||
|
|
||||||
// apply any twists
|
|
||||||
RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
|
|
||||||
if ( theta != 0.0) {
|
|
||||||
scalar_type twphase(::cos(theta),::sin(theta));
|
|
||||||
U = twphase*U;
|
|
||||||
std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp = where(coor == Lmu, phase * U, U);
|
|
||||||
PokeIndex<LorentzIndex>(Uds, tmp, mu);
|
|
||||||
|
|
||||||
U = adj(Cshift(U, mu, -1));
|
|
||||||
U = where(coor == 0, conjugate(phase) * U, U);
|
|
||||||
PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
|
|
||||||
GaugeLinkField link(mat.Grid());
|
|
||||||
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
|
|
||||||
PokeIndex<LorentzIndex>(mat,link,mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
|
|
||||||
mat = outerProduct(B,A);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
|
|
||||||
mat = TraceIndex<SpinIndex>(P);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
|
|
||||||
{
|
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
|
||||||
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu)
|
|
||||||
{
|
|
||||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
|
||||||
autoView( mat_v , mat, AcceleratorWrite);
|
|
||||||
{
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
|
||||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
|
||||||
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
|
|
||||||
int sU=sss;
|
|
||||||
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
|
|
||||||
ColorMatrixType sum;
|
|
||||||
zeroit(sum);
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
for(int spn=0;spn<Ns;spn++){ //sum over spin
|
|
||||||
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
|
|
||||||
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
|
|
||||||
auto op = outerProduct(bb,aa);
|
|
||||||
sum = sum + op;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
coalescedWrite(mat_v[sU](mu)(), sum);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
typedef TwoSpinWilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR; // Real.. whichever prec
|
|
||||||
typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF; // Float
|
|
||||||
typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD; // Double
|
|
||||||
typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2; // Double
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
@@ -1,84 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Helper routines that implement Wilson stencil for a single site.
|
|
||||||
// Common to both the WilsonFermion and WilsonFermion5D
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl> {
|
|
||||||
public:
|
|
||||||
|
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
|
||||||
typedef FermionOperator<Impl> Base;
|
|
||||||
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
|
|
||||||
public:
|
|
||||||
|
|
||||||
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
|
||||||
int interior=1,int exterior=1) ;
|
|
||||||
|
|
||||||
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
|
||||||
uint64_t *ids);
|
|
||||||
|
|
||||||
static void DhopDagKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
|
||||||
int interior=1,int exterior=1) ;
|
|
||||||
|
|
||||||
static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
|
|
||||||
int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
|
|
||||||
|
|
||||||
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
|
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
|
|
||||||
|
|
||||||
static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
|
||||||
static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
|
||||||
static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
|
||||||
static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
|
||||||
static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
|
||||||
static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
|
||||||
|
|
||||||
public:
|
|
||||||
TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
// Wilson compressor will need FaceGather policies for:
|
|
||||||
// Periodic, Dirichlet, and partial Dirichlet for DWF
|
|
||||||
///////////////////////////////////////////////////////////////
|
|
||||||
const int dwf_compressor_depth=2;
|
|
||||||
#define DWF_COMPRESS
|
|
||||||
class FaceGatherPartialDWF
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
|
||||||
#else
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
|
||||||
#endif
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
|
||||||
const Lattice<vobj> &rhs,
|
|
||||||
cobj *buffer,
|
|
||||||
compressor &compress,
|
|
||||||
int off,int so,int partial)
|
|
||||||
{
|
|
||||||
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
|
|
||||||
// Shrinks local and remote comms buffers
|
|
||||||
GridBase *Grid = rhs.Grid();
|
|
||||||
int Ls = Grid->_rdimensions[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth=Ls/2;
|
|
||||||
#endif
|
|
||||||
std::pair<int,int> *table_v = & table[0];
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
int vol=table.size()/Ls;
|
|
||||||
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
|
|
||||||
Integer i=idx/Ls;
|
|
||||||
Integer s=idx%Ls;
|
|
||||||
Integer sc=depth+s-(Ls-depth);
|
|
||||||
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
|
|
||||||
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
template<class decompressor,class Decompression>
|
|
||||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
|
||||||
{
|
|
||||||
auto Ls = dd.dims[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth=Ls/2;
|
|
||||||
#endif
|
|
||||||
// Just pass in the Grid
|
|
||||||
auto kp = dd.kernel_p;
|
|
||||||
auto mp = dd.mpi_p;
|
|
||||||
int size= dd.buffer_size;
|
|
||||||
int vol= size/Ls;
|
|
||||||
accelerator_forNB(o,size,1,{
|
|
||||||
int idx=o/Ls;
|
|
||||||
int s=o%Ls;
|
|
||||||
if ( s < depth ) {
|
|
||||||
int oo=s*vol+idx;
|
|
||||||
kp[o]=mp[oo];
|
|
||||||
} else if ( s >= Ls-depth ) {
|
|
||||||
int sc = depth + s - (Ls-depth);
|
|
||||||
int oo=sc*vol+idx;
|
|
||||||
kp[o]=mp[oo];
|
|
||||||
} else {
|
|
||||||
kp[o] = Zero();//fill rest with zero if partial dirichlet
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Need to gather *interior portions* for ALL s-slices in simd directions
|
|
||||||
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
|
|
||||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
|
||||||
compressor &compress,int type,int partial)
|
|
||||||
{
|
|
||||||
GridBase *Grid = rhs.Grid();
|
|
||||||
int Ls = Grid->_rdimensions[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth = Ls/2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// insertion of zeroes...
|
|
||||||
assert( (table.size()&0x1)==0);
|
|
||||||
int num=table.size()/2;
|
|
||||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
auto p0=&pointers[0][0];
|
|
||||||
auto p1=&pointers[1][0];
|
|
||||||
auto tp=&table[0];
|
|
||||||
int nnum=num/Ls;
|
|
||||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
|
||||||
// Reorders both local and remote comms buffers
|
|
||||||
//
|
|
||||||
int s = j % Ls;
|
|
||||||
int sp1 = (s+depth)%Ls; // peri incremented s slice
|
|
||||||
|
|
||||||
int hxyz= j/Ls;
|
|
||||||
|
|
||||||
int xyz0= hxyz*2; // xyzt part of coor
|
|
||||||
int xyz1= hxyz*2+1;
|
|
||||||
|
|
||||||
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
|
||||||
|
|
||||||
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
|
|
||||||
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
|
|
||||||
compress.CompressExchange(p0[jj],p1[jj],
|
|
||||||
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
|
|
||||||
rhs_v[so+tp[kk1 ].second],
|
|
||||||
type);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
// Merge routine is for SIMD faces
|
|
||||||
template<class decompressor,class Merger>
|
|
||||||
static void MergeFace(decompressor decompress,Merger &mm)
|
|
||||||
{
|
|
||||||
auto Ls = mm.dims[0];
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
#else
|
|
||||||
int depth = Ls/2;
|
|
||||||
#endif
|
|
||||||
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
|
|
||||||
auto mp = &mm.mpointer[0];
|
|
||||||
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
|
|
||||||
auto vp1= &mm.vpointers[1][0];
|
|
||||||
auto type= mm.type;
|
|
||||||
int nnum = num/Ls;
|
|
||||||
accelerator_forNB(o,num,Merger::Nsimd,{
|
|
||||||
|
|
||||||
int s=o%Ls;
|
|
||||||
int hxyz=o/Ls; // xyzt related component
|
|
||||||
int xyz0=hxyz*2;
|
|
||||||
int xyz1=hxyz*2+1;
|
|
||||||
|
|
||||||
int sp = (s+depth)%Ls;
|
|
||||||
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
|
||||||
|
|
||||||
int oo0= s+xyz0*Ls;
|
|
||||||
int oo1= s+xyz1*Ls;
|
|
||||||
|
|
||||||
// same ss0, ss1 pair goes to new layout
|
|
||||||
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
class FaceGatherDWFMixedBCs
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
#ifdef DWF_COMPRESS
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
|
||||||
#else
|
|
||||||
static int PartialCompressionFactor(GridBase *grid) {return 1;}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
|
||||||
const Lattice<vobj> &rhs,
|
|
||||||
cobj *buffer,
|
|
||||||
compressor &compress,
|
|
||||||
int off,int so,int partial)
|
|
||||||
{
|
|
||||||
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
|
|
||||||
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
|
||||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
|
||||||
}
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
|
||||||
compressor &compress,int type,int partial)
|
|
||||||
{
|
|
||||||
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
|
|
||||||
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
|
||||||
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
|
||||||
}
|
|
||||||
template<class decompressor,class Merger>
|
|
||||||
static void MergeFace(decompressor decompress,Merger &mm)
|
|
||||||
{
|
|
||||||
int partial = mm.partial;
|
|
||||||
// std::cout << " merge DWF partial "<<partial <<std::endl;
|
|
||||||
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
|
|
||||||
else FaceGatherSimple::MergeFace(decompress,mm);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class decompressor,class Decompression>
|
|
||||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
|
||||||
{
|
|
||||||
int partial = dd.partial;
|
|
||||||
// std::cout << " decompress DWF partial "<<partial <<std::endl;
|
|
||||||
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
|
|
||||||
else FaceGatherSimple::DecompressFace(decompress,dd);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// optimised versions supporting half precision too??? Deprecate
|
// optimised versions supporting half precision too??? Deprecate
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -242,8 +39,7 @@ public:
|
|||||||
|
|
||||||
//Could make FaceGather a template param, but then behaviour is runtime not compile time
|
//Could make FaceGather a template param, but then behaviour is runtime not compile time
|
||||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
||||||
class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs
|
class WilsonCompressorTemplate : public FaceGatherSimple
|
||||||
// : public FaceGatherSimple
|
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|||||||
@@ -38,8 +38,6 @@ public:
|
|||||||
static int MortonOrder;
|
static int MortonOrder;
|
||||||
static const std::vector<int> directions;
|
static const std::vector<int> directions;
|
||||||
static const std::vector<int> displacements;
|
static const std::vector<int> displacements;
|
||||||
static std::vector<int> MakeDirections(void);
|
|
||||||
static std::vector<int> MakeDisplacements(void);
|
|
||||||
static const int npoint = 8;
|
static const int npoint = 8;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -167,6 +165,12 @@ public:
|
|||||||
StencilImpl Stencil;
|
StencilImpl Stencil;
|
||||||
StencilImpl StencilEven;
|
StencilImpl StencilEven;
|
||||||
StencilImpl StencilOdd;
|
StencilImpl StencilOdd;
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
|
|
||||||
// Copy of the gauge field , with even and odd subsets
|
// Copy of the gauge field , with even and odd subsets
|
||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
|
|||||||
@@ -62,8 +62,6 @@ public:
|
|||||||
static const std::vector<int> directions;
|
static const std::vector<int> directions;
|
||||||
static const std::vector<int> displacements;
|
static const std::vector<int> displacements;
|
||||||
static constexpr int npoint = 8;
|
static constexpr int npoint = 8;
|
||||||
static std::vector<int> MakeDirections(void);
|
|
||||||
static std::vector<int> MakeDisplacements(void);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@@ -206,7 +204,14 @@ public:
|
|||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
|
||||||
|
void SloppyComms(int sloppy)
|
||||||
|
{
|
||||||
|
Stencil.SetSloppyComms(sloppy);
|
||||||
|
StencilEven.SetSloppyComms(sloppy);
|
||||||
|
StencilOdd.SetSloppyComms(sloppy);
|
||||||
|
}
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
|
|||||||
@@ -166,7 +166,7 @@ public:
|
|||||||
|
|
||||||
U = adj(Cshift(U, mu, -1));
|
U = adj(Cshift(U, mu, -1));
|
||||||
U = where(coor == 0, conjugate(phase) * U, U);
|
U = where(coor == 0, conjugate(phase) * U, U);
|
||||||
PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
|
PokeIndex<LorentzIndex>(Uds, U, mu + 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
Frbgrid,
|
Frbgrid,
|
||||||
Ugrid,
|
Ugrid,
|
||||||
Urbgrid,
|
Urbgrid,
|
||||||
Nd*1.0,p)
|
4.0,p)
|
||||||
|
|
||||||
{
|
{
|
||||||
update(_mass,_mu);
|
update(_mass,_mu);
|
||||||
@@ -83,7 +83,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
//axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
|
//axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
|
||||||
for (int s=0;s<(int)this->mass.size();s++) {
|
for (int s=0;s<(int)this->mass.size();s++) {
|
||||||
ComplexD a = Nd*1.0+this->mass[s];
|
ComplexD a = 4.0+this->mass[s];
|
||||||
ComplexD b(0.0,this->mu[s]);
|
ComplexD b(0.0,this->mu[s]);
|
||||||
axpbg5y_ssp(out,a,in,b,in,s,s);
|
axpbg5y_ssp(out,a,in,b,in,s,s);
|
||||||
}
|
}
|
||||||
@@ -92,7 +92,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
virtual void MooeeDag(const FermionField &in, FermionField &out) {
|
virtual void MooeeDag(const FermionField &in, FermionField &out) {
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
for (int s=0;s<(int)this->mass.size();s++) {
|
for (int s=0;s<(int)this->mass.size();s++) {
|
||||||
ComplexD a = Nd*1.0+this->mass[s];
|
ComplexD a = 4.0+this->mass[s];
|
||||||
ComplexD b(0.0,-this->mu[s]);
|
ComplexD b(0.0,-this->mu[s]);
|
||||||
axpbg5y_ssp(out,a,in,b,in,s,s);
|
axpbg5y_ssp(out,a,in,b,in,s,s);
|
||||||
}
|
}
|
||||||
@@ -101,7 +101,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
for (int s=0;s<(int)this->mass.size();s++) {
|
for (int s=0;s<(int)this->mass.size();s++) {
|
||||||
RealD m = this->mass[s];
|
RealD m = this->mass[s];
|
||||||
RealD tm = this->mu[s];
|
RealD tm = this->mu[s];
|
||||||
RealD mtil = Nd*1.0+this->mass[s];
|
RealD mtil = 4.0+this->mass[s];
|
||||||
RealD sq = mtil*mtil+tm*tm;
|
RealD sq = mtil*mtil+tm*tm;
|
||||||
ComplexD a = mtil/sq;
|
ComplexD a = mtil/sq;
|
||||||
ComplexD b(0.0, -tm /sq);
|
ComplexD b(0.0, -tm /sq);
|
||||||
@@ -112,7 +112,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
for (int s=0;s<(int)this->mass.size();s++) {
|
for (int s=0;s<(int)this->mass.size();s++) {
|
||||||
RealD m = this->mass[s];
|
RealD m = this->mass[s];
|
||||||
RealD tm = this->mu[s];
|
RealD tm = this->mu[s];
|
||||||
RealD mtil = Nd*1.0+this->mass[s];
|
RealD mtil = 4.0+this->mass[s];
|
||||||
RealD sq = mtil*mtil+tm*tm;
|
RealD sq = mtil*mtil+tm*tm;
|
||||||
ComplexD a = mtil/sq;
|
ComplexD a = mtil/sq;
|
||||||
ComplexD b(0.0,tm /sq);
|
ComplexD b(0.0,tm /sq);
|
||||||
@@ -126,7 +126,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
|||||||
this->Dhop(in, out, DaggerNo);
|
this->Dhop(in, out, DaggerNo);
|
||||||
FermionField tmp(out.Grid());
|
FermionField tmp(out.Grid());
|
||||||
for (int s=0;s<(int)this->mass.size();s++) {
|
for (int s=0;s<(int)this->mass.size();s++) {
|
||||||
ComplexD a = Nd*1.0+this->mass[s];
|
ComplexD a = 4.0+this->mass[s];
|
||||||
ComplexD b(0.0,this->mu[s]);
|
ComplexD b(0.0,this->mu[s]);
|
||||||
axpbg5y_ssp(tmp,a,in,b,in,s,s);
|
axpbg5y_ssp(tmp,a,in,b,in,s,s);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -240,7 +240,7 @@ void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::ve
|
|||||||
this->ceo.resize(Ls);
|
this->ceo.resize(Ls);
|
||||||
|
|
||||||
for(int i=0; i<Ls; ++i){
|
for(int i=0; i<Ls; ++i){
|
||||||
this->bee[i] = Nd*1.0 - this->M5 + 1.0;
|
this->bee[i] = 4.0 - this->M5 + 1.0;
|
||||||
this->cee[i] = 1.0;
|
this->cee[i] = 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,486 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion2plus1D.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
|
|
||||||
#include <Grid/perfmon/PerfCount.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
// 5d lattice for DWF.
|
|
||||||
template<class Impl>
|
|
||||||
TwoSpinWilsonFermion3plus15D<Impl>::TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
|
|
||||||
GridCartesian &FourDimGrid,
|
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
GridCartesian &ThreeDimGrid,
|
|
||||||
GridRedBlackCartesian &ThreeDimRedBlackGrid,
|
|
||||||
RealD _M5,const ImplParams &p) :
|
|
||||||
Kernels(p),
|
|
||||||
_FourDimGrid (&FourDimGrid),
|
|
||||||
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
|
|
||||||
_ThreeDimGrid (&ThreeDimGrid),
|
|
||||||
_ThreeDimRedBlackGrid(&ThreeDimRedBlackGrid),
|
|
||||||
Stencil (_FourDimGrid,npoint,Even,directions,displacements,p),
|
|
||||||
StencilEven(_FourDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
|
|
||||||
StencilOdd (_FourDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
|
|
||||||
M5(_M5),
|
|
||||||
Umu(_ThreeDimGrid),
|
|
||||||
UmuEven(_ThreeDimRedBlackGrid),
|
|
||||||
UmuOdd (_ThreeDimRedBlackGrid),
|
|
||||||
_tmp(&FourDimRedBlackGrid),
|
|
||||||
Dirichlet(0)
|
|
||||||
{
|
|
||||||
// some assertions
|
|
||||||
assert(FourDimGrid._ndimension==Nd+1);
|
|
||||||
assert(ThreeDimGrid._ndimension==Nd);
|
|
||||||
assert(ThreeDimRedBlackGrid._ndimension==Nd);
|
|
||||||
assert(FourDimRedBlackGrid._ndimension==Nd+1);
|
|
||||||
assert(FourDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
|
|
||||||
|
|
||||||
// extent of fifth dim and not spread out
|
|
||||||
Ls=FourDimGrid._fdimensions[0];
|
|
||||||
assert(FourDimRedBlackGrid._fdimensions[0]==Ls);
|
|
||||||
assert(FourDimGrid._processors[0] ==1);
|
|
||||||
assert(FourDimRedBlackGrid._processors[0] ==1);
|
|
||||||
|
|
||||||
// Other dimensions must match the decomposition of the four-D fields
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
|
|
||||||
assert(FourDimGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
|
|
||||||
assert(FourDimRedBlackGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
|
|
||||||
assert(ThreeDimRedBlackGrid._processors[d] ==ThreeDimGrid._processors[d]);
|
|
||||||
|
|
||||||
assert(FourDimGrid._fdimensions[d+1] ==ThreeDimGrid._fdimensions[d]);
|
|
||||||
assert(FourDimRedBlackGrid._fdimensions[d+1]==ThreeDimGrid._fdimensions[d]);
|
|
||||||
assert(ThreeDimRedBlackGrid._fdimensions[d] ==ThreeDimGrid._fdimensions[d]);
|
|
||||||
|
|
||||||
assert(FourDimGrid._simd_layout[d+1] ==ThreeDimGrid._simd_layout[d]);
|
|
||||||
assert(FourDimRedBlackGrid._simd_layout[d+1]==ThreeDimGrid._simd_layout[d]);
|
|
||||||
assert(ThreeDimRedBlackGrid._simd_layout[d] ==ThreeDimGrid._simd_layout[d]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( p.dirichlet.size() == Nd+1) {
|
|
||||||
Coordinate block = p.dirichlet;
|
|
||||||
for(int d=0;d<Nd+1;d++) {
|
|
||||||
if ( block[d] ){
|
|
||||||
Dirichlet = 1;
|
|
||||||
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
|
|
||||||
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
|
|
||||||
Block = block;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Coordinate block(Nd+1,0);
|
|
||||||
Block = block;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dimension zero of the five-d is the Ls direction
|
|
||||||
assert(FourDimRedBlackGrid._simd_layout[0]==1);
|
|
||||||
assert(FourDimGrid._simd_layout[0] ==1);
|
|
||||||
|
|
||||||
// Allocate the required comms buffer
|
|
||||||
ImportGauge(_Umu);
|
|
||||||
// Build lists of exterior only nodes
|
|
||||||
int LLs = FourDimGrid._rdimensions[0];
|
|
||||||
int vol3;
|
|
||||||
vol3=ThreeDimGrid.oSites();
|
|
||||||
Stencil.BuildSurfaceList(LLs,vol3);
|
|
||||||
|
|
||||||
vol3=ThreeDimRedBlackGrid.oSites();
|
|
||||||
StencilEven.BuildSurfaceList(LLs,vol3);
|
|
||||||
StencilOdd.BuildSurfaceList(LLs,vol3);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::ImportGauge(const GaugeField &_Umu)
|
|
||||||
{
|
|
||||||
GaugeField HUmu(_Umu.Grid());
|
|
||||||
HUmu = _Umu*(-0.5);
|
|
||||||
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
|
|
||||||
pickCheckerboard(Even,UmuEven,Umu);
|
|
||||||
pickCheckerboard(Odd ,UmuOdd,Umu);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
|
||||||
{
|
|
||||||
int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
|
|
||||||
// we drop off the innermost fifth dimension
|
|
||||||
// assert( (disp==1)||(disp==-1) );
|
|
||||||
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
|
||||||
|
|
||||||
int skip = (disp==1) ? 0 : 1;
|
|
||||||
int dirdisp = dir+skip*Nd;
|
|
||||||
int gamma = dir+(1-skip)*Nd;
|
|
||||||
|
|
||||||
Compressor compressor(DaggerNo);
|
|
||||||
Stencil.HaloExchange(in,compressor);
|
|
||||||
|
|
||||||
uint64_t Nsite = Umu.Grid()->oSites();
|
|
||||||
Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
|
|
||||||
|
|
||||||
};
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
|
|
||||||
{
|
|
||||||
Compressor compressor(DaggerNo);
|
|
||||||
Stencil.HaloExchange(in,compressor);
|
|
||||||
uint64_t Nsite = Umu.Grid()->oSites();
|
|
||||||
Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DerivInternal(StencilImpl & st,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
GaugeField &mat,
|
|
||||||
const FermionField &A,
|
|
||||||
const FermionField &B,
|
|
||||||
int dag)
|
|
||||||
{
|
|
||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
|
||||||
|
|
||||||
conformable(st.Grid(),A.Grid());
|
|
||||||
conformable(st.Grid(),B.Grid());
|
|
||||||
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
FermionField Btilde(B.Grid());
|
|
||||||
FermionField Atilde(B.Grid());
|
|
||||||
|
|
||||||
st.HaloExchange(B,compressor);
|
|
||||||
|
|
||||||
Atilde=A;
|
|
||||||
int LLs = B.Grid()->_rdimensions[0];
|
|
||||||
|
|
||||||
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
// Flip gamma if dag
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
int gamma = mu;
|
|
||||||
if (!dag) gamma += Nd;
|
|
||||||
|
|
||||||
////////////////////////
|
|
||||||
// Call the single hop
|
|
||||||
////////////////////////
|
|
||||||
|
|
||||||
int Usites = U.Grid()->oSites();
|
|
||||||
|
|
||||||
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
|
|
||||||
|
|
||||||
////////////////////////////
|
|
||||||
// spin trace outer product
|
|
||||||
////////////////////////////
|
|
||||||
Impl::InsertForce5D(mat, Btilde, Atilde, mu);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDeriv(GaugeField &mat,
|
|
||||||
const FermionField &A,
|
|
||||||
const FermionField &B,
|
|
||||||
int dag)
|
|
||||||
{
|
|
||||||
conformable(A.Grid(),FermionGrid());
|
|
||||||
conformable(A.Grid(),B.Grid());
|
|
||||||
|
|
||||||
//conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment
|
|
||||||
|
|
||||||
mat.Checkerboard() = A.Checkerboard();
|
|
||||||
// mat.checkerboard = A.checkerboard;
|
|
||||||
|
|
||||||
DerivInternal(Stencil,Umu,mat,A,B,dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivEO(GaugeField &mat,
|
|
||||||
const FermionField &A,
|
|
||||||
const FermionField &B,
|
|
||||||
int dag)
|
|
||||||
{
|
|
||||||
conformable(A.Grid(),FermionRedBlackGrid());
|
|
||||||
conformable(A.Grid(),B.Grid());
|
|
||||||
|
|
||||||
assert(B.Checkerboard()==Odd);
|
|
||||||
assert(A.Checkerboard()==Even);
|
|
||||||
mat.Checkerboard() = Even;
|
|
||||||
|
|
||||||
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivOE(GaugeField &mat,
|
|
||||||
const FermionField &A,
|
|
||||||
const FermionField &B,
|
|
||||||
int dag)
|
|
||||||
{
|
|
||||||
conformable(A.Grid(),FermionRedBlackGrid());
|
|
||||||
conformable(A.Grid(),B.Grid());
|
|
||||||
|
|
||||||
assert(B.Checkerboard()==Even);
|
|
||||||
assert(A.Checkerboard()==Odd);
|
|
||||||
mat.Checkerboard() = Odd;
|
|
||||||
|
|
||||||
DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternal(StencilImpl & st,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
DhopInternalSerialComms(st,U,in,out,dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
GRID_TRACE("DhopInternalOverlappedComms");
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
|
||||||
int len = U.Grid()->oSites();
|
|
||||||
|
|
||||||
/////////////////////////////
|
|
||||||
// Start comms // Gather intranode and extra node differentiated??
|
|
||||||
/////////////////////////////
|
|
||||||
{
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D gather " <<std::endl;
|
|
||||||
GRID_TRACE("Gather");
|
|
||||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
|
||||||
}
|
|
||||||
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Communicate Begin " <<std::endl;
|
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
/////////////////////////////
|
|
||||||
// Overlap with comms
|
|
||||||
/////////////////////////////
|
|
||||||
st.CommunicateBegin(requests);
|
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/////////////////////////////
|
|
||||||
// do the compute interior
|
|
||||||
/////////////////////////////
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
GRID_TRACE("DhopDagInterior");
|
|
||||||
Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
|
||||||
} else {
|
|
||||||
GRID_TRACE("DhopInterior");
|
|
||||||
Kernels::DhopKernel (st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
|
||||||
}
|
|
||||||
|
|
||||||
//ifdef GRID_ACCELERATED
|
|
||||||
#if 0
|
|
||||||
/////////////////////////////
|
|
||||||
// Overlap with comms -- on GPU the interior kernel call is nonblocking
|
|
||||||
/////////////////////////////
|
|
||||||
st.CommunicateBegin(requests);
|
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////
|
|
||||||
// Complete comms
|
|
||||||
/////////////////////////////
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Comms Complete " <<std::endl;
|
|
||||||
st.CommunicateComplete(requests);
|
|
||||||
// traceStop(id);
|
|
||||||
|
|
||||||
/////////////////////////////
|
|
||||||
// do the compute exterior
|
|
||||||
/////////////////////////////
|
|
||||||
{
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Comms Merge " <<std::endl;
|
|
||||||
GRID_TRACE("Merge");
|
|
||||||
st.CommsMerge(compressor);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Exterior " <<std::endl;
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
GRID_TRACE("DhopDagExterior");
|
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
|
||||||
} else {
|
|
||||||
GRID_TRACE("DhopExterior");
|
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
|
||||||
}
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
GRID_TRACE("DhopInternalSerialComms");
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
|
||||||
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Halo exch " <<std::endl;
|
|
||||||
{
|
|
||||||
GRID_TRACE("HaloExchange");
|
|
||||||
st.HaloExchangeOpt(in,compressor);
|
|
||||||
}
|
|
||||||
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Dhop " <<std::endl;
|
|
||||||
if (dag == DaggerYes) {
|
|
||||||
GRID_TRACE("DhopDag");
|
|
||||||
Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
|
||||||
} else {
|
|
||||||
GRID_TRACE("Dhop");
|
|
||||||
Kernels::DhopKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
|
||||||
}
|
|
||||||
// std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
|
|
||||||
conformable(in.Grid(),out.Grid()); // drops the cb check
|
|
||||||
|
|
||||||
assert(in.Checkerboard()==Even);
|
|
||||||
out.Checkerboard() = Odd;
|
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
|
|
||||||
conformable(in.Grid(),out.Grid()); // drops the cb check
|
|
||||||
|
|
||||||
assert(in.Checkerboard()==Odd);
|
|
||||||
out.Checkerboard() = Even;
|
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopComms(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
int dag =0 ;
|
|
||||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
|
||||||
conformable(in.Grid(),out.Grid());
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
Compressor compressor(dag);
|
|
||||||
Stencil.HaloExchangeOpt(in,compressor);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
|
|
||||||
{
|
|
||||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
|
||||||
conformable(in.Grid(),out.Grid());
|
|
||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
|
||||||
Kernels::DhopKernel(Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
|
||||||
conformable(in.Grid(),out.Grid());
|
|
||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,in,out,dag);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
out.Checkerboard()=in.Checkerboard();
|
|
||||||
Dhop(in,out,dag); // -0.5 is included
|
|
||||||
axpy(out,Nd*1.0-M5,in,out);
|
|
||||||
}
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::Meooe(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
if (in.Checkerboard() == Odd) {
|
|
||||||
DhopEO(in, out, DaggerNo);
|
|
||||||
} else {
|
|
||||||
DhopOE(in, out, DaggerNo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
if (in.Checkerboard() == Odd) {
|
|
||||||
DhopEO(in, out, DaggerYes);
|
|
||||||
} else {
|
|
||||||
DhopOE(in, out, DaggerYes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::Mooee(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
typename FermionField::scalar_type scal(Nd*1.0 + M5);
|
|
||||||
out = scal * in;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
Mooee(in, out);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
out = (1.0/(Nd*1.0 + M5))*in;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
MooeeInv(in,out);
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,441 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////
|
|
||||||
// Generic implementation; move to different file?
|
|
||||||
////////////////////////////////////////////
|
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
|
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
|
||||||
if (SE->_is_local) { \
|
|
||||||
int perm= SE->_permute; \
|
|
||||||
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
|
|
||||||
spProj(chi,tmp); \
|
|
||||||
} else { \
|
|
||||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
|
||||||
} \
|
|
||||||
acceleratorSynchronise(); \
|
|
||||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
|
||||||
Recon(result, Uchi);
|
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \
|
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
|
||||||
if (SE->_is_local) { \
|
|
||||||
int perm= SE->_permute; \
|
|
||||||
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
|
|
||||||
spProj(chi,tmp); \
|
|
||||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
|
||||||
Recon(result, Uchi); \
|
|
||||||
} \
|
|
||||||
acceleratorSynchronise();
|
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
|
|
||||||
SE = st.GetEntry(ptype, Dir, sF); \
|
|
||||||
if (!SE->_is_local ) { \
|
|
||||||
auto chi = coalescedRead(buf[SE->_offset],lane); \
|
|
||||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
|
||||||
Recon(result, Uchi); \
|
|
||||||
nmu++; \
|
|
||||||
} \
|
|
||||||
acceleratorSynchronise();
|
|
||||||
|
|
||||||
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
|
|
||||||
if (SE->_is_local ) { \
|
|
||||||
int perm= SE->_permute; \
|
|
||||||
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
|
|
||||||
spProj(chi,tmp); \
|
|
||||||
} else { \
|
|
||||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
|
||||||
} \
|
|
||||||
acceleratorSynchronise(); \
|
|
||||||
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
|
|
||||||
Recon(result, Uchi);
|
|
||||||
|
|
||||||
#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \
|
|
||||||
if (gamma == Dir) { \
|
|
||||||
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon); \
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
// All legs kernels ; comms then compute
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
template <class Impl> accelerator_inline
|
|
||||||
void TwoSpinWilsonKernels<Impl>::DhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
|
||||||
calcSpinor chi;
|
|
||||||
calcSpinor Uchi;
|
|
||||||
calcSpinor result;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
|
||||||
GENERIC_STENCIL_LEG(Xp,pauliProjXp,pauliAssign);
|
|
||||||
GENERIC_STENCIL_LEG(Yp,pauliProjYp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Zp,pauliProjZp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Xm,pauliProjXm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Ym,pauliProjYm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Zm,pauliProjZm,pauliAdd);
|
|
||||||
coalescedWrite(out[sF],result,lane);
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
|
||||||
void TwoSpinWilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
|
||||||
calcSpinor chi;
|
|
||||||
// calcSpinor *chi_p;
|
|
||||||
calcSpinor Uchi;
|
|
||||||
calcSpinor result;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
|
||||||
GENERIC_STENCIL_LEG(Xm,pauliProjXp,pauliAssign);
|
|
||||||
GENERIC_STENCIL_LEG(Ym,pauliProjYp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Zm,pauliProjZp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Xp,pauliProjXm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Yp,pauliProjYm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG(Zp,pauliProjZm,pauliAdd);
|
|
||||||
coalescedWrite(out[sF], result,lane);
|
|
||||||
};
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
// Interior kernels
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
template <class Impl> accelerator_inline
|
|
||||||
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
|
||||||
calcSpinor chi;
|
|
||||||
// calcSpinor *chi_p;
|
|
||||||
calcSpinor Uchi;
|
|
||||||
calcSpinor result;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
|
||||||
|
|
||||||
result=Zero();
|
|
||||||
GENERIC_STENCIL_LEG_INT(Xp,pauliProjXp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Yp,pauliProjYp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Zp,pauliProjZp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Xm,pauliProjXm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Ym,pauliProjYm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Zm,pauliProjZm,pauliAdd);
|
|
||||||
coalescedWrite(out[sF], result,lane);
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
|
||||||
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
|
||||||
|
|
||||||
calcSpinor chi;
|
|
||||||
// calcSpinor *chi_p;
|
|
||||||
calcSpinor Uchi;
|
|
||||||
calcSpinor result;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
result=Zero();
|
|
||||||
GENERIC_STENCIL_LEG_INT(Xm,pauliProjXp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Ym,pauliProjYp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Zm,pauliProjZp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Xp,pauliProjXm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Yp,pauliProjYm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_INT(Zp,pauliProjZm,pauliAdd);
|
|
||||||
coalescedWrite(out[sF], result,lane);
|
|
||||||
};
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
// Exterior kernels
|
|
||||||
////////////////////////////////////////////////////////////////////
|
|
||||||
template <class Impl> accelerator_inline
|
|
||||||
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
|
||||||
// calcSpinor *chi_p;
|
|
||||||
calcSpinor Uchi;
|
|
||||||
calcSpinor result;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
int nmu=0;
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
|
||||||
result=Zero();
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZm,pauliAdd);
|
|
||||||
if ( nmu ) {
|
|
||||||
auto out_t = coalescedRead(out[sF],lane);
|
|
||||||
out_t = out_t + result;
|
|
||||||
coalescedWrite(out[sF],out_t,lane);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
|
||||||
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
|
||||||
{
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
|
||||||
// calcSpinor *chi_p;
|
|
||||||
calcSpinor Uchi;
|
|
||||||
calcSpinor result;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
int nmu=0;
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
|
||||||
result=Zero();
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZp,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYm,pauliAdd);
|
|
||||||
GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZm,pauliAdd);
|
|
||||||
if ( nmu ) {
|
|
||||||
auto out_t = coalescedRead(out[sF],lane);
|
|
||||||
out_t = out_t + result;
|
|
||||||
coalescedWrite(out[sF],out_t,lane);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#define DhopDirMacro(Dir,spProj,spRecon) \
|
|
||||||
template <class Impl> accelerator_inline \
|
|
||||||
void TwoSpinWilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF, \
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
|
||||||
{ \
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor; \
|
|
||||||
calcSpinor chi; \
|
|
||||||
calcSpinor result; \
|
|
||||||
calcSpinor Uchi; \
|
|
||||||
StencilEntry *SE; \
|
|
||||||
int ptype; \
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd(); \
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd); \
|
|
||||||
\
|
|
||||||
SE = st.GetEntry(ptype, dir, sF); \
|
|
||||||
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
|
|
||||||
coalescedWrite(out[sF], result,lane); \
|
|
||||||
}
|
|
||||||
|
|
||||||
DhopDirMacro(Xp,pauliProjXp,pauliAssign);
|
|
||||||
DhopDirMacro(Yp,pauliProjYp,pauliAssign);
|
|
||||||
DhopDirMacro(Zp,pauliProjZp,pauliAssign);
|
|
||||||
DhopDirMacro(Xm,pauliProjXm,pauliAssign);
|
|
||||||
DhopDirMacro(Ym,pauliProjYm,pauliAssign);
|
|
||||||
DhopDirMacro(Zm,pauliProjZm,pauliAssign);
|
|
||||||
|
|
||||||
template <class Impl> accelerator_inline
|
|
||||||
void TwoSpinWilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
|
||||||
{
|
|
||||||
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
|
||||||
calcSpinor chi;
|
|
||||||
calcSpinor result;
|
|
||||||
calcSpinor Uchi;
|
|
||||||
StencilEntry *SE;
|
|
||||||
int ptype;
|
|
||||||
const int Nsimd = SiteSpinor::Nsimd();
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd);
|
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, dir, sF);
|
|
||||||
GENERIC_DHOPDIR_LEG(Xp,pauliProjXp,pauliAssign);
|
|
||||||
GENERIC_DHOPDIR_LEG(Yp,pauliProjYp,pauliAssign);
|
|
||||||
GENERIC_DHOPDIR_LEG(Zp,pauliProjZp,pauliAssign);
|
|
||||||
GENERIC_DHOPDIR_LEG(Xm,pauliProjXm,pauliAssign);
|
|
||||||
GENERIC_DHOPDIR_LEG(Ym,pauliProjYm,pauliAssign);
|
|
||||||
GENERIC_DHOPDIR_LEG(Zm,pauliProjZm,pauliAssign);
|
|
||||||
coalescedWrite(out[sF], result,lane);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
|
|
||||||
int Nsite, const FermionField &in, std::vector<FermionField> &out)
|
|
||||||
{
|
|
||||||
autoView(U_v ,U,AcceleratorRead);
|
|
||||||
autoView(in_v ,in,AcceleratorRead);
|
|
||||||
autoView(st_v ,st,AcceleratorRead);
|
|
||||||
|
|
||||||
autoView(out_Xm,out[0],AcceleratorWrite);
|
|
||||||
autoView(out_Ym,out[1],AcceleratorWrite);
|
|
||||||
autoView(out_Zm,out[2],AcceleratorWrite);
|
|
||||||
autoView(out_Xp,out[4],AcceleratorWrite);
|
|
||||||
autoView(out_Yp,out[5],AcceleratorWrite);
|
|
||||||
autoView(out_Zp,out[6],AcceleratorWrite);
|
|
||||||
auto CBp=st.CommBuf();
|
|
||||||
accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
|
|
||||||
int sU=sss/Ls;
|
|
||||||
int sF =sss;
|
|
||||||
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
|
|
||||||
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
|
|
||||||
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
|
|
||||||
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,3);
|
|
||||||
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,4);
|
|
||||||
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,5);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
|
|
||||||
int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
|
|
||||||
{
|
|
||||||
assert(dirdisp<=5);
|
|
||||||
assert(dirdisp>=0);
|
|
||||||
|
|
||||||
autoView(U_v ,U ,AcceleratorRead);
|
|
||||||
autoView(in_v ,in ,AcceleratorRead);
|
|
||||||
autoView(out_v,out,AcceleratorWrite);
|
|
||||||
autoView(st_v ,st ,AcceleratorRead);
|
|
||||||
auto CBp=st.CommBuf();
|
|
||||||
#define LoopBody(Dir) \
|
|
||||||
case Dir : \
|
|
||||||
accelerator_for(ss,Nsite,Simd::Nsimd(),{ \
|
|
||||||
for(int s=0;s<Ls;s++){ \
|
|
||||||
int sU=ss; \
|
|
||||||
int sF = s+Ls*sU; \
|
|
||||||
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
|
|
||||||
} \
|
|
||||||
}); \
|
|
||||||
break;
|
|
||||||
|
|
||||||
switch(gamma){
|
|
||||||
LoopBody(Xp);
|
|
||||||
LoopBody(Yp);
|
|
||||||
LoopBody(Zp);
|
|
||||||
|
|
||||||
LoopBody(Xm);
|
|
||||||
LoopBody(Ym);
|
|
||||||
LoopBody(Zm);
|
|
||||||
default:
|
|
||||||
assert(0);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#undef LoopBody
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define KERNEL_CALLNB(A) \
|
|
||||||
const uint64_t NN = Nsite*Ls; \
|
|
||||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
|
||||||
int sF = ss; \
|
|
||||||
int sU = ss/Ls; \
|
|
||||||
TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
|
||||||
});
|
|
||||||
|
|
||||||
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
|
||||||
|
|
||||||
#define KERNEL_CALL_EXT(A) \
|
|
||||||
const uint64_t sz = st.surface_list.size(); \
|
|
||||||
auto ptr = &st.surface_list[0]; \
|
|
||||||
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
|
|
||||||
int sF = ptr[ss]; \
|
|
||||||
int sU = sF/Ls; \
|
|
||||||
TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
|
||||||
}); \
|
|
||||||
accelerator_barrier();
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonKernels<Impl>::DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
|
||||||
int interior,int exterior)
|
|
||||||
{
|
|
||||||
autoView(U_v , U,AcceleratorRead);
|
|
||||||
autoView(in_v , in,AcceleratorRead);
|
|
||||||
autoView(out_v,out,AcceleratorWrite);
|
|
||||||
autoView(st_v , st,AcceleratorRead);
|
|
||||||
|
|
||||||
if( interior && exterior ) {
|
|
||||||
acceleratorFenceComputeStream();
|
|
||||||
KERNEL_CALL(GenericDhopSite);
|
|
||||||
return;
|
|
||||||
} else if( interior ) {
|
|
||||||
KERNEL_CALLNB(GenericDhopSiteInt);
|
|
||||||
return;
|
|
||||||
} else if( exterior ) {
|
|
||||||
// // dependent on result of merge
|
|
||||||
acceleratorFenceComputeStream();
|
|
||||||
KERNEL_CALL_EXT(GenericDhopSiteExt);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void TwoSpinWilsonKernels<Impl>::DhopDagKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
|
||||||
int interior,int exterior)
|
|
||||||
{
|
|
||||||
autoView(U_v ,U,AcceleratorRead);
|
|
||||||
autoView(in_v ,in,AcceleratorRead);
|
|
||||||
autoView(out_v,out,AcceleratorWrite);
|
|
||||||
autoView(st_v ,st,AcceleratorRead);
|
|
||||||
|
|
||||||
if( interior && exterior ) {
|
|
||||||
acceleratorFenceComputeStream();
|
|
||||||
KERNEL_CALL(GenericDhopSiteDag);
|
|
||||||
return;
|
|
||||||
} else if( interior ) {
|
|
||||||
KERNEL_CALLNB(GenericDhopSiteDagInt); return;
|
|
||||||
} else if( exterior ) {
|
|
||||||
// Dependent on result of merge
|
|
||||||
acceleratorFenceComputeStream();
|
|
||||||
KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;
|
|
||||||
}
|
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef KERNEL_CALLNB
|
|
||||||
#undef KERNEL_CALL
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
@@ -61,7 +61,7 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
|
|||||||
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
|
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
|
||||||
} else {
|
} else {
|
||||||
csw_r = _csw_r * 0.5;
|
csw_r = _csw_r * 0.5;
|
||||||
diag_mass = Nd*1.0 + _mass;
|
diag_mass = 4.0 + _mass;
|
||||||
}
|
}
|
||||||
csw_t = _csw_t * 0.5;
|
csw_t = _csw_t * 0.5;
|
||||||
|
|
||||||
@@ -297,9 +297,9 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
|
|||||||
{
|
{
|
||||||
if (mu == nu)
|
if (mu == nu)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
RealD factor;
|
RealD factor;
|
||||||
if (nu == (Nd-1) || mu == (Nd-1)) // This was a bug - surely mu/nu is NEVER 4 but rather (Nd-1)=3 ??
|
if (nu == 4 || mu == 4)
|
||||||
{
|
{
|
||||||
factor = 2.0 * csw_t;
|
factor = 2.0 * csw_t;
|
||||||
}
|
}
|
||||||
@@ -307,11 +307,9 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
|
|||||||
{
|
{
|
||||||
factor = 2.0 * csw_r;
|
factor = 2.0 * csw_r;
|
||||||
}
|
}
|
||||||
if ( mu < Nd && nu < Nd ) { // Allow to restrict range to Nd=3, but preserve orders of SigmaMuNu in table by counting ALL
|
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||||
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||||
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
|
||||||
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
|
|
||||||
}
|
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -63,10 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
Dirichlet(0)
|
Dirichlet(0)
|
||||||
{
|
{
|
||||||
// some assertions
|
// some assertions
|
||||||
assert(FiveDimGrid._ndimension==Nd+1);
|
assert(FiveDimGrid._ndimension==5);
|
||||||
assert(FourDimGrid._ndimension==Nd);
|
assert(FourDimGrid._ndimension==4);
|
||||||
assert(FourDimRedBlackGrid._ndimension==Nd);
|
assert(FourDimRedBlackGrid._ndimension==4);
|
||||||
assert(FiveDimRedBlackGrid._ndimension==Nd+1);
|
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||||
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
|
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
|
||||||
|
|
||||||
// extent of fifth dim and not spread out
|
// extent of fifth dim and not spread out
|
||||||
@@ -76,7 +76,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
assert(FiveDimRedBlackGrid._processors[0] ==1);
|
assert(FiveDimRedBlackGrid._processors[0] ==1);
|
||||||
|
|
||||||
// Other dimensions must match the decomposition of the four-D fields
|
// Other dimensions must match the decomposition of the four-D fields
|
||||||
for(int d=0;d<Nd;d++){
|
for(int d=0;d<4;d++){
|
||||||
|
|
||||||
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||||
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||||
@@ -93,13 +93,11 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
|
|
||||||
if ( p.dirichlet.size() == Nd+1) {
|
if ( p.dirichlet.size() == Nd+1) {
|
||||||
Coordinate block = p.dirichlet;
|
Coordinate block = p.dirichlet;
|
||||||
for(int d=0;d<Nd+1;d++) {
|
if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
|
||||||
if ( block[d] ){
|
Dirichlet = 1;
|
||||||
Dirichlet = 1;
|
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
|
||||||
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
|
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
|
||||||
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
|
Block = block;
|
||||||
Block = block;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
Coordinate block(Nd+1,0);
|
Coordinate block(Nd+1,0);
|
||||||
@@ -114,7 +112,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
assert(FiveDimGrid._simd_layout[0] ==nsimd);
|
assert(FiveDimGrid._simd_layout[0] ==nsimd);
|
||||||
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
|
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
|
||||||
|
|
||||||
for(int d=0;d<Nd;d++){
|
for(int d=0;d<4;d++){
|
||||||
assert(FourDimGrid._simd_layout[d]==1);
|
assert(FourDimGrid._simd_layout[d]==1);
|
||||||
assert(FourDimRedBlackGrid._simd_layout[d]==1);
|
assert(FourDimRedBlackGrid._simd_layout[d]==1);
|
||||||
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
||||||
@@ -185,8 +183,8 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
|||||||
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
|
||||||
|
|
||||||
int skip = (disp==1) ? 0 : 1;
|
int skip = (disp==1) ? 0 : 1;
|
||||||
int dirdisp = dir+skip*Nd;
|
int dirdisp = dir+skip*4;
|
||||||
int gamma = dir+(1-skip)*Nd;
|
int gamma = dir+(1-skip)*4;
|
||||||
|
|
||||||
Compressor compressor(DaggerNo);
|
Compressor compressor(DaggerNo);
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
@@ -485,7 +483,7 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
|
|||||||
{
|
{
|
||||||
out.Checkerboard()=in.Checkerboard();
|
out.Checkerboard()=in.Checkerboard();
|
||||||
Dhop(in,out,dag); // -0.5 is included
|
Dhop(in,out,dag); // -0.5 is included
|
||||||
axpy(out,Nd*1.0-M5,in,out);
|
axpy(out,4.0-M5,in,out);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
|
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||||
@@ -511,7 +509,7 @@ template <class Impl>
|
|||||||
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
|
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
typename FermionField::scalar_type scal(Nd*1.0 + M5);
|
typename FermionField::scalar_type scal(4.0 + M5);
|
||||||
out = scal * in;
|
out = scal * in;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -526,7 +524,7 @@ template<class Impl>
|
|||||||
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
out = (1.0/(Nd*1.0 + M5))*in;
|
out = (1.0/(4.0 + M5))*in;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@@ -637,7 +635,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
|
|||||||
A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
|
A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
|
||||||
F = eaLs * (one - Wea + (Wema - one) * mass*mass);
|
F = eaLs * (one - Wea + (Wema - one) * mass*mass);
|
||||||
F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
|
F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
|
||||||
F = F - abs(W) * sinha * (Nd* 1.0) * mass;
|
F = F - abs(W) * sinha * 4.0 * mass;
|
||||||
|
|
||||||
Bpp = (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
|
Bpp = (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
|
||||||
Bmm = (A/F) * (one - ea2Ls) * (one - Wea) * (one - mass*mass * one);
|
Bmm = (A/F) * (one - ea2Ls) * (one - Wea) * (one - mass*mass * one);
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
if (anisotropyCoeff.isAnisotropic){
|
if (anisotropyCoeff.isAnisotropic){
|
||||||
diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
|
diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
|
||||||
} else {
|
} else {
|
||||||
diag_mass = Nd*1.0 + mass;
|
diag_mass = 4.0 + mass;
|
||||||
}
|
}
|
||||||
|
|
||||||
int vol4;
|
int vol4;
|
||||||
@@ -354,8 +354,8 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
|
|||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
|
|
||||||
int skip = (disp == 1) ? 0 : 1;
|
int skip = (disp == 1) ? 0 : 1;
|
||||||
int dirdisp = dir + skip * Nd;
|
int dirdisp = dir + skip * 4;
|
||||||
int gamma = dir + (1 - skip) * Nd;
|
int gamma = dir + (1 - skip) * 4;
|
||||||
|
|
||||||
DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
|
DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
|
||||||
};
|
};
|
||||||
@@ -370,8 +370,8 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
|
|||||||
for(int disp=-1;disp<=1;disp+=2){
|
for(int disp=-1;disp<=1;disp+=2){
|
||||||
|
|
||||||
int skip = (disp == 1) ? 0 : 1;
|
int skip = (disp == 1) ? 0 : 1;
|
||||||
int dirdisp = dir + skip * Nd;
|
int dirdisp = dir + skip * 4;
|
||||||
int gamma = dir + (1 - skip) * Nd;
|
int gamma = dir + (1 - skip) * 4;
|
||||||
|
|
||||||
DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
|
DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
distance = st._distances[DIR]; \
|
distance = st._distances[DIR]; \
|
||||||
sl = st._simd_layout[direction]; \
|
sl = st._simd_layout[direction]; \
|
||||||
inplace_twist = 0; \
|
inplace_twist = 0; \
|
||||||
if(SE->_around_the_world && st.parameters.twists[DIR % Nd]){ \
|
if(SE->_around_the_world && st.parameters.twists[DIR % 4]){ \
|
||||||
if(sl == 1){ \
|
if(sl == 1){ \
|
||||||
g = (F+1) % 2; \
|
g = (F+1) % 2; \
|
||||||
}else{ \
|
}else{ \
|
||||||
|
|||||||
@@ -32,30 +32,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
const std::vector<int> ImprovedStaggeredFermion5DStatic::directions(ImprovedStaggeredFermion5DStatic::MakeDirections());
|
const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
|
||||||
const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements(ImprovedStaggeredFermion5DStatic::MakeDisplacements());
|
const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
|
||||||
std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDirections(void)
|
|
||||||
{
|
|
||||||
std::vector<int> directions(4*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
directions[d+Nd*0] = d+1;
|
|
||||||
directions[d+Nd*1] = d+1;
|
|
||||||
directions[d+Nd*2] = d+1;
|
|
||||||
directions[d+Nd*3] = d+1;
|
|
||||||
}
|
|
||||||
return directions;
|
|
||||||
}
|
|
||||||
std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDisplacements(void)
|
|
||||||
{
|
|
||||||
std::vector<int> displacements(4*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
displacements[d+Nd*0] =+1;
|
|
||||||
displacements[d+Nd*1] =-1;
|
|
||||||
displacements[d+Nd*2] =+3;
|
|
||||||
displacements[d+Nd*3] =-3;
|
|
||||||
}
|
|
||||||
return displacements;
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|||||||
@@ -32,26 +32,5 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
|
const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
|
||||||
const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
|
const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
|
||||||
std::vector<int> ImprovedStaggeredFermionStatic::MakeDirections(void)
|
|
||||||
{
|
|
||||||
std::vector<int> directions(4*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
directions[d+Nd*0] = d;
|
|
||||||
directions[d+Nd*1] = d;
|
|
||||||
directions[d+Nd*2] = d;
|
|
||||||
directions[d+Nd*3] = d;
|
|
||||||
}
|
|
||||||
return directions;
|
|
||||||
}
|
|
||||||
std::vector<int> ImprovedStaggeredFermionStatic::MakeDisplacements(void)
|
|
||||||
{
|
|
||||||
std::vector<int> displacements(4*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
displacements[d+Nd*0] =+1;
|
|
||||||
displacements[d+Nd*1] =-1;
|
|
||||||
displacements[d+Nd*2] =+3;
|
|
||||||
displacements[d+Nd*3] =-3;
|
|
||||||
}
|
|
||||||
return displacements;
|
|
||||||
}
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|||||||
@@ -30,27 +30,7 @@ directory
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
//const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
|
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
|
||||||
//const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
|
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
|
||||||
const std::vector<int> NaiveStaggeredFermionStatic::directions(NaiveStaggeredFermionStatic::MakeDirections());
|
|
||||||
const std::vector<int> NaiveStaggeredFermionStatic::displacements(NaiveStaggeredFermionStatic::MakeDisplacements());
|
|
||||||
std::vector<int> NaiveStaggeredFermionStatic::MakeDirections(void)
|
|
||||||
{
|
|
||||||
std::vector<int> directions(4*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
directions[d+Nd*0] = d;
|
|
||||||
directions[d+Nd*1] = d;
|
|
||||||
}
|
|
||||||
return directions;
|
|
||||||
}
|
|
||||||
std::vector<int> NaiveStaggeredFermionStatic::MakeDisplacements(void)
|
|
||||||
{
|
|
||||||
std::vector<int> displacements(4*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
displacements[d+Nd*0] =+1;
|
|
||||||
displacements[d+Nd*1] =-1;
|
|
||||||
}
|
|
||||||
return displacements;
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|||||||
@@ -1,61 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
|
||||||
|
|
||||||
const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::directions (TwoSpinWilsonFermion3plus1DStatic::MakeDirections());
|
|
||||||
const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::displacements(TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements());
|
|
||||||
|
|
||||||
std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDirections (void)
|
|
||||||
{
|
|
||||||
std::vector<int> directions(2*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
directions[d] = d+1;
|
|
||||||
directions[d+Nd] = d+1;
|
|
||||||
}
|
|
||||||
return directions;
|
|
||||||
}
|
|
||||||
std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements(void)
|
|
||||||
{
|
|
||||||
std::vector<int> displacements(2*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
displacements[d] = +1;
|
|
||||||
displacements[d+Nd] = -1;
|
|
||||||
}
|
|
||||||
return displacements;
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class TwoSpinWilsonFermion3plus1D<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
|
||||||
|
|
||||||
Copyright (C) 2015, 2020
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution
|
|
||||||
directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/qcd/action/fermion/FermionCore.h>
|
|
||||||
#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
#include "impl.h"
|
|
||||||
template class TwoSpinWilsonKernels<IMPLEMENTATION>;
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
@@ -34,28 +34,8 @@ directory
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
|
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
|
||||||
const std::vector<int> WilsonFermion5DStatic::directions (WilsonFermion5DStatic::MakeDirections());
|
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
|
||||||
const std::vector<int> WilsonFermion5DStatic::displacements(WilsonFermion5DStatic::MakeDisplacements());
|
|
||||||
|
|
||||||
std::vector<int> WilsonFermion5DStatic::MakeDirections (void)
|
|
||||||
{
|
|
||||||
std::vector<int> directions(2*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
directions[d] = d+1;
|
|
||||||
directions[d+Nd] = d+1;
|
|
||||||
}
|
|
||||||
return directions;
|
|
||||||
}
|
|
||||||
std::vector<int> WilsonFermion5DStatic::MakeDisplacements(void)
|
|
||||||
{
|
|
||||||
std::vector<int> displacements(2*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
displacements[d] = +1;
|
|
||||||
displacements[d+Nd] = -1;
|
|
||||||
}
|
|
||||||
return displacements;
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|||||||
@@ -33,27 +33,9 @@ directory
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
const std::vector<int> WilsonFermionStatic::directions(WilsonFermionStatic::MakeDirections());
|
const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
|
||||||
const std::vector<int> WilsonFermionStatic::displacements(WilsonFermionStatic::MakeDisplacements());
|
const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
|
||||||
int WilsonFermionStatic::HandOptDslash;
|
int WilsonFermionStatic::HandOptDslash;
|
||||||
std::vector<int> WilsonFermionStatic::MakeDirections (void)
|
|
||||||
{
|
|
||||||
std::vector<int> directions(2*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
directions[d] = d;
|
|
||||||
directions[d+Nd] = d;
|
|
||||||
}
|
|
||||||
return directions;
|
|
||||||
}
|
|
||||||
std::vector<int> WilsonFermionStatic::MakeDisplacements(void)
|
|
||||||
{
|
|
||||||
std::vector<int> displacements(2*Nd);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
displacements[d] = +1;
|
|
||||||
displacements[d+Nd] = -1;
|
|
||||||
}
|
|
||||||
return displacements;
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|||||||
@@ -36,16 +36,11 @@ DWF_IMPL_LIST=" \
|
|||||||
ZWilsonImplF \
|
ZWilsonImplF \
|
||||||
ZWilsonImplD2 "
|
ZWilsonImplD2 "
|
||||||
|
|
||||||
TWOSPIN_WILSON_IMPL_LIST=" \
|
|
||||||
TwoSpinWilsonImplF \
|
|
||||||
TwoSpinWilsonImplD "
|
|
||||||
|
|
||||||
|
|
||||||
GDWF_IMPL_LIST=" \
|
GDWF_IMPL_LIST=" \
|
||||||
GparityWilsonImplF \
|
GparityWilsonImplF \
|
||||||
GparityWilsonImplD "
|
GparityWilsonImplD "
|
||||||
|
|
||||||
IMPL_LIST="$STAG_IMPL_LIST $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST $TWOSPIN_WILSON_IMPL_LIST"
|
IMPL_LIST="$STAG_IMPL_LIST $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST"
|
||||||
|
|
||||||
for impl in $IMPL_LIST
|
for impl in $IMPL_LIST
|
||||||
do
|
do
|
||||||
@@ -115,12 +110,7 @@ do
|
|||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
CC_LIST="TwoSpinWilsonFermion3plus1DInstantiation.cc.master TwoSpinWilsonKernelsInstantiation.cc.master"
|
CC_LIST=" \
|
||||||
|
ImprovedStaggeredFermion5DInstantiation \
|
||||||
|
StaggeredKernelsInstantiation "
|
||||||
|
|
||||||
for impl in $TWOSPIN_WILSON_IMPL_LIST
|
|
||||||
do
|
|
||||||
for f in $CC_LIST
|
|
||||||
do
|
|
||||||
ln -f -s ../$f.cc.master $impl/$f$impl.cc
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|||||||
@@ -158,8 +158,8 @@ RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeF
|
|||||||
LatticeComplexD R(U.Grid());
|
LatticeComplexD R(U.Grid());
|
||||||
R = Zero();
|
R = Zero();
|
||||||
|
|
||||||
for(int mu=0;mu<Nd-1;mu++){
|
for(int mu=0;mu<3;mu++){
|
||||||
for(int nu=mu+1;nu<Nd;nu++){
|
for(int nu=mu+1;nu<4;nu++){
|
||||||
WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
|
WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
|
||||||
R = R + trace(F*F);
|
R = R + trace(F*F);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,220 +0,0 @@
|
|||||||
#ifndef GRID_QCD_PAULI_H
|
|
||||||
#define GRID_QCD_PAULI_H
|
|
||||||
|
|
||||||
#include <array>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
//
|
|
||||||
/*
|
|
||||||
* Pauli basis
|
|
||||||
* sx sy sz ident
|
|
||||||
* (0 1) , (0 -i) , ( 1 0 )
|
|
||||||
* (1 0) (i 0) ( 0 -1)
|
|
||||||
*
|
|
||||||
* These are hermitian.
|
|
||||||
*
|
|
||||||
* Also supply wilson "projectors" (1+/-sx), (1+/-sy), (1+/-sz)
|
|
||||||
*
|
|
||||||
* spPauliProjXm
|
|
||||||
* spPauliProjYm etc...
|
|
||||||
*/
|
|
||||||
class Pauli {
|
|
||||||
public:
|
|
||||||
GRID_SERIALIZABLE_ENUM(Algebra, undef,
|
|
||||||
SigmaX , 0,
|
|
||||||
MinusSigmaX , 1,
|
|
||||||
SigmaY , 2,
|
|
||||||
MinusSigmaY , 3,
|
|
||||||
SigmaZ , 4,
|
|
||||||
MinusSigmaZ , 5,
|
|
||||||
Identity , 6,
|
|
||||||
MinusIdentity , 7);
|
|
||||||
|
|
||||||
static constexpr unsigned int nPauli = 8;
|
|
||||||
static const std::array<const char *, nPauli> name;
|
|
||||||
static const std::array<std::array<Algebra, nPauli>, nPauli> mul;
|
|
||||||
static const std::array<Algebra, nPauli> adj;
|
|
||||||
static const std::array<const Pauli, 4> gmu;
|
|
||||||
static const std::array<const Pauli, 16> gall;
|
|
||||||
Algebra g;
|
|
||||||
public:
|
|
||||||
accelerator Pauli(Algebra initg): g(initg) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
#define CopyImplementation(iTemplate,multPauli,multFlavour) \
|
|
||||||
template<class vtype> \
|
|
||||||
accelerator_inline void multPauli(iTemplate<vtype, Nhs> &ret, const iTemplate<vtype, Nhs> &rhs) { \
|
|
||||||
multFlavour(ret,rhs); \
|
|
||||||
}
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliSigmaX,multFlavourSigmaX);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliSigmaX,lmultFlavourSigmaX);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliSigmaX,rmultFlavourSigmaX);
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliMinusSigmaX ,multFlavourMinusSigmaX);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliMinusSigmaX,lmultFlavourMinusSigmaX);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliMinusSigmaX,rmultFlavourMinusSigmaX);
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliSigmaY,multFlavourSigmaY);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliSigmaY,lmultFlavourSigmaY);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliSigmaY,rmultFlavourSigmaY);
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliMinusSigmaY ,multFlavourMinusSigmaY);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliMinusSigmaY,lmultFlavourMinusSigmaY);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliMinusSigmaY,rmultFlavourMinusSigmaY);
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliSigmaZ,multFlavourSigmaZ);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliSigmaZ,lmultFlavourSigmaZ);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliSigmaZ,rmultFlavourSigmaZ);
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliMinusSigmaZ ,multFlavourMinusSigmaZ);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliMinusSigmaZ,lmultFlavourMinusSigmaZ);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliMinusSigmaZ,rmultFlavourMinusSigmaZ);
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliIdentity,multFlavourIdentity);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliIdentity,lmultFlavourIdentity);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliIdentity,rmultFlavourIdentity);
|
|
||||||
|
|
||||||
CopyImplementation(iVector,multPauliMinusIdentity ,multFlavourMinusIdentity);
|
|
||||||
CopyImplementation(iMatrix,lmultPauliMinusIdentity,lmultFlavourMinusIdentity);
|
|
||||||
CopyImplementation(iMatrix,rmultPauliMinusIdentity,rmultFlavourMinusIdentity);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* sx sy sz ident
|
|
||||||
* (0 1) , (0 -i) , ( 1 0 )
|
|
||||||
* (1 0) (i 0) ( 0 -1)
|
|
||||||
*/
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
|
|
||||||
{
|
|
||||||
hspin(0)=fspin(0)+fspin(1);
|
|
||||||
hspin(1)=fspin(1)+fspin(0);
|
|
||||||
}
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
|
|
||||||
{
|
|
||||||
hspin(0)=fspin(0)-fspin(1);
|
|
||||||
hspin(1)=fspin(1)-fspin(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
|
|
||||||
{
|
|
||||||
hspin(0)=fspin(0)-timesI(fspin(1));
|
|
||||||
hspin(1)=fspin(1)+timesI(fspin(0));
|
|
||||||
}
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
|
|
||||||
{
|
|
||||||
hspin(0)=fspin(0)+timesI(fspin(1));
|
|
||||||
hspin(1)=fspin(1)-timesI(fspin(0));
|
|
||||||
}
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
|
|
||||||
{
|
|
||||||
hspin(0)=fspin(0)+fspin(0);
|
|
||||||
hspin(1)=Zero();
|
|
||||||
}
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
|
|
||||||
{
|
|
||||||
hspin(0)=Zero();
|
|
||||||
hspin(1)=fspin(1)+fspin(1);
|
|
||||||
}
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAssign(iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
|
|
||||||
{
|
|
||||||
fspin = hspin;
|
|
||||||
}
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAdd (iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
|
|
||||||
{
|
|
||||||
fspin = fspin + hspin;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vtype>
|
|
||||||
accelerator_inline auto operator*(const Pauli &G, const iVector<vtype, Nhs> &arg)
|
|
||||||
->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Nhs>, PauliIndex>::value, iVector<vtype, Nhs>>::type
|
|
||||||
{
|
|
||||||
iVector<vtype, Nhs> ret;
|
|
||||||
|
|
||||||
switch (G.g)
|
|
||||||
{
|
|
||||||
case Pauli::Algebra::SigmaX:
|
|
||||||
multPauliSigmaX(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaX:
|
|
||||||
multPauliMinusSigmaX(ret, arg); break;
|
|
||||||
case Pauli::Algebra::SigmaY:
|
|
||||||
multPauliSigmaY(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaY:
|
|
||||||
multPauliMinusSigmaY(ret, arg); break;
|
|
||||||
case Pauli::Algebra::SigmaZ:
|
|
||||||
multPauliSigmaZ(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaZ:
|
|
||||||
multPauliMinusSigmaZ(ret, arg); break;
|
|
||||||
case Pauli::Algebra::Identity:
|
|
||||||
multPauliIdentity(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusIdentity:
|
|
||||||
multPauliMinusIdentity(ret, arg); break;
|
|
||||||
default: assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vtype>
|
|
||||||
accelerator_inline auto operator*(const Pauli &G, const iMatrix<vtype, Nhs> &arg)
|
|
||||||
->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
|
|
||||||
{
|
|
||||||
iMatrix<vtype, Nhs> ret;
|
|
||||||
|
|
||||||
switch (G.g)
|
|
||||||
{
|
|
||||||
case Pauli::Algebra::SigmaX:
|
|
||||||
lmultPauliSigmaX(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaX:
|
|
||||||
lmultPauliMinusSigmaX(ret, arg); break;
|
|
||||||
case Pauli::Algebra::SigmaY:
|
|
||||||
lmultPauliSigmaY(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaY:
|
|
||||||
lmultPauliMinusSigmaY(ret, arg); break;
|
|
||||||
case Pauli::Algebra::SigmaZ:
|
|
||||||
lmultPauliSigmaZ(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaZ:
|
|
||||||
lmultPauliMinusSigmaZ(ret, arg); break;
|
|
||||||
case Pauli::Algebra::Identity:
|
|
||||||
lmultPauliIdentity(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusIdentity:
|
|
||||||
lmultPauliMinusIdentity(ret, arg); break;
|
|
||||||
default: assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vtype>
|
|
||||||
accelerator_inline auto operator*(const iMatrix<vtype, Nhs> &arg, const Pauli &G)
|
|
||||||
->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
|
|
||||||
{
|
|
||||||
iMatrix<vtype, Nhs> ret;
|
|
||||||
|
|
||||||
switch (G.g)
|
|
||||||
{
|
|
||||||
case Pauli::Algebra::SigmaX:
|
|
||||||
rmultPauliSigmaX(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaX:
|
|
||||||
rmultPauliMinusSigmaX(ret, arg); break;
|
|
||||||
case Pauli::Algebra::SigmaY:
|
|
||||||
rmultPauliSigmaY(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaY:
|
|
||||||
rmultPauliMinusSigmaY(ret, arg); break;
|
|
||||||
case Pauli::Algebra::SigmaZ:
|
|
||||||
rmultPauliSigmaZ(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusSigmaZ:
|
|
||||||
rmultPauliMinusSigmaZ(ret, arg); break;
|
|
||||||
case Pauli::Algebra::Identity:
|
|
||||||
rmultPauliIdentity(ret, arg); break;
|
|
||||||
case Pauli::Algebra::MinusIdentity:
|
|
||||||
rmultPauliMinusIdentity(ret, arg); break;
|
|
||||||
default: assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
#endif // GRID_QCD_GAMMA_H
|
|
||||||
@@ -179,17 +179,20 @@ public:
|
|||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// average over all x,y,z the temporal loop
|
// average over all x,y,z the temporal loop
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static ComplexD avgPolyakovLoop(const GaugeField &Umu) {
|
static ComplexD avgPolyakovLoop(const GaugeField &Umu) { //assume Nd=4
|
||||||
GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
|
GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
|
||||||
ComplexD out;
|
ComplexD out;
|
||||||
uint64_t vol = Umu.Grid()->gSites();
|
int T = Umu.Grid()->GlobalDimensions()[3];
|
||||||
int T = Umu.Grid()->GlobalDimensions()[Nd-1];
|
int X = Umu.Grid()->GlobalDimensions()[0];
|
||||||
Ut = peekLorentz(Umu,Nd-1); //Select temporal direction
|
int Y = Umu.Grid()->GlobalDimensions()[1];
|
||||||
|
int Z = Umu.Grid()->GlobalDimensions()[2];
|
||||||
|
|
||||||
|
Ut = peekLorentz(Umu,3); //Select temporal direction
|
||||||
P = Ut;
|
P = Ut;
|
||||||
for (int t=1;t<T;t++){
|
for (int t=1;t<T;t++){
|
||||||
P = Gimpl::CovShiftForward(Ut,Nd-1,P);
|
P = Gimpl::CovShiftForward(Ut,3,P);
|
||||||
}
|
}
|
||||||
RealD norm = 1.0/(Nc*vol);
|
RealD norm = 1.0/(Nc*X*Y*Z*T);
|
||||||
out = sum(trace(P))*norm;
|
out = sum(trace(P))*norm;
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
@@ -212,7 +215,7 @@ public:
|
|||||||
|
|
||||||
double vol = Umu.Grid()->gSites();
|
double vol = Umu.Grid()->gSites();
|
||||||
|
|
||||||
return p.real() / vol / (Nd * Nc ) ;
|
return p.real() / vol / (4.0 * Nc ) ;
|
||||||
};
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
@@ -737,7 +740,6 @@ public:
|
|||||||
//cf https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 6
|
//cf https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 6
|
||||||
//output is the charge by timeslice: sum over timeslices to obtain the total
|
//output is the charge by timeslice: sum over timeslices to obtain the total
|
||||||
static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
|
static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
|
||||||
// Audit: 4D epsilon is hard coded
|
|
||||||
assert(Nd == 4);
|
assert(Nd == 4);
|
||||||
std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
|
std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
|
||||||
//Note F_numu = - F_munu
|
//Note F_numu = - F_munu
|
||||||
@@ -827,25 +829,6 @@ public:
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
//Compute the 5Li topological charge density
|
|
||||||
static std::vector<Real> TopologicalChargeDensity5Li(const GaugeLorentz &U){
|
|
||||||
|
|
||||||
static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
|
|
||||||
std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
|
|
||||||
|
|
||||||
double c5=1./20.;
|
|
||||||
double c4=1./5.-2.*c5;
|
|
||||||
double c3=(-64.+640.*c5)/45.;
|
|
||||||
double c2=(1-64.*c5)/9.;
|
|
||||||
double c1=(19.-55.*c5)/9.;
|
|
||||||
|
|
||||||
int Lt = loops[0].size();
|
|
||||||
std::vector<Real> out(Lt,0.);
|
|
||||||
for(int t=0;t<Lt;t++)
|
|
||||||
out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
static Real TopologicalCharge5Li(const GaugeLorentz &U){
|
static Real TopologicalCharge5Li(const GaugeLorentz &U){
|
||||||
std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
|
std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
|
||||||
Real Q = 0.;
|
Real Q = 0.;
|
||||||
@@ -1472,7 +1455,7 @@ public:
|
|||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static Real sumWilsonLoop(const GaugeLorentz &Umu,
|
static Real sumWilsonLoop(const GaugeLorentz &Umu,
|
||||||
const int R1, const int R2) {
|
const int R1, const int R2) {
|
||||||
std::vector<GaugeMat> U(Nd, Umu.Grid());
|
std::vector<GaugeMat> U(4, Umu.Grid());
|
||||||
|
|
||||||
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
|
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
||||||
@@ -1491,7 +1474,7 @@ public:
|
|||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
|
static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
|
||||||
const int R1, const int R2) {
|
const int R1, const int R2) {
|
||||||
std::vector<GaugeMat> U(Nd, Umu.Grid());
|
std::vector<GaugeMat> U(4, Umu.Grid());
|
||||||
|
|
||||||
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
|
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
||||||
@@ -1509,8 +1492,8 @@ public:
|
|||||||
// sum over all x,y,z,t and over all planes of spatial Wilson loop
|
// sum over all x,y,z,t and over all planes of spatial Wilson loop
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
|
static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
|
||||||
const int R1, const int R2) {
|
const int R1, const int R2) {
|
||||||
std::vector<GaugeMat> U(Nd, Umu.Grid());
|
std::vector<GaugeMat> U(4, Umu.Grid());
|
||||||
|
|
||||||
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
|
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
|
||||||
|
|||||||
@@ -252,7 +252,7 @@ inline std::ostream& operator<< (std::ostream& stream, const vComplexF &o){
|
|||||||
|
|
||||||
inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
|
inline std::ostream& operator<< (std::ostream& stream, const vComplexD &o){
|
||||||
int nn=vComplexD::Nsimd();
|
int nn=vComplexD::Nsimd();
|
||||||
std::vector<ComplexD> buf(nn);
|
std::vector<ComplexD,alignedAllocator<ComplexD> > buf(nn);
|
||||||
vstore(o,&buf[0]);
|
vstore(o,&buf[0]);
|
||||||
stream<<"<";
|
stream<<"<";
|
||||||
for(int i=0;i<nn;i++){
|
for(int i=0;i<nn;i++){
|
||||||
@@ -272,7 +272,7 @@ inline std::ostream& operator<< (std::ostream& stream, const vComplexD2 &o){
|
|||||||
|
|
||||||
inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
|
inline std::ostream& operator<< (std::ostream& stream, const vRealF &o){
|
||||||
int nn=vRealF::Nsimd();
|
int nn=vRealF::Nsimd();
|
||||||
std::vector<RealF> buf(nn);
|
std::vector<RealF,alignedAllocator<RealF> > buf(nn);
|
||||||
vstore(o,&buf[0]);
|
vstore(o,&buf[0]);
|
||||||
stream<<"<";
|
stream<<"<";
|
||||||
for(int i=0;i<nn;i++){
|
for(int i=0;i<nn;i++){
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -30,25 +30,26 @@
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
uint64_t DslashFullCount;
|
uint64_t DslashFullCount;
|
||||||
uint64_t DslashPartialCount;
|
//uint64_t DslashPartialCount;
|
||||||
uint64_t DslashDirichletCount;
|
uint64_t DslashDirichletCount;
|
||||||
|
|
||||||
void DslashResetCounts(void)
|
void DslashResetCounts(void)
|
||||||
{
|
{
|
||||||
DslashFullCount=0;
|
DslashFullCount=0;
|
||||||
DslashPartialCount=0;
|
// DslashPartialCount=0;
|
||||||
DslashDirichletCount=0;
|
DslashDirichletCount=0;
|
||||||
}
|
}
|
||||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
|
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
|
||||||
{
|
{
|
||||||
dirichlet = DslashDirichletCount;
|
dirichlet = DslashDirichletCount;
|
||||||
partial = DslashPartialCount;
|
partial = 0;
|
||||||
full = DslashFullCount;
|
full = DslashFullCount;
|
||||||
}
|
}
|
||||||
void DslashLogFull(void) { DslashFullCount++;}
|
void DslashLogFull(void) { DslashFullCount++;}
|
||||||
void DslashLogPartial(void) { DslashPartialCount++;}
|
//void DslashLogPartial(void) { DslashPartialCount++;}
|
||||||
void DslashLogDirichlet(void){ DslashDirichletCount++;}
|
void DslashLogDirichlet(void){ DslashDirichletCount++;}
|
||||||
|
|
||||||
|
deviceVector<unsigned char> StencilBuffer::DeviceCommBuf;
|
||||||
|
|
||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
int off,std::vector<std::pair<int,int> > & table)
|
int off,std::vector<std::pair<int,int> > & table)
|
||||||
|
|||||||
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// These can move into a params header and be given MacroMagic serialisation
|
// These can move into a params header and be given MacroMagic serialisation
|
||||||
struct DefaultImplParams {
|
struct DefaultImplParams {
|
||||||
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
||||||
int partialDirichlet;
|
// int partialDirichlet;
|
||||||
DefaultImplParams() {
|
DefaultImplParams() {
|
||||||
dirichlet.resize(0);
|
dirichlet.resize(0);
|
||||||
partialDirichlet=0;
|
// partialDirichlet=0;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -69,6 +69,12 @@ struct DefaultImplParams {
|
|||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
int off,std::vector<std::pair<int,int> > & table);
|
int off,std::vector<std::pair<int,int> > & table);
|
||||||
|
|
||||||
|
class StencilBuffer
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static deviceVector<unsigned char> DeviceCommBuf; // placed in Stencil.cc
|
||||||
|
};
|
||||||
|
|
||||||
void DslashResetCounts(void);
|
void DslashResetCounts(void);
|
||||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
||||||
void DslashLogFull(void);
|
void DslashLogFull(void);
|
||||||
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// If true, this is partially communicated per face
|
// If true, this is partially communicated per face
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
StencilVector _comms_partial_send;
|
// StencilVector _comms_partial_send;
|
||||||
StencilVector _comms_partial_recv;
|
// StencilVector _comms_partial_recv;
|
||||||
//
|
//
|
||||||
StencilVector _comm_buf_size;
|
StencilVector _comm_buf_size;
|
||||||
StencilVector _permute_type;
|
StencilVector _permute_type;
|
||||||
@@ -205,16 +211,16 @@ public:
|
|||||||
struct Packet {
|
struct Packet {
|
||||||
void * send_buf;
|
void * send_buf;
|
||||||
void * recv_buf;
|
void * recv_buf;
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
void * compressed_send_buf;
|
||||||
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
void * compressed_recv_buf;
|
||||||
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
|
||||||
#endif
|
|
||||||
Integer to_rank;
|
Integer to_rank;
|
||||||
Integer from_rank;
|
Integer from_rank;
|
||||||
Integer do_send;
|
Integer do_send;
|
||||||
Integer do_recv;
|
Integer do_recv;
|
||||||
Integer xbytes;
|
Integer xbytes;
|
||||||
Integer rbytes;
|
Integer rbytes;
|
||||||
|
Integer xbytes_compressed;
|
||||||
|
Integer rbytes_compressed;
|
||||||
};
|
};
|
||||||
struct Merge {
|
struct Merge {
|
||||||
static constexpr int Nsimd = vobj::Nsimd();
|
static constexpr int Nsimd = vobj::Nsimd();
|
||||||
@@ -223,7 +229,7 @@ public:
|
|||||||
std::vector<cobj *> vpointers;
|
std::vector<cobj *> vpointers;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer type;
|
Integer type;
|
||||||
Integer partial; // partial dirichlet BCs
|
// Integer partial; // partial dirichlet BCs
|
||||||
Coordinate dims;
|
Coordinate dims;
|
||||||
};
|
};
|
||||||
struct Decompress {
|
struct Decompress {
|
||||||
@@ -231,7 +237,7 @@ public:
|
|||||||
cobj * kernel_p;
|
cobj * kernel_p;
|
||||||
cobj * mpi_p;
|
cobj * mpi_p;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer partial; // partial dirichlet BCs
|
// Integer partial; // partial dirichlet BCs
|
||||||
Coordinate dims;
|
Coordinate dims;
|
||||||
};
|
};
|
||||||
struct CopyReceiveBuffer {
|
struct CopyReceiveBuffer {
|
||||||
@@ -252,9 +258,45 @@ public:
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Sloppy comms will make a second buffer upon comms
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
size_t device_heap_top; //
|
||||||
|
size_t device_heap_bytes;//
|
||||||
|
size_t device_heap_size; //
|
||||||
|
void *DeviceBufferMalloc(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr = (void *)device_heap_top;
|
||||||
|
device_heap_top += bytes;
|
||||||
|
device_heap_bytes+= bytes;
|
||||||
|
if ( device_heap_bytes > device_heap_size ) {
|
||||||
|
std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
|
||||||
|
assert (device_heap_bytes <= device_heap_size);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void DeviceBufferFreeAll(void)
|
||||||
|
{
|
||||||
|
device_heap_size = _unified_buffer_size*sizeof(cobj);
|
||||||
|
// Resize up if necessary, never down
|
||||||
|
if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
|
||||||
|
StencilBuffer::DeviceCommBuf.resize(device_heap_size);
|
||||||
|
}
|
||||||
|
device_heap_top =(size_t) &StencilBuffer::DeviceCommBuf[0];
|
||||||
|
device_heap_size = StencilBuffer::DeviceCommBuf.size();
|
||||||
|
device_heap_bytes=0;
|
||||||
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GridBase *Grid(void) const { return _grid; }
|
GridBase *Grid(void) const { return _grid; }
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
// Control reduced precision comms
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
int SloppyComms;
|
||||||
|
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||||
// without adding parameters. Perhaps a template parameter to StenciView is
|
// without adding parameters. Perhaps a template parameter to StenciView is
|
||||||
@@ -268,7 +310,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
int partialDirichlet;
|
// int partialDirichlet;
|
||||||
int fullDirichlet;
|
int fullDirichlet;
|
||||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||||
deviceVector<int> surface_list;
|
deviceVector<int> surface_list;
|
||||||
@@ -361,24 +403,145 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Non blocking send and receive. Necessarily parallel.
|
// Non blocking send and receive. Necessarily parallel.
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
void DecompressPacket(Packet &packet)
|
||||||
|
{
|
||||||
|
if ( !SloppyComms ) return;
|
||||||
|
|
||||||
|
if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||||
|
|
||||||
|
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||||
|
uint64_t words = packet.rbytes/sizeof(word);
|
||||||
|
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||||
|
const uint64_t outer = words/nsimd;
|
||||||
|
|
||||||
|
if(sizeof(word)==8) {
|
||||||
|
|
||||||
|
// Can either choose to represent as float vs double and prec change
|
||||||
|
// OR
|
||||||
|
// truncate the mantissa bfp16 style
|
||||||
|
double *dbuf =(double *) packet.recv_buf;
|
||||||
|
float *fbuf =(float *) packet.compressed_recv_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
|
||||||
|
});
|
||||||
|
|
||||||
|
} else if ( sizeof(word)==4){
|
||||||
|
// Can either choose to represent as half vs float and prec change
|
||||||
|
// OR
|
||||||
|
// truncate the mantissa bfp16 style
|
||||||
|
|
||||||
|
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
|
||||||
|
uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
|
||||||
|
});
|
||||||
|
|
||||||
|
} else {
|
||||||
|
assert(0 && "unknown floating point precision");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CompressPacket(Packet &packet)
|
||||||
|
{
|
||||||
|
packet.xbytes_compressed = packet.xbytes;
|
||||||
|
packet.compressed_send_buf = packet.send_buf;
|
||||||
|
|
||||||
|
packet.rbytes_compressed = packet.rbytes;
|
||||||
|
packet.compressed_recv_buf = packet.recv_buf;
|
||||||
|
|
||||||
|
if ( !SloppyComms ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||||
|
uint64_t words = packet.xbytes/sizeof(word);
|
||||||
|
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||||
|
const uint64_t outer = words/nsimd;
|
||||||
|
|
||||||
|
if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||||
|
|
||||||
|
packet.rbytes_compressed = packet.rbytes/2;
|
||||||
|
packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
|
||||||
|
// std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
//else {
|
||||||
|
// std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||||
|
// }
|
||||||
|
|
||||||
|
if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
|
||||||
|
|
||||||
|
packet.xbytes_compressed = packet.xbytes/2;
|
||||||
|
packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
|
||||||
|
// std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||||
|
|
||||||
|
if(sizeof(word)==8) {
|
||||||
|
|
||||||
|
double *dbuf =(double *) packet.send_buf;
|
||||||
|
float *fbuf =(float *) packet.compressed_send_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
|
||||||
|
});
|
||||||
|
|
||||||
|
} else if ( sizeof(word)==4){
|
||||||
|
|
||||||
|
uint32_t *fbuf =(uint32_t *) packet.send_buf;
|
||||||
|
uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
|
||||||
|
|
||||||
|
accelerator_forNB(ss,outer,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
|
||||||
|
});
|
||||||
|
|
||||||
|
} else {
|
||||||
|
assert(0 && "unknown floating point precision");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
// else {
|
||||||
|
// std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||||
|
// }
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
// std::cout << "Communicate Begin "<<std::endl;
|
|
||||||
// _grid->Barrier();
|
|
||||||
FlightRecorder::StepLog("Communicate begin");
|
FlightRecorder::StepLog("Communicate begin");
|
||||||
|
///////////////////////////////////////////////
|
||||||
// All GPU kernel tasks must complete
|
// All GPU kernel tasks must complete
|
||||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
// accelerator_barrier(); All kernels should ALREADY be complete
|
||||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
//Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
_grid->StencilBarrier();
|
||||||
|
// But the HaloGather had a barrier too.
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
if (SloppyComms) {
|
||||||
|
DeviceBufferFreeAll();
|
||||||
|
}
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
this->CompressPacket(Packets[i]);
|
||||||
|
}
|
||||||
|
if (SloppyComms) {
|
||||||
|
accelerator_barrier();
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
_grid->StencilBarrier();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].compressed_send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
Packets[i].recv_buf,
|
Packets[i].compressed_recv_buf,
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||||
}
|
}
|
||||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
@@ -389,19 +552,22 @@ public:
|
|||||||
// Starts intranode
|
// Starts intranode
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
// std::cout << "Communicate Begin "<<i<<std::endl;
|
// std::cout << "Communicate Begin "<<i<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,Packets[i].compressed_send_buf,
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,Packets[i].compressed_recv_buf,
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||||
|
// std::cout << "Communicate Begin started "<<i<<std::endl;
|
||||||
|
// _grid->Barrier();
|
||||||
}
|
}
|
||||||
FlightRecorder::StepLog("Communicate begin has finished");
|
FlightRecorder::StepLog("Communicate begin has finished");
|
||||||
// Get comms started then run checksums
|
// Get comms started then run checksums
|
||||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
if ( Packets[i].do_send )
|
if ( Packets[i].do_send )
|
||||||
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -416,14 +582,15 @@ public:
|
|||||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||||
// _grid->Barrier();
|
// _grid->Barrier();
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
// if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||||
// accelerator_barrier();
|
// accelerator_barrier();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
this->DecompressPacket(Packets[i]);
|
||||||
if ( Packets[i].do_recv )
|
if ( Packets[i].do_recv )
|
||||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
|
||||||
}
|
}
|
||||||
FlightRecorder::StepLog("Finish communicate complete");
|
FlightRecorder::StepLog("Finish communicate complete");
|
||||||
}
|
}
|
||||||
@@ -618,7 +785,7 @@ public:
|
|||||||
}
|
}
|
||||||
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
||||||
Decompress d;
|
Decompress d;
|
||||||
d.partial = this->partialDirichlet;
|
// d.partial = this->partialDirichlet;
|
||||||
d.dims = _grid->_fdimensions;
|
d.dims = _grid->_fdimensions;
|
||||||
d.kernel_p = k_p;
|
d.kernel_p = k_p;
|
||||||
d.mpi_p = m_p;
|
d.mpi_p = m_p;
|
||||||
@@ -627,7 +794,7 @@ public:
|
|||||||
}
|
}
|
||||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||||
Merge m;
|
Merge m;
|
||||||
m.partial = this->partialDirichlet;
|
// m.partial = this->partialDirichlet;
|
||||||
m.dims = _grid->_fdimensions;
|
m.dims = _grid->_fdimensions;
|
||||||
m.type = type;
|
m.type = type;
|
||||||
m.mpointer = merge_p;
|
m.mpointer = merge_p;
|
||||||
@@ -732,8 +899,8 @@ public:
|
|||||||
int block = dirichlet_block[dimension];
|
int block = dirichlet_block[dimension];
|
||||||
this->_comms_send[ii] = comm_dim;
|
this->_comms_send[ii] = comm_dim;
|
||||||
this->_comms_recv[ii] = comm_dim;
|
this->_comms_recv[ii] = comm_dim;
|
||||||
this->_comms_partial_send[ii] = 0;
|
// this->_comms_partial_send[ii] = 0;
|
||||||
this->_comms_partial_recv[ii] = 0;
|
// this->_comms_partial_recv[ii] = 0;
|
||||||
if ( block && comm_dim ) {
|
if ( block && comm_dim ) {
|
||||||
assert(abs(displacement) < ld );
|
assert(abs(displacement) < ld );
|
||||||
// Quiesce communication across block boundaries
|
// Quiesce communication across block boundaries
|
||||||
@@ -754,10 +921,10 @@ public:
|
|||||||
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
||||||
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
||||||
}
|
}
|
||||||
if ( partialDirichlet ) {
|
// if ( partialDirichlet ) {
|
||||||
this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
// this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||||
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -769,6 +936,7 @@ public:
|
|||||||
Parameters p=Parameters(),
|
Parameters p=Parameters(),
|
||||||
bool preserve_shm=false)
|
bool preserve_shm=false)
|
||||||
{
|
{
|
||||||
|
SloppyComms = 0;
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_grid = grid;
|
_grid = grid;
|
||||||
this->parameters=p;
|
this->parameters=p;
|
||||||
@@ -786,7 +954,7 @@ public:
|
|||||||
this->same_node.resize(npoints);
|
this->same_node.resize(npoints);
|
||||||
|
|
||||||
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
||||||
partialDirichlet = p.partialDirichlet;
|
// partialDirichlet = p.partialDirichlet;
|
||||||
DirichletBlock(p.dirichlet); // comms send/recv set up
|
DirichletBlock(p.dirichlet); // comms send/recv set up
|
||||||
fullDirichlet=0;
|
fullDirichlet=0;
|
||||||
for(int d=0;d<p.dirichlet.size();d++){
|
for(int d=0;d<p.dirichlet.size();d++){
|
||||||
@@ -867,7 +1035,7 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
// Allow for multiple stencils to exist simultaneously
|
// Allow for multiple stencils to be communicated simultaneously
|
||||||
if (!preserve_shm)
|
if (!preserve_shm)
|
||||||
_grid->ShmBufferFreeAll();
|
_grid->ShmBufferFreeAll();
|
||||||
|
|
||||||
@@ -935,7 +1103,8 @@ public:
|
|||||||
GridBase *grid=_grid;
|
GridBase *grid=_grid;
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||||
|
int comms_recv = this->_comms_recv[point];
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int ld = _grid->_ldimensions[dimension];
|
int ld = _grid->_ldimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@@ -1124,8 +1293,8 @@ public:
|
|||||||
|
|
||||||
int comms_send = this->_comms_send[point];
|
int comms_send = this->_comms_send[point];
|
||||||
int comms_recv = this->_comms_recv[point];
|
int comms_recv = this->_comms_recv[point];
|
||||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||||
|
|
||||||
assert(rhs.Grid()==_grid);
|
assert(rhs.Grid()==_grid);
|
||||||
// conformable(_grid,rhs.Grid());
|
// conformable(_grid,rhs.Grid());
|
||||||
@@ -1160,11 +1329,11 @@ public:
|
|||||||
int rbytes;
|
int rbytes;
|
||||||
|
|
||||||
if ( comms_send ) xbytes = bytes; // Full send
|
if ( comms_send ) xbytes = bytes; // Full send
|
||||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else xbytes = 0; // full dirichlet
|
else xbytes = 0; // full dirichlet
|
||||||
|
|
||||||
if ( comms_recv ) rbytes = bytes;
|
if ( comms_recv ) rbytes = bytes;
|
||||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else rbytes = 0;
|
else rbytes = 0;
|
||||||
|
|
||||||
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||||
@@ -1191,7 +1360,8 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||||
|
if ( compress.DecompressionStep()&&comms_recv) {
|
||||||
recv_buf=u_simd_recv_buf[0];
|
recv_buf=u_simd_recv_buf[0];
|
||||||
} else {
|
} else {
|
||||||
recv_buf=this->u_recv_buf_p;
|
recv_buf=this->u_recv_buf_p;
|
||||||
@@ -1225,7 +1395,8 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
||||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||||
|
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
|
||||||
|
|
||||||
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
||||||
if ( !duplicate ) { // Force comms for now
|
if ( !duplicate ) { // Force comms for now
|
||||||
@@ -1234,8 +1405,8 @@ public:
|
|||||||
// Build a list of things to do after we synchronise GPUs
|
// Build a list of things to do after we synchronise GPUs
|
||||||
// Start comms now???
|
// Start comms now???
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
int do_send = (comms_send) && (!shm_send );
|
||||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
int do_recv = (comms_send) && (!shm_recv );
|
||||||
AddPacket((void *)&send_buf[comm_off],
|
AddPacket((void *)&send_buf[comm_off],
|
||||||
(void *)&recv_buf[comm_off],
|
(void *)&recv_buf[comm_off],
|
||||||
xmit_to_rank, do_send,
|
xmit_to_rank, do_send,
|
||||||
@@ -1243,7 +1414,7 @@ public:
|
|||||||
xbytes,rbytes);
|
xbytes,rbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
|
if ( (compress.DecompressionStep() && comms_recv) ) {
|
||||||
AddDecompress(&this->u_recv_buf_p[comm_off],
|
AddDecompress(&this->u_recv_buf_p[comm_off],
|
||||||
&recv_buf[comm_off],
|
&recv_buf[comm_off],
|
||||||
words,Decompressions);
|
words,Decompressions);
|
||||||
@@ -1265,8 +1436,8 @@ public:
|
|||||||
|
|
||||||
int comms_send = this->_comms_send[point];
|
int comms_send = this->_comms_send[point];
|
||||||
int comms_recv = this->_comms_recv[point];
|
int comms_recv = this->_comms_recv[point];
|
||||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
@@ -1341,18 +1512,20 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
if ( comms_send ) xbytes = bytes;
|
if ( comms_send ) xbytes = bytes;
|
||||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else xbytes = 0;
|
else xbytes = 0;
|
||||||
|
|
||||||
if ( comms_recv ) rbytes = bytes;
|
if ( comms_recv ) rbytes = bytes;
|
||||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||||
else rbytes = 0;
|
else rbytes = 0;
|
||||||
|
|
||||||
// Gathers SIMD lanes for send and merge
|
// Gathers SIMD lanes for send and merge
|
||||||
// Different faces can be full comms or partial comms with multiple ranks per node
|
// Different faces can be full comms or partial comms with multiple ranks per node
|
||||||
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||||
|
if ( comms_send || comms_recv ) {
|
||||||
|
|
||||||
int partial = partialDirichlet;
|
// int partial = partialDirichlet;
|
||||||
|
int partial = 0;
|
||||||
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
||||||
spointers,dimension,sx,cbmask,
|
spointers,dimension,sx,cbmask,
|
||||||
compress,permute_type,partial );
|
compress,permute_type,partial );
|
||||||
@@ -1418,7 +1591,8 @@ public:
|
|||||||
if ( (bytes != rbytes) && (rbytes!=0) ){
|
if ( (bytes != rbytes) && (rbytes!=0) ){
|
||||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||||
}
|
}
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
// int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||||
|
int do_send = (comms_send) && (!shm_send );
|
||||||
AddPacket((void *)sp,(void *)rp,
|
AddPacket((void *)sp,(void *)rp,
|
||||||
xmit_to_rank,do_send,
|
xmit_to_rank,do_send,
|
||||||
recv_from_rank,do_send,
|
recv_from_rank,do_send,
|
||||||
@@ -1432,7 +1606,8 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// rpointer may be doing a remote read in the gather over SHM
|
// rpointer may be doing a remote read in the gather over SHM
|
||||||
if ( comms_recv|comms_partial_recv ) {
|
// if ( comms_recv|comms_partial_recv ) {
|
||||||
|
if ( comms_recv ) {
|
||||||
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ void acceleratorInit(void)
|
|||||||
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
||||||
|
|
||||||
|
|
||||||
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
GPU_PROP_FMT(totalGlobalMem,"%zu");
|
||||||
GPU_PROP(managedMemory);
|
GPU_PROP(managedMemory);
|
||||||
GPU_PROP(isMultiGpuBoard);
|
GPU_PROP(isMultiGpuBoard);
|
||||||
GPU_PROP(warpSize);
|
GPU_PROP(warpSize);
|
||||||
@@ -240,7 +240,7 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
char hostname[HOST_NAME_MAX+1];
|
char hostname[HOST_NAME_MAX+1];
|
||||||
gethostname(hostname, HOST_NAME_MAX+1);
|
gethostname(hostname, HOST_NAME_MAX+1);
|
||||||
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
|
if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
|
||||||
|
|
||||||
auto devices = sycl::device::get_devices();
|
auto devices = sycl::device::get_devices();
|
||||||
for(int d = 0;d<devices.size();d++){
|
for(int d = 0;d<devices.size();d++){
|
||||||
|
|||||||
@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
|
|||||||
auto err = cudaMallocHost((void **)&ptr,bytes);
|
auto err = cudaMallocHost((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
|||||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||||
if( err != cudaSuccess ) {
|
if( err != cudaSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -46,10 +46,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
|
||||||
#include <Grid/Grid.h>
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
#include <Grid/util/CompilerCompatible.h>
|
#include <Grid/util/CompilerCompatible.h>
|
||||||
|
|
||||||
|
#ifdef HAVE_UNWIND
|
||||||
|
#include <libunwind.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <fenv.h>
|
#include <fenv.h>
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
@@ -187,9 +191,8 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
Coordinate &latt_c,
|
Coordinate &latt_c,
|
||||||
Coordinate &mpi_c)
|
Coordinate &mpi_c)
|
||||||
{
|
{
|
||||||
auto mpi =std::vector<int>(Nd,1);
|
auto mpi =std::vector<int>({1,1,1,1});
|
||||||
auto latt=std::vector<int>(Nd,8);
|
auto latt=std::vector<int>({8,8,8,8});
|
||||||
|
|
||||||
|
|
||||||
GridThread::SetMaxThreads();
|
GridThread::SetMaxThreads();
|
||||||
|
|
||||||
@@ -229,9 +232,6 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
}
|
}
|
||||||
// Copy back into coordinate format
|
// Copy back into coordinate format
|
||||||
int nd = mpi.size();
|
int nd = mpi.size();
|
||||||
// std::cout << "mpi.size() "<<nd<<std::endl;
|
|
||||||
// std::cout << "latt.size() "<<latt.size()<<std::endl;
|
|
||||||
// std::cout << "Nd "<<Nd<<std::endl;
|
|
||||||
assert(latt.size()==nd);
|
assert(latt.size()==nd);
|
||||||
latt_c.resize(nd);
|
latt_c.resize(nd);
|
||||||
mpi_c.resize(nd);
|
mpi_c.resize(nd);
|
||||||
@@ -299,6 +299,20 @@ void GridBanner(void)
|
|||||||
std::cout << std::setprecision(9);
|
std::cout << std::setprecision(9);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//Some file local variables
|
||||||
|
static int fileno_stdout;
|
||||||
|
static int fileno_stderr;
|
||||||
|
static int signal_delay;
|
||||||
|
class dlRegion {
|
||||||
|
public:
|
||||||
|
uint64_t start;
|
||||||
|
uint64_t end;
|
||||||
|
uint64_t size;
|
||||||
|
uint64_t offset;
|
||||||
|
std::string name;
|
||||||
|
};
|
||||||
|
std::vector<dlRegion> dlMap;
|
||||||
|
|
||||||
void Grid_init(int *argc,char ***argv)
|
void Grid_init(int *argc,char ***argv)
|
||||||
{
|
{
|
||||||
|
|
||||||
@@ -351,6 +365,19 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||||
Grid_debug_handler_init();
|
Grid_debug_handler_init();
|
||||||
}
|
}
|
||||||
|
// Sleep n-seconds at end of handler
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
|
||||||
|
GridCmdOptionInt(arg,signal_delay);
|
||||||
|
}
|
||||||
|
// periodic wakeup with stack trace printed
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
|
||||||
|
Grid_debug_heartbeat();
|
||||||
|
}
|
||||||
|
// periodic wakeup with empty handler (interrupts some system calls)
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
|
||||||
|
Grid_heartbeat();
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(A64FX)
|
#if defined(A64FX)
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||||
@@ -400,15 +427,25 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
fp=freopen(ename.str().c_str(),"w",stderr);
|
fp=freopen(ename.str().c_str(),"w",stderr);
|
||||||
assert(fp!=(FILE *)NULL);
|
assert(fp!=(FILE *)NULL);
|
||||||
}
|
}
|
||||||
|
fileno_stdout = fileno(stdout);
|
||||||
|
fileno_stderr = fileno(stderr) ;
|
||||||
|
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
// OK to use GridLogMessage etc from here on
|
// OK to use GridLogMessage etc from here on
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
||||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
|
{
|
||||||
gethostname(hostname, HOST_NAME_MAX+1);
|
gethostname(hostname, HOST_NAME_MAX+1);
|
||||||
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
time_t mytime;
|
||||||
|
struct tm *info;
|
||||||
|
char buffer[80];
|
||||||
|
time(&mytime);
|
||||||
|
info = localtime(&mytime);
|
||||||
|
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
|
||||||
|
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// Reporting
|
// Reporting
|
||||||
@@ -425,6 +462,47 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
MemoryProfiler::stats = &dbgMemStats;
|
MemoryProfiler::stats = &dbgMemStats;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
// LD.so space
|
||||||
|
/////////////////////////////////////////////////////////
|
||||||
|
#ifndef __APPLE__
|
||||||
|
{
|
||||||
|
// Provides mapping of .so files
|
||||||
|
FILE *f = fopen("/proc/self/maps", "r");
|
||||||
|
if (f) {
|
||||||
|
char line[256];
|
||||||
|
while (fgets(line, sizeof(line), f)) {
|
||||||
|
if (strstr(line, "r-xp")) {
|
||||||
|
dlRegion region;
|
||||||
|
uint32_t major, minor, inode;
|
||||||
|
uint64_t start,end,offset;
|
||||||
|
char path[PATH_MAX];
|
||||||
|
sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
|
||||||
|
&start,&end,&offset,
|
||||||
|
&major,&minor,&inode,path);
|
||||||
|
region.start=start;
|
||||||
|
region.end =end;
|
||||||
|
region.offset=offset;
|
||||||
|
region.name = std::string(path);
|
||||||
|
region.size = region.end-region.start;
|
||||||
|
dlMap.push_back(region);
|
||||||
|
// std::cout << GridLogMessage<< line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
|
||||||
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
|
for(int r=0;r<dlMap.size();r++){
|
||||||
|
auto region = dlMap[r];
|
||||||
|
std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Logging
|
// Logging
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
@@ -457,14 +535,19 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
|
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
|
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"Debug:"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
@@ -559,17 +642,56 @@ void GridLogLayout() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||||
|
#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
|
||||||
|
|
||||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void sig_print_dig(uint32_t dig)
|
||||||
{
|
{
|
||||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
|
||||||
fprintf(stderr,"FlightRecorder step %d stage %s \n",
|
if ( dig>=0 && dig< 16){
|
||||||
FlightRecorder::StepLoggingCounter,
|
SIGLOG(digits[dig]);
|
||||||
FlightRecorder::StepName);
|
}
|
||||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
}
|
||||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
void sig_print_uint(uint32_t A)
|
||||||
fprintf(stderr," code %d\n",si->si_code);
|
{
|
||||||
// x86 64bit
|
int dig;
|
||||||
|
int nz=0;
|
||||||
|
#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
||||||
|
DIGIT(1000000000); // Catches 4BN = 2^32
|
||||||
|
DIGIT(100000000);
|
||||||
|
DIGIT(10000000);
|
||||||
|
DIGIT(1000000);
|
||||||
|
DIGIT(100000);
|
||||||
|
DIGIT(10000);
|
||||||
|
DIGIT(1000);
|
||||||
|
DIGIT(100);
|
||||||
|
DIGIT(10);
|
||||||
|
DIGIT(1);
|
||||||
|
if (nz==0) SIGLOG("0");
|
||||||
|
}
|
||||||
|
void sig_print_hex(uint64_t A)
|
||||||
|
{
|
||||||
|
int nz=0;
|
||||||
|
int dig;
|
||||||
|
#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
||||||
|
SIGLOG("0x");
|
||||||
|
NIBBLE((A>>(15*4))&0xF);
|
||||||
|
NIBBLE((A>>(14*4))&0xF);
|
||||||
|
NIBBLE((A>>(13*4))&0xF);
|
||||||
|
NIBBLE((A>>(12*4))&0xF);
|
||||||
|
NIBBLE((A>>(11*4))&0xF);
|
||||||
|
NIBBLE((A>>(10*4))&0xF);
|
||||||
|
NIBBLE((A>>(9*4))&0xF);
|
||||||
|
NIBBLE((A>>(8*4))&0xF);
|
||||||
|
NIBBLE((A>>(7*4))&0xF);
|
||||||
|
NIBBLE((A>>(6*4))&0xF);
|
||||||
|
NIBBLE((A>>(5*4))&0xF);
|
||||||
|
NIBBLE((A>>(4*4))&0xF);
|
||||||
|
NIBBLE((A>>(3*4))&0xF);
|
||||||
|
NIBBLE((A>>(2*4))&0xF);
|
||||||
|
NIBBLE((A>>4)&0xF);
|
||||||
|
sig_print_dig(A&0xF);
|
||||||
|
}
|
||||||
|
/*
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
@@ -577,80 +699,158 @@ void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
fflush(stderr);
|
*/
|
||||||
BACKTRACEFP(stderr);
|
void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
fprintf(stderr,"Called backtrace\n");
|
{
|
||||||
fflush(stdout);
|
SIGLOG("Signal handler on host ");
|
||||||
fflush(stderr);
|
SIGLOG(hostname);
|
||||||
|
SIGLOG(" process id ");
|
||||||
|
sig_print_uint((uint32_t)getpid());
|
||||||
|
SIGLOG("\n");
|
||||||
|
SIGLOG("FlightRecorder step ");
|
||||||
|
sig_print_uint(FlightRecorder::StepLoggingCounter);
|
||||||
|
SIGLOG(" stage ");
|
||||||
|
SIGLOG(FlightRecorder::StepName);
|
||||||
|
SIGLOG("\n");
|
||||||
|
SIGLOG("Caught signal ");
|
||||||
|
sig_print_uint(si->si_signo);
|
||||||
|
SIGLOG("\n");
|
||||||
|
SIGLOG(" mem address ");
|
||||||
|
sig_print_hex((uint64_t)si->si_addr);
|
||||||
|
SIGLOG("\n");
|
||||||
|
SIGLOG(" code ");
|
||||||
|
sig_print_uint(si->si_code);
|
||||||
|
SIGLOG("\n");
|
||||||
|
|
||||||
|
ucontext_t *uc= (ucontext_t *)ptr;
|
||||||
|
|
||||||
|
SIGLOG("Backtrace:\n");
|
||||||
|
#ifdef HAVE_UNWIND
|
||||||
|
// Debug cross check on offsets
|
||||||
|
// int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||||
|
// backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
||||||
|
unw_cursor_t cursor;
|
||||||
|
unw_word_t ip, off;
|
||||||
|
if (!unw_init_local(&cursor, uc) ) {
|
||||||
|
|
||||||
|
SIGLOG(" frame IP function\n");
|
||||||
|
int level = 0;
|
||||||
|
int ret = 0;
|
||||||
|
while(1) {
|
||||||
|
char name[128];
|
||||||
|
if (level >= _NBACKTRACE) return;
|
||||||
|
|
||||||
|
unw_get_reg(&cursor, UNW_REG_IP, &ip);
|
||||||
|
|
||||||
|
sig_print_uint(level); SIGLOG(" ");
|
||||||
|
sig_print_hex(ip); SIGLOG(" ");
|
||||||
|
for(int r=0;r<dlMap.size();r++){
|
||||||
|
if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
|
||||||
|
SIGLOG(dlMap[r].name.c_str());
|
||||||
|
SIGLOG("+");
|
||||||
|
sig_print_hex((ip-dlMap[r].start));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
SIGLOG("\n");
|
||||||
|
Grid_backtrace_buffer[level]=(void *)ip;
|
||||||
|
level++;
|
||||||
|
ret = unw_step(&cursor);
|
||||||
|
if (ret <= 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Known Asynch-Signal unsafe
|
||||||
|
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||||
|
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
|
{
|
||||||
|
Grid_generic_handler(sig,si,ptr);
|
||||||
|
SIGLOG("\n");
|
||||||
|
}
|
||||||
|
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
|
{
|
||||||
|
Grid_generic_handler(sig,si,ptr);
|
||||||
|
if (signal_delay) {
|
||||||
|
SIGLOG("Adding extra signal delay ");
|
||||||
|
sig_print_uint(signal_delay);
|
||||||
|
SIGLOG(" s\n");
|
||||||
|
usleep( (uint64_t) signal_delay*1000LL*1000LL);
|
||||||
|
}
|
||||||
|
SIGLOG("\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
{
|
{
|
||||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
Grid_generic_handler(sig,si,ptr);
|
||||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
SIGLOG("\n");
|
||||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
|
||||||
fprintf(stderr," code %d\n",si->si_code);
|
|
||||||
// Linux/Posix
|
|
||||||
#ifdef __linux__
|
|
||||||
// And x86 64bit
|
|
||||||
#ifdef __x86_64__
|
|
||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
|
||||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
|
||||||
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
|
|
||||||
REG(rdi);
|
|
||||||
REG(rsi);
|
|
||||||
REG(rbp);
|
|
||||||
REG(rbx);
|
|
||||||
REG(rdx);
|
|
||||||
REG(rax);
|
|
||||||
REG(rcx);
|
|
||||||
REG(rsp);
|
|
||||||
REG(rip);
|
|
||||||
|
|
||||||
|
|
||||||
REG(r8);
|
|
||||||
REG(r9);
|
|
||||||
REG(r10);
|
|
||||||
REG(r11);
|
|
||||||
REG(r12);
|
|
||||||
REG(r13);
|
|
||||||
REG(r14);
|
|
||||||
REG(r15);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
fflush(stderr);
|
|
||||||
BACKTRACEFP(stderr);
|
|
||||||
fprintf(stderr,"Called backtrace\n");
|
|
||||||
fflush(stdout);
|
|
||||||
fflush(stderr);
|
|
||||||
exit(0);
|
exit(0);
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
|
{
|
||||||
|
// SIGLOG("heartbeat signal handled\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
void Grid_debug_heartbeat(void)
|
||||||
|
{
|
||||||
|
struct sigaction sa_ping;
|
||||||
|
|
||||||
|
sigemptyset (&sa_ping.sa_mask);
|
||||||
|
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||||
|
sa_ping.sa_flags = SA_SIGINFO;
|
||||||
|
sigaction(SIGALRM,&sa_ping,NULL);
|
||||||
|
|
||||||
|
// repeating 10s heartbeat
|
||||||
|
struct itimerval it_val;
|
||||||
|
it_val.it_value.tv_sec = 10;
|
||||||
|
it_val.it_value.tv_usec = 0;
|
||||||
|
it_val.it_interval = it_val.it_value;
|
||||||
|
setitimer(ITIMER_REAL, &it_val, NULL);
|
||||||
|
}
|
||||||
|
void Grid_heartbeat(void)
|
||||||
|
{
|
||||||
|
struct sigaction sa_ping;
|
||||||
|
|
||||||
|
sigemptyset (&sa_ping.sa_mask);
|
||||||
|
sa_ping.sa_sigaction= Grid_empty_signal_handler;
|
||||||
|
sa_ping.sa_flags = SA_SIGINFO;
|
||||||
|
sigaction(SIGALRM,&sa_ping,NULL);
|
||||||
|
|
||||||
|
// repeating 10s heartbeat
|
||||||
|
struct itimerval it_val;
|
||||||
|
it_val.it_value.tv_sec = 10;
|
||||||
|
it_val.it_value.tv_usec = 1000;
|
||||||
|
it_val.it_interval = it_val.it_value;
|
||||||
|
setitimer(ITIMER_REAL, &it_val, NULL);
|
||||||
|
}
|
||||||
void Grid_exit_handler(void)
|
void Grid_exit_handler(void)
|
||||||
{
|
{
|
||||||
// BACKTRACEFP(stdout);
|
BACKTRACEFP(stdout);
|
||||||
// fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
void Grid_debug_handler_init(void)
|
void Grid_debug_handler_init(void)
|
||||||
{
|
{
|
||||||
struct sigaction sa;
|
struct sigaction sa;
|
||||||
sigemptyset (&sa.sa_mask);
|
sigemptyset (&sa.sa_mask);
|
||||||
sa.sa_sigaction= Grid_sa_signal_handler;
|
sa.sa_sigaction= Grid_fatal_signal_handler;
|
||||||
sa.sa_flags = SA_SIGINFO;
|
sa.sa_flags = SA_SIGINFO;
|
||||||
// sigaction(SIGSEGV,&sa,NULL);
|
|
||||||
sigaction(SIGTRAP,&sa,NULL);
|
sigaction(SIGTRAP,&sa,NULL);
|
||||||
// sigaction(SIGBUS,&sa,NULL);
|
|
||||||
// sigaction(SIGUSR2,&sa,NULL);
|
|
||||||
|
|
||||||
// feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
|
||||||
// sigaction(SIGFPE,&sa,NULL);
|
|
||||||
sigaction(SIGKILL,&sa,NULL);
|
|
||||||
sigaction(SIGILL,&sa,NULL);
|
sigaction(SIGILL,&sa,NULL);
|
||||||
|
#ifndef GRID_SYCL
|
||||||
|
sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
|
||||||
|
sigaction(SIGBUS,&sa,NULL);
|
||||||
|
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||||
|
sigaction(SIGFPE,&sa,NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
// Non terminating SIGUSR1/2 handler
|
// Non terminating SIGHUP handler
|
||||||
struct sigaction sa_ping;
|
struct sigaction sa_ping;
|
||||||
sigemptyset (&sa_ping.sa_mask);
|
sigemptyset (&sa_ping.sa_mask);
|
||||||
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||||
|
|||||||
@@ -38,7 +38,11 @@ char * GridHostname(void);
|
|||||||
|
|
||||||
// internal, controled with --handle
|
// internal, controled with --handle
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||||
|
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||||
|
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||||
void Grid_debug_handler_init(void);
|
void Grid_debug_handler_init(void);
|
||||||
|
void Grid_debug_heartbeat(void);
|
||||||
|
void Grid_heartbeat(void);
|
||||||
void Grid_quiesce_nodes(void);
|
void Grid_quiesce_nodes(void);
|
||||||
void Grid_unquiesce_nodes(void);
|
void Grid_unquiesce_nodes(void);
|
||||||
|
|
||||||
|
|||||||
@@ -201,8 +201,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
Params.dirichlet=NonDirichlet;
|
Params.dirichlet=NonDirichlet;
|
||||||
ParamsDir.dirichlet=Dirichlet;
|
ParamsDir.dirichlet=Dirichlet;
|
||||||
ParamsDir.partialDirichlet=0;
|
// ParamsDir.partialDirichlet=0;
|
||||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
|
||||||
|
|
||||||
// double StoppingCondition = 1e-14;
|
// double StoppingCondition = 1e-14;
|
||||||
// double MDStoppingCondition = 1e-9;
|
// double MDStoppingCondition = 1e-9;
|
||||||
@@ -298,11 +297,11 @@ int main(int argc, char **argv) {
|
|||||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||||
else ParamsDen.dirichlet = NonDirichlet;
|
else ParamsDen.dirichlet = NonDirichlet;
|
||||||
|
|
||||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||||
else ParamsNum.partialDirichlet = 0;
|
// else ParamsNum.partialDirichlet = 0;
|
||||||
|
|
||||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||||
else ParamsDen.partialDirichlet = 0;
|
// else ParamsDen.partialDirichlet = 0;
|
||||||
|
|
||||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||||
|
|||||||
@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
|
|||||||
ParamsF.dirichlet=NonDirichlet;
|
ParamsF.dirichlet=NonDirichlet;
|
||||||
ParamsDir.dirichlet=Dirichlet;
|
ParamsDir.dirichlet=Dirichlet;
|
||||||
ParamsDirF.dirichlet=Dirichlet;
|
ParamsDirF.dirichlet=Dirichlet;
|
||||||
ParamsDir.partialDirichlet=1;
|
// ParamsDir.partialDirichlet=1;
|
||||||
ParamsDirF.partialDirichlet=1;
|
// ParamsDirF.partialDirichlet=1;
|
||||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
// std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||||
|
|
||||||
// double StoppingCondition = 1e-14;
|
// double StoppingCondition = 1e-14;
|
||||||
// double MDStoppingCondition = 1e-9;
|
// double MDStoppingCondition = 1e-9;
|
||||||
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
|
|||||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||||
else ParamsDen.dirichlet = NonDirichlet;
|
else ParamsDen.dirichlet = NonDirichlet;
|
||||||
|
|
||||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||||
else ParamsNum.partialDirichlet = 0;
|
// else ParamsNum.partialDirichlet = 0;
|
||||||
|
|
||||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||||
else ParamsDen.partialDirichlet = 0;
|
// else ParamsDen.partialDirichlet = 0;
|
||||||
|
|
||||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||||
|
|
||||||
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
||||||
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
// ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||||
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
||||||
|
|
||||||
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
||||||
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
// ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||||
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
||||||
|
|
||||||
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
||||||
|
|||||||
5
TODO
5
TODO
@@ -1,8 +1,3 @@
|
|||||||
|
|
||||||
* Clean up the extract merge and replace with insertLane/extractLane
|
|
||||||
|
|
||||||
-----
|
|
||||||
|
|
||||||
i) Refine subspace with HDCG & recompute
|
i) Refine subspace with HDCG & recompute
|
||||||
ii) Block Lanczos in coarse space
|
ii) Block Lanczos in coarse space
|
||||||
iii) Batched block project in the operator computation
|
iii) Batched block project in the operator computation
|
||||||
|
|||||||
@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
header();
|
header();
|
||||||
|
|
||||||
for(int lat=8;lat<=maxlat;lat+=4){
|
for(int lat=8;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=8;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
Coordinate latt_size ({lat*mpi_layout[0],
|
Coordinate latt_size ({lat*mpi_layout[0],
|
||||||
lat*mpi_layout[1],
|
lat*mpi_layout[1],
|
||||||
lat*mpi_layout[2],
|
lat*mpi_layout[2],
|
||||||
lat*mpi_layout[3]});
|
lat*mpi_layout[3]});
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
RealD Nrank = Grid._Nprocessors;
|
RealD Nrank = Grid._Nprocessors;
|
||||||
@@ -193,101 +193,6 @@ int main (int argc, char ** argv)
|
|||||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ncomm;
|
|
||||||
|
|
||||||
double dbytes;
|
|
||||||
for(int i=0;i<Nloop;i++){
|
|
||||||
double start=usecond();
|
|
||||||
|
|
||||||
dbytes=0;
|
|
||||||
ncomm=0;
|
|
||||||
|
|
||||||
std::vector<CommsRequest_t> requests;
|
|
||||||
|
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
|
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
|
||||||
|
|
||||||
ncomm++;
|
|
||||||
int comm_proc=1;
|
|
||||||
int xmit_to_rank;
|
|
||||||
int recv_from_rank;
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
dbytes+=
|
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
|
||||||
(void *)&xbuf[mu][0],
|
|
||||||
xmit_to_rank,1,
|
|
||||||
(void *)&rbuf[mu][0],
|
|
||||||
recv_from_rank,1,
|
|
||||||
bytes,bytes,mu);
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
|
||||||
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
dbytes+=
|
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
|
||||||
(void *)&xbuf[mu+4][0],
|
|
||||||
xmit_to_rank,1,
|
|
||||||
(void *)&rbuf[mu+4][0],
|
|
||||||
recv_from_rank,1,
|
|
||||||
bytes,bytes,mu+4);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Grid.StencilSendToRecvFromComplete(requests,0);
|
|
||||||
Grid.Barrier();
|
|
||||||
double stop=usecond();
|
|
||||||
t_time[i] = stop-start; // microseconds
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
|
||||||
|
|
||||||
dbytes=dbytes*ppn;
|
|
||||||
double xbytes = dbytes*0.5;
|
|
||||||
// double rbytes = dbytes*0.5;
|
|
||||||
double bidibytes = dbytes;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
|
||||||
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
|
||||||
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
|
||||||
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
|
|
||||||
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
|
||||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
|
||||||
header();
|
|
||||||
|
|
||||||
for(int lat=8;lat<=maxlat;lat+=4){
|
|
||||||
for(int Ls=8;Ls<=8;Ls*=2){
|
|
||||||
|
|
||||||
Coordinate latt_size ({lat*mpi_layout[0],
|
|
||||||
lat*mpi_layout[1],
|
|
||||||
lat*mpi_layout[2],
|
|
||||||
lat*mpi_layout[3]});
|
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
|
||||||
RealD Nrank = Grid._Nprocessors;
|
|
||||||
RealD Nnode = Grid.NodeCount();
|
|
||||||
RealD ppn = Nrank/Nnode;
|
|
||||||
|
|
||||||
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
|
||||||
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
|
||||||
Grid.ShmBufferFreeAll();
|
|
||||||
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
|
||||||
for(int d=0;d<8;d++){
|
|
||||||
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
|
||||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ncomm;
|
int ncomm;
|
||||||
double dbytes;
|
double dbytes;
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
@@ -296,45 +201,34 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<CommsRequest_t> requests;
|
std::vector<CommsRequest_t> requests;
|
||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
for(int mu=0;mu<4;mu++){
|
|
||||||
|
for(int dir=0;dir<8;dir++) {
|
||||||
|
|
||||||
|
double tbytes;
|
||||||
|
int mu =dir % 4;
|
||||||
|
|
||||||
if (mpi_layout[mu]>1 ) {
|
if (mpi_layout[mu]>1 ) {
|
||||||
|
|
||||||
ncomm++;
|
ncomm++;
|
||||||
int comm_proc=1;
|
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
|
if ( dir == mu ) {
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
int comm_proc=1;
|
||||||
dbytes+=
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
} else {
|
||||||
(void *)&xbuf[mu][0],
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
xmit_to_rank,1,
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
(void *)&rbuf[mu][0],
|
}
|
||||||
recv_from_rank,1,
|
int tid = omp_get_thread_num();
|
||||||
bytes,bytes,mu);
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
|
||||||
Grid.StencilSendToRecvFromComplete(requests,mu);
|
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
|
||||||
requests.resize(0);
|
|
||||||
|
|
||||||
comm_proc = mpi_layout[mu]-1;
|
dbytes+=tbytes;
|
||||||
|
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
dbytes+=
|
|
||||||
Grid.StencilSendToRecvFromBegin(requests,
|
|
||||||
(void *)&xbuf[mu+4][0],
|
|
||||||
xmit_to_rank,1,
|
|
||||||
(void *)&rbuf[mu+4][0],
|
|
||||||
recv_from_rank,1,
|
|
||||||
bytes,bytes,mu+4);
|
|
||||||
Grid.StencilSendToRecvFromComplete(requests,mu+4);
|
|
||||||
requests.resize(0);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Grid.Barrier();
|
Grid.Barrier();
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
t_time[i] = stop-start; // microseconds
|
t_time[i] = stop-start; // microseconds
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
timestat.statistics(t_time);
|
timestat.statistics(t_time);
|
||||||
|
|||||||
@@ -32,18 +32,18 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
template<class d>
|
////////////////////////
|
||||||
struct scal {
|
/// Move to domains ////
|
||||||
d internal;
|
////////////////////////
|
||||||
|
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT
|
||||||
};
|
};
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
@@ -52,39 +52,108 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
int Ls=16;
|
||||||
int Ls=8;
|
for(int i=0;i<argc;i++) {
|
||||||
for(int i=0;i<argc;i++)
|
|
||||||
if(std::string(argv[i]) == "-Ls"){
|
if(std::string(argv[i]) == "-Ls"){
|
||||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////
|
||||||
|
// With comms
|
||||||
|
//////////////////
|
||||||
|
Coordinate Dirichlet(Nd+1,0);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
|
||||||
|
//////////////////
|
||||||
|
// Domain decomposed
|
||||||
|
//////////////////
|
||||||
|
/*
|
||||||
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
|
Coordinate mpi = GridDefaultMpi();
|
||||||
|
Coordinate CommDim(Nd);
|
||||||
|
Coordinate shm;
|
||||||
|
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||||
|
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||||
|
Dirichlet[0] = 0;
|
||||||
|
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||||
|
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||||
|
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||||
|
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
*/
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||||
|
{
|
||||||
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
GridLogLayout();
|
GridLogLayout();
|
||||||
|
|
||||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
#undef SINGLE
|
||||||
|
#ifdef SINGLE
|
||||||
|
typedef vComplexF Simd;
|
||||||
|
typedef LatticeFermionF FermionField;
|
||||||
|
typedef LatticeGaugeFieldF GaugeField;
|
||||||
|
typedef LatticeColourMatrixF ColourMatrixField;
|
||||||
|
typedef DomainWallFermionF FermionAction;
|
||||||
|
#else
|
||||||
|
typedef vComplexD Simd;
|
||||||
|
typedef LatticeFermionD FermionField;
|
||||||
|
typedef LatticeGaugeFieldD GaugeField;
|
||||||
|
typedef LatticeColourMatrixD ColourMatrixField;
|
||||||
|
typedef DomainWallFermionD FermionAction;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
|
||||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
|
||||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
|
||||||
|
|
||||||
LatticeFermion src (FGrid); random(RNG5,src);
|
|
||||||
|
FermionField src (FGrid); random(RNG5,src);
|
||||||
#if 0
|
#if 0
|
||||||
src = Zero();
|
src = Zero();
|
||||||
{
|
{
|
||||||
@@ -100,46 +169,39 @@ int main (int argc, char ** argv)
|
|||||||
src = src*N2;
|
src = src*N2;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
FermionField result(FGrid); result=Zero();
|
||||||
LatticeFermion result(FGrid); result=Zero();
|
FermionField ref(FGrid); ref=Zero();
|
||||||
LatticeFermion ref(FGrid); ref=Zero();
|
FermionField tmp(FGrid);
|
||||||
LatticeFermion tmp(FGrid);
|
FermionField err(FGrid);
|
||||||
LatticeFermion err(FGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||||
LatticeGaugeField Umu(UGrid);
|
GaugeField Umu(UGrid);
|
||||||
|
GaugeField UmuCopy(UGrid);
|
||||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||||
|
// SU<Nc>::ColdConfiguration(Umu);
|
||||||
|
UmuCopy=Umu;
|
||||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||||
#if 0
|
|
||||||
Umu=1.0;
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
LatticeColourMatrix ttmp(UGrid);
|
|
||||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
// if (mu !=2 ) ttmp = 0;
|
|
||||||
// ttmp = ttmp* pow(10.0,mu);
|
|
||||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Apply BCs
|
||||||
|
////////////////////////////////////
|
||||||
|
Coordinate Block(4);
|
||||||
|
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
||||||
|
|
||||||
|
DirichletFilter<GaugeField> Filter(Block);
|
||||||
|
Filter.applyFilter(Umu);
|
||||||
|
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// Naive wilson implementation
|
// Naive wilson implementation
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
// replicate across fifth dimension
|
std::vector<ColourMatrixField> U(4,UGrid);
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
|
||||||
{
|
|
||||||
autoView( Umu5d_v, Umu5d, CpuWrite);
|
|
||||||
autoView( Umu_v , Umu , CpuRead);
|
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
@@ -147,10 +209,28 @@ int main (int argc, char ** argv)
|
|||||||
ref = Zero();
|
ref = Zero();
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
tmp = Cshift(src,mu+1,1);
|
||||||
|
{
|
||||||
|
autoView( tmp_v , tmp , CpuWrite);
|
||||||
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
{
|
||||||
|
autoView( tmp_v , tmp , CpuWrite);
|
||||||
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
|
autoView( src_v, src , CpuRead);
|
||||||
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||||
}
|
}
|
||||||
@@ -167,11 +247,9 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
|
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
||||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
|
||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
@@ -181,9 +259,15 @@ int main (int argc, char ** argv)
|
|||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||||
|
|
||||||
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
FermionAction::ImplParams p;
|
||||||
int ncall =1000;
|
p.dirichlet=Dirichlet;
|
||||||
|
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||||
|
Dw.SloppyComms(sloppy);
|
||||||
|
Dw.ImportGauge(Umu);
|
||||||
|
|
||||||
|
int ncall =300;
|
||||||
|
RealD n2e;
|
||||||
|
|
||||||
if (1) {
|
if (1) {
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
Dw.Dhop(src,result,0);
|
Dw.Dhop(src,result,0);
|
||||||
@@ -198,8 +282,8 @@ int main (int argc, char ** argv)
|
|||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
double flops=single_site_flops*volume*ncall;
|
double flops=single_site_flops*volume*ncall;
|
||||||
|
|
||||||
auto nsimd = vComplex::Nsimd();
|
auto nsimd = Simd::Nsimd();
|
||||||
auto simdwidth = sizeof(vComplex);
|
auto simdwidth = sizeof(Simd);
|
||||||
|
|
||||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||||
@@ -208,28 +292,27 @@ int main (int argc, char ** argv)
|
|||||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
|
||||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||||
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
|
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
n2e = norm2(err);
|
||||||
//exit(0);
|
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||||
|
|
||||||
if(( norm2(err)>1.0e-4) ) {
|
if(( n2e>1.0e-4) ) {
|
||||||
/*
|
|
||||||
std::cout << "RESULT\n " << result<<std::endl;
|
|
||||||
std::cout << "REF \n " << ref <<std::endl;
|
|
||||||
std::cout << "ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "RESULT" << std::endl;
|
||||||
|
// std::cout << result<<std::endl;
|
||||||
|
std::cout << norm2(result)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "REF" << std::endl;
|
||||||
|
std::cout << norm2(ref)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "ERR" << std::endl;
|
||||||
|
std::cout << norm2(err)<<std::endl;
|
||||||
|
FGrid->Barrier();
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
assert (norm2(err)< 1.0e-4 );
|
assert (n2e< 1.0e-4 );
|
||||||
}
|
}
|
||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
@@ -238,16 +321,30 @@ int main (int argc, char ** argv)
|
|||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
tmp = Cshift(src,mu+1,1);
|
||||||
{
|
{
|
||||||
autoView( ref_v, ref, CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
autoView( tmp_v, tmp, CpuRead);
|
autoView( tmp_v, tmp, CpuRead);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
int i=s+Ls*ss;
|
||||||
|
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp =adj(U[mu])*src;
|
{
|
||||||
|
autoView( tmp_v , tmp , CpuWrite);
|
||||||
|
autoView( U_v , U[mu] , CpuRead);
|
||||||
|
autoView( src_v, src , CpuRead);
|
||||||
|
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// tmp =adj(U[mu])*src;
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
{
|
{
|
||||||
autoView( ref_v, ref, CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
@@ -259,27 +356,27 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
ref = -0.5*ref;
|
ref = -0.5*ref;
|
||||||
}
|
}
|
||||||
// dump=1;
|
|
||||||
Dw.Dhop(src,result,1);
|
Dw.Dhop(src,result,DaggerYes);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
|
n2e= norm2(err);
|
||||||
if((norm2(err)>1.0e-4)){
|
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||||
/*
|
|
||||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
|
||||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
|
||||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
LatticeFermion src_e (FrbGrid);
|
|
||||||
LatticeFermion src_o (FrbGrid);
|
|
||||||
LatticeFermion r_e (FrbGrid);
|
|
||||||
LatticeFermion r_o (FrbGrid);
|
|
||||||
LatticeFermion r_eo (FGrid);
|
|
||||||
|
|
||||||
|
assert((n2e)<1.0e-4);
|
||||||
|
|
||||||
|
FermionField src_e (FrbGrid);
|
||||||
|
FermionField src_o (FrbGrid);
|
||||||
|
FermionField r_e (FrbGrid);
|
||||||
|
FermionField r_o (FrbGrid);
|
||||||
|
FermionField r_eo (FGrid);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||||
pickCheckerboard(Even,src_e,src);
|
pickCheckerboard(Even,src_e,src);
|
||||||
@@ -291,10 +388,8 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO "<<std::endl;
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
|
||||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||||
@@ -308,13 +403,7 @@ int main (int argc, char ** argv)
|
|||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
if(i==10) cudaProfilerStart();
|
|
||||||
#endif
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
if(i==20) cudaProfilerStop();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
FGrid->Barrier();
|
FGrid->Barrier();
|
||||||
@@ -338,14 +427,9 @@ int main (int argc, char ** argv)
|
|||||||
setCheckerboard(r_eo,r_e);
|
setCheckerboard(r_eo,r_e);
|
||||||
|
|
||||||
err = r_eo-result;
|
err = r_eo-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
n2e= norm2(err);
|
||||||
if((norm2(err)>1.0e-4)){
|
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
|
||||||
/*
|
assert(n2e<1.0e-4);
|
||||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
|
||||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
|
||||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,err);
|
pickCheckerboard(Even,src_e,err);
|
||||||
pickCheckerboard(Odd,src_o,err);
|
pickCheckerboard(Odd,src_o,err);
|
||||||
@@ -354,6 +438,4 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
assert(norm2(src_e)<1.0e-4);
|
assert(norm2(src_e)<1.0e-4);
|
||||||
assert(norm2(src_o)<1.0e-4);
|
assert(norm2(src_o)<1.0e-4);
|
||||||
Grid_finalize();
|
|
||||||
exit(0);
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
|
|||||||
Gamma::Algebra::GammaT
|
Gamma::Algebra::GammaT
|
||||||
};
|
};
|
||||||
|
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet);
|
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
@@ -69,11 +69,19 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
Benchmark(Ls,Dirichlet);
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
|
||||||
//////////////////
|
//////////////////
|
||||||
// Domain decomposed
|
// Domain decomposed
|
||||||
//////////////////
|
//////////////////
|
||||||
|
/*
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
Coordinate mpi = GridDefaultMpi();
|
Coordinate mpi = GridDefaultMpi();
|
||||||
Coordinate CommDim(Nd);
|
Coordinate CommDim(Nd);
|
||||||
@@ -81,42 +89,35 @@ int main (int argc, char ** argv)
|
|||||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||||
|
|
||||||
|
|
||||||
//////////////////////
|
|
||||||
// Node level
|
|
||||||
//////////////////////
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||||
// Dirichlet[0] = 0;
|
Dirichlet[0] = 0;
|
||||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||||
|
|
||||||
Benchmark(Ls,Dirichlet);
|
Benchmark(Ls,Dirichlet,false);
|
||||||
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
|
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||||
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||||
// Dirichlet[0] = 0;
|
|
||||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
|
||||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
|
||||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
|
||||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
|
||||||
|
|
||||||
Benchmark(Ls,Dirichlet);
|
Benchmark(Ls,Dirichlet,true);
|
||||||
|
*/
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet)
|
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||||
{
|
{
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
Coordinate latt4 = GridDefaultLatt();
|
||||||
GridLogLayout();
|
GridLogLayout();
|
||||||
@@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
|||||||
typedef LatticeGaugeFieldF GaugeField;
|
typedef LatticeGaugeFieldF GaugeField;
|
||||||
typedef LatticeColourMatrixF ColourMatrixField;
|
typedef LatticeColourMatrixF ColourMatrixField;
|
||||||
typedef DomainWallFermionF FermionAction;
|
typedef DomainWallFermionF FermionAction;
|
||||||
#endif
|
#else
|
||||||
#ifdef DOUBLE
|
|
||||||
typedef vComplexD Simd;
|
typedef vComplexD Simd;
|
||||||
typedef LatticeFermionD FermionField;
|
typedef LatticeFermionD FermionField;
|
||||||
typedef LatticeGaugeFieldD GaugeField;
|
typedef LatticeGaugeFieldD GaugeField;
|
||||||
typedef LatticeColourMatrixD ColourMatrixField;
|
typedef LatticeColourMatrixD ColourMatrixField;
|
||||||
typedef DomainWallFermionD FermionAction;
|
typedef DomainWallFermionD FermionAction;
|
||||||
#endif
|
#endif
|
||||||
#ifdef DOUBLE2
|
|
||||||
typedef vComplexD2 Simd;
|
|
||||||
typedef LatticeFermionD2 FermionField;
|
|
||||||
typedef LatticeGaugeFieldD2 GaugeField;
|
|
||||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
|
||||||
typedef DomainWallFermionD2 FermionAction;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
@@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
|||||||
FermionAction::ImplParams p;
|
FermionAction::ImplParams p;
|
||||||
p.dirichlet=Dirichlet;
|
p.dirichlet=Dirichlet;
|
||||||
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||||
|
Dw.SloppyComms(sloppy);
|
||||||
Dw.ImportGauge(Umu);
|
Dw.ImportGauge(Umu);
|
||||||
|
|
||||||
int ncall =300;
|
int ncall =300;
|
||||||
|
|||||||
@@ -1,465 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#include <Grid/Grid.h>
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
#define CUDA_PROFILE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CUDA_PROFILE
|
|
||||||
#include <cuda_profiler_api.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
using namespace Grid;
|
|
||||||
|
|
||||||
////////////////////////
|
|
||||||
/// Move to domains ////
|
|
||||||
////////////////////////
|
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial);
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
|
||||||
{
|
|
||||||
Grid_init(&argc,&argv);
|
|
||||||
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
|
|
||||||
int Ls=8;
|
|
||||||
for(int i=0;i<argc;i++) {
|
|
||||||
if(std::string(argv[i]) == "-Ls"){
|
|
||||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////
|
|
||||||
// With comms
|
|
||||||
//////////////////
|
|
||||||
Coordinate Dirichlet(Nd+1,0);
|
|
||||||
|
|
||||||
for(auto partial : {0}) {
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
Benchmark(Ls,Dirichlet,partial);
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////
|
|
||||||
// Domain decomposed
|
|
||||||
//////////////////
|
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
|
||||||
Coordinate mpi = GridDefaultMpi();
|
|
||||||
Coordinate CommDim(Nd);
|
|
||||||
//Coordinate shm({2,1,1,1});
|
|
||||||
Coordinate shm;
|
|
||||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
|
||||||
|
|
||||||
std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
|
|
||||||
|
|
||||||
//////////////////////
|
|
||||||
// Node level
|
|
||||||
//////////////////////
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
|
||||||
// for(int d=0;d<Nd;d++) CommDim[d]= 1;
|
|
||||||
Dirichlet[0] = 0;
|
|
||||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
|
||||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
|
||||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
|
||||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
|
||||||
|
|
||||||
for(auto partial : {0,1}) {
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
Benchmark(Ls,Dirichlet,partial);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
|
||||||
Dirichlet[0] = 0;
|
|
||||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
|
||||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
|
||||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
|
||||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
|
||||||
|
|
||||||
for(auto partial : {0,1}) {
|
|
||||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
|
||||||
Benchmark(Ls,Dirichlet,partial);
|
|
||||||
}
|
|
||||||
Grid_finalize();
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial)
|
|
||||||
{
|
|
||||||
Coordinate latt4 = GridDefaultLatt();
|
|
||||||
GridLogLayout();
|
|
||||||
|
|
||||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
|
||||||
#define SINGLE
|
|
||||||
#ifdef SINGLE
|
|
||||||
typedef vComplexF Simd;
|
|
||||||
typedef LatticeFermionF FermionField;
|
|
||||||
typedef LatticeGaugeFieldF GaugeField;
|
|
||||||
typedef LatticeColourMatrixF ColourMatrixField;
|
|
||||||
typedef DomainWallFermionF FermionAction;
|
|
||||||
#endif
|
|
||||||
#ifdef DOUBLE
|
|
||||||
typedef vComplexD Simd;
|
|
||||||
typedef LatticeFermionD FermionField;
|
|
||||||
typedef LatticeGaugeFieldD GaugeField;
|
|
||||||
typedef LatticeColourMatrixD ColourMatrixField;
|
|
||||||
typedef DomainWallFermionD FermionAction;
|
|
||||||
#endif
|
|
||||||
#ifdef DOUBLE2
|
|
||||||
typedef vComplexD2 Simd;
|
|
||||||
typedef LatticeFermionD2 FermionField;
|
|
||||||
typedef LatticeGaugeFieldD2 GaugeField;
|
|
||||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
|
||||||
typedef DomainWallFermionD2 FermionAction;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
|
||||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
|
||||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
|
||||||
|
|
||||||
|
|
||||||
FermionField src (FGrid); random(RNG5,src);
|
|
||||||
#if 0
|
|
||||||
src = Zero();
|
|
||||||
{
|
|
||||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
|
||||||
SpinColourVectorF tmp;
|
|
||||||
tmp=Zero();
|
|
||||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
|
||||||
std::cout << " source site 0 " << tmp<<std::endl;
|
|
||||||
pokeSite(tmp,src,origin);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
|
||||||
src = src*N2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
FermionField result(FGrid); result=Zero();
|
|
||||||
FermionField ref(FGrid); ref=Zero();
|
|
||||||
FermionField tmp(FGrid);
|
|
||||||
FermionField err(FGrid);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
|
||||||
GaugeField Umu(UGrid);
|
|
||||||
GaugeField UmuFull(UGrid);
|
|
||||||
GaugeField UmuCopy(UGrid);
|
|
||||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
|
||||||
UmuCopy=Umu;
|
|
||||||
UmuFull=Umu;
|
|
||||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Apply BCs
|
|
||||||
////////////////////////////////////
|
|
||||||
Coordinate Block(4);
|
|
||||||
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
|
||||||
|
|
||||||
DirichletFilter<GaugeField> Filter(Block);
|
|
||||||
Filter.applyFilter(Umu);
|
|
||||||
if(!partial) Filter.applyFilter(UmuCopy);
|
|
||||||
|
|
||||||
////////////////////////////////////
|
|
||||||
// Naive wilson implementation
|
|
||||||
////////////////////////////////////
|
|
||||||
std::vector<ColourMatrixField> U(4,UGrid);
|
|
||||||
std::vector<ColourMatrixField> Ucopy(4,UGrid);
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
|
||||||
Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{
|
|
||||||
ref = Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
tmp = Cshift(src,mu+1,1);
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
autoView( src_v, src , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
RealD mass=0.1;
|
|
||||||
RealD M5 =1.8;
|
|
||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
|
||||||
RealD NN = UGrid->NodeCount();
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
|
||||||
std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
|
|
||||||
std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
|
||||||
#endif
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
|
||||||
|
|
||||||
FermionAction::ImplParams p;
|
|
||||||
p.dirichlet=Dirichlet;
|
|
||||||
p.partialDirichlet=partial;
|
|
||||||
FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
|
||||||
|
|
||||||
int ncall =1;
|
|
||||||
RealD n2e;
|
|
||||||
|
|
||||||
if (1) {
|
|
||||||
FGrid->Barrier();
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.Dhop(src,result,0);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=single_site_flops*volume*ncall;
|
|
||||||
|
|
||||||
auto nsimd = Simd::Nsimd();
|
|
||||||
auto simdwidth = sizeof(Simd);
|
|
||||||
|
|
||||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
|
||||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
|
||||||
|
|
||||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
|
||||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
|
||||||
err = ref-result;
|
|
||||||
n2e = norm2(err);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
|
||||||
|
|
||||||
if(( n2e>1.0e-4) ) {
|
|
||||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
DumpSliceNorm("s-slice ref ",ref,1);
|
|
||||||
DumpSliceNorm("s-slice res ",result,1);
|
|
||||||
DumpSliceNorm("s-slice error ",err,1);
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
assert (n2e< 1.0e-4 );
|
|
||||||
}
|
|
||||||
|
|
||||||
if (1)
|
|
||||||
{ // Naive wilson dag implementation
|
|
||||||
|
|
||||||
ref = Zero();
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
|
|
||||||
int depth=dwf_compressor_depth;
|
|
||||||
tmp = Cshift(src,mu+1,1);
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
|
||||||
{
|
|
||||||
autoView( tmp_v , tmp , CpuWrite);
|
|
||||||
autoView( U_v , U[mu] , CpuRead);
|
|
||||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
|
||||||
autoView( src_v, src , CpuRead);
|
|
||||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
if ( (s<depth) || (s>=Ls-depth)){
|
|
||||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
|
||||||
} else {
|
|
||||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
|
||||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
|
||||||
}
|
|
||||||
ref = -0.5*ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
Dw.Dhop(src,result,DaggerYes);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
|
||||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
|
||||||
err = ref-result;
|
|
||||||
n2e= norm2(err);
|
|
||||||
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
|
||||||
|
|
||||||
assert((n2e)<1.0e-4);
|
|
||||||
|
|
||||||
FermionField src_e (FrbGrid);
|
|
||||||
FermionField src_o (FrbGrid);
|
|
||||||
FermionField r_e (FrbGrid);
|
|
||||||
FermionField r_o (FrbGrid);
|
|
||||||
FermionField r_eo (FGrid);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
|
||||||
pickCheckerboard(Even,src_e,src);
|
|
||||||
pickCheckerboard(Odd,src_o,src);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
|
||||||
#endif
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
|
||||||
{
|
|
||||||
FGrid->Barrier();
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
double t0=usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
}
|
|
||||||
double t1=usecond();
|
|
||||||
FGrid->Barrier();
|
|
||||||
|
|
||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
|
||||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
|
|
||||||
}
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
|
||||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
|
||||||
Dw.Dhop (src ,result,DaggerNo);
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
|
|
||||||
|
|
||||||
setCheckerboard(r_eo,r_o);
|
|
||||||
setCheckerboard(r_eo,r_e);
|
|
||||||
|
|
||||||
err = r_eo-result;
|
|
||||||
n2e= norm2(err);
|
|
||||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
|
||||||
assert(n2e<1.0e-4);
|
|
||||||
|
|
||||||
pickCheckerboard(Even,src_e,err);
|
|
||||||
pickCheckerboard(Odd,src_o,err);
|
|
||||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
|
||||||
|
|
||||||
assert(norm2(src_e)<1.0e-4);
|
|
||||||
assert(norm2(src_o)<1.0e-4);
|
|
||||||
}
|
|
||||||
36
configure.ac
36
configure.ac
@@ -86,6 +86,7 @@ AC_ARG_WITH([gmp],
|
|||||||
[try this for a non-standard install prefix of the GMP library])],
|
[try this for a non-standard install prefix of the GMP library])],
|
||||||
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
|
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
|
||||||
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
|
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
|
||||||
|
|
||||||
AC_ARG_WITH([mpfr],
|
AC_ARG_WITH([mpfr],
|
||||||
[AS_HELP_STRING([--with-mpfr=prefix],
|
[AS_HELP_STRING([--with-mpfr=prefix],
|
||||||
[try this for a non-standard install prefix of the MPFR library])],
|
[try this for a non-standard install prefix of the MPFR library])],
|
||||||
@@ -106,6 +107,13 @@ AC_ARG_WITH([lime],
|
|||||||
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
|
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
|
||||||
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
|
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
|
||||||
|
|
||||||
|
############### LIBUNWIND
|
||||||
|
AC_ARG_WITH([unwind],
|
||||||
|
[AS_HELP_STRING([--with-unwind=prefix],
|
||||||
|
[try this for a non-standard install prefix of the libunwind library])],
|
||||||
|
[AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
|
||||||
|
[AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
|
||||||
|
|
||||||
############### OpenSSL
|
############### OpenSSL
|
||||||
AC_ARG_WITH([openssl],
|
AC_ARG_WITH([openssl],
|
||||||
[AS_HELP_STRING([--with-openssl=prefix],
|
[AS_HELP_STRING([--with-openssl=prefix],
|
||||||
@@ -198,8 +206,6 @@ AC_ARG_ENABLE([Nc],
|
|||||||
[ac_Nc=${enable_Nc}], [ac_Nc=3])
|
[ac_Nc=${enable_Nc}], [ac_Nc=3])
|
||||||
|
|
||||||
case ${ac_Nc} in
|
case ${ac_Nc} in
|
||||||
1)
|
|
||||||
AC_DEFINE([Config_Nc],[1],[Gauge group Nc]);;
|
|
||||||
2)
|
2)
|
||||||
AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
|
AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
|
||||||
3)
|
3)
|
||||||
@@ -213,21 +219,6 @@ case ${ac_Nc} in
|
|||||||
*)
|
*)
|
||||||
AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
|
AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
|
||||||
esac
|
esac
|
||||||
############### Nd
|
|
||||||
AC_ARG_ENABLE([Nd],
|
|
||||||
[AS_HELP_STRING([--enable-Nd=2|3|4],[enable default LGT dimension])],
|
|
||||||
[ac_Nd=${enable_Nd}], [ac_Nd=4])
|
|
||||||
|
|
||||||
case ${ac_Nd} in
|
|
||||||
2)
|
|
||||||
AC_DEFINE([Config_Nd],[2],[Gauge field dimension Nd]);;
|
|
||||||
3)
|
|
||||||
AC_DEFINE([Config_Nd],[3],[Gauge field dimension Nd]);;
|
|
||||||
4)
|
|
||||||
AC_DEFINE([Config_Nd],[4],[Gauge field dimension Nd]);;
|
|
||||||
*)
|
|
||||||
AC_MSG_ERROR(["Unsupport dimension Nd = ${ac_Nd}"]);;
|
|
||||||
esac
|
|
||||||
|
|
||||||
############### Symplectic group
|
############### Symplectic group
|
||||||
AC_ARG_ENABLE([Sp],
|
AC_ARG_ENABLE([Sp],
|
||||||
@@ -390,6 +381,16 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
|
|||||||
[have_lime=true],
|
[have_lime=true],
|
||||||
[AC_MSG_WARN(LIME library was not found in your system.)])
|
[AC_MSG_WARN(LIME library was not found in your system.)])
|
||||||
|
|
||||||
|
AC_SEARCH_LIBS([unw_backtrace], [unwind],
|
||||||
|
[AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
|
||||||
|
[have_unwind=true],
|
||||||
|
[AC_MSG_WARN(libunwind library was not found in your system.)])
|
||||||
|
|
||||||
|
AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
|
||||||
|
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
|
||||||
|
[have_unwind_x86_64=true],
|
||||||
|
[AC_MSG_WARN(libunwind library was not found in your system.)])
|
||||||
|
|
||||||
AC_SEARCH_LIBS([SHA256_Init], [crypto],
|
AC_SEARCH_LIBS([SHA256_Init], [crypto],
|
||||||
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
|
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
|
||||||
[have_crypto=true],
|
[have_crypto=true],
|
||||||
@@ -835,7 +836,6 @@ os (target) : $target_os
|
|||||||
compiler vendor : ${ax_cv_cxx_compiler_vendor}
|
compiler vendor : ${ax_cv_cxx_compiler_vendor}
|
||||||
compiler version : ${ax_cv_gxx_version}
|
compiler version : ${ax_cv_gxx_version}
|
||||||
----- BUILD OPTIONS -----------------------------------
|
----- BUILD OPTIONS -----------------------------------
|
||||||
Nd : ${ac_Nd}
|
|
||||||
Nc : ${ac_Nc}
|
Nc : ${ac_Nc}
|
||||||
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
|
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
|
||||||
Threading : ${ac_openmp}
|
Threading : ${ac_openmp}
|
||||||
|
|||||||
273
systems/Jupiter/benchmarks/dwf.1node.perf
Normal file
273
systems/Jupiter/benchmarks/dwf.1node.perf
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
SLURM detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 1
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 0 device 0 bus id: 0009:01:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 4
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 81604378624 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.309000 s : Testing with full communication
|
||||||
|
Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.313000 s : Grid Layout
|
||||||
|
Grid : Message : 0.313000 s : Global lattice size : 32 32 64 64
|
||||||
|
Grid : Message : 0.319000 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 0.320000 s : MPI tasks : 1 1 2 2
|
||||||
|
Grid : Message : 0.129590 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 0.942440 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
local rank 1 device 0 bus id: 0019:01:00.0
|
||||||
|
local rank 2 device 0 bus id: 0029:01:00.0
|
||||||
|
local rank 3 device 0 bus id: 0039:01:00.0
|
||||||
|
Grid : Message : 43.893114 s : Drawing gauge field
|
||||||
|
Grid : Message : 54.574150 s : Random gauge initialised
|
||||||
|
Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 54.580032 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 60.407451 s : *****************************************************************
|
||||||
|
Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 60.407470 s : *****************************************************************
|
||||||
|
Grid : Message : 60.407471 s : *****************************************************************
|
||||||
|
Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 60.407473 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 60.407475 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 60.407480 s : *****************************************************************
|
||||||
|
Grid : Message : 61.102178 s : Called warmup
|
||||||
|
Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
|
||||||
|
Grid : Message : 62.177198 s : mflop/s = 24721998.6
|
||||||
|
Grid : Message : 62.177201 s : mflop/s per rank = 6180499.64
|
||||||
|
Grid : Message : 62.177204 s : mflop/s per node = 24721998.6
|
||||||
|
Grid : Message : 62.182696 s : norm diff 5.8108784e-14 Line 306
|
||||||
|
Grid : Message : 71.328862 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 71.328885 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 71.328886 s : Called DwDag
|
||||||
|
Grid : Message : 71.328887 s : norm dag result 4.12810493
|
||||||
|
Grid : Message : 71.329493 s : norm dag ref 4.12810493
|
||||||
|
Grid : Message : 71.331967 s : norm dag diff 3.40632318e-14 Line 377
|
||||||
|
Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 71.803650 s : src_e0.500003185
|
||||||
|
Grid : Message : 71.819727 s : src_o0.499996882
|
||||||
|
Grid : Message : 71.821991 s : *********************************************************
|
||||||
|
Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 71.821995 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 71.822003 s : *********************************************************
|
||||||
|
Grid : Message : 72.377054 s : Deo mflop/s = 24065467
|
||||||
|
Grid : Message : 72.377071 s : Deo mflop/s per rank 6016366.75
|
||||||
|
Grid : Message : 72.377074 s : Deo mflop/s per node 24065467
|
||||||
|
Grid : Message : 72.624877 s : r_e2.06377678
|
||||||
|
Grid : Message : 72.625198 s : r_o2.06381058
|
||||||
|
Grid : Message : 72.625507 s : res4.12758736
|
||||||
|
Grid : Message : 73.759140 s : norm diff 0
|
||||||
|
Grid : Message : 73.868204 s : norm diff even 0
|
||||||
|
Grid : Message : 73.907201 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 74.414582 s : Testing without internode communication
|
||||||
|
Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 74.414586 s : Grid Layout
|
||||||
|
Grid : Message : 74.414586 s : Global lattice size : 32 32 64 64
|
||||||
|
Grid : Message : 74.414594 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 74.414595 s : MPI tasks : 1 1 2 2
|
||||||
|
Grid : Message : 74.679364 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 74.759525 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 119.252016 s : Drawing gauge field
|
||||||
|
Grid : Message : 129.919846 s : Random gauge initialised
|
||||||
|
Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 129.923611 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 135.522878 s : *****************************************************************
|
||||||
|
Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 135.522899 s : *****************************************************************
|
||||||
|
Grid : Message : 135.522899 s : *****************************************************************
|
||||||
|
Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 135.522901 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 135.522903 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 135.522908 s : *****************************************************************
|
||||||
|
Grid : Message : 136.151202 s : Called warmup
|
||||||
|
Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
|
||||||
|
Grid : Message : 137.224748 s : mflop/s = 24755806
|
||||||
|
Grid : Message : 137.224751 s : mflop/s per rank = 6188951.49
|
||||||
|
Grid : Message : 137.224753 s : mflop/s per node = 24755806
|
||||||
|
Grid : Message : 137.235239 s : norm diff 5.8108784e-14 Line 306
|
||||||
|
Grid : Message : 146.451686 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 146.451710 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 146.451712 s : Called DwDag
|
||||||
|
Grid : Message : 146.451714 s : norm dag result 4.12810493
|
||||||
|
Grid : Message : 146.452323 s : norm dag ref 4.12810493
|
||||||
|
Grid : Message : 146.454799 s : norm dag diff 3.40632318e-14 Line 377
|
||||||
|
Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 146.940894 s : src_e0.500003185
|
||||||
|
Grid : Message : 146.953676 s : src_o0.499996882
|
||||||
|
Grid : Message : 146.955927 s : *********************************************************
|
||||||
|
Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 146.955932 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 146.955941 s : *********************************************************
|
||||||
|
Grid : Message : 147.511975 s : Deo mflop/s = 24036256.5
|
||||||
|
Grid : Message : 147.511989 s : Deo mflop/s per rank 6009064.13
|
||||||
|
Grid : Message : 147.511991 s : Deo mflop/s per node 24036256.5
|
||||||
|
Grid : Message : 147.522100 s : r_e2.06377678
|
||||||
|
Grid : Message : 147.522433 s : r_o2.06381058
|
||||||
|
Grid : Message : 147.522745 s : res4.12758736
|
||||||
|
Grid : Message : 148.229848 s : norm diff 0
|
||||||
|
Grid : Message : 149.233474 s : norm diff even 0
|
||||||
|
Grid : Message : 149.235815 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 149.960990 s : Testing without intranode communication
|
||||||
|
Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 149.960995 s : Grid Layout
|
||||||
|
Grid : Message : 149.960995 s : Global lattice size : 32 32 64 64
|
||||||
|
Grid : Message : 149.961003 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 149.961004 s : MPI tasks : 1 1 2 2
|
||||||
|
Grid : Message : 150.155810 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 150.973420 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 193.933765 s : Drawing gauge field
|
||||||
|
Grid : Message : 204.611551 s : Random gauge initialised
|
||||||
|
Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 204.615265 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 210.117788 s : *****************************************************************
|
||||||
|
Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 210.117809 s : *****************************************************************
|
||||||
|
Grid : Message : 210.117810 s : *****************************************************************
|
||||||
|
Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 210.117813 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 210.117814 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 210.117819 s : *****************************************************************
|
||||||
|
Grid : Message : 210.714641 s : Called warmup
|
||||||
|
Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
|
||||||
|
Grid : Message : 211.892252 s : mflop/s = 22568003.2
|
||||||
|
Grid : Message : 211.892255 s : mflop/s per rank = 5642000.8
|
||||||
|
Grid : Message : 211.892257 s : mflop/s per node = 22568003.2
|
||||||
|
Grid : Message : 211.896037 s : norm diff 5.8108784e-14 Line 306
|
||||||
|
Grid : Message : 220.751375 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 220.751409 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 220.751411 s : Called DwDag
|
||||||
|
Grid : Message : 220.751412 s : norm dag result 4.12810493
|
||||||
|
Grid : Message : 220.753307 s : norm dag ref 4.12810493
|
||||||
|
Grid : Message : 220.755796 s : norm dag diff 3.40632318e-14 Line 377
|
||||||
|
Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 221.697800 s : src_e0.500003185
|
||||||
|
Grid : Message : 221.890920 s : src_o0.499996882
|
||||||
|
Grid : Message : 221.913430 s : *********************************************************
|
||||||
|
Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 221.913480 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 221.913550 s : *********************************************************
|
||||||
|
Grid : Message : 221.645213 s : Deo mflop/s = 24114032
|
||||||
|
Grid : Message : 221.645228 s : Deo mflop/s per rank 6028508.01
|
||||||
|
Grid : Message : 221.645231 s : Deo mflop/s per node 24114032
|
||||||
|
Grid : Message : 221.656021 s : r_e2.06377678
|
||||||
|
Grid : Message : 221.656389 s : r_o2.06381058
|
||||||
|
Grid : Message : 221.656698 s : res4.12758736
|
||||||
|
Grid : Message : 222.110075 s : norm diff 0
|
||||||
|
Grid : Message : 222.857692 s : norm diff even 0
|
||||||
|
Grid : Message : 222.875763 s : norm diff odd 0
|
||||||
|
Grid : Message : 223.598127 s : *******************************************
|
||||||
|
Grid : Message : 223.598145 s : ******* Grid Finalize ******
|
||||||
|
Grid : Message : 223.598146 s : *******************************************
|
||||||
286
systems/Jupiter/benchmarks/dwf.4node.perf
Normal file
286
systems/Jupiter/benchmarks/dwf.4node.perf
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||||
|
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||||
|
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||||
|
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||||
|
SLURM detected
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device Number : 0
|
||||||
|
AcceleratorCudaInit[0]: ========================
|
||||||
|
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
|
||||||
|
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
|
||||||
|
AcceleratorCudaInit[0]: managedMemory: 1
|
||||||
|
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||||
|
AcceleratorCudaInit[0]: warpSize: 32
|
||||||
|
AcceleratorCudaInit[0]: pciBusID: 1
|
||||||
|
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||||
|
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||||
|
AcceleratorCudaInit: using default device
|
||||||
|
AcceleratorCudaInit: assume user either uses
|
||||||
|
AcceleratorCudaInit: a) IBM jsrun, or
|
||||||
|
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||||
|
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||||
|
local rank 0 device 0 bus id: 0009:01:00.0
|
||||||
|
AcceleratorCudaInit: ================================================
|
||||||
|
SharedMemoryMpi: World communicator of size 16
|
||||||
|
SharedMemoryMpi: Node communicator of size 4
|
||||||
|
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers
|
||||||
|
Setting up IPC
|
||||||
|
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|_ | | | | | | | | | | | | _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|_ GGGG RRRR III DDDD _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G R R I D D _|__
|
||||||
|
__|_ G GG RRRR I D D _|__
|
||||||
|
__|_ G G R R I D D _|__
|
||||||
|
__|_ GGGG R R III DDDD _|__
|
||||||
|
__|_ _|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||||
|
| | | | | | | | | | | | | |
|
||||||
|
|
||||||
|
|
||||||
|
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||||
|
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : MPI is initialised and logging filters activated
|
||||||
|
Grid : Message : ================================================
|
||||||
|
Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
|
||||||
|
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||||
|
Grid : Message : MemoryManager Cache 81604378624 bytes
|
||||||
|
Grid : Message : MemoryManager::Init() setting up
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
|
||||||
|
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||||
|
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.838000 s : Testing with full communication
|
||||||
|
Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 0.840000 s : Grid Layout
|
||||||
|
Grid : Message : 0.840000 s : Global lattice size : 64 64 64 64
|
||||||
|
Grid : Message : 0.846000 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 0.846000 s : MPI tasks : 2 2 2 2
|
||||||
|
Grid : Message : 0.165970 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 0.960410 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
local rank 2 device 0 bus id: 0029:01:00.0
|
||||||
|
local rank 3 device 0 bus id: 0039:01:00.0
|
||||||
|
local rank 1 device 0 bus id: 0019:01:00.0
|
||||||
|
Grid : Message : 44.657270 s : Drawing gauge field
|
||||||
|
Grid : Message : 55.247733 s : Random gauge initialised
|
||||||
|
Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 55.253053 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 62.191747 s : *****************************************************************
|
||||||
|
Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 62.191768 s : *****************************************************************
|
||||||
|
Grid : Message : 62.191769 s : *****************************************************************
|
||||||
|
Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 62.191769 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 62.191770 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 62.191772 s : *****************************************************************
|
||||||
|
Grid : Message : 62.857568 s : Called warmup
|
||||||
|
Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
|
||||||
|
Grid : Message : 65.582120 s : mflop/s = 48306525
|
||||||
|
Grid : Message : 65.582140 s : mflop/s per rank = 3019157.81
|
||||||
|
Grid : Message : 65.582150 s : mflop/s per node = 12076631.3
|
||||||
|
Grid : Message : 65.637550 s : norm diff 5.80156793e-14 Line 306
|
||||||
|
Grid : Message : 75.122153 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 75.122167 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 75.122167 s : Called DwDag
|
||||||
|
Grid : Message : 75.122167 s : norm dag result 4.12801829
|
||||||
|
Grid : Message : 75.123295 s : norm dag ref 4.12801829
|
||||||
|
Grid : Message : 75.125890 s : norm dag diff 3.42093991e-14 Line 377
|
||||||
|
Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 75.605683 s : src_e0.500004005
|
||||||
|
Grid : Message : 75.617824 s : src_o0.499996067
|
||||||
|
Grid : Message : 75.620089 s : *********************************************************
|
||||||
|
Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 75.620093 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 75.620096 s : *********************************************************
|
||||||
|
Grid : Message : 76.732272 s : Deo mflop/s = 48068252.4
|
||||||
|
Grid : Message : 76.732283 s : Deo mflop/s per rank 3004265.77
|
||||||
|
Grid : Message : 76.732285 s : Deo mflop/s per node 12017063.1
|
||||||
|
Grid : Message : 76.749317 s : r_e2.06443136
|
||||||
|
Grid : Message : 76.749652 s : r_o2.06378451
|
||||||
|
Grid : Message : 76.749955 s : res4.12821587
|
||||||
|
Grid : Message : 77.198827 s : norm diff 0
|
||||||
|
Grid : Message : 77.981760 s : norm diff even 0
|
||||||
|
Grid : Message : 78.455900 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 78.539337 s : Testing without internode communication
|
||||||
|
Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 78.539339 s : Grid Layout
|
||||||
|
Grid : Message : 78.539339 s : Global lattice size : 64 64 64 64
|
||||||
|
Grid : Message : 78.539347 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 78.539348 s : MPI tasks : 2 2 2 2
|
||||||
|
Grid : Message : 78.798501 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 78.879916 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 124.586264 s : Drawing gauge field
|
||||||
|
Grid : Message : 135.338090 s : Random gauge initialised
|
||||||
|
Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 135.341266 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 142.604280 s : *****************************************************************
|
||||||
|
Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 142.604460 s : *****************************************************************
|
||||||
|
Grid : Message : 142.604470 s : *****************************************************************
|
||||||
|
Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 142.604480 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 142.604500 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 142.604520 s : *****************************************************************
|
||||||
|
Grid : Message : 142.686034 s : Called warmup
|
||||||
|
Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
|
||||||
|
Grid : Message : 144.868559 s : mflop/s = 48706194.1
|
||||||
|
Grid : Message : 144.868561 s : mflop/s per rank = 3044137.13
|
||||||
|
Grid : Message : 144.868562 s : mflop/s per node = 12176548.5
|
||||||
|
Grid : Message : 144.887595 s : norm diff 5.80156793e-14 Line 306
|
||||||
|
Grid : Message : 153.622978 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 153.622995 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 153.622995 s : Called DwDag
|
||||||
|
Grid : Message : 153.622996 s : norm dag result 4.12801829
|
||||||
|
Grid : Message : 153.623604 s : norm dag ref 4.12801829
|
||||||
|
Grid : Message : 153.626098 s : norm dag diff 3.42093991e-14 Line 377
|
||||||
|
Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 154.148319 s : src_e0.500004005
|
||||||
|
Grid : Message : 154.151454 s : src_o0.499996067
|
||||||
|
Grid : Message : 154.153722 s : *********************************************************
|
||||||
|
Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 154.153725 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 154.153728 s : *********************************************************
|
||||||
|
Grid : Message : 155.200671 s : Deo mflop/s = 51121022.4
|
||||||
|
Grid : Message : 155.200682 s : Deo mflop/s per rank 3195063.9
|
||||||
|
Grid : Message : 155.200684 s : Deo mflop/s per node 12780255.6
|
||||||
|
Grid : Message : 155.217204 s : r_e2.06443136
|
||||||
|
Grid : Message : 155.217550 s : r_o2.06378451
|
||||||
|
Grid : Message : 155.217869 s : res4.12821587
|
||||||
|
Grid : Message : 155.673744 s : norm diff 0
|
||||||
|
Grid : Message : 156.463329 s : norm diff even 0
|
||||||
|
Grid : Message : 156.878866 s : norm diff odd 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 157.620764 s : Testing without intranode communication
|
||||||
|
Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||||
|
Grid : Message : 157.620766 s : Grid Layout
|
||||||
|
Grid : Message : 157.620766 s : Global lattice size : 64 64 64 64
|
||||||
|
Grid : Message : 157.620773 s : OpenMP threads : 4
|
||||||
|
Grid : Message : 157.620774 s : MPI tasks : 2 2 2 2
|
||||||
|
Grid : Message : 157.671479 s : Initialising 4d RNG
|
||||||
|
Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||||
|
Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||||
|
Grid : Message : 157.755651 s : Initialising 5d RNG
|
||||||
|
Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||||
|
Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||||
|
Grid : Message : 202.465158 s : Drawing gauge field
|
||||||
|
Grid : Message : 213.214546 s : Random gauge initialised
|
||||||
|
Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||||
|
Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||||
|
Grid : Message : 213.217711 s : Setting up Cshift based reference
|
||||||
|
Grid : Message : 219.662772 s : *****************************************************************
|
||||||
|
Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||||
|
Grid : Message : 219.662787 s : *****************************************************************
|
||||||
|
Grid : Message : 219.662788 s : *****************************************************************
|
||||||
|
Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop
|
||||||
|
Grid : Message : 219.662789 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 219.662790 s : * VComplex size is 64 B
|
||||||
|
Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 219.662791 s : *****************************************************************
|
||||||
|
Grid : Message : 220.425592 s : Called warmup
|
||||||
|
Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
|
||||||
|
Grid : Message : 222.536267 s : mflop/s = 50365105.5
|
||||||
|
Grid : Message : 222.536269 s : mflop/s per rank = 3147819.09
|
||||||
|
Grid : Message : 222.536270 s : mflop/s per node = 12591276.4
|
||||||
|
Grid : Message : 222.541053 s : norm diff 5.80156793e-14 Line 306
|
||||||
|
Grid : Message : 232.135901 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
|
||||||
|
Grid : Message : 232.135916 s : ----------------------------------------------------------------
|
||||||
|
Grid : Message : 232.135917 s : Called DwDag
|
||||||
|
Grid : Message : 232.135918 s : norm dag result 4.12801829
|
||||||
|
Grid : Message : 232.151938 s : norm dag ref 4.12801829
|
||||||
|
Grid : Message : 232.154451 s : norm dag diff 3.42093991e-14 Line 377
|
||||||
|
Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||||
|
Grid : Message : 232.630529 s : src_e0.500004005
|
||||||
|
Grid : Message : 232.643197 s : src_o0.499996067
|
||||||
|
Grid : Message : 232.645527 s : *********************************************************
|
||||||
|
Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO
|
||||||
|
Grid : Message : 232.645532 s : * Vectorising space-time by 8
|
||||||
|
Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
|
||||||
|
Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
|
||||||
|
Grid : Message : 232.645535 s : *********************************************************
|
||||||
|
Grid : Message : 233.774184 s : Deo mflop/s = 47432091.9
|
||||||
|
Grid : Message : 233.774194 s : Deo mflop/s per rank 2964505.74
|
||||||
|
Grid : Message : 233.774196 s : Deo mflop/s per node 11858023
|
||||||
|
Grid : Message : 233.791552 s : r_e2.06443136
|
||||||
|
Grid : Message : 233.791899 s : r_o2.06378451
|
||||||
|
Grid : Message : 233.792204 s : res4.12821587
|
||||||
|
Grid : Message : 234.230783 s : norm diff 0
|
||||||
|
Grid : Message : 235.162780 s : norm diff even 0
|
||||||
|
Grid : Message : 235.291950 s : norm diff odd 0
|
||||||
|
Grid : Message : 235.765411 s : *******************************************
|
||||||
|
Grid : Message : 235.765424 s : ******* Grid Finalize ******
|
||||||
|
Grid : Message : 235.765425 s : *******************************************
|
||||||
|
| ||||||