1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 22:07:05 +01:00

Compare commits

...

120 Commits

Author SHA1 Message Date
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
3811d19298 Fence 2025-04-04 18:35:06 -04:00
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
8cf809e231 Best results on Aurora so far 2025-01-31 16:14:45 +00:00
94019a922e Significantly better performance on Aurora without using pipeline mode 2025-01-30 16:36:46 +00:00
d6b2727f86 Pipeline mode getting better -- 2 nodes @ 10TF/s per node on Aurora 2025-01-29 09:22:21 +00:00
74a4f43946 Optional host buffer bounce for no CUDA aware MPI 2025-01-28 15:22:46 +00:00
1caf8b0f86 Rename 2025-01-28 15:22:37 +00:00
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
8fe429346f Dslash testing for reproduce 2024-11-11 23:11:11 +00:00
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
b91fc1b6b4 Merge branch 'feature/boosted' into feature/deprecate-uvm
Fixed boosted free field test
2024-10-28 16:53:09 -04:00
eafc150034 Test fft asserts 2024-10-23 16:46:26 -04:00
2877f1a268 Verbose reduce 2024-10-23 15:14:16 -04:00
1e893af775 GPU happy 2024-10-23 14:52:15 -04:00
d9f430a575 Happy GPU 2024-10-23 14:51:16 -04:00
63abe87f36 Memory manager verbose improvements that were useful to track an error 2024-10-23 14:49:13 -04:00
368d649c8a feature/deprecate-uvm happier -- preallocate device resident neigbour table 2024-10-23 14:47:55 -04:00
5603464f39 Fix in partial fraction import/export physical and
make the GPU happier on the deprecate-uvm -- don't use static vectors, make member of class
2024-10-23 14:45:58 -04:00
655c79f39e Suppress warning on partial override 2024-10-23 14:44:41 -04:00
565b231c03 Nvcc happy 2024-10-23 14:44:17 -04:00
62a9f180fa NVCC happy 2024-10-23 14:44:04 -04:00
5ae77876a8 Meson field and Aslash field on GPU; some compiler warning removed 2024-10-18 19:08:06 -04:00
4ed2c2c74f Config command 2024-10-18 13:58:33 -04:00
955da582b6 Working on NVCC 2024-10-18 13:58:03 -04:00
11b07b950d Vanilla linux compile, assuming spack prerequisites 2024-10-18 13:57:40 -04:00
8f70cfeda9 Clean up 2024-10-18 13:56:53 -04:00
ce64271048 Remove the copying version 2024-10-18 13:56:24 -04:00
5cc4f3241d Meson field test 2024-10-18 15:42:30 +00:00
6815e138b4 Boosted fermion attempt 2024-10-17 18:37:33 +01:00
a78a61d76f Update configure 2024-10-15 14:38:45 +00:00
2eff3f34ed Alternate reduction; default to grids own but make a configure flag
--enable-reduction=grid|mpi
2024-10-15 14:36:06 +00:00
03687c1d62 Final version of test, closer to original again 2024-10-15 14:35:17 +00:00
febfe4e77f Make my own reduction a configure flag 2024-10-15 14:32:35 +00:00
4d1aa134b5 Use normal reduction, configure flag to force deterministic 2024-10-15 14:32:11 +00:00
5ec879860a Odd rounding issue - bears looking into 2024-10-15 14:30:54 +00:00
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
b728af903c Fast axpy norm under CFLAG 2024-10-11 03:23:09 +00:00
54f1999030 axpy_norm_fast -- wasn't using the determinstic MPI sum causing issues 2024-10-11 03:22:18 +00:00
fd58f0b669 Return ok 2024-10-11 03:21:21 +00:00
c5c67b706e cl::sycl -> SYCL 2024-10-10 22:04:12 +00:00
be7a543e2c Revert barriers -- these were not the problem 2024-10-10 22:03:29 +00:00
68f112d576 New software moves cl::sycl 2024-10-10 22:03:04 +00:00
ec1395a304 Better flight logging 2024-10-10 22:01:57 +00:00
beb0e474ee Use deterministic own brand reduction 2024-10-10 22:01:24 +00:00
2b5fdcbbc5 New software version 2024-10-10 21:59:02 +00:00
295127d456 Deterministic homebrew reduction 2024-10-10 21:58:26 +00:00
7dcfb13694 New software stack 2024-10-10 21:57:35 +00:00
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
9fa8bd6438 Configure for AOT on Aurora latest software 2024-09-23 11:25:44 +00:00
02c8178f16 Almost working on Aurora 2024-09-23 09:43:50 +00:00
e637fbacae Verbose remove 2024-09-23 09:42:43 +00:00
e29b97b3ea Qslash term added 2023-09-14 16:14:03 -04:00
ad2b699d2b Better macos 2023-09-14 16:12:21 -04:00
107 changed files with 6799 additions and 3063 deletions

View File

@ -191,7 +191,7 @@ public:
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite); autoView(pgbuf_v , pgbuf, CpuWrite);
std::cout << "CPU view" << std::endl; //std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -215,7 +215,7 @@ public:
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else assert(0); else assert(0);
std::cout << "Making FFTW plan" << std::endl; //std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
std::cout << "Making pencil" << std::endl; //std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd); Coordinate lcoor(Nd), gcoor(Nd);
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
} }
} }
std::cout << "Looping orthog" << std::endl; //std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords // Loop over orthog coords
int NN=pencil_g.lSites(); int NN=pencil_g.lSites();
GridStopWatch timer; GridStopWatch timer;
@ -274,7 +274,7 @@ public:
usec += timer.useconds(); usec += timer.useconds();
flops+= flops_call*NN; flops+= flops_call*NN;
std::cout << "Writing back results " << std::endl; //std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result // writing out result
{ {
autoView(pgbuf_v,pgbuf,CpuRead); autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
} }
result = result*div; result = result*div;
std::cout << "Destroying plan " << std::endl; //std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif #endif

View File

@ -277,6 +277,38 @@ public:
assert(0); assert(0);
} }
}; };
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several // Even Odd Schur decomp operators; there are several

View File

@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
typedef cublasHandle_t gridblasHandle_t; typedef cublasHandle_t gridblasHandle_t;
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
typedef cl::sycl::queue *gridblasHandle_t; typedef sycl::queue *gridblasHandle_t;
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t; typedef int32_t gridblasHandle_t;
@ -89,9 +89,9 @@ public:
gridblasHandle = theGridAccelerator; gridblasHandle = theGridAccelerator;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
cl::sycl::gpu_selector selector; sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector }; sycl::device selectedDevice { selector };
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; sycl::property_list q_prop{sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop); gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif #endif
gridblasInit=1; gridblasInit=1;
@ -208,8 +208,8 @@ public:
assert(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T); //assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@ -367,28 +367,67 @@ public:
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} ); } );
} else { } else {
assert(0); assert(0);
@ -414,8 +453,8 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose //assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T); //assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@ -514,28 +553,70 @@ public:
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} ); } );
} else { } else {
assert(0); assert(0);
@ -661,29 +742,41 @@ public:
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; if (std::abs(beta) != 0.0)
} ); eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else { } else {
assert(0); assert(0);
} }
@ -809,28 +902,40 @@ public:
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
}); });
} else { } else {
assert(0); assert(0);

View File

@ -144,11 +144,11 @@ public:
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol); acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
} }
RealD t4 = usecond(); RealD t4 = usecond();
std::cout << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl; std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl; std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout << "MulMatrix blas took "<< t3-t2<<" us"<<std::endl; std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout << "MulMatrix copy took "<< t4-t3<<" us"<<std::endl; std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl; std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
} }
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y) void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
@ -242,16 +242,16 @@ public:
RealD flops = 8.0*M*N*K; RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3; flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3; bytes = bytes/(t4-t3)/1.e3;
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else #else
int nrhs; int nrhs;
GridBase *grid; GridBase *grid;
@ -358,17 +358,17 @@ public:
flops = flops/(t4-t3)/1.e3; flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3; bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3; xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout << "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout << "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl; std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif #endif
} }
}; };

View File

@ -63,7 +63,12 @@ class TwoLevelCGmrhs
GridStopWatch SmoothTimer; GridStopWatch SmoothTimer;
GridStopWatch InsertTimer; GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions // more most opertor functions
TwoLevelCGmrhs(RealD tol, TwoLevelCGmrhs(RealD tol,
Integer maxit, Integer maxit,
@ -74,6 +79,12 @@ class TwoLevelCGmrhs
MaxIterations(maxit), MaxIterations(maxit),
_FineLinop(FineLinop), _FineLinop(FineLinop),
_Smoother(Smoother) _Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{ {
grid = fine; grid = fine;
}; };
@ -81,8 +92,8 @@ class TwoLevelCGmrhs
// Vector case // Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x) virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{ {
SolveSingleSystem(src,x); // SolveSingleSystem(src,x);
// SolvePrecBlockCG(src,x); SolvePrecBlockCG(src,x);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@ -657,6 +668,8 @@ public:
CoarseField PleftProjMrhs(this->coarsegridmrhs); CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs); CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE #undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE #if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start(); this->SmoothTimer.Start();
@ -669,6 +682,7 @@ public:
this->SmoothTimer.Stop(); this->SmoothTimer.Stop();
} }
#endif #endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
@ -705,9 +719,11 @@ public:
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min] this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop(); this->PromoteTimer.Stop();
this->FineTimer.Start(); this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
} }
// this->zzz=out[0];
this->FineTimer.Stop(); this->FineTimer.Stop();
} }
}; };

View File

@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
} }
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0); // std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl; // std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. // RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001) if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
src_n = tmp; src_n = tmp;
} }
} }
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm); std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm); std::vector<RealD> lme2(Nm);

View File

@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi=Zero(); // psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;

View File

@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x) inline RealD AggregatePowerLaw(RealD x)
@ -95,7 +97,7 @@ public:
RealD scale; RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false); ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid); FineField noise(FineGrid);
FineField Mn(FineGrid); FineField Mn(FineGrid);
@ -108,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl; hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){ for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]); CG(hermop,noise,subspace[b]);
@ -124,6 +126,53 @@ public:
} }
} }
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found // and this is the best I found
@ -160,14 +209,21 @@ public:
int b =0; int b =0;
{ {
ComplexD ip;
// Filter // Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter); Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn); Cheb(hermop,noise,Mn);
// normalise // normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@ -213,8 +269,18 @@ public:
Mn=*Tnp; Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
ComplexD ip;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@ -228,6 +294,70 @@ public:
} }
assert(b==nn); assert(b==nn);
} }
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop, virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn, int nn,
double hi, double hi,

View File

@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl; std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
} }
#else #else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace) Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{ {
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid(); GridBase *grid = FineGrid();
@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis // Orthogonalise the subblocks over the basis
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid()); CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace); blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint; const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions(); Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -542,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl; std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond(); tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i]; phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond(); tphaseBZ+=usecond();
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@ -555,7 +565,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl; // std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond(); tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace); blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner; coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner; ComputeProj[p] = coarseInner;

View File

@ -69,7 +69,7 @@ public:
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
@ -175,10 +175,11 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
/*
template<class T> class vecView template<class T> class vecView
{ {
protected: protected:
@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
#define autoVecView(v_v,v,mode) \ #define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \ auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v); ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
*/
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,16 +1,15 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UVM #ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define MAXLINE 512 #define MAXLINE 512
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...) //#define dprintf(...)
//#define mprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0); assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL; AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return; if (AccCache.accLock!=0) return;
@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL; AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
// uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++; DeviceEvictions++;
@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode) void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{ {
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr); CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{ {
uint64_t CpuPtr = (uint64_t)_CpuPtr; uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
} }
void MemoryManager::EvictVictims(uint64_t bytes) void MemoryManager::EvictVictims(uint64_t bytes)
{ {
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes); assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){ while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){ if ( DeviceLRUBytes > 0){
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent AccCache.state = Consistent; // Empty + AccRead => Consistent
} }
AccCache.accLock= 1; AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){ } else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) { if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache); CpuDiscard(AccCache);
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else { } else {
assert(0); assert(0);
} }
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
// If view is opened on device must remove from LRU // If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){ if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU // must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n"); dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache); LRUremove(AccCache);
} }
@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
AccCache.accLock--; AccCache.accLock--;
// Move to LRU queue if not locked and close on device // Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) { if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache); LRUinsert(AccCache);
} else { } else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
} }
} }
void MemoryManager::CpuViewClose(uint64_t CpuPtr) void MemoryManager::CpuViewClose(uint64_t CpuPtr)

View File

@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
GlobalSumVector((float *)&c,2); GlobalSumVector((float *)&c,2);
} }
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c) void CartesianCommunicator::GlobalSum(ComplexD &c)
{ {
GlobalSumVector((double *)&c,2); GlobalSumVector((double *)&c,2);
} }
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{ {
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);

View File

@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ; extern bool Stencil_force_mpi ;
@ -127,7 +129,36 @@ public:
void GlobalSumVector(ComplexD *c,int N); void GlobalSumVector(ComplexD *c,int N);
void GlobalXOR(uint32_t &); void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &); void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering assert in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
@ -138,8 +169,8 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<CommsRequest_t> &list); void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@ -158,6 +189,17 @@ public:
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,

View File

@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
//////////////////////////////////////////// ////////////////////////////////////////////
@ -257,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator()
} }
} }
} }
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@ -287,27 +307,18 @@ void CartesianCommunicator::GlobalMax(double &d)
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N) void CartesianCommunicator::GlobalSumVector(float *f,int N)
{ {
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@ -332,7 +343,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list) void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@ -351,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from, int from,
int bytes) int bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<MpiCommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
@ -369,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
assert(ierr==0); assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
} }
// Basic Halo comms primitive // Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -381,12 +387,287 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int bytes,int dir) int bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
#undef NVLINK_GET // Define to use get instead of put DMA
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif
}
// This is a NVLINK PUT
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,int dox,
@ -411,54 +692,106 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag; int tag;
if ( dor ) { void * host_xmit = NULL;
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; ////////////////////////////////
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); // Receives already posted
assert(ierr==0); // Copies already started
list.push_back(rrq); ////////////////////////////////
off_node_bytes+=rbytes; /*
} * PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET #ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit); void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
#endif CommsRequest_t srq;
}
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) { if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
tag= dir+_processor*32; // Intranode
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
} }
} }
#endif
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
int nreq=list.size(); acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
acceleratorCopySynchronise(); std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
if (nreq==0) return; int nreq=MpiRequests.size();
std::vector<MPI_Status> status(nreq); if (nreq>0) {
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); status.resize(MpiRequests.size());
assert(ierr==0); int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
list.resize(0); assert(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
} }
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);

View File

@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);} void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{ {
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,

View File

@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else #else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif
@ -105,7 +137,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes); // static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif #endif
#include <syscall.h> #include <syscall.h>
#endif #endif
@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
HostCommBuf= malloc(bytes); // printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#if 0
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;
@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) //void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{ //{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) //#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes); // acceleratorCopyToDevice(src,dest,bytes);
#else //#else
bcopy(src,dest,bytes); // bcopy(src,dest,bytes);
#endif //#endif
} //}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@ -953,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
} }
#endif #endif
//SharedMemoryTest(); SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier
@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode; check[0]=GlobalSharedMemory::WorldNode;
check[1]=r; check[1]=r;
check[2]=magic; check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t)); acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier(); acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode); assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r); assert(check[1]==r);
assert(check[2]==magic); assert(check[2]==magic);
ShmBarrier();
} }
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)

View File

@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@ -55,17 +55,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
RealD t1,t0; RealD t1,t0;
t0=usecond(); t0=usecond();
if ( !comm_dim ) { if ( !comm_dim ) {
std::cout << "CSHIFT: Cshift_local" <<std::endl; // std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) { } else if ( splice_dim ) {
std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl; // std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift); Cshift_comms_simd(ret,rhs,dimension,shift);
} else { } else {
std::cout << "CSHIFT: Cshift_comms" <<std::endl; // std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift); Cshift_comms(ret,rhs,dimension,shift);
} }
t1=usecond(); t1=usecond();
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret; return ret;
} }
@ -76,12 +76,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3); Cshift_comms(ret,rhs,dimension,shift,0x3);
} else { } else {
std::cout << "Two pass Cshift_comms" <<std::endl; // std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
@ -94,12 +94,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; // std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
std::cout << "Single pass Cshift_comms" <<std::endl; // std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3); Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else { } else {
std::cout << "Two pass Cshift_comms" <<std::endl; // std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
@ -125,7 +125,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size); static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size); static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
RealD tcopy=0.0; RealD tcopy=0.0;
@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
// int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
grid->Barrier(); grid->Barrier();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
xbytes+=bytes; xbytes+=bytes;
grid->Barrier(); grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
@ -175,11 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
tscatter+=usecond(); tscatter+=usecond();
} }
} }
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl; if (Cshift_verbose){
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
} }
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -197,9 +216,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int simd_layout = grid->_simd_layout[dimension]; int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ; int comm_dim = grid->_processors[dimension] >1 ;
std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd // std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
<< " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout // << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
<< " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; // << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1); assert(comm_dim==1);
assert(simd_layout==2); assert(simd_layout==2);
@ -224,12 +243,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi; scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi; scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){ for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size); send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size); recv_buf_extract[s].resize(buffer_size);
} }
#ifndef ACCELERATOR_AWARE_MPI
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); // ExtractPointerArray<scalar_object> pointers(Nsimd); //
@ -281,11 +304,22 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi, grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
xbytes+=bytes; xbytes+=bytes;
grid->Barrier(); grid->Barrier();
@ -301,12 +335,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl; if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
}); });
} }
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm"); GRID_TRACE("axpy_norm");
return axpy_norm_fast(ret,a,x,y); #ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm"); GRID_TRACE("axpby_norm");
return axpby_norm_fast(ret,a,b,x,y); #ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
} }
/// Trace product /// Trace product

View File

@ -290,8 +290,10 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL #ifdef GRID_SYCL
uint64_t csum=0; uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{ {
// Hack // Hack
@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0]; uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words); csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
} else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
} }
FlightRecorder::CsumLog(csum);
#endif #endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right); ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm); RealD local = real(nrm);
FlightRecorder::NormLog(real(nrm)); ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@ -353,8 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// z_v
{
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&z_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// inner_v
{
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
}
#endif
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm));
assert(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@ -498,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
scalar_type * ptr = (scalar_type *) &result[0]; scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type); int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words); grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
} }
template<class vobj> inline template<class vobj> inline
std::vector<typename vobj::scalar_object> std::vector<typename vobj::scalar_object>

View File

@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
Integer nsimd= vobj::Nsimd(); Integer nsimd= vobj::Nsimd();
{ {
sycl::buffer<sobj, 1> abuff(&ret, {1}); sycl::buffer<sobj, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>()); auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites}, cgh.parallel_for(sycl::range<1>{osites},
Reduction, Reduction,
[=] (cl::sycl::id<1> item, auto &sum) { [=] (sycl::id<1> item, auto &sum) {
auto osite = item[0]; auto osite = item[0];
sum +=Reduce(lat[osite]); sum +=Reduce(lat[osite]);
}); });
@ -75,11 +75,11 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
Word ret = 0; Word ret = 0;
{ {
sycl::buffer<Word, 1> abuff(&ret, {1}); sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(cl::sycl::range<1>{L}, cgh.parallel_for(sycl::range<1>{L},
Reduction, Reduction,
[=] (cl::sycl::id<1> index, auto &sum) { [=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index]; sum ^=vec[index];
}); });
}); });

View File

@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int))); d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device //copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream); acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy //sync after copy
accelerator_barrier(); accelerator_barrier();
@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
}); });
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size}, cgh.parallel_for(sycl::range<1>{subvol_size},
Reduction, Reduction,
[=](cl::sycl::id<1> item, auto &sum) { [=](sycl::id<1> item, auto &sum) {
auto s = item[0]; auto s = item[0];
sum += rb_p[r*subvol_size+s]; sum += rb_p[r*subvol_size+s];
}); });

View File

@ -466,9 +466,15 @@ public:
static deviceVector<vobj> recv_buf; static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth); send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<CommsRequest_t> fwd_req; std::vector<MpiCommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req; std::vector<MpiCommsRequest_t> bwd_req;
int words = buffer_size; int words = buffer_size;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@ -495,9 +501,17 @@ public:
t_gather+=usecond()-t; t_gather+=usecond()-t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req, grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank, (void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
for ( int d=0;d < depth ; d ++ ) { for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +522,17 @@ public:
t_gather+= usecond() - t; t_gather+= usecond() - t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req, grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }

View File

@ -98,7 +98,7 @@ public:
virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// virtual smeared interface through configuration container // virtual smeared interface through configuration container
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@ -132,6 +132,10 @@ public:
template <class GaugeField > template <class GaugeField >
class EmptyAction : public Action <GaugeField> class EmptyAction : public Action <GaugeField>
{ {
using Action<GaugeField>::refresh;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative

View File

@ -55,6 +55,11 @@ public:
RealD alpha; // Mobius scale RealD alpha; // Mobius scale
RealD k; // EOFA normalization constant RealD k; // EOFA normalization constant
// Device resident
deviceVector<Coeff_t> d_shift_coefficients;
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
virtual void Instantiatable(void) = 0; virtual void Instantiatable(void) = 0;
// EOFA-specific operations // EOFA-specific operations
@ -92,6 +97,11 @@ public:
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) / this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) / ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
d_shift_coefficients.resize(Ls);
d_MooeeInv_shift_lc.resize(Ls);
d_MooeeInv_shift_norm.resize(Ls);
}; };
}; };

View File

@ -124,6 +124,11 @@ public:
RealD _b; RealD _b;
RealD _c; RealD _c;
// possible boost
std::vector<ComplexD> qmu;
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
// Cayley form Moebius (tanh and zolotarev) // Cayley form Moebius (tanh and zolotarev)
std::vector<Coeff_t> omega; std::vector<Coeff_t> omega;
std::vector<Coeff_t> bs; // S dependent coeffs std::vector<Coeff_t> bs; // S dependent coeffs
@ -143,6 +148,17 @@ public:
std::vector<Coeff_t> ueem; std::vector<Coeff_t> ueem;
std::vector<Coeff_t> dee; std::vector<Coeff_t> dee;
// Device memory
deviceVector<Coeff_t> d_diag;
deviceVector<Coeff_t> d_upper;
deviceVector<Coeff_t> d_lower;
deviceVector<Coeff_t> d_lee;
deviceVector<Coeff_t> d_dee;
deviceVector<Coeff_t> d_uee;
deviceVector<Coeff_t> d_leem;
deviceVector<Coeff_t> d_ueem;
// Matrices of 5d ee inverse params // Matrices of 5d ee inverse params
// std::vector<iSinglet<Simd> > MatpInv; // std::vector<iSinglet<Simd> > MatpInv;
// std::vector<iSinglet<Simd> > MatmInv; // std::vector<iSinglet<Simd> > MatmInv;

View File

@ -60,6 +60,50 @@ public:
// virtual void Instantiatable(void)=0; // virtual void Instantiatable(void)=0;
virtual void Instantiatable(void) =0; virtual void Instantiatable(void) =0;
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
// Efficient support for multigrid coarsening // Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp); virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out); virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);

View File

@ -42,7 +42,7 @@ public:
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist); this->MomentumSpacePropagatorHw(out,in,_m,twist);
}; };
// Constructors // Constructors
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu, OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,

View File

@ -41,6 +41,10 @@ public:
public: public:
// Constructors // Constructors
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu, OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -41,6 +41,9 @@ public:
public: public:
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonContFracTanhFermion(GaugeField &_Umu, OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -40,6 +40,9 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu, OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -41,6 +41,9 @@ public:
public: public:
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu, OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -40,6 +40,11 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors // Constructors
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu, OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,

View File

@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
public: public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
const int part_frac_chroma_convention=1; const int part_frac_chroma_convention=0;
void Meooe_internal(const FermionField &in, FermionField &out,int dag); void Meooe_internal(const FermionField &in, FermionField &out,int dag);
void Mooee_internal(const FermionField &in, FermionField &out,int dag); void Mooee_internal(const FermionField &in, FermionField &out,int dag);
@ -83,11 +83,70 @@ public:
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,const ImplParams &p= ImplParams()); RealD _mass,RealD M5,const ImplParams &p= ImplParams());
PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
if ( this->qmu.size() ){
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
} else {
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
}
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
protected: protected:
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata); virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
std::vector<RealD> qmu;
// Part frac // Part frac
RealD mass; RealD mass;
RealD dw_diag; RealD dw_diag;

View File

@ -414,29 +414,6 @@ public:
// surface_list.resize(0); // surface_list.resize(0);
this->same_node.resize(npoints); this->same_node.resize(npoints);
}; };
/*
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
this->same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
*/
template < class compressor> template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
@ -507,6 +484,11 @@ public:
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier(); accelerator_barrier();
#ifdef NVLINK_GET
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
} }
}; };

View File

@ -109,6 +109,8 @@ public:
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
std::vector<double> qmu) ;
// Implement hopping term non-hermitian hopping term; half cb or both // Implement hopping term non-hermitian hopping term; half cb or both
// Implement s-diagonal DW // Implement s-diagonal DW
@ -117,6 +119,9 @@ public:
void DhopOE(const FermionField &in, FermionField &out,int dag); void DhopOE(const FermionField &in, FermionField &out,int dag);
void DhopEO(const FermionField &in, FermionField &out,int dag); void DhopEO(const FermionField &in, FermionField &out,int dag);
void DhopComms (const FermionField &in, FermionField &out);
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
// add a DhopComm // add a DhopComm
// -- suboptimal interface will presently trigger multiple comms. // -- suboptimal interface will presently trigger multiple comms.
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);

View File

@ -57,6 +57,10 @@ public:
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ; int interior=1,int exterior=1) ;
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids);
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ; int interior=1,int exterior=1) ;

View File

@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
FourDimGrid, FourDimGrid,
FourDimRedBlackGrid,_M5,p), FourDimRedBlackGrid,_M5,p),
mass_plus(_mass), mass_minus(_mass) mass_plus(_mass), mass_minus(_mass)
{ {
// qmu defaults to zero size;
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
M5Ddag(psi,psi,Din,lower,diag,upper); M5Ddag(psi,psi,Din,lower,diag,upper);
} }
template<class Impl>
void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
{
if ( qmu.size() ) {
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
std::vector<ComplexD> coeff(Nd);
ComplexD ci(0,1);
assert(qmu.size()==Nd);
for(int mu=0;mu<Nd;mu++){
coeff[mu] = ci*qmu[mu];
if ( dag ) coeff[mu] = conjugate(coeff[mu]);
}
chi = chi + Gamma(Gmu[0])*psi*coeff[0];
for(int mu=1;mu<Nd;mu++){
chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
}
}
}
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
{ {
@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
// Assemble Din // Assemble Din
Meooe5D(psi,Din); Meooe5D(psi,Din);
this->DW(Din,chi,DaggerNo); this->DW(Din,chi,DaggerNo);
// add i q_mu gamma_mu here
addQmu(Din,chi,DaggerNo);
// ((b D_W + D_w hop terms +1) on s-diag // ((b D_W + D_w hop terms +1) on s-diag
axpby(chi,1.0,1.0,chi,psi); axpby(chi,1.0,1.0,chi,psi);
@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
FermionField Din(psi.Grid()); FermionField Din(psi.Grid());
// Apply Dw // Apply Dw
this->DW(psi,Din,DaggerYes); this->DW(psi,Din,DaggerYes);
// add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
addQmu(psi,Din,DaggerYes);
MeooeDag5D(Din,chi); MeooeDag5D(Din,chi);
@ -488,7 +524,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
leem.resize(Ls); leem.resize(Ls);
uee.resize(Ls); uee.resize(Ls);
ueem.resize(Ls); ueem.resize(Ls);
for(int i=0;i<Ls;i++){ for(int i=0;i<Ls;i++){
dee[i] = bee[i]; dee[i] = bee[i];
@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
dee[Ls-1] += delta_d; dee[Ls-1] += delta_d;
} }
//////////////////////////////////////////
// Device buffers
//////////////////////////////////////////
d_diag.resize(Ls);
d_upper.resize(Ls);
d_lower.resize(Ls);
d_dee.resize(Ls);
d_lee.resize(Ls);
d_uee.resize(Ls);
d_leem.resize(Ls);
d_ueem.resize(Ls);
// int inv=1; // int inv=1;
// this->MooeeInternalCompute(0,inv,MatpInv,MatmInv); // this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); // this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);

View File

@ -57,9 +57,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
int Ls =this->Ls; int Ls =this->Ls;
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &d_upper[0];
@ -99,9 +99,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
int Ls=this->Ls; int Ls=this->Ls;
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &d_upper[0];
@ -134,11 +134,11 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
int Ls=this->Ls; int Ls=this->Ls;
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto plee = & d_lee [0]; auto plee = & d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & d_dee [0];
@ -196,11 +196,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
autoView(psi , psi_i,AcceleratorRead); autoView(psi , psi_i,AcceleratorRead);
autoView(chi , chi_i,AcceleratorWrite); autoView(chi , chi_i,AcceleratorWrite);
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto plee = & d_lee [0]; auto plee = & d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & d_dee [0];

View File

@ -42,13 +42,13 @@ template<class Impl>
void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata) void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
{ {
// How to check Ls matches?? // How to check Ls matches??
// std::cout<<GridLogMessage << Ls << " Ls"<<std::endl; std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
// std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl; std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
// std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl; std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
// std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl; std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
// std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl; std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
// std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
int Ls = this->Ls; int Ls = this->Ls;
std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
assert(zdata->db==Ls);// Beta has Ls coeffs assert(zdata->db==Ls);// Beta has Ls coeffs
R=(1+this->mass)/(1-this->mass); R=(1+this->mass)/(1-this->mass);
@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
int Ls = this->Ls; int Ls = this->Ls;
conformable(solution5d.Grid(),this->FermionGrid()); conformable(solution5d.Grid(),this->FermionGrid());
conformable(exported4d.Grid(),this->GaugeGrid()); conformable(exported4d.Grid(),this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); ExtractSlice(exported4d, solution5d, Ls-1, 0);
} }
template<class Impl> template<class Impl>
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
conformable(input4d.Grid() ,this->GaugeGrid()); conformable(input4d.Grid() ,this->GaugeGrid());
FermionField tmp(this->FermionGrid()); FermionField tmp(this->FermionGrid());
tmp=Zero(); tmp=Zero();
InsertSlice(input4d, tmp, Ls-1, Ls-1); InsertSlice(input4d, tmp, Ls-1, 0);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d); this->Dminus(tmp,imported5d);
} }

View File

@ -51,13 +51,13 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
autoView( chi , chi_i, AcceleratorWrite); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
auto pdiag = &d_diag[0]; acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
auto pupper = &d_upper[0]; acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
auto plower = &d_lower[0]; acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
@ -89,14 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
autoView( phi , phi_i, AcceleratorRead); autoView( phi , phi_i, AcceleratorRead);
autoView( chi , chi_i, AcceleratorWrite); autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &this->d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &this->d_upper[0];
auto plower = &d_lower[0]; auto plower = &this->d_lower[0];
acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
@ -125,18 +125,18 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
autoView( chi, chi_i, AcceleratorWrite); autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); auto plee = & this->d_lee [0];
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); auto pdee = & this->d_dee [0];
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); auto puee = & this->d_uee [0];
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); auto pleem = & this->d_leem[0];
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto pueem = & this->d_ueem[0];
auto plee = & d_lee [0];
auto pdee = & d_dee [0];
auto puee = & d_uee [0];
auto pleem = & d_leem[0];
auto pueem = & d_ueem[0];
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
uint64_t nloop=grid->oSites()/Ls; uint64_t nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;

View File

@ -50,14 +50,14 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
auto pdiag = &d_diag[0];
auto pupper = &d_upper[0];
auto plower = &d_lower[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -93,15 +93,15 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); auto pshift_coeffs = &this->d_shift_coefficients[0];
auto pdiag = &d_diag[0]; acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
auto pupper = &d_upper[0]; acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
auto plower = &d_lower[0]; acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
auto pshift_coeffs = &d_shift_coeffs[0]; acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
@ -138,14 +138,14 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0]; auto pdiag = &this->d_diag[0];
auto pupper = &d_upper[0]; auto pupper = &this->d_upper[0];
auto plower = &d_lower[0]; auto plower = &this->d_lower[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
@ -180,16 +180,16 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
assert(phi.Checkerboard() == psi.Checkerboard()); assert(phi.Checkerboard() == psi.Checkerboard());
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); auto pdiag = &this->d_diag[0];
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); auto pupper = &this->d_upper[0];
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); auto plower = &this->d_lower[0];
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); auto pshift_coeffs = &this->d_shift_coefficients[0];
auto pdiag = &d_diag[0];
auto pupper = &d_upper[0];
auto plower = &d_lower[0];
auto pshift_coeffs = &d_shift_coeffs[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
auto pm = this->pm; auto pm = this->pm;
@ -230,17 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
autoView(psi , psi_i, AcceleratorRead); autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); auto plee = & this->d_lee [0];
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); auto pdee = & this->d_dee [0];
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); auto puee = & this->d_uee [0];
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); auto pleem = & this->d_leem[0];
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto pueem = & this->d_ueem[0];
auto plee = & d_lee [0]; acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
auto pdee = & d_dee [0]; acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto puee = & d_uee [0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
auto pleem = & d_leem[0]; acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
auto pueem = & d_ueem[0]; acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
@ -293,23 +293,22 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
// Move into object and constructor // Move into object and constructor
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto pm = this->pm; auto pm = this->pm;
auto plee = & d_lee [0]; auto plee = & this->d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & this->d_dee [0];
auto puee = & d_uee [0]; auto puee = & this->d_uee [0];
auto pleem = & d_leem[0]; auto pleem = & this->d_leem[0];
auto pueem = & d_ueem[0]; auto pueem = & this->d_ueem[0];
auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0];
auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
static deviceVector<Coeff_t> d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0]; acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -367,17 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
autoView(psi , psi_i, AcceleratorRead); autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); auto plee = &this->d_lee [0];
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); auto pdee = &this->d_dee [0];
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); auto puee = &this->d_uee [0];
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); auto pleem = &this->d_leem[0];
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); auto pueem = &this->d_ueem[0];
auto plee = & d_lee [0]; acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
auto pdee = & d_dee [0]; acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto puee = & d_uee [0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
auto pleem = & d_leem[0]; acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
auto pueem = & d_ueem[0]; acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -426,25 +425,23 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
autoView(chi , chi_i, AcceleratorWrite); autoView(chi , chi_i, AcceleratorWrite);
int Ls = this->Ls; int Ls = this->Ls;
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto pm = this->pm; auto pm = this->pm;
auto plee = & d_lee [0]; auto plee = & this->d_lee [0];
auto pdee = & d_dee [0]; auto pdee = & this->d_dee [0];
auto puee = & d_uee [0]; auto puee = & this->d_uee [0];
auto pleem = & d_leem[0]; auto pleem = & this->d_leem[0];
auto pueem = & d_ueem[0]; auto pueem = & this->d_ueem[0];
static deviceVector<Coeff_t> d_MooeeInvDag_shift_lc(Ls); auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0];
static deviceVector<Coeff_t> d_MooeeInvDag_shift_norm(Ls); auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0]; acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0]; acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; // auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; // auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];

View File

@ -237,7 +237,32 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
// ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H // ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H
// //
this->DW(psi,D,DaggerNo); this->DW(psi,D,DaggerNo);
// DW - DW+iqslash
// (g5 Dw)^dag = g5 Dw
// (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
if ( qmu.size() ) {
std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
assert(qmu.size()==Nd);
FermionField qslash_psi(psi.Grid());
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
for(int mu=1;mu<Nd;mu++){
qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
}
ComplexD ci(0.0,1.0);
qslash_psi = ci*qslash_psi ; // i qslash
D = D + qslash_psi;
}
int nblock=(Ls-1)/2; int nblock=(Ls-1)/2;
for(int b=0;b<nblock;b++){ for(int b=0;b<nblock;b++){
@ -255,15 +280,55 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
} }
{ {
// The 'conventional' Cayley overlap operator is
//
// Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
//
//
// With massless limit 1/2(1+g5 sgnHw)
//
// Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
//
// However, the conventional normalisation has both a leading order factor of 2 in Zq
// at tree level AND a mass dependent (1-m) that are convenient to absorb.
//
// In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
//
// num = -i sin kmu gmu
//
// denom ( sqrt(sk^2 + (2shk^2 - 1)^2
// b_k = sk2 - M5;
//
// w_k = sqrt(sk + b_k*b_k);
//
// denom= ( w_k + b_k + mass*mass) ;
//
// denom= one/denom;
// out = num*denom;
//
// Chroma, and Grid define partial fraction via 4d operator
//
// Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
//
// Now since:
//
// (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
//
// This corresponds to a modified mass parameter
//
// It has an annoying
//
//
double R=(1+this->mass)/(1-this->mass); double R=(1+this->mass)/(1-this->mass);
//R g5 psi[Ls] + p[0] H //R g5 psi[Ls] + p[0] Hw
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1); ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
for(int b=0;b<nblock;b++){ for(int b=0;b<nblock;b++){
int s = 2*b+1; int s = 2*b+1;
double pp = p[nblock-1-b]; double pp = p[nblock-1-b];
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s); axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
} }
} }
} }
@ -411,17 +476,18 @@ void PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
int Ls = this->Ls; int Ls = this->Ls;
conformable(solution5d.Grid(),this->FermionGrid()); conformable(solution5d.Grid(),this->FermionGrid());
conformable(exported4d.Grid(),this->GaugeGrid()); conformable(exported4d.Grid(),this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); ExtractSlice(exported4d, solution5d, Ls-1, 0);
} }
template<class Impl> template<class Impl>
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{ {
//void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
int Ls = this->Ls; int Ls = this->Ls;
conformable(imported5d.Grid(),this->FermionGrid()); conformable(imported5d.Grid(),this->FermionGrid());
conformable(input4d.Grid() ,this->GaugeGrid()); conformable(input4d.Grid() ,this->GaugeGrid());
FermionField tmp(this->FermionGrid()); FermionField tmp(this->FermionGrid());
tmp=Zero(); tmp=Zero();
InsertSlice(input4d, tmp, Ls-1, Ls-1); InsertSlice(input4d, tmp, Ls-1, 0);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d); this->Dminus(tmp,imported5d);
} }
@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
{ {
int Ls = this->Ls; int Ls = this->Ls;
qmu.resize(0);
assert((Ls&0x1)==1); // Odd Ls required assert((Ls&0x1)==1); // Odd Ls required
int nrational=Ls-1; int nrational=Ls-1;
@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
Approx::zolotarev_free(zdata); Approx::zolotarev_free(zdata);
} }
template<class Impl>
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,
std::vector<RealD> &_qmu,
const ImplParams &p)
: PartialFractionFermion5D<Impl>(_Umu,
FiveDimGrid,FiveDimRedBlackGrid,
FourDimGrid,FourDimRedBlackGrid,
_mass,M5,p)
{
qmu=_qmu;
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -325,29 +325,25 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
// Start comms // Gather intranode and extra node differentiated?? // Start comms // Gather intranode and extra node differentiated??
///////////////////////////// /////////////////////////////
{ {
std::cout << " WilsonFermion5D gather " <<std::endl; // std::cout << " WilsonFermion5D gather " <<std::endl;
GRID_TRACE("Gather"); GRID_TRACE("Gather");
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
} }
std::cout << " WilsonFermion5D Communicate Begin " <<std::endl; // std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
auto id=traceStart("Communicate overlapped");
st.CommunicateBegin(requests);
#if 1
///////////////////////////// /////////////////////////////
// Overlap with comms // Overlap with comms
///////////////////////////// /////////////////////////////
{ st.CommunicateBegin(requests);
std::cout << " WilsonFermion5D Comms merge " <<std::endl; st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
GRID_TRACE("MergeSHM"); #endif
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
}
///////////////////////////// /////////////////////////////
// do the compute interior // do the compute interior
///////////////////////////// /////////////////////////////
std::cout << " WilsonFermion5D Interior " <<std::endl;
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior"); GRID_TRACE("DhopDagInterior");
@ -356,25 +352,35 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
GRID_TRACE("DhopInterior"); GRID_TRACE("DhopInterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} }
//ifdef GRID_ACCELERATED
#if 0
/////////////////////////////
// Overlap with comms -- on GPU the interior kernel call is nonblocking
/////////////////////////////
st.CommunicateBegin(requests);
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
#endif
///////////////////////////// /////////////////////////////
// Complete comms // Complete comms
///////////////////////////// /////////////////////////////
std::cout << " WilsonFermion5D Comms Complete " <<std::endl; // std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
traceStop(id); // traceStop(id);
///////////////////////////// /////////////////////////////
// do the compute exterior // do the compute exterior
///////////////////////////// /////////////////////////////
{ {
std::cout << " WilsonFermion5D Comms Merge " <<std::endl; // std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
GRID_TRACE("Merge"); GRID_TRACE("Merge");
st.CommsMerge(compressor); st.CommsMerge(compressor);
} }
std::cout << " WilsonFermion5D Exterior " <<std::endl; // std::cout << " WilsonFermion5D Exterior " <<std::endl;
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior"); GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
@ -382,7 +388,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
GRID_TRACE("DhopExterior"); GRID_TRACE("DhopExterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
} }
std::cout << " WilsonFermion5D Done " <<std::endl; // std::cout << " WilsonFermion5D Done " <<std::endl;
} }
@ -397,13 +403,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
std::cout << " WilsonFermion5D Halo exch " <<std::endl; // std::cout << " WilsonFermion5D Halo exch " <<std::endl;
{ {
GRID_TRACE("HaloExchange"); GRID_TRACE("HaloExchange");
st.HaloExchangeOpt(in,compressor); st.HaloExchangeOpt(in,compressor);
} }
std::cout << " WilsonFermion5D Dhop " <<std::endl; // std::cout << " WilsonFermion5D Dhop " <<std::endl;
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDag"); GRID_TRACE("DhopDag");
@ -412,7 +418,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
GRID_TRACE("Dhop"); GRID_TRACE("Dhop");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
} }
std::cout << " WilsonFermion5D Done " <<std::endl; // std::cout << " WilsonFermion5D Done " <<std::endl;
} }
@ -438,6 +444,29 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
DhopInternal(StencilOdd,UmuEven,in,out,dag); DhopInternal(StencilOdd,UmuEven,in,out,dag);
} }
template<class Impl>
void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
{
int dag =0 ;
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
Compressor compressor(dag);
Stencil.HaloExchangeOpt(in,compressor);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
{
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
int LLs = in.Grid()->_rdimensions[0];
int Opt = WilsonKernelsStatic::Opt;
Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
}
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{ {
@ -740,6 +769,15 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
{
std::vector<double> empty_q(Nd,0.0);
MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
}
template<class Impl>
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
RealD mass,
std::vector<double> twist,
std::vector<double> qmu)
{ {
Gamma::Algebra Gmu [] = { Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX, Gamma::Algebra::GammaX,
@ -755,6 +793,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
typedef typename FermionField::scalar_type ScalComplex; typedef typename FermionField::scalar_type ScalComplex;
typedef Lattice<iSinglet<vector_type> > LatComplex; typedef Lattice<iSinglet<vector_type> > LatComplex;
typedef iSpinMatrix<ScalComplex> SpinMat;
Coordinate latt_size = _grid->_fdimensions; Coordinate latt_size = _grid->_fdimensions;
@ -772,8 +811,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
LatComplex kmu(_grid); LatComplex kmu(_grid);
ScalComplex ci(0.0,1.0); ScalComplex ci(0.0,1.0);
std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
for(int mu=0;mu<Nd;mu++) { for(int mu=0;mu<Nd;mu++) {
LatticeCoordinate(kmu,mu); LatticeCoordinate(kmu,mu);
RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
@ -782,9 +823,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
sk = sk + sin(kmu)*sin(kmu);
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in); sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]);
// Terms for boosted Fermion
// 1/2 [ -i gamma.(sin p + q ) ]
// [ --------------------- + 1 ]
// [ wq + b ]
//
// wq = sqrt( (sinp+q)^2 + b^2 )
//
num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);
} }
num = num + mass * in ; num = num + mass * in ;

View File

@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \ } else { \
chi = coalescedRead(buf[SE->_offset],lane); \ chi = coalescedRead(buf[SE->_offset],lane); \
} \ } \
acceleratorSynchronise(); \ acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); Recon(result, Uchi);
@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody #undef LoopBody
} }
#ifdef GRID_SYCL
extern "C" {
ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
}
#ifdef GRID_SIMT
#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
#else
#define MAKE_ID(A) (0)
#endif
#else
#define MAKE_ID(A) (0)
#endif
#define KERNEL_CALL_ID(A) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
const int Nsimd = SiteHalfSpinor::Nsimd(); \
const int lane=acceleratorSIMTlane(Nsimd); \
int idx=sF*Nsimd+lane; \
uint64_t id = MAKE_ID(); \
ids[idx]=id; \
}); \
accelerator_barrier();
#define KERNEL_CALLNB(A) \ #define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \ const uint64_t NN = Nsite*Ls; \
@ -418,7 +458,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
int sF = ss; \ int sF = ss; \
int sU = ss/Ls; \ int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \ WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
}); });
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});} });}
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
@ -475,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
// dependent on result of merge // // dependent on result of merge
acceleratorFenceComputeStream(); acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");
} }
template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids)
{
autoView(U_v , U,AcceleratorRead);
autoView(in_v , in,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v , st,AcceleratorRead);
KERNEL_CALL_ID(GenericDhopSite);
}
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,

View File

@ -40,6 +40,11 @@ public:
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
using Action<GaugeField>::S;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
using Action<GaugeField>::refresh;
private: private:
RealD c_plaq; RealD c_plaq;
RealD c_rect; RealD c_rect;

View File

@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
public: public:
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
using Action<GaugeField>::S;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
using Action<GaugeField>::refresh;
/////////////////////////// constructors /////////////////////////// constructors
explicit WilsonGaugeAction(RealD beta_):beta(beta_){}; explicit WilsonGaugeAction(RealD beta_):beta(beta_){};

File diff suppressed because it is too large Load Diff

View File

@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Map a su2 subgroup number to the pair of rows that are non zero // Map a su2 subgroup number to the pair of rows that are non zero
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
int spare = su2_index; int spare = su2_index;

View File

@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
// Map a su2 subgroup number to the pair of rows that are non zero // Map a su2 subgroup number to the pair of rows that are non zero
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
template <ONLY_IF_Sp> template <ONLY_IF_Sp>
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
const int nsp=ncolour/2; const int nsp=ncolour/2;
assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2)); assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));

View File

@ -121,17 +121,22 @@ class CartesianStencilAccelerator {
StencilVector same_node; StencilVector same_node;
Coordinate _simd_layout; Coordinate _simd_layout;
Parameters parameters; Parameters parameters;
ViewMode mode;
StencilEntry* _entries_p; StencilEntry* _entries_p;
StencilEntry* _entries_host_p;
cobj* u_recv_buf_p; cobj* u_recv_buf_p;
cobj* u_send_buf_p; cobj* u_send_buf_p;
accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }
accelerator_inline int GetNodeLocal(int osite,int point) const { // Not a device function
return this->_entries_p[point+this->_npoints*osite]._is_local; inline int GetNodeLocal(int osite,int point) const {
StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
return SE._is_local;
} }
accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; ptype = this->_permute_type[point];
return & this->_entries_p[point+this->_npoints*osite];
} }
accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
@ -164,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
{ {
public: public:
int *closed; int *closed;
StencilEntry *cpu_ptr; // StencilEntry *cpu_ptr;
ViewMode mode;
public: public:
// default copy constructor // default copy constructor
CartesianStencilView (const CartesianStencilView &refer_to_me) = default; CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode) CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me), : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me)
cpu_ptr(this->_entries_p),
mode(_mode)
{ {
this->_entries_p =(StencilEntry *) this->ViewOpen(_mode);
MemoryManager::ViewOpen(this->_entries_p, }
this->_npoints*this->_osites*sizeof(StencilEntry), void ViewOpen(ViewMode _mode)
mode, {
AdviseDefault); this->mode = _mode;
} }
void ViewClose(void) void ViewClose(void) { }
{
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
}; };
@ -274,8 +273,8 @@ public:
std::vector<deviceVector<std::pair<int,int> > > face_table ; std::vector<deviceVector<std::pair<int,int> > > face_table ;
deviceVector<int> surface_list; deviceVector<int> surface_list;
std::vector<StencilEntry> _entries; // Resident in host memory std::vector<StencilEntry> _entries; // Resident in host memory
deviceVector<StencilEntry> _entries_device; // Resident in device memory deviceVector<StencilEntry> _entries_device; // Resident in device memory
std::vector<Packet> Packets; std::vector<Packet> Packets;
std::vector<Merge> Mergers; std::vector<Merge> Mergers;
std::vector<Merge> MergersSHM; std::vector<Merge> MergersSHM;
@ -364,11 +363,32 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
// std::cout << "Communicate Begin "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Communicate begin");
// All GPU kernel tasks must complete // All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete // accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too. // But the HaloGather had a barrier too.
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate prepare "<<i<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
// std::cout << "Communicate PollDtoH "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
// std::cout << "Communicate CopySynch "<<std::endl;
// _grid->Barrier();
acceleratorCopySynchronise();
// Starts intranode
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate Begin "<<i<<std::endl;
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send, Packets[i].to_rank,Packets[i].do_send,
@ -386,18 +406,25 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
// std::cout << "Communicate Complete "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Start communicate complete");
// std::cout << "Communicate Complete PollIRecv "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollIRecv(MpiReqs);
// std::cout << "Communicate Complete Complete "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial(); if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet(); else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull(); else DslashLogFull();
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
// accelerator_barrier(); // accelerator_barrier();
_grid->StencilBarrier();
// run any checksums
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
} }
FlightRecorder::StepLog("Finish communicate complete");
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel. // Blocking send and receive. Either sequential or parallel.
@ -419,6 +446,7 @@ public:
Communicate(); Communicate();
CommsMergeSHM(compress); CommsMergeSHM(compress);
CommsMerge(compress); CommsMerge(compress);
accelerator_barrier();
} }
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx) template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -474,6 +502,9 @@ public:
void HaloGather(const Lattice<vobj> &source,compressor &compress) void HaloGather(const Lattice<vobj> &source,compressor &compress)
{ {
// accelerator_barrier(); // accelerator_barrier();
//////////////////////////////////
// I will overwrite my send buffers
//////////////////////////////////
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
assert(source.Grid()==_grid); assert(source.Grid()==_grid);
@ -487,6 +518,11 @@ public:
HaloGatherDir(source,compress,point,face_idx); HaloGatherDir(source,compress,point,face_idx);
} }
accelerator_barrier(); // All my local gathers are complete accelerator_barrier(); // All my local gathers are complete
#ifdef NVLINK_GET
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
face_table_computed=1; face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size); assert(u_comm_offset==_unified_buffer_size);
} }
@ -525,6 +561,7 @@ public:
coalescedWrite(to[j] ,coalescedRead(from [j])); coalescedWrite(to[j] ,coalescedRead(from [j]));
}); });
acceleratorFenceComputeStream(); acceleratorFenceComputeStream();
// Also fenced in WilsonKernels
} }
} }
@ -622,10 +659,10 @@ public:
//////////////////////////////////////// ////////////////////////////////////////
void PrecomputeByteOffsets(void){ void PrecomputeByteOffsets(void){
for(int i=0;i<_entries.size();i++){ for(int i=0;i<_entries.size();i++){
if( _entries[i]._is_local ) { if( this->_entries[i]._is_local ) {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj);
} else { } else {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj);
} }
} }
}; };
@ -653,7 +690,9 @@ public:
} }
} }
} }
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
surface_list.resize(surface_list_size); surface_list.resize(surface_list_size);
std::vector<int> surface_list_host(surface_list_size);
int32_t ss=0; int32_t ss=0;
for(int site = 0 ;site< vol4;site++){ for(int site = 0 ;site< vol4;site++){
int local = 1; int local = 1;
@ -665,12 +704,13 @@ public:
if(local == 0) { if(local == 0) {
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int idx=site*Ls+s; int idx=site*Ls+s;
acceleratorPut(surface_list[ss],idx); surface_list_host[ss]= idx;
ss++; ss++;
} }
} }
} }
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl; acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
} }
/// Introduce a block structure and switch off comms on boundaries /// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block) void DirichletBlock(const Coordinate &dirichlet_block)
@ -758,7 +798,13 @@ public:
this->_osites = _grid->oSites(); this->_osites = _grid->oSites();
_entries.resize(this->_npoints* this->_osites); _entries.resize(this->_npoints* this->_osites);
this->_entries_p = &_entries[0]; _entries_device.resize(this->_npoints* this->_osites);
this->_entries_host_p = &_entries[0];
this->_entries_p = &_entries_device[0];
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
for(int ii=0;ii<npoints;ii++){ for(int ii=0;ii<npoints;ii++){
int i = ii; // reverse direction to get SIMD comms done first int i = ii; // reverse direction to get SIMD comms done first
@ -835,6 +881,7 @@ public:
u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
} }
PrecomputeByteOffsets(); PrecomputeByteOffsets();
acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry));
} }
void Local (int point, int dimension,int shiftpm,int cbmask) void Local (int point, int dimension,int shiftpm,int cbmask)
@ -990,10 +1037,10 @@ public:
for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){ for(int b=0;b<_grid->_slice_block[dimension];b++){
int idx=point+(lo+o+b)*this->_npoints; int idx=point+(lo+o+b)*this->_npoints;
_entries[idx]._offset =ro+o+b; this->_entries[idx]._offset =ro+o+b;
_entries[idx]._permute=permute; this->_entries[idx]._permute=permute;
_entries[idx]._is_local=1; this->_entries[idx]._is_local=1;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];
} }
@ -1011,10 +1058,10 @@ public:
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
int idx = point+(lo+o+b)*this->_npoints; int idx = point+(lo+o+b)*this->_npoints;
_entries[idx]._offset =ro+o+b; this->_entries[idx]._offset =ro+o+b;
_entries[idx]._is_local=1; this->_entries[idx]._is_local=1;
_entries[idx]._permute=permute; this->_entries[idx]._permute=permute;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
} }
@ -1038,10 +1085,10 @@ public:
for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){ for(int b=0;b<_grid->_slice_block[dimension];b++){
int idx=point+(so+o+b)*this->_npoints; int idx=point+(so+o+b)*this->_npoints;
_entries[idx]._offset =offset+(bo++); this->_entries[idx]._offset =offset+(bo++);
_entries[idx]._is_local=0; this->_entries[idx]._is_local=0;
_entries[idx]._permute=0; this->_entries[idx]._permute=0;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];
} }
@ -1058,10 +1105,10 @@ public:
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
int idx = point+(so+o+b)*this->_npoints; int idx = point+(so+o+b)*this->_npoints;
_entries[idx]._offset =offset+(bo++); this->_entries[idx]._offset =offset+(bo++);
_entries[idx]._is_local=0; this->_entries[idx]._is_local=0;
_entries[idx]._permute =0; this->_entries[idx]._permute =0;
_entries[idx]._around_the_world=wrap; this->_entries[idx]._around_the_world=wrap;
} }
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];

View File

@ -202,13 +202,13 @@ void acceleratorInit(void)
#ifdef GRID_SYCL #ifdef GRID_SYCL
cl::sycl::queue *theGridAccelerator; sycl::queue *theGridAccelerator;
cl::sycl::queue *theCopyAccelerator; sycl::queue *theCopyAccelerator;
void acceleratorInit(void) void acceleratorInit(void)
{ {
int nDevices = 1; int nDevices = 1;
// cl::sycl::gpu_selector selector; // sycl::gpu_selector selector;
// cl::sycl::device selectedDevice { selector }; // sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (sycl::gpu_selector_v); theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
@ -242,14 +242,14 @@ void acceleratorInit(void)
gethostname(hostname, HOST_NAME_MAX+1); gethostname(hostname, HOST_NAME_MAX+1);
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
auto devices = cl::sycl::device::get_devices(); auto devices = sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){ for(int d = 0;d<devices.size();d++){
#define GPU_PROP_STR(prop) \ #define GPU_PROP_STR(prop) \
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str()); printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
#define GPU_PROP_FMT(prop,FMT) \ #define GPU_PROP_FMT(prop,FMT) \
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>()); printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
if ( world_rank == 0) { if ( world_rank == 0) {

View File

@ -132,27 +132,17 @@ inline void cuda_mem(void)
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \ { \
int nt=acceleratorThreads(); \ if ( num1*num2 ) { \
typedef uint64_t Iterator; \ int nt=acceleratorThreads(); \
auto lambda = [=] accelerator \ typedef uint64_t Iterator; \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \ auto lambda = [=] accelerator \
__VA_ARGS__; \ (Iterator iter1,Iterator iter2,Iterator lane) mutable { \
}; \ __VA_ARGS__; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \ }; \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ dim3 cu_threads(nsimd,acceleratorThreads(),1); \
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
} LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ } \
{ \
int nt=acceleratorThreads(); \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
} }
#define accelerator_for6dNB(iter1, num1, \ #define accelerator_for6dNB(iter1, num1, \
@ -175,19 +165,6 @@ inline void cuda_mem(void)
} }
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
int nt=acceleratorThreads(); \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__ template<typename lambda> __global__
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{ {
@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
Lambda(x,y,z); Lambda(x,y,z);
} }
} }
template<typename lambda> __global__
void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{
// Weird permute is to make lane coalesce for large blocks
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
uint64_t z = threadIdx.x;
if ( (x < num1) && (y<num2) && (z<num3) ) {
Lambda(x,y,z);
}
}
template<typename lambda> __global__ template<typename lambda> __global__
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
@ -243,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
} \ } \
} }
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMallocHost((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
assert(0);
}
return ptr;
}
inline void *acceleratorAllocShared(size_t bytes) inline void *acceleratorAllocShared(size_t bytes)
{ {
void *ptr=NULL; void *ptr=NULL;
@ -264,18 +241,34 @@ inline void *acceleratorAllocDevice(size_t bytes)
} }
return ptr; return ptr;
}; };
typedef int acceleratorEvent_t;
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
return 0;
}
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{ {
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream); cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
return 0;
} }
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); }; inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
//auto discard=cudaStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
inline int acceleratorIsCommunicable(void *ptr) inline int acceleratorIsCommunicable(void *ptr)
@ -302,7 +295,7 @@ NAMESPACE_END(Grid);
// Force deterministic reductions // Force deterministic reductions
#define SYCL_REDUCTION_DETERMINISTIC #define SYCL_REDUCTION_DETERMINISTIC
#include <sycl/CL/sycl.hpp> #include <sycl/sycl.hpp>
#include <sycl/usm.hpp> #include <sycl/usm.hpp>
#include <level_zero/ze_api.h> #include <level_zero/ze_api.h>
#include <sycl/ext/oneapi/backend/level_zero.hpp> #include <sycl/ext/oneapi/backend/level_zero.hpp>
@ -314,8 +307,8 @@ inline void acceleratorMem(void)
std::cout <<" SYCL acceleratorMem not implemented"<<std::endl; std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
} }
extern cl::sycl::queue *theGridAccelerator; extern sycl::queue *theGridAccelerator;
extern cl::sycl::queue *theCopyAccelerator; extern sycl::queue *theCopyAccelerator;
#ifdef __SYCL_DEVICE_ONLY__ #ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT #define GRID_SIMT
@ -326,24 +319,24 @@ extern cl::sycl::queue *theCopyAccelerator;
accelerator_inline int acceleratorSIMTlane(int Nsimd) { accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT #ifdef GRID_SIMT
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2]; return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];
#else #else
return 0; return 0;
#endif #endif
} // SYCL specific } // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ theGridAccelerator->submit([&](sycl::handler &cgh) { \
unsigned long nt=acceleratorThreads(); \ unsigned long nt=acceleratorThreads(); \
if(nt < 8)nt=8; \ if(nt < 8)nt=8; \
unsigned long unum1 = num1; \ unsigned long unum1 = num1; \
unsigned long unum2 = num2; \ unsigned long unum2 = num2; \
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
cl::sycl::range<3> local {nt,1,nsimd}; \ sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
cgh.parallel_for( \ cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \ sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \ [=] (sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \ [[intel::reqd_sub_group_size(16)]] \
{ \ { \
auto iter1 = item.get_global_id(0); \ auto iter1 = item.get_global_id(0); \
@ -356,26 +349,50 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} ///////
// Asynch event interface
///////
typedef sycl::event acceleratorEvent_t;
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
ev.wait();
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
{
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr) inline int acceleratorIsCommunicable(void *ptr)
{ {
#if 0 #if 0
auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
if ( uvm = cl::sycl::usm::alloc::shared ) return 1; if ( uvm = sycl::usm::alloc::shared ) return 1;
else return 0; else return 0;
#endif #endif
return 1; return 1;
} }
#endif #endif
////////////////////////////////////////////// //////////////////////////////////////////////
@ -472,6 +489,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
} \ } \
} }
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipHostMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
}
return ptr;
};
inline void *acceleratorAllocShared(size_t bytes) inline void *acceleratorAllocShared(size_t bytes)
{ {
void *ptr=NULL; void *ptr=NULL;
@ -495,37 +522,53 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr; return ptr;
}; };
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
//inline void acceleratorCopySynchronise(void) { }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch typedef int acceleratorEvent_t;
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{ {
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream); auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
return 0;
} }
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream); acceleratorCopyToDevice(from,to,bytes);
return 0;
} }
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream); acceleratorCopyFromDevice(from,to,bytes);
return 0;
} }
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); }; inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
// auto discard=hipStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
#endif #endif
inline void acceleratorPin(void *ptr,unsigned long bytes)
{
#ifdef GRID_SYCL
sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
#endif
}
////////////////////////////////////////////// //////////////////////////////////////////////
// Common on all GPU targets // Common on all GPU targets
////////////////////////////////////////////// //////////////////////////////////////////////
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB // FIXME -- the non-blocking nature got broken March 30 2023 by PAB
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} ); #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
accelerator_barrier(dummy);
#define accelerator_for( iter, num, nsimd, ... ) \ #define accelerator_for( iter, num, nsimd, ... ) \
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \ accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
@ -547,6 +590,8 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
#undef GRID_SIMT #undef GRID_SIMT
typedef int acceleratorEvent_t;
inline void acceleratorMem(void) inline void acceleratorMem(void)
{ {
/* /*
@ -566,16 +611,21 @@ inline void acceleratorMem(void)
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} inline void acceleratorEventWait(acceleratorEvent_t ev){}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
inline void acceleratorCopySynchronise(void) {}; inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);}; inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
#else #else
@ -655,9 +705,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
acceleratorCopySynchronise(); acceleratorCopySynchronise();
} }
template<class T> void acceleratorPut(T& dev,T&host) template<class T> void acceleratorPut(T& dev,const T&host)
{ {
acceleratorCopyToDevice(&host,&dev,sizeof(T)); acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
} }
template<class T> T acceleratorGet(T& dev) template<class T> T acceleratorGet(T& dev)
{ {

View File

@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define thread_critical DO_PRAGMA(omp critical) #define thread_critical DO_PRAGMA(omp critical)
#ifdef GRID_OMP #ifdef GRID_OMP
inline void thread_bcopy(void *from, void *to,size_t bytes) inline void thread_bcopy(const void *from, void *to,size_t bytes)
{ {
uint64_t *ufrom = (uint64_t *)from; const uint64_t *ufrom = (const uint64_t *)from;
uint64_t *uto = (uint64_t *)to; uint64_t *uto = (uint64_t *)to;
assert(bytes%8==0); assert(bytes%8==0);
uint64_t words=bytes/8; uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
}); });
} }
#else #else
inline void thread_bcopy(void *from, void *to,size_t bytes) inline void thread_bcopy(const void *from, void *to,size_t bytes)
{ {
bcopy(from,to,bytes); bcopy(from,to,bytes);
} }

View File

@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
int FlightRecorder::LoggingMode; int FlightRecorder::LoggingMode;
int FlightRecorder::ChecksumComms; int FlightRecorder::ChecksumComms;
int FlightRecorder::ChecksumCommsSend; int FlightRecorder::ChecksumCommsSend;
const char * FlightRecorder::StepName;
int32_t FlightRecorder::StepLoggingCounter;
int32_t FlightRecorder::XmitLoggingCounter; int32_t FlightRecorder::XmitLoggingCounter;
int32_t FlightRecorder::RecvLoggingCounter; int32_t FlightRecorder::RecvLoggingCounter;
int32_t FlightRecorder::CsumLoggingCounter; int32_t FlightRecorder::CsumLoggingCounter;
@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
CsumLoggingCounter=0; CsumLoggingCounter=0;
NormLoggingCounter=0; NormLoggingCounter=0;
ReductionLoggingCounter=0; ReductionLoggingCounter=0;
StepName = "No steps started";
StepLoggingCounter=0;
} }
void FlightRecorder::Truncate(void) void FlightRecorder::Truncate(void)
{ {
@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
assert(0); assert(0);
} }
} }
bool FlightRecorder::StepLog(const char *name)
{
StepName = name;
StepLoggingCounter ++;
return true;
}
void FlightRecorder::SetLoggingModePrint(void) void FlightRecorder::SetLoggingModePrint(void)
{ {
@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
{ {
return ErrorCounter; return ErrorCounter;
} }
void FlightRecorder::NormLog(double value) bool FlightRecorder::NormLog(double value)
{ {
uint64_t hex = * ( (uint64_t *)&value ); uint64_t hex = * ( (uint64_t *)&value );
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLoggingCounter++; NormLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeRecord) { if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLogVector.push_back(value); NormLogVector.push_back(value);
NormLoggingCounter++; NormLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
NormLoggingCounter,NormLogVector.size(), NormLoggingCounter,NormLogVector.size(),
value, NormLogVector[NormLoggingCounter]); fflush(stderr); value, NormLogVector[NormLoggingCounter]); fflush(stderr);
if(!ContinueOnFail)assert(0); // Force takedown of job BACKTRACEFP(stderr);
if(!ContinueOnFail) return false;
ErrorCounter++; ErrorCounter++;
} else { } else {
@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
} }
NormLoggingCounter++; NormLoggingCounter++;
} }
return true;
} }
void FlightRecorder::CsumLog(uint64_t hex) bool FlightRecorder::CsumLog(uint64_t hex)
{ {
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLoggingCounter++; CsumLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeRecord) { if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLogVector.push_back(hex); CsumLogVector.push_back(hex);
CsumLoggingCounter++; CsumLoggingCounter++;
return true;
} }
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
if ( hex != hexref ) { if ( hex != hexref ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
CsumLoggingCounter,hex, hexref); CsumLoggingCounter,hex, hexref);
BACKTRACEFP(stderr);
fflush(stderr); fflush(stderr);
if(!ContinueOnFail) assert(0); // Force takedown of job if(!ContinueOnFail) return false;
ErrorCounter++; ErrorCounter++;
@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
} }
CsumLoggingCounter++; CsumLoggingCounter++;
} }
return true;
} }
void FlightRecorder::ReductionLog(double local,double global) void FlightRecorder::ReductionLog(double local,double global)
{ {
uint64_t hex_l = * ( (uint64_t *)&local ); uint64_t hex_l = * ( (uint64_t *)&local );
@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(ReductionLoggingCounter < ReductionLogVector.size()){ if(ReductionLoggingCounter < ReductionLogVector.size()){
if ( global != ReductionLogVector[ReductionLoggingCounter] ) { if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
ReductionLoggingCounter,ReductionLogVector.size(), ReductionLoggingCounter,ReductionLogVector.size(),
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);
@ -250,10 +280,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if(LoggingMode == LoggingModeNone) return; if(LoggingMode == LoggingModeNone) return;
if ( ChecksumCommsSend ){ if ( ChecksumCommsSend ){
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return; if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL #ifdef GRID_SYCL
uint64_t *ubuf = (uint64_t *)buf;
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@ -267,11 +298,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(XmitLoggingCounter < XmitLogVector.size()){ if(XmitLoggingCounter < XmitLogVector.size()){
if ( _xor != XmitLogVector[XmitLoggingCounter] ) { if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n", fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
XmitLoggingCounter,XmitLogVector.size(), XmitLoggingCounter,XmitLogVector.size(),
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); _xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);
@ -293,9 +328,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
{ {
if ( ChecksumComms ){ if ( ChecksumComms ){
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return; if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL #ifdef GRID_SYCL
uint64_t *ubuf = (uint64_t *)buf;
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) { if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@ -309,11 +344,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if(LoggingMode == LoggingModeVerify) { if(LoggingMode == LoggingModeVerify) {
if(RecvLoggingCounter < RecvLogVector.size()){ if(RecvLoggingCounter < RecvLogVector.size()){
if ( _xor != RecvLogVector[RecvLoggingCounter] ) { if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n", fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
GridHostname(), GridHostname(),
GlobalSharedMemory::WorldShmRank, GlobalSharedMemory::WorldShmRank,
RecvLoggingCounter,RecvLogVector.size(), RecvLoggingCounter,RecvLogVector.size(),
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); _xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0); if ( !ContinueOnFail ) assert(0);

View File

@ -12,6 +12,8 @@ class FlightRecorder {
static int LoggingMode; static int LoggingMode;
static uint64_t ErrorCounter; static uint64_t ErrorCounter;
static const char * StepName;
static int32_t StepLoggingCounter;
static int32_t XmitLoggingCounter; static int32_t XmitLoggingCounter;
static int32_t RecvLoggingCounter; static int32_t RecvLoggingCounter;
static int32_t CsumLoggingCounter; static int32_t CsumLoggingCounter;
@ -30,8 +32,9 @@ class FlightRecorder {
static void SetLoggingModeRecord(void); static void SetLoggingModeRecord(void);
static void SetLoggingModeVerify(void); static void SetLoggingModeVerify(void);
static void SetLoggingMode(LoggingMode_t mode); static void SetLoggingMode(LoggingMode_t mode);
static void NormLog(double value); static bool StepLog(const char *name);
static void CsumLog(uint64_t csum); static bool NormLog(double value);
static bool CsumLog(uint64_t csum);
static void ReductionLog(double lcl, double glbl); static void ReductionLog(double lcl, double glbl);
static void Truncate(void); static void Truncate(void);
static void ResetCounters(void); static void ResetCounters(void);

View File

@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
Grid_default_latt, Grid_default_latt,
Grid_default_mpi); Grid_default_mpi);
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
FlightRecorder::PrintEntireLog = 1;
FlightRecorder::ChecksumComms = 1;
FlightRecorder::ChecksumCommsSend=1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -549,8 +556,34 @@ void GridLogLayout() {
void * Grid_backtrace_buffer[_NBACKTRACE]; void * Grid_backtrace_buffer[_NBACKTRACE];
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"FlightRecorder step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
// x86 64bit
#ifdef __linux__
#ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#endif
#endif
fflush(stderr);
BACKTRACEFP(stderr);
fprintf(stderr,"Called backtrace\n");
fflush(stdout);
fflush(stderr);
return;
}
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
{ {
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"Caught signal %d\n",si->si_signo); fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr); fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code); fprintf(stderr," code %d\n",si->si_code);
@ -561,7 +594,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
ucontext_t * uc= (ucontext_t *)ptr; ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#define REG(A) printf(" %s %lx\n",#A,sc-> A); #define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
REG(rdi); REG(rdi);
REG(rsi); REG(rsi);
REG(rbp); REG(rbp);
@ -594,8 +627,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
void Grid_exit_handler(void) void Grid_exit_handler(void)
{ {
BACKTRACEFP(stdout); // BACKTRACEFP(stdout);
fflush(stdout); // fflush(stdout);
} }
void Grid_debug_handler_init(void) void Grid_debug_handler_init(void)
{ {
@ -603,10 +636,10 @@ void Grid_debug_handler_init(void)
sigemptyset (&sa.sa_mask); sigemptyset (&sa.sa_mask);
sa.sa_sigaction= Grid_sa_signal_handler; sa.sa_sigaction= Grid_sa_signal_handler;
sa.sa_flags = SA_SIGINFO; sa.sa_flags = SA_SIGINFO;
sigaction(SIGSEGV,&sa,NULL); // sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL); sigaction(SIGTRAP,&sa,NULL);
sigaction(SIGBUS,&sa,NULL); sigaction(SIGBUS,&sa,NULL);
sigaction(SIGUSR2,&sa,NULL); // sigaction(SIGUSR2,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
@ -614,7 +647,15 @@ void Grid_debug_handler_init(void)
sigaction(SIGKILL,&sa,NULL); sigaction(SIGKILL,&sa,NULL);
sigaction(SIGILL,&sa,NULL); sigaction(SIGILL,&sa,NULL);
atexit(Grid_exit_handler); // Non terminating SIGUSR1/2 handler
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGHUP,&sa_ping,NULL);
// atexit(Grid_exit_handler);
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -50,7 +50,7 @@ namespace Grid{
int64_t index64; int64_t index64;
IndexFromCoorReversed(coor,index64,dims); IndexFromCoorReversed(coor,index64,dims);
if ( index64>=2*1024*1024*1024LL ){ if ( index64>=2*1024*1024*1024LL ){
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl; // std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
} }
assert(index64<2*1024*1024*1024LL); assert(index64<2*1024*1024*1024LL);
index = (int) index64; index = (int) index64;

View File

@ -1,5 +1,5 @@
# additional include paths necessary to compile the C++ library # additional include paths necessary to compile the C++ library
SUBDIRS = Grid HMC benchmarks tests examples SUBDIRS = Grid benchmarks tests examples HMC
include $(top_srcdir)/doxygen.inc include $(top_srcdir)/doxygen.inc

View File

@ -118,7 +118,7 @@ public:
fprintf(FP,"Packet bytes, direction, GB/s per node\n"); fprintf(FP,"Packet bytes, direction, GB/s per node\n");
for(int lat=16;lat<=maxlat;lat+=8){ for(int lat=16;lat<=maxlat;lat+=8){
// for(int Ls=8;Ls<=8;Ls*=2){ // for(int Ls=8;Ls<=8;Ls*=2){
{ int Ls=12; { int Ls=8;
Coordinate latt_size ({lat*mpi_layout[0], Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
@ -175,8 +175,8 @@ public:
timestat.statistics(t_time); timestat.statistics(t_time);
dbytes=dbytes*ppn; dbytes=dbytes*ppn;
double xbytes = dbytes*0.5; double xbytes = dbytes;
double bidibytes = dbytes; double bidibytes = dbytes*2.0;
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t " std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
<< bytes << " \t " << bytes << " \t "
@ -492,17 +492,18 @@ public:
} }
FGrid->Barrier(); FGrid->Barrier();
double t1=usecond(); double t1=usecond();
uint64_t ncall = 500; uint64_t no = 50;
uint64_t ni = 100;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat; time_statistics timestat;
std::vector<double> t_time(ncall); std::vector<double> t_time(no);
for(uint64_t i=0;i<ncall;i++){ for(uint64_t i=0;i<no;i++){
t0=usecond(); t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo); for(uint64_t j=0;j<ni;j++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond(); t1=usecond();
t_time[i] = t1-t0; t_time[i] = t1-t0;
} }
@ -520,11 +521,11 @@ public:
double mf_hi, mf_lo, mf_err; double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time); timestat.statistics(t_time);
mf_hi = flops/timestat.min; mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max; mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean; mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean; mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops); mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops_worst== 0 ) mflops_worst= mflops;
@ -535,6 +536,7 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
} }
@ -654,17 +656,19 @@ public:
} }
FGrid->Barrier(); FGrid->Barrier();
double t1=usecond(); double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall)); uint64_t no = 50;
uint64_t ni = 100;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat; time_statistics timestat;
std::vector<double> t_time(ncall); std::vector<double> t_time(no);
for(uint64_t i=0;i<ncall;i++){ for(uint64_t i=0;i<no;i++){
t0=usecond(); t0=usecond();
Ds.DhopEO(src_o,r_e,DaggerNo); for(uint64_t j=0;j<ni;j++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond(); t1=usecond();
t_time[i] = t1-t0; t_time[i] = t1-t0;
} }
@ -675,11 +679,11 @@ public:
double mf_hi, mf_lo, mf_err; double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time); timestat.statistics(t_time);
mf_hi = flops/timestat.min; mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max; mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean; mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean; mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops); mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops_worst== 0 ) mflops_worst= mflops;
@ -689,6 +693,7 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
} }
@ -792,19 +797,18 @@ public:
Dc.M(src,r); Dc.M(src,r);
} }
FGrid->Barrier(); FGrid->Barrier();
double t1=usecond(); uint64_t ni = 100;
uint64_t ncall = 500; uint64_t no = 50;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat; time_statistics timestat;
std::vector<double> t_time(ncall); std::vector<double> t_time(no);
for(uint64_t i=0;i<ncall;i++){ for(uint64_t i=0;i<no;i++){
t0=usecond(); double t0=usecond();
Dc.M(src,r); for(uint64_t j=0;j<ni;j++){
t1=usecond(); Dc.M(src,r);
}
double t1=usecond();
t_time[i] = t1-t0; t_time[i] = t1-t0;
} }
FGrid->Barrier(); FGrid->Barrier();
@ -814,20 +818,21 @@ public:
double mf_hi, mf_lo, mf_err; double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time); timestat.statistics(t_time);
mf_hi = flops/timestat.min; mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max; mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean; mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean; mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops); mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops; if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops; if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops; if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops; if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl; std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
} }

View File

@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h) AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(numaif.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
@ -128,6 +129,20 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac esac
############### internal reduction
AC_ARG_ENABLE([reduction],
[AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
[ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
case ${ac_REDUCTION} in
mpi)
;;
grid)
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
*)
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
esac
############### tracing ############### tracing
AC_ARG_ENABLE([tracing], AC_ARG_ENABLE([tracing],
[AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])], [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
@ -226,6 +241,20 @@ case ${ac_SFW_FP16} in
esac esac
############### MPI BOUNCE TO HOST
AC_ARG_ENABLE([accelerator-aware-mpi],
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
# Force accelerator CSHIFT now
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
case ${ac_ACCELERATOR_AWARE_MPI} in
yes)
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
*);;
esac
############### SYCL/CUDA/HIP/none ############### SYCL/CUDA/HIP/none
AC_ARG_ENABLE([accelerator], AC_ARG_ENABLE([accelerator],
[AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])], [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],

View File

@ -1,383 +0,0 @@
/*
* Warning: This code illustrative only: not well tested, and not meant for production use
* without regression / tests being applied
*/
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
RealD LLscale =1.0;
RealD LCscale =1.0;
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
GridBase *grid;
GaugeField U;
CovariantLaplacianCshift(GaugeField &_U) :
grid(_U.Grid()),
U(_U) { };
virtual GridBase *Grid(void) { return grid; };
virtual void M (const Field &in, Field &out)
{
out=Zero();
for(int mu=0;mu<Nd-1;mu++) {
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
out = out - Gimpl::CovShiftForward(Umu,mu,in);
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
out = out + 2.0*in;
}
};
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
};
void MakePhase(Coordinate mom,LatticeComplex &phase)
{
GridBase *grid = phase.Grid();
auto latt_size = grid->GlobalDimensions();
ComplexD ci(0.0,1.0);
phase=Zero();
LatticeComplex coor(phase.Grid());
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(coor,mu);
phase = phase + (TwoPiL * mom[mu]) * coor;
}
phase = exp(phase*ci);
}
void PointSource(Coordinate &coor,LatticePropagator &source)
{
// Coordinate coor({0,0,0,0});
source=Zero();
SpinColourMatrix kronecker; kronecker=1.0;
pokeSite(kronecker,source,coor);
}
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
{
GridBase *grid = source.Grid();
LatticeComplex noise(grid);
LatticeComplex zz(grid); zz=Zero();
LatticeInteger t(grid);
RealD nrm=1.0/sqrt(2);
bernoulli(RNG, noise); // 0,1 50:50
noise = (2.*noise - Complex(1,1))*nrm;
LatticeCoordinate(t,Tdir);
noise = where(t==Integer(tslice), noise, zz);
source = 1.0;
source = source*noise;
std::cout << " Z2 wall " << norm2(source) << std::endl;
}
template<class Field>
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
{
typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
Laplacian_t Laplacian(U);
Integer Iterations = 40;
Real width = 2.0;
Real coeff = (width*width) / Real(4*Iterations);
Field tmp(U.Grid());
smeared=unsmeared;
// chi = (1-p^2/2N)^N kronecker
for(int n = 0; n < Iterations; ++n) {
Laplacian.M(smeared,tmp);
smeared = smeared - coeff*tmp;
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
}
}
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
{
LatticePropagator tmp(source.Grid());
PointSource(site,source);
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
tmp = source;
GaussianSmear(U,tmp,source);
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
}
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
{
Z2WallSource(RNG,tslice,source);
auto tmp = source;
GaussianSmear(U,tmp,source);
}
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
{
assert(mom.size()==Nd);
assert(mom[Tdir] == 0);
GridBase * grid = spectator.Grid();
LatticeInteger ts(grid);
LatticeCoordinate(ts,Tdir);
source = Zero();
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
LatticeComplex phase(grid);
MakePhase(mom,phase);
source = source *phase;
}
template<class Action>
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
{
GridBase *UGrid = D.GaugeGrid();
GridBase *FGrid = D.FermionGrid();
LatticeFermion src4 (UGrid);
LatticeFermion src5 (FGrid);
LatticeFermion result5(FGrid);
LatticeFermion result4(UGrid);
LatticePropagator prop5(FGrid);
ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
for(int s=0;s<Nd;s++){
for(int c=0;c<Nc;c++){
PropToFerm<Action>(src4,source,s,c);
D.ImportPhysicalFermionSource(src4,src5);
result5=Zero();
schur(D,src5,result5,ZG);
std::cout<<GridLogMessage
<<"spin "<<s<<" color "<<c
<<" norm2(src5d) " <<norm2(src5)
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
D.ExportPhysicalFermionSolution(result5,result4);
FermToProp<Action>(prop5,result5,s,c);
FermToProp<Action>(propagator,result4,s,c);
}
}
LatticePropagator Axial_mu(UGrid);
LatticePropagator Vector_mu(UGrid);
LatticeComplex PA (UGrid);
LatticeComplex VV (UGrid);
LatticeComplex PJ5q(UGrid);
LatticeComplex PP (UGrid);
std::vector<TComplex> sumPA;
std::vector<TComplex> sumVV;
std::vector<TComplex> sumPP;
std::vector<TComplex> sumPJ5q;
Gamma g5(Gamma::Algebra::Gamma5);
D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current
sliceSum(PA,sumPA,Tdir);
int Nt{static_cast<int>(sumPA.size())};
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
PP = trace(adj(propagator)*propagator); // Pseudoscalar density
sliceSum(PP,sumPP,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
D.ContractJ5q(prop5,PJ5q);
sliceSum(PJ5q,sumPJ5q,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
Gamma::Algebra GammaV[3] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ
};
for( int mu=0;mu<3;mu++ ) {
Gamma gV(GammaV[mu]);
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
// auto ss=sliceSum(Vector_mu,Tdir);
// for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
sliceSum(VV,sumVV,Tdir);
for(int t=0;t<Nt;t++){
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
}
}
}
class MesonFile: Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
};
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
{
const int nchannel=3;
Gamma::Algebra Gammas[nchannel][2] = {
{Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
{Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
{Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
};
Gamma G5(Gamma::Algebra::Gamma5);
LatticeComplex meson_CF(q1.Grid());
MesonFile MF;
for(int ch=0;ch<nchannel;ch++){
Gamma Gsrc(Gammas[ch][0]);
Gamma Gsnk(Gammas[ch][1]);
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
std::vector<TComplex> meson_T;
sliceSum(meson_CF,meson_T, Tdir);
int nt=meson_T.size();
std::vector<Complex> corr(nt);
for(int t=0;t<nt;t++){
corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
}
MF.data.push_back(corr);
}
{
XmlWriter WR(file);
write(WR,"MesonFile",MF);
}
}
int main (int argc, char ** argv)
{
const int Ls=32;
Grid_init(&argc,&argv);
// Double precision grids
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
//////////////////////////////////////////////////////////////////////
// You can manage seeds however you like.
// Recommend SeedUniqueString.
//////////////////////////////////////////////////////////////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
std::string config;
RealD M5=1.8;
if( argc > 1 && argv[1][0] != '-' )
{
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
FieldMetaData header;
NerscIO::readConfiguration(Umu, header, argv[1]);
config=argv[1];
M5=1.8;
}
else
{
SU<Nc>::ColdConfiguration(Umu);
config="ColdConfig";
// RealD P=1.0; // Don't scale
RealD P=0.5871119; // 48I
// RealD P=0.6153342; // 64I
// RealD P=0.6388238 // 32Ifine
RealD u0 = sqrt(sqrt(P));
RealD M5mf = M5 - 4.0*(1.0-u0);
RealD w0 = 1.0 - M5mf;
#if 0
// M5=1.8 with U=u0
Umu = Umu * u0;
LLscale = 1.0;
LCscale = 1.0;
std::cout<<GridLogMessage <<"Gauge links are u=u0= "<<u0<<std::endl;
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
#else
M5 = M5mf;
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
std::cout<<GridLogMessage <<"u0="<<u0<<std::endl;
std::cout<<GridLogMessage <<"M5=M5mf = "<<M5<<std::endl;
LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
#endif
std::cout<<GridLogMessage <<"LLscale = "<<LLscale<<std::endl;
std::cout<<GridLogMessage <<"LCscale = "<<LCscale<<std::endl;
}
std::vector<RealD> masses({ 0.00} ); // u/d, s, c ??
int nmass = masses.size();
std::vector<MobiusFermionD *> FermActs;
std::cout<<GridLogMessage <<"======================"<<std::endl;
std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
std::cout<<GridLogMessage <<"======================"<<std::endl;
for(auto mass: masses) {
RealD b=1.5;// Scale factor b+c=2, b-c=1
RealD c=0.5;
FermActs.push_back(new MobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
}
LatticePropagator point_source(UGrid);
// LatticePropagator wall_source(UGrid);
Coordinate Origin({0,0,0,0});
PointSource (Origin,point_source);
// Z2WallSource (RNG4,0,wall_source);
std::vector<LatticePropagator> PointProps(nmass,UGrid);
// std::vector<LatticePropagator> GaussProps(nmass,UGrid);
// std::vector<LatticePropagator> Z2Props (nmass,UGrid);
for(int m=0;m<nmass;m++) {
Solve(*FermActs[m],point_source ,PointProps[m]);
}
LatticeComplex phase(UGrid);
Coordinate mom({0,0,0,0});
MakePhase(mom,phase);
for(int m1=0 ;m1<nmass;m1++) {
for(int m2=m1;m2<nmass;m2++) {
std::stringstream ssp,ssg,ssz;
ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_wall_meson.xml";
MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
// MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
}}
Grid_finalize();
}

View File

@ -1,479 +0,0 @@
/*
* Warning: This code illustrative only: not well tested, and not meant for production use
* without regression / tests being applied
*/
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
RealD LLscale =1.0;
RealD LCscale =1.0;
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
GridBase *grid;
GaugeField U;
CovariantLaplacianCshift(GaugeField &_U) :
grid(_U.Grid()),
U(_U) { };
virtual GridBase *Grid(void) { return grid; };
virtual void M (const Field &in, Field &out)
{
out=Zero();
for(int mu=0;mu<Nd-1;mu++) {
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
out = out - Gimpl::CovShiftForward(Umu,mu,in);
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
out = out + 2.0*in;
}
};
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
};
void MakePhase(Coordinate mom,LatticeComplex &phase)
{
GridBase *grid = phase.Grid();
auto latt_size = grid->GlobalDimensions();
ComplexD ci(0.0,1.0);
phase=Zero();
LatticeComplex coor(phase.Grid());
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(coor,mu);
phase = phase + (TwoPiL * mom[mu]) * coor;
}
phase = exp(phase*ci);
}
void PointSource(Coordinate &coor,LatticePropagator &source)
{
// Coordinate coor({0,0,0,0});
source=Zero();
SpinColourMatrix kronecker; kronecker=1.0;
pokeSite(kronecker,source,coor);
}
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
{
GridBase *grid = source.Grid();
LatticeComplex noise(grid);
LatticeComplex zz(grid); zz=Zero();
LatticeInteger t(grid);
RealD nrm=1.0/sqrt(2);
bernoulli(RNG, noise); // 0,1 50:50
noise = (2.*noise - Complex(1,1))*nrm;
LatticeCoordinate(t,Tdir);
noise = where(t==Integer(tslice), noise, zz);
source = 1.0;
source = source*noise;
std::cout << " Z2 wall " << norm2(source) << std::endl;
}
template<class Field>
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
{
typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
Laplacian_t Laplacian(U);
Integer Iterations = 40;
Real width = 2.0;
Real coeff = (width*width) / Real(4*Iterations);
Field tmp(U.Grid());
smeared=unsmeared;
// chi = (1-p^2/2N)^N kronecker
for(int n = 0; n < Iterations; ++n) {
Laplacian.M(smeared,tmp);
smeared = smeared - coeff*tmp;
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
}
}
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
{
LatticePropagator tmp(source.Grid());
PointSource(site,source);
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
tmp = source;
GaussianSmear(U,tmp,source);
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
}
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
{
Z2WallSource(RNG,tslice,source);
auto tmp = source;
GaussianSmear(U,tmp,source);
}
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
{
assert(mom.size()==Nd);
assert(mom[Tdir] == 0);
GridBase * grid = spectator.Grid();
LatticeInteger ts(grid);
LatticeCoordinate(ts,Tdir);
source = Zero();
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
LatticeComplex phase(grid);
MakePhase(mom,phase);
source = source *phase;
}
template<class Action>
void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
{
GridBase *UGrid = source.Grid();
GridBase *FGrid = D.FermionGrid();
bool fiveD = true; //calculate 5d free propagator
RealD mass = D.Mass();
LatticeFermion src4 (UGrid);
LatticeFermion result4 (UGrid);
LatticeFermion result5(FGrid);
LatticeFermion src5(FGrid);
LatticePropagator prop5(FGrid);
for(int s=0;s<Nd;s++){
for(int c=0;c<Nc;c++){
PropToFerm<Action>(src4,source,s,c);
D.ImportPhysicalFermionSource(src4,src5);
D.FreePropagator(src5,result5,mass,true);
std::cout<<GridLogMessage
<<"Free 5D prop spin "<<s<<" color "<<c
<<" norm2(src5d) " <<norm2(src5)
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
D.ExportPhysicalFermionSolution(result5,result4);
FermToProp<Action>(prop5,result5,s,c);
FermToProp<Action>(propagator,result4,s,c);
}
}
LatticePropagator Vector_mu(UGrid);
LatticeComplex VV (UGrid);
std::vector<TComplex> sumVV;
Gamma::Algebra GammaV[3] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ
};
for( int mu=0;mu<3;mu++ ) {
Gamma gV(GammaV[mu]);
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
sliceSum(VV,sumVV,Tdir);
int Nt = sumVV.size();
for(int t=0;t<Nt;t++){
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
RealD Cont=0;
if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
<< " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
}
}
}
template<class Action>
void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
{
bool fiveD = false; //calculate 4d free propagator
RealD mass = D.Mass();
GridBase *UGrid = source.Grid();
LatticeFermion src4 (UGrid);
LatticeFermion result4 (UGrid);
for(int s=0;s<Nd;s++){
for(int c=0;c<Nc;c++){
PropToFerm<Action>(src4,source,s,c);
D.FreePropagator(src4,result4,mass,false);
FermToProp<Action>(propagator,result4,s,c);
}
}
}
template<class Action>
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
{
GridBase *UGrid = D.GaugeGrid();
GridBase *FGrid = D.FermionGrid();
LatticeFermion src4 (UGrid);
LatticeFermion src5 (FGrid);
LatticeFermion result5(FGrid);
LatticeFermion result4(UGrid);
LatticePropagator prop5(FGrid);
ConjugateGradient<LatticeFermion> CG(1.0e-10,100000);
SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
for(int s=0;s<Nd;s++){
for(int c=0;c<Nc;c++){
PropToFerm<Action>(src4,source,s,c);
D.ImportPhysicalFermionSource(src4,src5);
result5=Zero();
schur(D,src5,result5,ZG);
std::cout<<GridLogMessage
<<"spin "<<s<<" color "<<c
<<" norm2(src5d) " <<norm2(src5)
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
D.ExportPhysicalFermionSolution(result5,result4);
FermToProp<Action>(prop5,result5,s,c);
FermToProp<Action>(propagator,result4,s,c);
}
}
LatticePropagator Axial_mu(UGrid);
LatticePropagator Vector_mu(UGrid);
LatticeComplex PA (UGrid);
LatticeComplex VV (UGrid);
LatticeComplex PJ5q(UGrid);
LatticeComplex PP (UGrid);
std::vector<TComplex> sumPA;
std::vector<TComplex> sumVV;
std::vector<TComplex> sumPP;
std::vector<TComplex> sumPJ5q;
Gamma g5(Gamma::Algebra::Gamma5);
D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current
sliceSum(PA,sumPA,Tdir);
int Nt{static_cast<int>(sumPA.size())};
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
PP = trace(adj(propagator)*propagator); // Pseudoscalar density
sliceSum(PP,sumPP,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
D.ContractJ5q(prop5,PJ5q);
sliceSum(PJ5q,sumPJ5q,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
Gamma::Algebra GammaV[3] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ
};
for( int mu=0;mu<3;mu++ ) {
Gamma gV(GammaV[mu]);
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
// auto ss=sliceSum(Vector_mu,Tdir);
// for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
sliceSum(VV,sumVV,Tdir);
for(int t=0;t<Nt;t++){
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
RealD Cont=0;
if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
<< " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
}
}
}
class MesonFile: Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
};
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
{
const int nchannel=4;
Gamma::Algebra Gammas[nchannel][2] = {
{Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
{Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
{Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
{Gamma::Algebra::Identity,Gamma::Algebra::Identity}
};
LatticeComplex meson_CF(q1.Grid());
MesonFile MF;
for(int ch=0;ch<nchannel;ch++){
Gamma Gsrc(Gammas[ch][0]);
Gamma Gsnk(Gammas[ch][1]);
meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
std::vector<TComplex> meson_T;
sliceSum(meson_CF,meson_T, Tdir);
int nt=meson_T.size();
std::vector<Complex> corr(nt);
for(int t=0;t<nt;t++){
corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
RealD Ct = real(corr[t]);
RealD Cont=0;
if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
<< " deltaC " <<Ct-Cont<<std::endl;
}
MF.data.push_back(corr);
}
{
XmlWriter WR(file);
write(WR,"MesonFile",MF);
}
}
int main (int argc, char ** argv)
{
const int Ls=10;
Grid_init(&argc,&argv);
// Double precision grids
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
//////////////////////////////////////////////////////////////////////
// You can manage seeds however you like.
// Recommend SeedUniqueString.
//////////////////////////////////////////////////////////////////////
// std::vector<int> seeds4({1,2,3,4});
// GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
std::string config;
RealD M5=atof(getenv("M5"));
RealD mq = atof(getenv("mass"));
int tadpole = atof(getenv("tadpole"));
std::vector<RealD> masses({ mq} ); // u/d, s, c ??
if( argc > 1 && argv[1][0] != '-' )
{
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
FieldMetaData header;
NerscIO::readConfiguration(Umu, header, argv[1]);
config=argv[1];
LLscale = 1.0;
LCscale = 1.0;
}
else
{
SU<Nc>::ColdConfiguration(Umu);
config="ColdConfig";
// RealD P=1.0; // Don't scale
// RealD P=0.6388238 // 32Ifine
// RealD P=0.6153342; // 64I
RealD P=0.5871119; // 48I
RealD u0 = sqrt(sqrt(P));
RealD w0 = 1 - M5;
std::cout<<GridLogMessage <<"For plaquette P="<<P<<" u0= "<<u0<<std::endl;
if ( tadpole == 1 ) {
Umu = Umu * u0;
// LLscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
// LCscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
LLscale = 1.0;
LCscale = 1.0;
std::cout<<GridLogMessage <<"Gauge links are u= u0 "<<std::endl;
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
} else if ( tadpole == 2) {
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
LLscale = 1.0;
LCscale = 1.0;
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
} else {
LLscale = 1.0/u0/u0;
LCscale = 1.0/u0/u0;
M5 = M5 - 4.0 * (1-u0);
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
std::cout<<GridLogMessage <<"M5mf = "<<M5<<std::endl;
}
std::cout<<GridLogMessage <<"mq = "<<mq<<std::endl;
std::cout<<GridLogMessage <<"LLscale = "<<LLscale<<std::endl;
std::cout<<GridLogMessage <<"LCscale = "<<LCscale<<std::endl;
}
int nmass = masses.size();
typedef DomainWallFermionD FermionActionD;
// typedef MobiusFermionD FermionActionD;
std::vector<FermionActionD *> FermActs;
std::vector<DomainWallFermionD *> DWFActs;
std::cout<<GridLogMessage <<"======================"<<std::endl;
std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
std::cout<<GridLogMessage <<"======================"<<std::endl;
for(auto mass: masses) {
std::vector<Complex> boundary = {1,1,1,-1};
FermionActionD::ImplParams Params(boundary);
RealD b=1.5;
RealD c=0.5;
std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
// DWFActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params));
// FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass+0.001,M5,b,c));
std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
}
LatticePropagator point_source(UGrid);
Coordinate Origin({0,0,0,0});
PointSource (Origin,point_source);
std::vector<LatticePropagator> PointProps(nmass,UGrid);
// std::vector<LatticePropagator> FreeProps(nmass,UGrid);
// LatticePropagator delta(UGrid);
for(int m=0;m<nmass;m++) {
Solve(*FermActs[m],point_source ,PointProps[m]);
// MasslessFreePropagator(*FermActs[m],point_source ,FreeProps[m]);
// delta = PointProps[m] - FreeProps[m];
// std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
}
LatticeComplex phase(UGrid);
Coordinate mom({0,0,0,0});
MakePhase(mom,phase);
for(int m1=0 ;m1<nmass;m1++) {
for(int m2=m1;m2<nmass;m2++) {
std::stringstream ssp,ssg,ssz;
ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
std::cout << "CG determined VV correlation function"<<std::endl;
MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
// std::cout << "FFT derived VV correlation function"<<std::endl;
// MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
}}
Grid_finalize();
}

View File

@ -1,433 +0,0 @@
/*
* Warning: This code illustrative only: not well tested, and not meant for production use
* without regression / tests being applied
*/
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
RealD LLscale =1.0;
RealD LCscale =1.0;
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
GridBase *grid;
GaugeField U;
CovariantLaplacianCshift(GaugeField &_U) :
grid(_U.Grid()),
U(_U) { };
virtual GridBase *Grid(void) { return grid; };
virtual void M (const Field &in, Field &out)
{
out=Zero();
for(int mu=0;mu<Nd-1;mu++) {
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
out = out - Gimpl::CovShiftForward(Umu,mu,in);
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
out = out + 2.0*in;
}
};
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
};
void MakePhase(Coordinate mom,LatticeComplex &phase)
{
GridBase *grid = phase.Grid();
auto latt_size = grid->GlobalDimensions();
ComplexD ci(0.0,1.0);
phase=Zero();
LatticeComplex coor(phase.Grid());
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(coor,mu);
phase = phase + (TwoPiL * mom[mu]) * coor;
}
phase = exp(phase*ci);
}
void PointSource(Coordinate &coor,LatticePropagator &source)
{
// Coordinate coor({0,0,0,0});
source=Zero();
SpinColourMatrix kronecker; kronecker=1.0;
pokeSite(kronecker,source,coor);
}
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
{
GridBase *grid = source.Grid();
LatticeComplex noise(grid);
LatticeComplex zz(grid); zz=Zero();
LatticeInteger t(grid);
RealD nrm=1.0/sqrt(2);
bernoulli(RNG, noise); // 0,1 50:50
noise = (2.*noise - Complex(1,1))*nrm;
LatticeCoordinate(t,Tdir);
noise = where(t==Integer(tslice), noise, zz);
source = 1.0;
source = source*noise;
std::cout << " Z2 wall " << norm2(source) << std::endl;
}
template<class Field>
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
{
typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
Laplacian_t Laplacian(U);
Integer Iterations = 40;
Real width = 2.0;
Real coeff = (width*width) / Real(4*Iterations);
Field tmp(U.Grid());
smeared=unsmeared;
// chi = (1-p^2/2N)^N kronecker
for(int n = 0; n < Iterations; ++n) {
Laplacian.M(smeared,tmp);
smeared = smeared - coeff*tmp;
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
}
}
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
{
LatticePropagator tmp(source.Grid());
PointSource(site,source);
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
tmp = source;
GaussianSmear(U,tmp,source);
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
}
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
{
Z2WallSource(RNG,tslice,source);
auto tmp = source;
GaussianSmear(U,tmp,source);
}
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
{
assert(mom.size()==Nd);
assert(mom[Tdir] == 0);
GridBase * grid = spectator.Grid();
LatticeInteger ts(grid);
LatticeCoordinate(ts,Tdir);
source = Zero();
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
LatticeComplex phase(grid);
MakePhase(mom,phase);
source = source *phase;
}
template<class Action>
void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
{
GridBase *UGrid = source.Grid();
GridBase *FGrid = D.FermionGrid();
bool fiveD = true; //calculate 4d free propagator
RealD mass = D.Mass();
LatticeFermion src4 (UGrid);
LatticeFermion result4 (UGrid);
LatticeFermion result5(FGrid);
LatticeFermion src5(FGrid);
LatticePropagator prop5(FGrid);
for(int s=0;s<Nd;s++){
for(int c=0;c<Nc;c++){
PropToFerm<Action>(src4,source,s,c);
D.ImportPhysicalFermionSource(src4,src5);
D.FreePropagator(src5,result5,mass,true);
std::cout<<GridLogMessage
<<"spin "<<s<<" color "<<c
<<" norm2(src5d) " <<norm2(src5)
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
D.ExportPhysicalFermionSolution(result5,result4);
FermToProp<Action>(prop5,result5,s,c);
FermToProp<Action>(propagator,result4,s,c);
}
}
LatticePropagator Vector_mu(UGrid);
LatticeComplex VV (UGrid);
std::vector<TComplex> sumVV;
Gamma::Algebra GammaV[3] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ
};
for( int mu=0;mu<3;mu++ ) {
Gamma gV(GammaV[mu]);
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
sliceSum(VV,sumVV,Tdir);
int Nt = sumVV.size();
for(int t=0;t<Nt;t++){
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
}
}
}
template<class Action>
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
{
GridBase *UGrid = D.GaugeGrid();
GridBase *FGrid = D.FermionGrid();
LatticeFermion src4 (UGrid);
LatticeFermion src5 (FGrid);
LatticeFermion result5(FGrid);
LatticeFermion result4(UGrid);
LatticePropagator prop5(FGrid);
ConjugateGradient<LatticeFermion> CG(1.0e-6,100000);
SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
for(int s=0;s<Nd;s++){
for(int c=0;c<Nc;c++){
PropToFerm<Action>(src4,source,s,c);
D.ImportPhysicalFermionSource(src4,src5);
result5=Zero();
schur(D,src5,result5,ZG);
std::cout<<GridLogMessage
<<"spin "<<s<<" color "<<c
<<" norm2(src5d) " <<norm2(src5)
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
D.ExportPhysicalFermionSolution(result5,result4);
FermToProp<Action>(prop5,result5,s,c);
FermToProp<Action>(propagator,result4,s,c);
}
}
LatticePropagator Axial_mu(UGrid);
LatticePropagator Vector_mu(UGrid);
LatticeComplex PA (UGrid);
LatticeComplex VV (UGrid);
LatticeComplex PJ5q(UGrid);
LatticeComplex PP (UGrid);
std::vector<TComplex> sumPA;
std::vector<TComplex> sumVV;
std::vector<TComplex> sumPP;
std::vector<TComplex> sumPJ5q;
Gamma g5(Gamma::Algebra::Gamma5);
D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current
sliceSum(PA,sumPA,Tdir);
int Nt{static_cast<int>(sumPA.size())};
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
PP = trace(adj(propagator)*propagator); // Pseudoscalar density
sliceSum(PP,sumPP,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
D.ContractJ5q(prop5,PJ5q);
sliceSum(PJ5q,sumPJ5q,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
Gamma::Algebra GammaV[3] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ
};
for( int mu=0;mu<3;mu++ ) {
Gamma gV(GammaV[mu]);
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
// auto ss=sliceSum(Vector_mu,Tdir);
// for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
sliceSum(VV,sumVV,Tdir);
for(int t=0;t<Nt;t++){
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
}
}
}
class MesonFile: Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
};
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
{
const int nchannel=3;
Gamma::Algebra Gammas[nchannel][2] = {
{Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
{Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
// {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
{Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5}
};
Gamma G5(Gamma::Algebra::Gamma5);
LatticeComplex meson_CF(q1.Grid());
MesonFile MF;
for(int ch=0;ch<nchannel;ch++){
Gamma Gsrc(Gammas[ch][0]);
Gamma Gsnk(Gammas[ch][1]);
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
std::vector<TComplex> meson_T;
sliceSum(meson_CF,meson_T, Tdir);
int nt=meson_T.size();
std::vector<Complex> corr(nt);
for(int t=0;t<nt;t++){
corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
}
MF.data.push_back(corr);
}
{
XmlWriter WR(file);
write(WR,"MesonFile",MF);
}
}
int main (int argc, char ** argv)
{
const int Ls=8;
Grid_init(&argc,&argv);
// Double precision grids
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
//////////////////////////////////////////////////////////////////////
// You can manage seeds however you like.
// Recommend SeedUniqueString.
//////////////////////////////////////////////////////////////////////
// std::vector<int> seeds4({1,2,3,4});
// GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
std::string config;
RealD M5=atof(getenv("M5"));
RealD mq = atof(getenv("mass"));
std::vector<RealD> masses({ mq} ); // u/d, s, c ??
if( argc > 1 && argv[1][0] != '-' )
{
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
FieldMetaData header;
NerscIO::readConfiguration(Umu, header, argv[1]);
config=argv[1];
LLscale = 1.0;
LCscale = 1.0;
}
else
{
SU<Nc>::ColdConfiguration(Umu);
config="ColdConfig";
// RealD P=1.0; // Don't scale
// RealD P=0.6153342; // 64I
// RealD P=0.6388238 // 32Ifine
// RealD P=0.5871119; // 48I
// RealD u0 = sqrt(sqrt(P));
// Umu = Umu * u0;
RealD w0 = 1 - M5;
LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
std::cout<<GridLogMessage <<"mq = "<<mq<<std::endl;
std::cout<<GridLogMessage <<"LLscale = "<<LLscale<<std::endl;
std::cout<<GridLogMessage <<"LCscale = "<<LCscale<<std::endl;
}
int nmass = masses.size();
std::vector<DomainWallFermionD *> FermActs;
std::cout<<GridLogMessage <<"======================"<<std::endl;
std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
std::cout<<GridLogMessage <<"======================"<<std::endl;
for(auto mass: masses) {
std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
FermActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
}
LatticePropagator point_source(UGrid);
Coordinate Origin({0,0,0,0});
PointSource (Origin,point_source);
// std::vector<LatticePropagator> PointProps(nmass,UGrid);
std::vector<LatticePropagator> FreeProps(nmass,UGrid);
LatticePropagator delta(UGrid);
for(int m=0;m<nmass;m++) {
// Solve(*FermActs[m],point_source ,PointProps[m]);
MasslessFreePropagator(*FermActs[m],point_source ,FreeProps[m]);
// delta = PointProps[m] - FreeProps[m];
// std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
}
LatticeComplex phase(UGrid);
Coordinate mom({0,0,0,0});
MakePhase(mom,phase);
for(int m1=0 ;m1<nmass;m1++) {
for(int m2=m1;m2<nmass;m2++) {
std::stringstream ssp,ssg,ssz;
ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
// std::cout << "CG determined VV correlation function"<<std::endl;
// MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
std::cout << "FFT derived VV correlation function"<<std::endl;
MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
}}
Grid_finalize();
}

View File

@ -0,0 +1,23 @@
#Ahead of time compile for PVC
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
#JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-debug \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$CLIME \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=yes\
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx

View File

@ -0,0 +1,15 @@
#module load oneapi/release/2023.12.15.001
#module load mpich/icc-all-debug-pmix-gpu/52.2
#module load mpich-config/mode/deterministic
#module load intel_compute_runtime/release/821.35
source ~/spack/share/spack/setup-env.sh
spack load c-lime
spack load openssl
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128
git config --global http.proxy http://proxy.alcf.anl.gov:3128
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

View File

@ -0,0 +1,74 @@
#!/bin/bash
#PBS -l select=512
#PBS -q EarlyAppAccess
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=6:00:00
#PBS -N reproBigJob
#PBS -k doe
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
#module load oneapi/eng-compiler/2023.05.15.003
#module load mpich/51.2/icc-all-deterministic-pmix-gpu
# 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=0
export GRID_CHECKSUM_SEND_BUF=0
export MPICH_OFI_NIC_POLICY=GPU
#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
cd $PBS_O_WORKDIR
cp $PBS_NODEFILE nodefile
DIR=reproBigJob.$PBS_JOBID
mkdir -p $DIR
cd $DIR
cp $PBS_NODEFILE nodefile
BINARY=../Test_dwf_mixedcg_prec
echo > pingjob <<EOF
while read node ;
do
echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
done < nodefile
EOF
CMD="mpiexec -np 6144 -ppn 12 -envall --hostfile nodefile \
../gpu_tile_compact.sh \
$BINARY --mpi 8.8.8.12 --grid 128.128.128.288 \
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --debug-stdout --log Message --debug-signals --comms-overlap"
echo $CMD > command-line
env > environment
$CMD
grep Oops Grid.stderr.* > failures.$PBS_JOBID
rm core.*

View File

@ -1,27 +1,24 @@
#!/bin/bash #!/bin/bash
#PBS -q EarlyAppAccess ##PBS -q EarlyAppAccess
#PBS -q debug
#PBS -l select=1 #PBS -l select=1
#PBS -l walltime=00:20:00 #PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA #PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR cd $PBS_O_WORKDIR
source ../sourceme.sh source ../sourceme.sh
module load pti-gpu
#cat $PBS_NODEFILE cp $PBS_NODEFILE nodefile
export OMP_NUM_THREADS=4 export OMP_NUM_THREADS=4
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export MPICH_OFI_NIC_POLICY=GPU
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
@ -29,39 +26,11 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 2 nodes, 24 ranks
#
CMD="mpiexec -np 1 -ppn 1 -envall \
./gpu_tile_compact.sh \
./Benchmark_usqcd --mpi 1.1.1.1 --grid 24.32.32.24 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD | tee usqcd.log
CMD="mpiexec -np 1 -ppn 1 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
$CMD | tee 1tile.dwf
CMD="mpiexec -np 12 -ppn 12 -envall \ CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \ ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.96 \
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 "
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
#$CMD | tee 1node.32.32.32.48.dwf
echo $CMD
CMD="mpiexec -np 12 -ppn 12 -envall \ $CMD
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
#$CMD | tee 1node.64.64.32.96.dwf
CMD="mpiexec -np 12 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
#$CMD | tee 1node.64.32.32.48.dwf

View File

@ -0,0 +1,74 @@
#!/bin/bash
##PBS -q LatticeQCD_aesp_CNDA
#PBS -q debug-scaling
##PBS -q prod
#PBS -l select=16
#PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA
cd $PBS_O_WORKDIR
source ../sourceme.sh
cp $PBS_NODEFILE nodefile
export OMP_NUM_THREADS=4
export MPICH_OFI_NIC_POLICY=GPU
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#
# Local vol 16.16.16.32
#
LX=16
LY=16
LZ=16
LT=32
NX=2
NY=2
NZ=4
NT=1
GX=2
GY=2
GZ=1
GT=3
PX=$((NX * GX ))
PY=$((NY * GY ))
PZ=$((NZ * GZ ))
PT=$((NT * GT ))
VX=$((PX * LX ))
VY=$((PY * LY ))
VZ=$((PZ * LZ ))
VT=$((PT * LT ))
NP=$((PX*PY*PZ*PT))
VOL=${VX}.${VY}.${VZ}.${VT}
AT=8
MPI=${PX}.${PY}.${PZ}.${PT}
CMD="mpiexec -np $NP -ppn 12 -envall \
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
echo VOL $VOL
echo MPI $MPI
echo NPROC $NP
echo $CMD
$CMD

View File

@ -1,55 +1,48 @@
#!/bin/bash #!/bin/bash
#PBS -q EarlyAppAccess ##PBS -q EarlyAppAccess
#PBS -q debug
#PBS -l select=2 #PBS -l select=2
#PBS -l walltime=00:20:00 #PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA #PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR cd $PBS_O_WORKDIR
source ../sourceme.sh source ../sourceme.sh
module load pti-gpu
#cat $PBS_NODEFILE cp $PBS_NODEFILE nodefile
export OMP_NUM_THREADS=4 export OMP_NUM_THREADS=4
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export MPICH_OFI_NIC_POLICY=GPU
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 2 nodes, 24 ranks
# #
# Local vol 16.16.16.32
#
#VOL=32.64.64.96
for VOL in 32.32.32.96 32.64.64.96
do
for AT in 32
do
CMD="mpiexec -np 24 -ppn 12 -envall \ CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \ ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD | tee 2node.comms
echo $CMD
CMD="mpiexec -np 24 -ppn 12 -envall \ $CMD
./gpu_tile_compact.sh \ done
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ done
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
$CMD | tee 2node.32.32.64.48.dwf
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
$CMD | tee 2node.64.64.64.96.dwf

View File

@ -4,10 +4,12 @@
#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); #export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) #export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ); export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 );
export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]}
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
unset EnableWalkerPartition unset EnableWalkerPartition
@ -17,18 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
#export MPI_BUF_NUMA=$NUMAH
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
if [ $PALS_RANKID = "0" ] if [ $PALS_RANKID = "0" ]
then then
# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@" # numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" numactl -p $NUMAP -N $NUMAP "$@"
numactl -m $NUMA -N $NUMA "$@"
else else
numactl -m $NUMA -N $NUMA "$@" numactl -p $NUMAP -N $NUMAP "$@"
fi fi

View File

@ -1,17 +1,25 @@
#Ahead of time compile for PVC
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl " export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions " export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
../../configure \
#JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
../configure \
--enable-simd=GPU \ --enable-simd=GPU \
--enable-reduction=grid \
--enable-gen-simd-width=64 \ --enable-gen-simd-width=64 \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-debug \ --enable-debug \
--prefix $HOME/gpt-install \
--disable-gparity \ --disable-gparity \
--disable-fermion-reps \ --disable-fermion-reps \
--with-lime=$CLIME \ --with-lime=$CLIME \
--enable-shm=nvlink \ --enable-shm=nvlink \
--enable-accelerator=sycl \ --enable-accelerator=sycl \
--enable-accelerator-aware-mpi=yes\ --enable-accelerator-aware-mpi=no\
--enable-unified=no \ --enable-unified=no \
MPICXX=mpicxx \ MPICXX=mpicxx \
CXX=icpx CXX=icpx

View File

@ -1,5 +1,9 @@
module load oneapi/release/2023.12.15.001 #module load oneapi/release/2023.12.15.001
#module load mpich/icc-all-debug-pmix-gpu/52.2
#module load mpich-config/mode/deterministic
#module load intel_compute_runtime/release/821.35 #module load intel_compute_runtime/release/821.35
module load pti-gpu
source ~/spack/share/spack/setup-env.sh source ~/spack/share/spack/setup-env.sh
spack load c-lime spack load c-lime
spack load openssl spack load openssl

View File

@ -15,13 +15,13 @@
# 56 cores / 6 threads ~9 # 56 cores / 6 threads ~9
export OMP_NUM_THREADS=6 export OMP_NUM_THREADS=6
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
@ -30,20 +30,22 @@ export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export GRID_PRINT_ENTIRE_LOG=0 export GRID_PRINT_ENTIRE_LOG=0
export GRID_CHECKSUM_RECV_BUF=1 export GRID_CHECKSUM_RECV_BUF=0
export GRID_CHECKSUM_SEND_BUF=1 export GRID_CHECKSUM_SEND_BUF=0
export MPICH_OFI_NIC_POLICY=GPU export MPICH_OFI_NIC_POLICY=GPU
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0 #export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0 #export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling #export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE #unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE #unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE #unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
cd $PBS_O_WORKDIR cd $PBS_O_WORKDIR
cp $PBS_NODEFILE nodefile
DIR=reproBigJob.$PBS_JOBID DIR=reproBigJob.$PBS_JOBID
mkdir -p $DIR mkdir -p $DIR
@ -51,10 +53,19 @@ cd $DIR
cp $PBS_NODEFILE nodefile cp $PBS_NODEFILE nodefile
BINARY=../Test_dwf_mixedcg_prec
echo > pingjob <<EOF
while read node ;
do
echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
done < nodefile
EOF
CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \ CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \
../gpu_tile_compact.sh \ ../gpu_tile_compact.sh \
../Test_dwf_mixedcg_prec --mpi 4.4.4.6 --grid 128.128.128.96 \ $BINARY --mpi 4.4.4.6 --grid 128.128.128.96 \
--shm-mpi 1 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals" --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals"
echo $CMD > command-line echo $CMD > command-line
env > environment env > environment

View File

@ -0,0 +1,22 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@ -0,0 +1,16 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
#module load cce/15.0.1
module load rocm/6.3.1
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Ugly hacks to get down level software working on current system
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .

View File

@ -30,14 +30,10 @@ source ${root}/sourceme.sh
export OMP_NUM_THREADS=7 export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1 export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM #export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
#64.64.32.96
for vol in 32.32.32.64 for vol in 64.64.32.64
do do
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
done done

View File

@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--with-lime=$CLIME \ --with-lime=$CLIME \
--enable-unified=no \ --enable-unified=no \
--enable-shm=nvlink \ --enable-shm=nvlink \
--enable-tracing=timer \ --enable-tracing=none \
--enable-accelerator=hip \ --enable-accelerator=hip \
--enable-gen-simd-width=64 \ --enable-gen-simd-width=64 \
--disable-gparity \ --disable-gparity \
--disable-fermion-reps \ --disable-fermion-reps \
--enable-simd=GPU \ --enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \ --with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \ --with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \ --disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \ CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \ CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas" LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@ -1,12 +1,25 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh . /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
module load emacs module load cce/15.0.1
module load PrgEnv-gnu module load rocm/5.3.0
module load rocm
module load cray-mpich
module load gmp
module load cray-fftw module load cray-fftw
module load craype-accel-amd-gfx90a module load craype-accel-amd-gfx90a
#Ugly hacks to get down level software working on current system
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
#echo spack load c-lime
#spack load c-lime
#module load emacs
##module load PrgEnv-gnu
##module load cray-mpich
##module load cray-fftw
##module load craype-accel-amd-gfx90a
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib #Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH ##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH

View File

@ -0,0 +1,18 @@
../../configure \
--enable-comms=mpi \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-shm=nvlink \
--with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--enable-accelerator=cuda \
--disable-gparity \
--disable-fermion-reps \
--disable-unified \
CXX=nvcc \
LDFLAGS="-cudart shared -L$NVIDIALIB -lcublas" \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"

View File

@ -0,0 +1,16 @@
. /home/paboyle/spack/share/spack/setup-env.sh
spack load cuda@12.0.0
spack load c-lime
spack load gmp
spack load mpfr
spack load hdf5
spack load fftw
spack load openmpi
export FFTW=`spack find --paths fftw | grep fftw | cut -c 14-`
export HDF5=`spack find --paths hdf5 | grep hdf5 | cut -c 14-`
export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-`
export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
export GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/
export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH:$HDF5/lib:$FFTW/lib:$CLIME/lib/:$MPFR/lib

View File

@ -1,7 +1,7 @@
spack load c-lime spack load c-lime
spack load gmp spack load gmp
spack load mpfr spack load mpfr
CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-` CLIME=`spack find --paths c-lime | grep c-lime| cut -c 13-`
GMP=`spack find --paths gmp | grep gmp | cut -c 12-` GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-` MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
echo clime X$CLIME echo clime X$CLIME

206
systems/WorkArounds.txt Normal file
View File

@ -0,0 +1,206 @@
The purpose of this file is to collate all non-obvious known magic shell variables
and compiler flags required for either correctness or performance on various systems.
A repository of work-arounds.
Contents:
1. Interconnect + MPI
2. Compilation
3. Profiling
************************
* 1. INTERCONNECT + MPI
************************
--------------------------------------------------------------------
MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O
--------------------------------------------------------------------
export OMPI_MCA_io=romio321
--------------------------------------
ROMIO fail with > 2GB per node read (32 bit issue)
--------------------------------------
Use later MPICH
https://github.com/paboyle/Grid/issues/381
https://github.com/pmodels/mpich/commit/3a479ab0
--------------------------------------------------------------------
Slingshot: Frontier and Perlmutter libfabric slow down
and physical memory fragmentation
--------------------------------------------------------------------
export FI_MR_CACHE_MONITOR=disabled
or
export FI_MR_CACHE_MONITOR=kdreg2
--------------------------------------------------------------------
Perlmutter
--------------------------------------------------------------------
export MPICH_RDMA_ENABLED_CUDA=1
export MPICH_GPU_IPC_ENABLED=1
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
export MPICH_GPU_NO_ASYNC_MEMCPY=0
--------------------------------------------------------------------
Frontier/LumiG
--------------------------------------------------------------------
Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
cat << EOF > select_gpu
#!/bin/bash
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
export GPU_MAP=(0 1 2 3 7 6 5 4)
export NUMA_MAP=(3 3 1 1 2 2 0 0)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
export HIP_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./select_gpu
srun ./select_gpu BINARY
--------------------------------------------------------------------
Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
--------------------------------------------------------------------
Mellanox + A100 correctness (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
export UCX_MEMTYPE_CACHE=n
--------------------------------------------------------------------
MPICH/Aurora/PVC correctness and performance
--------------------------------------------------------------------
https://github.com/pmodels/mpich/issues/7302
--enable-cuda-aware-mpi=no
--enable-unified=no
Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
Do not use SVM
Ideally use MPICH with fix to issue 7302:
https://github.com/pmodels/mpich/pull/7312
Ideally:
MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
Alternatives:
export MPIR_CVAR_NOLOCAL=1
export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
--------------------------------------------------------------------
MPICH/Aurora/PVC correctness and performance
--------------------------------------------------------------------
Broken:
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
This gives good peformance without requiring
--enable-cuda-aware-mpi=no
But is an open issue reported by James Osborn
https://github.com/pmodels/mpich/issues/7139
Possibly resolved but unclear if in the installed software yet.
************************
* 2. COMPILATION
************************
--------------------------------------------------------------------
G++ compiler breakage / graveyard
--------------------------------------------------------------------
9.3.0, 10.3.1,
https://github.com/paboyle/Grid/issues/290
https://github.com/paboyle/Grid/issues/264
Working (-) Broken (X):
4.9.0 -
4.9.1 -
5.1.0 X
5.2.0 X
5.3.0 X
5.4.0 X
6.1.0 X
6.2.0 X
6.3.0 -
7.1.0 -
8.0.0 (HEAD) -
https://github.com/paboyle/Grid/issues/100
--------------------------------------------------------------------
AMD GPU nodes :
--------------------------------------------------------------------
multiple ROCM versions broken; use 5.3.0
manifests itself as wrong results in fp32
https://github.com/paboyle/Grid/issues/464
--------------------------------------------------------------------
Aurora/PVC
--------------------------------------------------------------------
SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
SYCL slow link and relocatable code issues (Christoph Lehner)
Opt large register file required for good performance in fp64
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC"
--------------------------------------------------------------------
Aurora/PVC useful extra options
--------------------------------------------------------------------
Host only sanitizer:
-Xarch_host -fsanitize=leak
-Xarch_host -fsanitize=address
Deterministic MPI reduction:
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
************************
* 3. Visual profile tools
************************
--------------------------------------------------------------------
Frontier/rocprof
--------------------------------------------------------------------
--------------------------------------------------------------------
Aurora/unitrace
--------------------------------------------------------------------
--------------------------------------------------------------------
Tursa/nsight-sys
--------------------------------------------------------------------

View File

@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --partition lqcd
#SBATCH --time=00:50:00
#SBATCH -A lqcdtest
#SBATCH -q lqcd
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=64
#SBATCH --qos lqcd
source sourceme.sh
export PLACES=(1:16:4 1:32:2 0:64:1);
export THR=(16 32 64)
for t in 2
do
export OMP_NUM_THREADS=${THR[$t]}
export OMP_PLACES=${PLACES[$t]}
export thr=${THR[$t]}
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
for vol in 48.48.48.96
do
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
done
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
done

View File

@ -0,0 +1,36 @@
#!/bin/bash
#SBATCH --partition lqcd
#SBATCH --time=00:50:00
#SBATCH -A lqcdtest
#SBATCH -q lqcd
#SBATCH --exclusive
#SBATCH --nodes=2
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=64
#SBATCH --qos lqcd
source sourceme.sh
export PLACES=(1:16:4 1:32:2 0:64:1);
export THR=(16 32 64)
nodes=2
mpi=1.1.1.2
for t in 2
do
export OMP_NUM_THREADS=${THR[$t]}
export OMP_PLACES=${PLACES[$t]}
export thr=${THR[$t]}
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
for vol in 64.64.64.128
do
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
done
done

View File

@ -0,0 +1,16 @@
../../configure \
--enable-comms=mpi-auto \
--enable-unified=yes \
--enable-shm=shmopen \
--enable-shm-fast-path=shmopen \
--enable-accelerator=none \
--enable-simd=AVX512 \
--disable-accelerator-cshift \
--disable-fermion-reps \
--disable-gparity \
CXX=clang++ \
MPICXX=mpicxx \
CXXFLAGS="-std=c++17"

View File

@ -0,0 +1,4 @@
source $HOME/spack/share/spack/setup-env.sh
spack load llvm@17.0.4
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
module load openmpi

View File

@ -0,0 +1,17 @@
../../src/Grid/configure \
--prefix /home/pab/NPR/install \
--enable-comms=mpi-auto \
--enable-simd=AVX2 \
--enable-shm=none \
--enable-debug \
--with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--disable-gparity \
--disable-fermion-reps \
CXX=clang++ \
MPICXX=mpicxx \
CXXFLAGS="-std=c++17 "

View File

@ -0,0 +1,28 @@
source $HOME/spack/share/spack/setup-env.sh
spack load llvm@12
spack load autoconf%clang@12.0.1
spack load automake%clang@12.0.1
spack load c-lime%clang@12.0.1
spack load fftw%clang@12.0.1
spack load gmp%clang@12.0.1
spack load mpfr%clang@12.0.1
spack load openmpi%clang@12.0.1
spack load openssl%clang@12.0.1
spack load hdf5+cxx%clang@12.0.1
spack load cmake%clang@12.0.1
export FFTW=`spack find --paths fftw%clang@12.0.1 | grep ^fftw | awk '{print $2}' `
export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1 | grep ^hdf5 | awk '{print $2}' `
export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
export MPFR=`spack find --paths mpfr%clang@12.0.1 | grep ^mpfr | awk '{print $2}' `
export LLVM=`spack find --paths llvm@12 | grep ^llvm | awk '{print $2}' `
export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk '{print $2}' `
export GMP=`spack find --paths gmp%clang@12.0.1 | grep ^gmp | awk '{print $2}' `
export TCLAP=`spack find --paths tclap%clang@12.0.1 | grep ^tclap | awk '{print $2}' `
export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
ulimit -s 81920

View File

@ -0,0 +1,19 @@
cd
git clone https://github.com/spack/spack.git
source $HOME/spack/share/spack/setup-env.sh
spack install llvm@12
spack install autoconf%clang@12.0.1
spack install automake%clang@12.0.1
spack install c-lime%clang@12.0.1
spack install fftw%clang@12.0.1
spack install gmp%clang@12.0.1
spack install mpfr%clang@12.0.1
spack install openmpi%clang@12.0.1
spack install openssl%clang@12.0.1
spack install hdf5+cxx%clang@12.0.1
spack install cmake%clang@12.0.1
spack install tclap%clang@12.0.1
spack install emacs%clang@12.0.1

View File

@ -0,0 +1,239 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_cg_prec.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
#endif
typedef LatticeFermionD FermionField;
int VerifyOnDevice(const FermionField &res, FermionField &ref)
{
deviceVector<int> Fails(1);
int * Fail = &Fails[0];
int FailHost=0;
typedef typename FermionField::vector_object vobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const uint64_t NN = res.Grid()->oSites();
acceleratorPut(*Fail,FailHost);
accelerator_barrier();
// Inject an error
int injection=0;
if(getenv("GRID_ERROR_INJECT")) injection=1;
autoView(res_v,res,AcceleratorWrite);
autoView(ref_v,ref,AcceleratorRead);
if ( res.Grid()->ThisRank()== 0 )
{
if (((random()&0xF)==0)&&injection) {
uint64_t sF = random()%(NN);
int lane=0;
printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank());
auto vv = acceleratorGet(res_v[sF]);
double *dd = (double *)&vv;
*dd=M_PI;
acceleratorPut(res_v[sF],vv);
}
}
accelerator_for( sF, NN, vobj::Nsimd(), {
#ifdef GRID_SIMT
{
int blane = acceleratorSIMTlane(vobj::Nsimd());
#else
for(int blane;blane<vobj::Nsimd();blane++){
#endif
vector_type *vtrr = (vector_type *)&res_v[sF];
vector_type *vtrf = (vector_type *)&ref_v[sF];
int words = sizeof(vobj)/sizeof(vector_type);
for(int w=0;w<words;w++){
scalar_type rrtmp = getlane(vtrr[w], blane);
scalar_type rftmp = getlane(vtrf[w], blane);
if ( rrtmp != rftmp) {
*Fail=1;
}
}
}
});
FailHost = acceleratorGet(*Fail);
return FailHost;
}
void PrintFails(const FermionField &res, FermionField &ref,uint64_t *ids)
{
typedef typename FermionField::vector_object vobj;
const int Nsimd=vobj::Nsimd();
const uint64_t NN = res.Grid()->oSites();
///////////////////////////////
// Pull back to host
///////////////////////////////
autoView(res_v,res,CpuRead);
autoView(ref_v,ref,CpuRead);
std::vector<uint64_t> ids_host(NN*Nsimd);
acceleratorCopyFromDevice(ids,&ids_host[0],NN*Nsimd*sizeof(uint64_t));
//////////////////////////////////////////////////////////////
// Redo check on host and print IDs
//////////////////////////////////////////////////////////////
for(int ss=0;ss< NN; ss++){
int sF = ss;
for(int lane=0;lane<Nsimd;lane++){
auto rr = extractLane(lane,res_v[sF]);
auto rf = extractLane(lane,ref_v[sF]);
uint64_t id = ids_host[lane+Nsimd*sF];
// std::cout << GridHostname()<<" id["<<sF<<"] lane "<<lane<<" id "<<id<<std::endl;
for(int s=0;s<4;s++){
for(int c=0;c<3;c++){
if ( rr()(s)(c)!=rf()(s)(c) ) {
int subslice=(id>>0 )&0xFF;
int slice =(id>>8 )&0xFF;
int eu =(id>>16)&0xFF;
std::cout << GridHostname()<<" miscompare site "<<sF<<" "<<rr()(s)(c)<<" "<<rf()(s)(c)<<" EU "<<eu<<" slice "<<slice<<" subslice "<<subslice<<std::endl;
}
}
}
}
};
return;
}
int main (int argc, char ** argv)
{
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
std::string host(hostname);
Grid_init(&argc,&argv);
const int Ls=12;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
LatticeFermionD src(FGrid); random(RNG5,src);
LatticeFermionD junk(FGrid); random(RNG5,junk);
LatticeFermionD result(FGrid); result=Zero();
LatticeFermionD ref(FGrid); ref=Zero();
SU<Nc>::HotConfiguration(RNG4,Umu);
RealD mass=0.1;
RealD M5=1.8;
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int nsecs=600;
if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
GridCmdOptionInt(arg,nsecs);
}
std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
UGrid->Barrier();
std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
std::cout << GridLogMessage << "::::::::::::: Starting DWF repro for "<<nsecs <<" seconds" << std::endl;
time_t now;
time_t start = time(NULL);
UGrid->Broadcast(0,(void *)&start,sizeof(start));
FlightRecorder::ContinueOnFail = 0;
FlightRecorder::PrintEntireLog = 0;
FlightRecorder::ChecksumComms = 0;
FlightRecorder::ChecksumCommsSend=0;
if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s);
if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s);
if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s);
const uint64_t NN = FGrid->oSites()*vComplexD::Nsimd();
deviceVector<uint64_t> ids_device(NN);
uint64_t *ids = &ids_device[0];
Ddwf.DhopComms(src,ref);
Ddwf.DhopCalc(src,ref,ids);
Ddwf.DhopComms(src,result);
int iter=0;
do {
result=junk;
Ddwf.DhopCalc(src,result,ids);
if ( VerifyOnDevice(result, ref) ) {
printf("Node %s Iter %d detected fails\n",GridHostname(),iter);
PrintFails(result,ref,ids);
// std::cout << " Dslash "<<iter<<" is WRONG! "<<std::endl;
}
//else {
// printf("Node %s Iter %d detected NO fails\n",GridHostname(),iter);
// PrintFails(result,ref,ids);
// std::cout << " Dslash "<<iter<<" is OK! "<<std::endl;
//}
iter ++;
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
} while (now < (start + nsecs) );
Grid_finalize();
}

View File

@ -124,6 +124,8 @@ int main (int argc, char ** argv)
SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf); SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f); SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
// SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
// SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
int nsecs=600; int nsecs=600;
if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
@ -131,6 +133,10 @@ int main (int argc, char ** argv)
GridCmdOptionInt(arg,nsecs); GridCmdOptionInt(arg,nsecs);
} }
std::cout << GridLogMessage << "::::::::::::: Job startup Barrier " << std::endl;
UGrid->Barrier();
std::cout << GridLogMessage << "::::::::::::: Job startup Barrier complete" << std::endl;
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl; std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
@ -148,7 +154,7 @@ int main (int argc, char ** argv)
FlightRecorder::ContinueOnFail = 0; FlightRecorder::ContinueOnFail = 0;
FlightRecorder::PrintEntireLog = 0; FlightRecorder::PrintEntireLog = 0;
FlightRecorder::ChecksumComms = 1; FlightRecorder::ChecksumComms = 0;
FlightRecorder::ChecksumCommsSend=0; FlightRecorder::ChecksumCommsSend=0;
if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s); if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s);
@ -180,7 +186,7 @@ int main (int argc, char ** argv)
iter ++; iter ++;
now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now)); now = time(NULL); UGrid->Broadcast(0,(void *)&now,sizeof(now));
} while (now < (start + nsecs/10) ); } while (now < (start + nsecs/10) );
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl; std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
int i=0; int i=0;

View File

@ -31,7 +31,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
using namespace Grid; using namespace Grid;
const int TSRC = 0; //timeslice where rho is nonzero const int TSRC = 0; //timeslice where rho is nonzero
const int VDIM = 5; //length of each vector const int VDIM = 8; //length of each vector
typedef typename DomainWallFermionD::ComplexField ComplexField; typedef typename DomainWallFermionD::ComplexField ComplexField;
typedef typename DomainWallFermionD::FermionField FermionField; typedef typename DomainWallFermionD::FermionField FermionField;
@ -55,19 +55,26 @@ int main(int argc, char *argv[])
pRNG.SeedFixedIntegers(seeds); pRNG.SeedFixedIntegers(seeds);
// MesonField lhs and rhs vectors // MesonField lhs and rhs vectors
const int Nem=1;
std::vector<FermionField> phi(VDIM,&grid); std::vector<FermionField> phi(VDIM,&grid);
std::vector<FermionField> rho(VDIM,&grid); std::vector<ComplexField> B0(Nem,&grid);
FermionField rho_tmp(&grid); std::vector<ComplexField> B1(Nem,&grid);
std::cout << GridLogMessage << "Initialising random meson fields" << std::endl; std::cout << GridLogMessage << "Initialising random meson fields" << std::endl;
for (unsigned int i = 0; i < VDIM; ++i){ for (unsigned int i = 0; i < VDIM; ++i){
random(pRNG,phi[i]); random(pRNG,phi[i]);
random(pRNG,rho_tmp); //ideally only nonzero on t=0 }
rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0 for (unsigned int i = 0; i < Nem; ++i){
random(pRNG,B0[i]);
random(pRNG,B1[i]);
} }
std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl; std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl;
// Gamma matrices used in the contraction // Gamma matrices used in the contraction
std::vector<Gamma::Algebra> Gmu = { std::vector<Gamma::Algebra> Gmu = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
Gamma::Algebra::GammaX, Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY, Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ, Gamma::Algebra::GammaZ,
@ -78,11 +85,15 @@ int main(int argc, char *argv[])
std::vector<std::vector<double>> momenta = { std::vector<std::vector<double>> momenta = {
{0.,0.,0.}, {0.,0.,0.},
{1.,0.,0.}, {1.,0.,0.},
{-1.,0.,0.},
{0,1.,0.},
{0,-1.,0.},
{0,0,1.},
{0,0,-1.},
{1.,1.,0.}, {1.,1.,0.},
{1.,1.,1.}, {1.,1.,1.},
{2.,0.,0.} {2.,0.,0.}
}; };
std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl; std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl;
std::cout << GridLogMessage << "Computing complex phases" << std::endl; std::cout << GridLogMessage << "Computing complex phases" << std::endl;
@ -102,28 +113,29 @@ int main(int argc, char *argv[])
std::cout << GridLogMessage << "Computing complex phases done." << std::endl; std::cout << GridLogMessage << "Computing complex phases done." << std::endl;
Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); Eigen::Tensor<ComplexD,5, Eigen::RowMajor> App(B0.size(),1,Nt,VDIM,VDIM);
Eigen::Tensor<ComplexD,5, Eigen::RowMajor> Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM);
// timer // timer
double start,stop; double start,stop;
/////////////////////////////////////////////////////////////////////////
//execute meson field routine //execute meson field routine
/////////////////////////////////////////////////////////////////////////
A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
start = usecond(); start = usecond();
A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); A2Autils<WilsonImplR>::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp);
stop = usecond(); stop = usecond();
std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl; std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl;
start = usecond();
/* Ideally, for this meson field we could pass TSRC (even better a list of timeslices)
* to the routine so that all the compnents which are predictably equal to zero are not computed. */
A2Autils<WilsonImplR>::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp);
stop = usecond();
std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl;
start = usecond();
A2Autils<WilsonImplR>::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp);
stop = usecond();
std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl;
/////////////////////////////////////////////////////////////////////////
//execute aslash field routine
/////////////////////////////////////////////////////////////////////////
A2Autils<WilsonImplR>::AslashField(App,&phi[0],&phi[0],B0,B1,Tp);
start = usecond();
A2Autils<WilsonImplR>::AslashField(App,&phi[0],&phi[0],B0,B1,Tp);
stop = usecond();
std::cout << GridLogMessage << "Alash(phi,phi) created, execution time " << stop-start << " us" << std::endl;
std::string FileName = "Meson_Fields"; std::string FileName = "Meson_Fields";
#ifdef HAVE_HDF5 #ifdef HAVE_HDF5
using Default_Reader = Grid::Hdf5Reader; using Default_Reader = Grid::Hdf5Reader;
@ -134,12 +146,11 @@ int main(int argc, char *argv[])
using Default_Writer = Grid::BinaryWriter; using Default_Writer = Grid::BinaryWriter;
FileName.append(".bin"); FileName.append(".bin");
#endif #endif
{
Default_Writer w(FileName); Default_Writer w(FileName);
write(w,"phi_phi",Mpp); write(w,"MesonField",Mpp);
write(w,"phi_rho",Mpr); write(w,"AslashField",App);
write(w,"rho_rho",Mrr); }
// epilogue // epilogue
std::cout << GridLogMessage << "Grid is finalizing now" << std::endl; std::cout << GridLogMessage << "Grid is finalizing now" << std::endl;
Grid_finalize(); Grid_finalize();

View File

@ -39,7 +39,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
Coordinate latt_size = GridDefaultLatt(); Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1}); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
int vol = 1; int vol = 1;
@ -279,6 +279,7 @@ int main (int argc, char ** argv)
result5 = result5 - Kinetic; result5 = result5 - Kinetic;
std::cout<<"diff "<< norm2(result5)<<std::endl; std::cout<<"diff "<< norm2(result5)<<std::endl;
assert(norm2(result5)<1.0e-4);
} }
@ -357,6 +358,7 @@ int main (int argc, char ** argv)
diff = ref - result4; diff = ref - result4;
std::cout << "result - ref "<<norm2(diff)<<std::endl; std::cout << "result - ref "<<norm2(diff)<<std::endl;
assert(norm2(diff)<1.0e-4);
} }
@ -440,6 +442,7 @@ int main (int argc, char ** argv)
diff = ref - result4; diff = ref - result4;
std::cout << "result - ref "<<norm2(diff)<<std::endl; std::cout << "result - ref "<<norm2(diff)<<std::endl;
assert(norm2(diff)<1.0e-4);
} }

View File

@ -38,7 +38,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
Coordinate latt_size = GridDefaultLatt(); Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout( { vComplexD::Nsimd(),1,1,1}); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
int vol = 1; int vol = 1;
@ -74,7 +74,7 @@ int main (int argc, char ** argv)
{ {
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator \n"; std::cout << "Testing OverlapWilsonPartialFractionTanhFermionD Hw kernel Mom space 4d propagator \n";
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
// LatticeFermionD src(&GRID); gaussian(pRNG,src); // LatticeFermionD src(&GRID); gaussian(pRNG,src);
@ -88,7 +88,7 @@ int main (int argc, char ** argv)
RealD mass=0.1; RealD mass=0.1;
RealD M5 =0.8; RealD M5 =0.8;
OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0); OverlapWilsonPartialFractionTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
// Momentum space prop // Momentum space prop
std::cout << " Solving by FFT and Feynman rules" <<std::endl; std::cout << " Solving by FFT and Feynman rules" <<std::endl;
@ -119,9 +119,10 @@ int main (int argc, char ** argv)
std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl; std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
Dov.Mdag(src5,tmp5); Dov.Mdag(src5,tmp5);
src5=tmp5; src5=tmp5;
MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov); MdagMLinearOperator<OverlapWilsonPartialFractionTanhFermionD,LatticeFermionD> HermOp(Dov);
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
CG(HermOp,src5,result5); CG(HermOp,src5,result5);
std::cout << " Solved by Conjugate Gradient (CGNE)" <<std::endl;
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Domain wall physical field propagator // Domain wall physical field propagator
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -153,7 +154,7 @@ int main (int argc, char ** argv)
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
{ {
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
std::cout << "Testing Dov(Hw) Mom space 4d propagator \n"; std::cout << "Testing OverlapWilsonCayleyTanhFermionD space 4d propagator \n";
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
LatticeFermionD tmp(&GRID); LatticeFermionD tmp(&GRID);
@ -228,7 +229,7 @@ int main (int argc, char ** argv)
{ {
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
std::cout << "Testing PartialFraction Hw kernel Mom space 4d propagator with q\n"; std::cout<<"Testing OverlapWilsonPartialFractionTanhFermionD Hw kernel Mom space 4d propagator with q\n";
std::cout<<"****************************************"<<std::endl; std::cout<<"****************************************"<<std::endl;
// LatticeFermionD src(&GRID); gaussian(pRNG,src); // LatticeFermionD src(&GRID); gaussian(pRNG,src);
@ -242,7 +243,9 @@ int main (int argc, char ** argv)
RealD mass=0.1; RealD mass=0.1;
RealD M5 =0.8; RealD M5 =0.8;
OverlapWilsonPartialFractionZolotarevFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,0.001,8.0); OverlapWilsonPartialFractionTanhFermionD Dov(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5,1.0);
std::vector<RealD> qmu({1.0,0.0,0.0,0.0});
Dov.set_qmu(qmu);
// Momentum space prop // Momentum space prop
std::cout << " Solving by FFT and Feynman rules" <<std::endl; std::cout << " Solving by FFT and Feynman rules" <<std::endl;
@ -273,7 +276,7 @@ int main (int argc, char ** argv)
std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl; std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
Dov.Mdag(src5,tmp5); Dov.Mdag(src5,tmp5);
src5=tmp5; src5=tmp5;
MdagMLinearOperator<OverlapWilsonPartialFractionZolotarevFermionD,LatticeFermionD> HermOp(Dov); MdagMLinearOperator<OverlapWilsonPartialFractionTanhFermionD,LatticeFermionD> HermOp(Dov);
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000); ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
CG(HermOp,src5,result5); CG(HermOp,src5,result5);
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////

View File

@ -39,7 +39,8 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
Coordinate latt_size = GridDefaultLatt(); Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout( { vComplexF::Nsimd(),1,1,1}); Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
// Coordinate simd_layout( { vComplexF::Nsimd(),1,1,1});
Coordinate mpi_layout = GridDefaultMpi(); Coordinate mpi_layout = GridDefaultMpi();
int vol = 1; int vol = 1;

View File

@ -0,0 +1,781 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_general_coarse_hdcg.cc
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
#include <Grid/algorithms/iterative/PowerSpectrum.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
using namespace std;
using namespace Grid;
template<class aggregation>
void SaveFineEvecs(aggregation &Agg,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(Agg[0].Grid()->IsBoss());
WR.open(file);
for(int b=0;b<Agg.size();b++){
WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
}
WR.close();
#endif
}
template<class aggregation>
void SaveBasis(aggregation &Agg,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(Agg.FineGrid->IsBoss());
WR.open(file);
for(int b=0;b<Agg.subspace.size();b++){
WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
// WR.writeScidacFieldRecord(Agg.subspace[b],record);
}
WR.close();
#endif
}
template<class aggregation>
void LoadBasis(aggregation &Agg, std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacReader RD ;
RD.open(file);
for(int b=0;b<Agg.subspace.size();b++){
RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
}
RD.close();
#endif
}
template<class aggregation>
void LoadBasisSkip(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacReader RD ;
RD.open(file);
for(int b=0;b<Agg.subspace.size();b++){
for(int n=0;n<N;n++){
RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
if(n==0) precisionChange(Agg.subspace[b],tmp);
}
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
}
RD.close();
#endif
}
template<class aggregation>
void LoadBasisSum(aggregation &Agg, std::string file,int N,LatticeFermionF & tmp)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacReader RD ;
LatticeFermionF sum(tmp.Grid());
RD.open(file);
for(int b=0;b<Agg.subspace.size();b++){
sum=Zero();
for(int n=0;n<N;n++){
RD.readScidacFieldRecord(tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
sum=sum+tmp;
}
precisionChange(Agg.subspace[b],sum);
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
}
RD.close();
#endif
}
template<class CoarseVector>
void SaveEigenvectors(std::vector<RealD> &eval,
std::vector<CoarseVector> &evec,
std::string evec_file,
std::string eval_file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(evec[0].Grid()->IsBoss());
WR.open(evec_file);
for(int b=0;b<evec.size();b++){
WR.writeScidacFieldRecord(evec[b],record,0,0);
}
WR.close();
XmlWriter WRx(eval_file);
write(WRx,"evals",eval);
#endif
}
template<class CoarseVector>
void LoadEigenvectors(std::vector<RealD> &eval,
std::vector<CoarseVector> &evec,
std::string evec_file,
std::string eval_file)
{
#ifdef HAVE_LIME
XmlReader RDx(eval_file);
read(RDx,"evals",eval);
emptyUserRecord record;
Grid::ScidacReader RD ;
RD.open(evec_file);
assert(evec.size()==eval.size());
for(int k=0;k<eval.size();k++) {
RD.readScidacFieldRecord(evec[k],record);
}
RD.close();
#endif
}
// Want Op in CoarsenOp to call MatPcDagMatPc
template<class Field>
class HermOpAdaptor : public LinearOperatorBase<Field>
{
LinearOperatorBase<Field> & wrapped;
public:
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); }
void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); }
void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); }
void OpDiag (const Field &in, Field &out) { assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
};
template<class Field> class FixedCGPolynomial : public LinearFunction<Field>
{
public:
using LinearFunction<Field>::operator();
typedef LinearOperatorBase<Field> FineOperator;
FineOperator & _SmootherOperator;
ConjugateGradientPolynomial<Field> CG;
int iters;
bool record;
int replay_count;
FixedCGPolynomial(int _iters, FineOperator &SmootherOperator) :
_SmootherOperator(SmootherOperator),
iters(_iters),
record(true),
CG(0.0,_iters,false)
{
std::cout << GridLogMessage<<" FixedCGPolynomial order "<<iters<<std::endl;
replay_count = 0;
};
void operator() (const Field &in, Field &out)
{
#if 1
GridBase *grid = in.Grid();
Field Mx0(grid);
Field r0(grid);
Field Minvr0(grid);
_SmootherOperator.HermOp(out,Mx0);
r0 = in - Mx0;
Minvr0 = Zero();
Minvr0.Checkerboard()=in.Checkerboard();
if ( record ) {
std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
CG.Solve(_SmootherOperator,r0,Minvr0);
record = false;
/*
std::cout << "P(x) = 0 "<<std::endl;
for(int i=0;i<CG.polynomial.size();i++){
std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
}
*/
Field tmp(Minvr0.Grid());
CG.CGsequenceHermOp(_SmootherOperator,r0,tmp);
tmp = tmp - Minvr0;
std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
} else {
std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
CG.CGsequenceHermOp(_SmootherOperator,r0,Minvr0);
if ( replay_count %5== 0 ) record=true;
replay_count++;
}
out = out + Minvr0;
_SmootherOperator.HermOp(out,r0);
r0 = r0 - in;
RealD rr=norm2(r0);
RealD ss=norm2(in);
std::cout << " FixedCGPolynomial replayed polynomial resid "<<::sqrt(rr/ss)<<std::endl;
#else
out = Zero();
out.Checkerboard()=in.Checkerboard();
if ( record ) {
std::cout << " FixedCGPolynomial recording polynomial "<<std::endl;
CG.Solve(_SmootherOperator,in,out);
record = false;
std::cout << "P(x) = 0 "<<std::endl;
for(int i=0;i<CG.polynomial.size();i++){
std::cout<<" + "<< CG.polynomial[i]<<" * (x**"<<i<<")"<<std::endl;
}
Field tmp(in.Grid());
CG.CGsequenceHermOp(_SmootherOperator,in,tmp);
tmp = tmp - out;
std::cout << " CGsequence error "<<norm2(tmp)<<" / "<<norm2(out)<<std::endl;
} else {
std::cout << " FixedCGPolynomial replaying polynomial "<<std::endl;
CG.CGsequenceHermOp(_SmootherOperator,in,out);
if ( replay_count %5== 5 ) record=true;
replay_count++;
}
#endif
}
void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{
for(int i=0;i<out.size();i++){
out[i]=Zero();
}
int blockDim = 0;//not used for BlockCGVec
BlockConjugateGradient<Field> BCGV (BlockCGrQVec,blockDim,0.0,iters,false);
BCGV(_SmootherOperator,in,out);
}
};
template<class Field> class CGSmoother : public LinearFunction<Field>
{
public:
using LinearFunction<Field>::operator();
typedef LinearOperatorBase<Field> FineOperator;
FineOperator & _SmootherOperator;
int iters;
CGSmoother(int _iters, FineOperator &SmootherOperator) :
_SmootherOperator(SmootherOperator),
iters(_iters)
{
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
};
void operator() (const Field &in, Field &out)
{
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
out=Zero();
CG(_SmootherOperator,in,out);
}
};
RealD InverseApproximation(RealD x){
return 1.0/x;
}
template<class Field> class ChebyshevSmoother : public LinearFunction<Field>
{
public:
using LinearFunction<Field>::operator();
typedef LinearOperatorBase<Field> FineOperator;
FineOperator & _SmootherOperator;
Chebyshev<Field> Cheby;
ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator) :
_SmootherOperator(SmootherOperator),
Cheby(_lo,_hi,_ord,InverseApproximation)
{
std::cout << GridLogMessage<<" Chebyshev smoother order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
};
void operator() (const Field &in, Field &out)
{
// Field r(out.Grid());
Cheby(_SmootherOperator,in,out);
// _SmootherOperator.HermOp(out,r);
// r=r-in;
// RealD rr=norm2(r);
// RealD ss=norm2(in);
// std::cout << GridLogMessage<<" Chebyshev smoother resid "<<::sqrt(rr/ss)<<std::endl;
}
};
template<class Field> class ChebyshevInverter : public LinearFunction<Field>
{
public:
using LinearFunction<Field>::operator();
typedef LinearOperatorBase<Field> FineOperator;
FineOperator & _Operator;
Chebyshev<Field> Cheby;
ChebyshevInverter(RealD _lo,RealD _hi,int _ord, FineOperator &Operator) :
_Operator(Operator),
Cheby(_lo,_hi,_ord,InverseApproximation)
{
std::cout << GridLogMessage<<" Chebyshev Inverter order "<<_ord<<" ["<<_lo<<","<<_hi<<"]"<<std::endl;
};
void operator() (const Field &in, Field &out)
{
Field r(in.Grid());
Field AinvR(in.Grid());
_Operator.HermOp(out,r);
r = in - r; // b - A x
Cheby(_Operator,r,AinvR); // A^{-1} ( b - A x ) ~ A^{-1} b - x
out = out + AinvR;
_Operator.HermOp(out,r);
r = in - r; // b - A x
RealD rr = norm2(r);
RealD ss = norm2(in);
std::cout << "ChebshevInverse resid " <<::sqrt(rr/ss)<<std::endl;
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int sample=1;
if( GridCmdOptionExists(argv,argv+argc,"--sample") ){
std::string arg;
arg = GridCmdOptionPayload(argv,argv+argc,"--sample");
GridCmdOptionInt(arg,sample);
}
const int Ls=24;
const int nbasis = 62;
const int cb = 0 ;
RealD mass=0.00078;
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
std::string arg;
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
GridCmdOptionFloat(arg,mass);
}
RealD M5=1.8;
RealD b=1.5;
RealD c=0.5;
std::cout << GridLogMessage << " *************************** " <<std::endl;
std::cout << GridLogMessage << " Mass " <<mass<<std::endl;
std::cout << GridLogMessage << " M5 " <<M5<<std::endl;
std::cout << GridLogMessage << " Ls " <<Ls<<std::endl;
std::cout << GridLogMessage << " b " <<b<<std::endl;
std::cout << GridLogMessage << " c " <<c<<std::endl;
std::cout << GridLogMessage << " *************************** " <<std::endl;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
//////////////////////////////////////////
// Single precision grids -- lanczos + smoother
//////////////////////////////////////////
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplexF::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
///////////////////////// Configuration /////////////////////////////////
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("ckpoint_lat.1000");
NerscIO::readConfiguration(Umu,header,file);
//////////////////////// Fermion action //////////////////////////////////
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
std::cout << "**************************************"<<std::endl;
std::cout << " Fine Power method "<<std::endl;
std::cout << "**************************************"<<std::endl;
{
LatticeFermionD pm_src(FrbGrid);
pm_src = ComplexD(1.0);
PowerMethod<LatticeFermionD> fPM;
fPM(HermOpEO,pm_src);
}
if(0)
{
std::cout << "**************************************"<<std::endl;
std::cout << " Fine Lanczos "<<std::endl;
std::cout << "**************************************"<<std::endl;
typedef LatticeFermionF FermionField;
LatticeGaugeFieldF UmuF(UGridF);
precisionChange(UmuF,Umu);
MobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,b,c);
SchurDiagMooeeOperator<MobiusFermionF, LatticeFermionF> HermOpEOF(DdwfF);
const int Fine_Nstop = 200;
const int Fine_Nk = 200;
const int Fine_Np = 200;
const int Fine_Nm = Fine_Nk+Fine_Np;
const int Fine_MaxIt= 10;
RealD Fine_resid = 1.0e-4;
std::cout << GridLogMessage << "Fine Lanczos "<<std::endl;
std::cout << GridLogMessage << "Nstop "<<Fine_Nstop<<std::endl;
std::cout << GridLogMessage << "Nk "<<Fine_Nk<<std::endl;
std::cout << GridLogMessage << "Np "<<Fine_Np<<std::endl;
std::cout << GridLogMessage << "resid "<<Fine_resid<<std::endl;
Chebyshev<FermionField> Cheby(0.002,92.0,401);
// Chebyshev<FermionField> Cheby(0.1,92.0,401);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOpEOF);
PlainHermOp<FermionField> Op (HermOpEOF);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Fine_Nstop,Fine_Nk,Fine_Nm,Fine_resid,Fine_MaxIt);
std::vector<RealD> Fine_eval(Fine_Nm);
FermionField Fine_src(FrbGridF);
Fine_src = ComplexF(1.0);
std::vector<FermionField> Fine_evec(Fine_Nm,FrbGridF);
int Fine_Nconv;
std::cout << GridLogMessage <<" Calling IRL.calc single prec"<<std::endl;
IRL.calc(Fine_eval,Fine_evec,Fine_src,Fine_Nconv);
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
SaveFineEvecs(Fine_evec,evec_file);
}
//////////////////////////////////////////
// Construct a coarsened grid with 4^4 cell
//////////////////////////////////////////
Coordinate Block({4,4,6,4});
Coordinate clatt = GridDefaultLatt();
for(int d=0;d<clatt.size();d++){
clatt[d] = clatt[d]/Block[d];
}
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());;
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
///////////////////////// RNGs /////////////////////////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::vector<int> cseeds({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
HermFineMatrix FineHermOp(HermOpEO);
////////////////////////////////////////////////////////////
///////////// Coarse basis and Little Dirac Operator ///////
////////////////////////////////////////////////////////////
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
typedef LittleDiracOperator::CoarseVector CoarseVector;
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
Subspace Aggregates(Coarse5d,FrbGrid,cb);
////////////////////////////////////////////////////////////
// Need to check about red-black grid coarsening
////////////////////////////////////////////////////////////
std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
// // std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
// std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
bool load_agg=true;
bool load_refine=true;
bool load_mat=false;
bool load_evec=false;
int refine=1;
if ( load_agg ) {
if ( !(refine) || (!load_refine) ) {
LoadBasis(Aggregates,subspace_file);
}
} else {
// Aggregates.CreateSubspaceMultishift(RNG5,HermOpEO,
// 0.0003,1.0e-5,2000); // Lo, tol, maxit
// Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,95.,0.01,1500);// <== last run
Aggregates.CreateSubspaceChebyshevNew(RNG5,HermOpEO,95.);
SaveBasis(Aggregates,subspace_file);
}
std::cout << "**************************************"<<std::endl;
std::cout << "Building MultiRHS Coarse operator"<<std::endl;
std::cout << "**************************************"<<std::endl;
ConjugateGradient<CoarseVector> coarseCG(4.0e-2,20000,true);
const int nrhs=24;
Coordinate mpi=GridDefaultMpi();
Coordinate rhMpi ({1,1,mpi[0],mpi[1],mpi[2],mpi[3]});
Coordinate rhLatt({nrhs,1,clatt[0],clatt[1],clatt[2],clatt[3]});
Coordinate rhSimd({vComplex::Nsimd(),1, 1,1,1,1});
GridCartesian *CoarseMrhs = new GridCartesian(rhLatt,rhSimd,rhMpi);
typedef MultiGeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> MultiGeneralCoarsenedMatrix_t;
MultiGeneralCoarsenedMatrix_t mrhs(geom,CoarseMrhs);
std::cout << "**************************************"<<std::endl;
std::cout << " Coarse Lanczos "<<std::endl;
std::cout << "**************************************"<<std::endl;
typedef HermitianLinearOperator<MultiGeneralCoarsenedMatrix_t,CoarseVector> MrhsHermMatrix;
Chebyshev<CoarseVector> IRLCheby(0.005,42.0,301); // 1 iter
MrhsHermMatrix MrhsCoarseOp (mrhs);
// CoarseVector pm_src(CoarseMrhs);
// pm_src = ComplexD(1.0);
// PowerMethod<CoarseVector> cPM; cPM(MrhsCoarseOp,pm_src);
int Nk=192;
int Nm=384;
int Nstop=Nk;
int Nconv_test_interval=1;
ImplicitlyRestartedBlockLanczosCoarse<CoarseVector> IRL(MrhsCoarseOp,
Coarse5d,
CoarseMrhs,
nrhs,
IRLCheby,
Nstop,
Nconv_test_interval,
nrhs,
Nk,
Nm,
1e-5,10);
int Nconv;
std::vector<RealD> eval(Nm);
std::vector<CoarseVector> evec(Nm,Coarse5d);
std::vector<CoarseVector> c_src(nrhs,Coarse5d);
///////////////////////
// Deflation guesser object
///////////////////////
MultiRHSDeflation<CoarseVector> MrhsGuesser;
//////////////////////////////////////////
// Block projector for coarse/fine
//////////////////////////////////////////
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
//////////////////////////
// Extra HDCG parameters
//////////////////////////
int maxit=300;
ConjugateGradient<CoarseVector> CG(5.0e-2,maxit,false);
ConjugateGradient<CoarseVector> CGstart(5.0e-2,maxit,false);
RealD lo=2.0;
int ord = 7;
// int ord = 11;
int blockDim = 0;//not used for BlockCG
BlockConjugateGradient<CoarseVector> BCG (BlockCGrQ,blockDim,5.0e-5,maxit,true);
DoNothingGuesser<CoarseVector> DoNothing;
// HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,CG,DoNothing);
// HPDSolver<CoarseVector> HPDSolveMrhsStart(MrhsCoarseOp,CGstart,DoNothing);
// HPDSolver<CoarseVector> HPDSolveMrhs(MrhsCoarseOp,BCG,DoNothing);
// HPDSolver<CoarseVector> HPDSolveMrhsRefine(MrhsCoarseOp,BCG,DoNothing);
// FixedCGPolynomial<CoarseVector> HPDSolveMrhs(maxit,MrhsCoarseOp);
ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp); //
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,110,MrhsCoarseOp); // 114 iter with Chebysmooth and BlockCG
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,120,MrhsCoarseOp); // 138 iter with Chebysmooth
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-2,40.0,200,MrhsCoarseOp); // 139 iter
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-3,40.0,200,MrhsCoarseOp); // 137 iter, CG smooth, flex
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(1.0e-3,40.0,200,MrhsCoarseOp); // 146 iter, CG smooth, flex
// ChebyshevInverter<CoarseVector> HPDSolveMrhs(3.0e-4,40.0,200,MrhsCoarseOp); // 156 iter, CG smooth, flex
/////////////////////////////////////////////////
// Mirs smoother
/////////////////////////////////////////////////
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,lo);
// FixedCGPolynomial<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
// CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
ChebyshevSmoother<LatticeFermionD> CGsmooth(2.0,92.0,8,HermOpEO) ;
if ( load_refine ) {
LoadBasis(Aggregates,refine_file);
// LatticeFermionF conv_tmp(FrbGridF);
// LoadBasisSum(Aggregates,refine_file,sample,conv_tmp);
} else {
Aggregates.RefineSubspace(HermOpEO,0.001,1.0e-3,3000); // 172 iters
SaveBasis(Aggregates,refine_file);
}
Aggregates.Orthogonalise();
std::cout << "**************************************"<<std::endl;
std::cout << "Coarsen after refine"<<std::endl;
std::cout << "**************************************"<<std::endl;
mrhs.CoarsenOperator(FineHermOp,Aggregates,Coarse5d);
std::cout << "**************************************"<<std::endl;
std::cout << " Recompute coarse evecs "<<std::endl;
std::cout << "**************************************"<<std::endl;
evec.resize(Nm,Coarse5d);
eval.resize(Nm);
for(int r=0;r<nrhs;r++){
random(CRNG,c_src[r]);
}
IRL.calc(eval,evec,c_src,Nconv,LanczosType::irbl);
std::cout << "**************************************"<<std::endl;
std::cout << " Reimport coarse evecs "<<std::endl;
std::cout << "**************************************"<<std::endl;
MrhsGuesser.ImportEigenBasis(evec,eval);
std::cout << "**************************************"<<std::endl;
std::cout << " Setting up mRHS HDCG"<<std::endl;
std::cout << "**************************************"<<std::endl;
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
MrhsProjector.ImportBasis(Aggregates.subspace);
std::cout << "**************************************"<<std::endl;
std::cout << "Calling mRHS HDCG"<<std::endl;
std::cout << "**************************************"<<std::endl;
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
HDCGmrhs(1.0e-8, 300,
FineHermOp,
CGsmooth,
HPDSolveMrhs, // Used in M1
HPDSolveMrhs, // Used in Vstart
MrhsProjector,
MrhsGuesser,
CoarseMrhs);
std::vector<LatticeFermionD> src_mrhs(nrhs,FrbGrid);
std::vector<LatticeFermionD> res_mrhs(nrhs,FrbGrid);
LatticeFermionD result_accurate(FrbGrid);
LatticeFermionD result_sloppy(FrbGrid);
LatticeFermionD error(FrbGrid);
LatticeFermionD residual(FrbGrid);
for(int r=0;r<nrhs;r++){
random(RNG5,src_mrhs[r]);
res_mrhs[r]=Zero();
}
HDCGmrhs(src_mrhs,res_mrhs);
result_accurate = res_mrhs[0];
#if 0
std::vector<RealD> bins({1.0e-3,1.0e-2,1.0e-1,1.0,10.0,100.0});
std::vector<int> orders({6000 ,4000 ,1000 ,500,500 ,500});
PowerSpectrum GraphicEqualizer(bins,orders);
std::cout << "**************************************"<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum of rrr "<<std::endl;
std::cout << "**************************************"<<std::endl;
GraphicEqualizer(FineHermOp,HDCGmrhs.rrr);
std::cout << "**************************************"<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum of sss "<<std::endl;
std::cout << "**************************************"<<std::endl;
GraphicEqualizer(FineHermOp,HDCGmrhs.sss);
std::cout << "**************************************"<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum of qqq "<<std::endl;
std::cout << "**************************************"<<std::endl;
GraphicEqualizer(FineHermOp,HDCGmrhs.qqq);
std::cout << "**************************************"<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum of zzz "<<std::endl;
std::cout << "**************************************"<<std::endl;
GraphicEqualizer(FineHermOp,HDCGmrhs.zzz);
std::vector<RealD> tols({1.0e-3,1.0e-4,1.0e-5});
for(auto tol : tols) {
TwoLevelADEF2mrhs<LatticeFermion,CoarseVector>
HDCGmrhsSloppy(tol, 500,
FineHermOp,
CGsmooth,
HPDSolveMrhs, // Used in M1
HPDSolveMrhs, // Used in Vstart
MrhsProjector,
MrhsGuesser,
CoarseMrhs);
// Solve again to 10^-5
for(int r=0;r<nrhs;r++){
res_mrhs[r]=Zero();
}
HDCGmrhsSloppy(src_mrhs,res_mrhs);
result_sloppy = res_mrhs[0];
error = result_sloppy - result_accurate;
FineHermOp.HermOp(result_sloppy,residual);
residual = residual - src_mrhs[0];
std::cout << "**************************************"<<std::endl;
std::cout << GridLogMessage << " Converged to tolerance "<< tol<<std::endl;
std::cout << GridLogMessage << " Absolute error "<<norm2(error)<<std::endl;
std::cout << GridLogMessage << " Residual "<<norm2(residual)<<std::endl;
std::cout << "**************************************"<<std::endl;
std::cout << "**************************************"<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum of error "<<std::endl;
std::cout << "**************************************"<<std::endl;
GraphicEqualizer(FineHermOp,error);
std::cout << "**************************************"<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum of residual "<<std::endl;
std::cout << "**************************************"<<std::endl;
GraphicEqualizer(FineHermOp,residual);
};
#endif
// Standard CG
#if 0
{
std::cout << "**************************************"<<std::endl;
std::cout << "Calling red black CG"<<std::endl;
std::cout << "**************************************"<<std::endl;
LatticeFermion result(FrbGrid); result=Zero();
LatticeFermion src(FrbGrid); random(RNG5,src);
result=Zero();
ConjugateGradient<LatticeFermionD> CGfine(1.0e-8,30000,false);
CGfine(HermOpEO, src, result);
}
#endif
Grid_finalize();
return 0;
}

View File

@ -0,0 +1,355 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_general_coarse_hdcg.cc
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczosCoarse.h>
#include <Grid/algorithms/iterative/AdefMrhs.h>
using namespace std;
using namespace Grid;
template<class aggregation>
void SaveFineEvecs(aggregation &Agg,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(Agg[0].Grid()->IsBoss());
WR.open(file);
for(int b=0;b<Agg.size();b++){
WR.writeScidacFieldRecord(Agg[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
}
WR.close();
#endif
}
template<class aggregation>
void SaveBasis(aggregation &Agg,std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(Agg.FineGrid->IsBoss());
WR.open(file);
for(int b=0;b<Agg.subspace.size();b++){
WR.writeScidacFieldRecord(Agg.subspace[b],record,0,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
// WR.writeScidacFieldRecord(Agg.subspace[b],record);
}
WR.close();
#endif
}
template<class aggregation>
void LoadBasis(aggregation &Agg, std::string file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacReader RD ;
RD.open(file);
for(int b=0;b<Agg.subspace.size();b++){
RD.readScidacFieldRecord(Agg.subspace[b],record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
// RD.readScidacFieldRecord(Agg.subspace[b],record,0);
}
RD.close();
#endif
}
template<class aggregation>
void LoadFineEvecs(aggregation &Agg, std::string file,LatticeFermionF & conv_tmp)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacReader RD ;
RD.open(file);
for(int b=0;b<Agg.size();b++){
RD.readScidacFieldRecord(conv_tmp,record,Grid::BinaryIO::BINARYIO_LEXICOGRAPHIC);
precisionChange(Agg[b],conv_tmp);
}
RD.close();
#endif
}
template<class CoarseVector>
void SaveEigenvectors(std::vector<RealD> &eval,
std::vector<CoarseVector> &evec,
std::string evec_file,
std::string eval_file)
{
#ifdef HAVE_LIME
emptyUserRecord record;
ScidacWriter WR(evec[0].Grid()->IsBoss());
WR.open(evec_file);
for(int b=0;b<evec.size();b++){
WR.writeScidacFieldRecord(evec[b],record,0,0);
}
WR.close();
XmlWriter WRx(eval_file);
write(WRx,"evals",eval);
#endif
}
template<class CoarseVector>
void LoadEigenvectors(std::vector<RealD> &eval,
std::vector<CoarseVector> &evec,
std::string evec_file,
std::string eval_file)
{
#ifdef HAVE_LIME
XmlReader RDx(eval_file);
read(RDx,"evals",eval);
emptyUserRecord record;
Grid::ScidacReader RD ;
RD.open(evec_file);
assert(evec.size()==eval.size());
for(int k=0;k<eval.size();k++) {
RD.readScidacFieldRecord(evec[k],record);
}
RD.close();
#endif
}
// Want Op in CoarsenOp to call MatPcDagMatPc
template<class Field>
class HermOpAdaptor : public LinearOperatorBase<Field>
{
LinearOperatorBase<Field> & wrapped;
public:
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
void Op (const Field &in, Field &out) { wrapped.HermOp(in,out); }
void HermOp(const Field &in, Field &out) { wrapped.HermOp(in,out); }
void AdjOp (const Field &in, Field &out){ wrapped.HermOp(in,out); }
void OpDiag (const Field &in, Field &out) { assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out) { assert(0); };
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
};
template<class Field> class CGSmoother : public LinearFunction<Field>
{
public:
using LinearFunction<Field>::operator();
typedef LinearOperatorBase<Field> FineOperator;
FineOperator & _SmootherOperator;
int iters;
CGSmoother(int _iters, FineOperator &SmootherOperator) :
_SmootherOperator(SmootherOperator),
iters(_iters)
{
std::cout << GridLogMessage<<" Mirs smoother order "<<iters<<std::endl;
};
void operator() (const Field &in, Field &out)
{
ConjugateGradient<Field> CG(0.0,iters,false); // non-converge is just fine in a smoother
out=Zero();
CG(_SmootherOperator,in,out);
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=24;
const int nbasis = 62;
const int cb = 0 ;
RealD mass=0.00078;
RealD M5=1.8;
RealD b=1.5;
RealD c=0.5;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
// Construct a coarsened grid with 4^4 cell
Coordinate Block({4,4,6,4});
Coordinate clatt = GridDefaultLatt();
for(int d=0;d<clatt.size();d++){
clatt[d] = clatt[d]/Block[d];
}
//////////////////////////////////////////
// Double precision grids
//////////////////////////////////////////
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());;
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
//////////////////////////////////////////
// Single precision grids -- lanczos + smoother
//////////////////////////////////////////
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplexF::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
///////////////////////// RNGs /////////////////////////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::vector<int> cseeds({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds);
///////////////////////// Configuration /////////////////////////////////
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("ckpoint_lat.1000");
NerscIO::readConfiguration(Umu,header,file);
//////////////////////// Fermion action //////////////////////////////////
MobiusFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c);
SchurDiagMooeeOperator<MobiusFermionD, LatticeFermion> HermOpEO(Ddwf);
const int Fine_Nstop = 200;
const int Fine_Nk = 100;
const int Fine_Np = 100;
const int Fine_Nm = Fine_Nk+Fine_Np;
typedef LatticeFermion FermionField;
std::vector<RealD> Fine_eval;
std::vector<FermionField> Fine_evec;
LatticeFermionF conv_tmp(FrbGridF);
Fine_eval.resize(Fine_Nstop);
Fine_evec.resize(Fine_Nstop,FrbGrid);
std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evecF");
LoadFineEvecs(Fine_evec,evec_file,conv_tmp);
typedef HermOpAdaptor<LatticeFermionD> HermFineMatrix;
HermFineMatrix FineHermOp(HermOpEO);
////////////////////////////////////////////////////////////
///////////// Coarse basis and Little Dirac Operator ///////
////////////////////////////////////////////////////////////
typedef GeneralCoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> LittleDiracOperator;
typedef LittleDiracOperator::CoarseVector CoarseVector;
NextToNextToNextToNearestStencilGeometry5D geom(Coarse5d);
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
Subspace Aggregates(Coarse5d,FrbGrid,cb);
////////////////////////////////////////////////////////////
// Need to check about red-black grid coarsening
////////////////////////////////////////////////////////////
// std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.mixed.2500.60");
// // std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.new.62");
// std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys48.evec");
std::string refine_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Refine.phys48.mixed.2500.60");
// std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys48.mixed.60");
// std::string evec_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/evecs.scidac");
// std::string eval_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/eval.xml");
bool load_agg=true;
bool load_refine=true;
//////////////////////////////////////////
// Block projector for coarse/fine
//////////////////////////////////////////
MultiRHSBlockProject<LatticeFermionD> MrhsProjector;
/////////////////////////////////////////////////
// Mirs smoother
/////////////////////////////////////////////////
int ord=8;
RealD lo=2.0;
RealD MirsShift = lo;
ShiftedHermOpLinearOperator<LatticeFermionD> ShiftedFineHermOp(HermOpEO,MirsShift);
CGSmoother<LatticeFermionD> CGsmooth(ord,ShiftedFineHermOp) ;
LoadBasis(Aggregates,refine_file);
Aggregates.Orthogonalise();
std::cout << "**************************************"<<std::endl;
std::cout << " Using filtered subspace"<<std::endl;
std::cout << "**************************************"<<std::endl;
MrhsProjector.Allocate(nbasis,FrbGrid,Coarse5d);
MrhsProjector.ImportBasis(Aggregates.subspace);
FermionField Ftmp(FrbGrid);
std::vector<FermionField> Fine_ev(1,FrbGrid);
std::vector<FermionField> Fine_ev_compressed(1,FrbGrid);
std::vector<CoarseVector> c_evec(1,Coarse5d);
for(int ev=0;ev<Fine_evec.size();ev++){
Fine_ev[0] = Fine_evec[ev];
MrhsProjector.blockProject(Fine_ev,c_evec);
MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
Ftmp = Fine_ev_compressed[0];
RealD div = 1.0/ sqrt(norm2(Ftmp));
Ftmp = Ftmp * div;
std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0]) <<std::endl;
std::cout << GridLogMessage<<" "<<ev<<" comp "<< norm2(Ftmp) <<std::endl;
Ftmp = Fine_ev[0] - Ftmp;
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
CGsmooth(Fine_ev_compressed[0],Ftmp);
Ftmp = Ftmp *lo;
std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp) <<std::endl;
div = 1.0/ sqrt(norm2(Ftmp));
Ftmp=Ftmp*div;
Ftmp = Fine_ev[0]-Ftmp;
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
}
std::cout << "**************************************"<<std::endl;
std::cout << " Using eigenvector subspace "<<std::endl;
std::cout << "**************************************"<<std::endl;
for(int i=0;i<Aggregates.subspace.size();i++){
Aggregates.subspace[i] = Fine_evec[i];
}
Aggregates.Orthogonalise();
MrhsProjector.ImportBasis(Aggregates.subspace);
for(int ev=0;ev<Fine_evec.size();ev++){
Fine_ev[0] = Fine_evec[ev];
MrhsProjector.blockProject(Fine_ev,c_evec);
MrhsProjector.blockPromote(Fine_ev_compressed,c_evec);
Ftmp = Fine_ev_compressed[0];
RealD div = 1.0/ sqrt(norm2(Ftmp));
Ftmp = Ftmp * div;
std::cout << GridLogMessage<<" "<<ev<<" uncomp "<< norm2(Fine_ev[0]) <<std::endl;
std::cout << GridLogMessage<<" "<<ev<<" comp "<< norm2(Ftmp) <<std::endl;
Ftmp = Fine_ev[0] - Ftmp;
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
CGsmooth(Fine_ev_compressed[0],Ftmp);
Ftmp = Ftmp *lo;
std::cout << GridLogMessage<<" "<<ev<<" smoothed "<< norm2(Ftmp) <<std::endl;
div = 1.0/ sqrt(norm2(Ftmp));
Ftmp=Ftmp*div;
Ftmp = Fine_ev[0]-Ftmp;
std::cout << GridLogMessage<<" "<<ev<<" diff "<< norm2(Ftmp) <<std::endl;
}
// Standard CG
Grid_finalize();
return 0;
}

View File

@ -36,28 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
template<class Field>
class HermOpAdaptor : public LinearOperatorBase<Field>
{
LinearOperatorBase<Field> & wrapped;
public:
HermOpAdaptor(LinearOperatorBase<Field> &wrapme) : wrapped(wrapme) {};
void OpDiag (const Field &in, Field &out) { assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
wrapped.HermOp(in,out);
}
void AdjOp (const Field &in, Field &out){
wrapped.HermOp(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){
wrapped.HermOp(in,out);
}
};
template<class Matrix,class Field> template<class Matrix,class Field>
class PVdagMLinearOperator : public LinearOperatorBase<Field> { class PVdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
@ -69,78 +47,169 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); } void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }; void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
// std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.M(in,tmp); _Mat.M(in,tmp);
_PV.Mdag(tmp,out); _PV.Mdag(tmp,out);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
// std::cout << "AdjOp: Mdag PV "<<std::endl;
Field tmp(in.Grid()); Field tmp(in.Grid());
_PV.M(tmp,out); _PV.M(in,tmp);
_Mat.Mdag(in,tmp); _Mat.Mdag(tmp,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); } void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
std::cout << "HermOp"<<std::endl; // std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
Field tmp(in.Grid());
// _Mat.M(in,tmp);
// _PV.Mdag(tmp,out);
// _PV.M(out,tmp);
// _Mat.Mdag(tmp,out);
Op(in,tmp);
AdjOp(tmp,out);
// std::cout << "HermOp done "<<norm2(out)<<std::endl;
}
};
template<class Matrix,class Field>
class ShiftedPVdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
Matrix &_PV;
RealD shift;
public:
ShiftedPVdagMLinearOperator(RealD _shift,Matrix &Mat,Matrix &PV): shift(_shift),_Mat(Mat),_PV(PV){};
void OpDiag (const Field &in, Field &out) { assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
// std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid()); Field tmp(in.Grid());
_Mat.M(in,tmp); _Mat.M(in,tmp);
_PV.Mdag(tmp,out); _PV.Mdag(tmp,out);
_PV.M(out,tmp); out = out + shift * in;
_Mat.Mdag(tmp,out);
std::cout << "HermOp done "<<norm2(out)<<std::endl;
} }
}; void AdjOp (const Field &in, Field &out){
// std::cout << "AdjOp: Mdag PV "<<std::endl;
template<class Field> class DumbOperator : public LinearOperatorBase<Field> { Field tmp(in.Grid());
public: _PV.M(tmp,out);
LatticeComplex scale; _Mat.Mdag(in,tmp);
DumbOperator(GridBase *grid) : scale(grid) out = out + shift * in;
{
scale = 0.0;
LatticeComplex scalesft(grid);
LatticeComplex scaletmp(grid);
for(int d=0;d<4;d++){
Lattice<iScalar<vInteger> > x(grid); LatticeCoordinate(x,d+1);
LatticeCoordinate(scaletmp,d+1);
scalesft = Cshift(scaletmp,d+1,1);
scale = 100.0*scale + where( mod(x ,2)==(Integer)0, scalesft,scaletmp);
}
std::cout << " scale\n" << scale << std::endl;
}
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {};
void OpDir (const Field &in, Field &out,int dir,int disp){};
void OpDirAll (const Field &in, std::vector<Field> &out) {};
void Op (const Field &in, Field &out){
out = scale * in;
}
void AdjOp (const Field &in, Field &out){
out = scale * in;
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
double n1, n2; // std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
HermOpAndNorm(in,out,n1,n2); Field tmp(in.Grid());
} Op(in,tmp);
void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){ AdjOp(tmp,out);
ComplexD dot;
out = scale * in;
dot= innerProduct(in,out);
n1=real(dot);
dot = innerProduct(out,out);
n2=real(dot);
} }
}; };
template<class Fobj,class CComplex,int nbasis>
class MGPreconditioner : public LinearFunction< Lattice<Fobj> > {
public:
using LinearFunction<Lattice<Fobj> >::operator();
typedef Aggregation<Fobj,CComplex,nbasis> Aggregates;
typedef typename Aggregation<Fobj,CComplex,nbasis>::FineField FineField;
typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseVector CoarseVector;
typedef typename Aggregation<Fobj,CComplex,nbasis>::CoarseMatrix CoarseMatrix;
typedef LinearOperatorBase<FineField> FineOperator;
typedef LinearFunction <FineField> FineSmoother;
typedef LinearOperatorBase<CoarseVector> CoarseOperator;
typedef LinearFunction <CoarseVector> CoarseSolver;
Aggregates & _Aggregates;
FineOperator & _FineOperator;
FineSmoother & _PreSmoother;
FineSmoother & _PostSmoother;
CoarseOperator & _CoarseOperator;
CoarseSolver & _CoarseSolve;
int level; void Level(int lv) {level = lv; };
MGPreconditioner(Aggregates &Agg,
FineOperator &Fine,
FineSmoother &PreSmoother,
FineSmoother &PostSmoother,
CoarseOperator &CoarseOperator_,
CoarseSolver &CoarseSolve_)
: _Aggregates(Agg),
_FineOperator(Fine),
_PreSmoother(PreSmoother),
_PostSmoother(PostSmoother),
_CoarseOperator(CoarseOperator_),
_CoarseSolve(CoarseSolve_),
level(1) { }
virtual void operator()(const FineField &in, FineField & out)
{
GridBase *CoarseGrid = _Aggregates.CoarseGrid;
// auto CoarseGrid = _CoarseOperator.Grid();
CoarseVector Csrc(CoarseGrid);
CoarseVector Csol(CoarseGrid);
FineField vec1(in.Grid());
FineField vec2(in.Grid());
std::cout<<GridLogMessage << "Calling PreSmoother " <<std::endl;
// std::cout<<GridLogMessage << "Calling PreSmoother input residual "<<norm2(in) <<std::endl;
double t;
// Fine Smoother
// out = in;
out = Zero();
t=-usecond();
_PreSmoother(in,out);
t+=usecond();
std::cout<<GridLogMessage << "PreSmoother took "<< t/1000.0<< "ms" <<std::endl;
// Update the residual
_FineOperator.Op(out,vec1); sub(vec1, in ,vec1);
// std::cout<<GridLogMessage <<"Residual-1 now " <<norm2(vec1)<<std::endl;
// Fine to Coarse
t=-usecond();
_Aggregates.ProjectToSubspace (Csrc,vec1);
t+=usecond();
std::cout<<GridLogMessage << "Project to coarse took "<< t/1000.0<< "ms" <<std::endl;
// Coarse correction
t=-usecond();
Csol = Zero();
_CoarseSolve(Csrc,Csol);
//Csol=Zero();
t+=usecond();
std::cout<<GridLogMessage << "Coarse solve took "<< t/1000.0<< "ms" <<std::endl;
// Coarse to Fine
t=-usecond();
// _CoarseOperator.PromoteFromSubspace(_Aggregates,Csol,vec1);
_Aggregates.PromoteFromSubspace(Csol,vec1);
add(out,out,vec1);
t+=usecond();
std::cout<<GridLogMessage << "Promote to this level took "<< t/1000.0<< "ms" <<std::endl;
// Residual
_FineOperator.Op(out,vec1); sub(vec1 ,in , vec1);
// std::cout<<GridLogMessage <<"Residual-2 now " <<norm2(vec1)<<std::endl;
// Fine Smoother
t=-usecond();
// vec2=vec1;
vec2=Zero();
_PostSmoother(vec1,vec2);
t+=usecond();
std::cout<<GridLogMessage << "PostSmoother took "<< t/1000.0<< "ms" <<std::endl;
add( out,out,vec2);
std::cout<<GridLogMessage << "Done " <<std::endl;
}
};
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
const int Ls=2; const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -151,7 +220,8 @@ int main (int argc, char ** argv)
// Construct a coarsened grid // Construct a coarsened grid
Coordinate clatt = GridDefaultLatt(); Coordinate clatt = GridDefaultLatt();
for(int d=0;d<clatt.size();d++){ for(int d=0;d<clatt.size();d++){
clatt[d] = clatt[d]/4; clatt[d] = clatt[d]/2;
// clatt[d] = clatt[d]/4;
} }
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
@ -173,15 +243,14 @@ int main (int argc, char ** argv)
FieldMetaData header; FieldMetaData header;
std::string file("ckpoint_lat.4000"); std::string file("ckpoint_lat.4000");
NerscIO::readConfiguration(Umu,header,file); NerscIO::readConfiguration(Umu,header,file);
//Umu = 1.0;
RealD mass=0.5; RealD mass=0.01;
RealD M5=1.8; RealD M5=1.8;
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5); DomainWallFermionD Dpv(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,1.0,M5);
const int nbasis = 1; const int nbasis = 20;
const int cb = 0 ; const int cb = 0 ;
LatticeFermion prom(FGrid); LatticeFermion prom(FGrid);
@ -193,25 +262,51 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"*******************************************"<<std::endl; std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM(Ddwf,Dpv); typedef PVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> PVdagM_t;
HermOpAdaptor<LatticeFermionD> HOA(PVdagM); typedef ShiftedPVdagMLinearOperator<DomainWallFermionD,LatticeFermionD> ShiftedPVdagM_t;
PVdagM_t PVdagM(Ddwf,Dpv);
// ShiftedPVdagM_t ShiftedPVdagM(2.0,Ddwf,Dpv); // 355
// ShiftedPVdagM_t ShiftedPVdagM(1.0,Ddwf,Dpv); // 246
// ShiftedPVdagM_t ShiftedPVdagM(0.5,Ddwf,Dpv); // 183
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 145
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 134
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 127 -- NULL space via inverse iteration
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 57 -- NULL space via inverse iteration; 3 iterations
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 57 , tighter inversion
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 49 iters
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // nbasis 20 -- 70 iters; asymmetric
// ShiftedPVdagM_t ShiftedPVdagM(0.25,Ddwf,Dpv); // 58; Loosen coarse, tighten fine
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 56 ...
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 51 ... with 24 vecs
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 31 ... with 24 vecs and 2^4 blocking
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 43 ... with 16 vecs and 2^4 blocking, sloppier
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35 ... with 20 vecs and 2^4 blocking
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 35 ... with 20 vecs and 2^4 blocking, looser coarse
// ShiftedPVdagM_t ShiftedPVdagM(0.1,Ddwf,Dpv); // 64 ... with 20 vecs, Christoph setup, and 2^4 blocking, looser coarse
ShiftedPVdagM_t ShiftedPVdagM(0.01,Ddwf,Dpv); //
// Run power method on HOA?? // Run power method on HOA??
PowerMethod<LatticeFermion> PM; PM(HOA,src); // PowerMethod<LatticeFermion> PM; PM(PVdagM,src);
// Warning: This routine calls PVdagM.Op, not PVdagM.HermOp // Warning: This routine calls PVdagM.Op, not PVdagM.HermOp
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace; typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
Subspace AggregatesPD(Coarse5d,FGrid,cb); Subspace AggregatesPD(Coarse5d,FGrid,cb);
/*
AggregatesPD.CreateSubspaceChebyshev(RNG5, AggregatesPD.CreateSubspaceChebyshev(RNG5,
HOA, PVdagM,
nbasis, nbasis,
5000.0, 4000.0,
0.02, 2.0,
100, 200,
50, 200,
50, 200,
0.0); 0.0);
*/
AggregatesPD.CreateSubspaceGCR(RNG5,
PVdagM,
nbasis);
LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d); LittleDiracOperator LittleDiracOpPV(geom,FGrid,Coarse5d);
LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD); LittleDiracOpPV.CoarsenOperator(PVdagM,AggregatesPD);
@ -257,6 +352,60 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl; std::cout<<GridLogMessage<<" ldop error: "<<norm2(c_proj)<<std::endl;
// std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl; // std::cout<<GridLogMessage<<" error "<< c_proj<<std::endl;
/**********
* Some solvers
**********
*/
///////////////////////////////////////
// Coarse grid solver test
///////////////////////////////////////
std::cout<<GridLogMessage<<"******************* "<<std::endl;
std::cout<<GridLogMessage<<" Coarse Grid Solve -- Level 3 "<<std::endl;
std::cout<<GridLogMessage<<"******************* "<<std::endl;
TrivialPrecon<CoarseVector> simple;
NonHermitianLinearOperator<LittleDiracOperator,CoarseVector> LinOpCoarse(LittleDiracOpPV);
// PrecGeneralisedConjugateResidualNonHermitian<CoarseVector> L2PGCR(1.0e-4, 100, LinOpCoarse,simple,10,10);
PrecGeneralisedConjugateResidualNonHermitian<CoarseVector> L2PGCR(3.0e-2, 100, LinOpCoarse,simple,10,10);
L2PGCR.Level(3);
c_res=Zero();
L2PGCR(c_src,c_res);
////////////////////////////////////////
// Fine grid smoother
////////////////////////////////////////
std::cout<<GridLogMessage<<"******************* "<<std::endl;
std::cout<<GridLogMessage<<" Fine Grid Smoother -- Level 2 "<<std::endl;
std::cout<<GridLogMessage<<"******************* "<<std::endl;
TrivialPrecon<LatticeFermionD> simple_fine;
// NonHermitianLinearOperator<PVdagM_t,LatticeFermionD> LinOpSmooth(PVdagM);
PrecGeneralisedConjugateResidualNonHermitian<LatticeFermionD> SmootherGCR(0.01,1,ShiftedPVdagM,simple_fine,16,16);
SmootherGCR.Level(2);
LatticeFermionD f_src(FGrid);
LatticeFermionD f_res(FGrid);
f_src = one; // 1 in every element for vector 1.
f_res=Zero();
SmootherGCR(f_src,f_res);
typedef MGPreconditioner<vSpinColourVector, vTComplex,nbasis> TwoLevelMG;
TwoLevelMG TwoLevelPrecon(AggregatesPD,
PVdagM,
simple_fine,
SmootherGCR,
LinOpCoarse,
L2PGCR);
PrecGeneralisedConjugateResidualNonHermitian<LatticeFermion> L1PGCR(1.0e-8,1000,PVdagM,TwoLevelPrecon,16,16);
L1PGCR.Level(1);
f_res=Zero();
L1PGCR(f_src,f_res);
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"*******************************************"<<std::endl; std::cout<<GridLogMessage<<"*******************************************"<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;

Some files were not shown because too many files have changed in this diff Show More