1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port

Conflicts:
	Grid/stencil/Stencil.h
This commit is contained in:
Peter Boyle 2019-07-12 17:14:11 +01:00
commit 705a8098b2
45 changed files with 35 additions and 75 deletions

View File

@ -62,7 +62,7 @@ public:
LatticeCoordinate(coor, nu + shift);
ph = ph + twist[nu]*coor*((1./(in.Grid()->FullDimensions()[nu+shift])));
}
in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
in_buf = exp(Scalar(2.0*M_PI)*ci*ph*(-1.0))*in;
if(fiveD){//FFT only on temporal and spatial dimensions
std::vector<int> mask(Nd+1,1); mask[0] = 0;
@ -77,7 +77,7 @@ public:
}
//phase for boundary condition
out = out * exp((Real)(2.0*M_PI)*ci*ph);
out = out * exp(Scalar(2.0*M_PI)*ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {

View File

@ -323,10 +323,8 @@ public:
this->HaloExchangeOptGather(source,compress);
double t1=usecond();
// Asynchronous MPI calls multidirectional, Isend etc...
// this->CommunicateBegin(reqs);
// this->CommunicateComplete(reqs);
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
this->Communicate();
// this->Communicate();
double t2=usecond(); timer1 += t2-t1;
this->CommsMerge(compress);
double t3=usecond(); timer2 += t3-t2;

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiationGparity.cc.master

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiationGparity.cc.master

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiationGparity.cc.master

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiationGparity.cc.master

View File

@ -777,7 +777,7 @@ public:
int permute_slice=0;
if(permute_dim){
int wrap = sshift/rd;
int wrap = sshift/rd; wrap=wrap % ly; // but it is local anyway
int num = sshift%rd;
if ( x< rd-num ) permute_slice=wrap;
else permute_slice = (wrap+1)%ly;

View File

@ -69,6 +69,11 @@ void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,
// vstream(vec, extracted);
vec = extracted;
}
template<class vobj> accelerator_inline
void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
{
vstream(vec, extracted);
}
#else
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
@ -92,6 +97,11 @@ void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object &
{
insertLane(lane,vec,extracted);
}
template<class vobj> accelerator_inline
void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
{
insertLane(lane,vec,extracted);
}
#endif

50
TODO
View File

@ -1,66 +1,58 @@
- Lattice_arith - are the mult, mac etc.. still needed after ET engine?
- LinalgUtils ssp loop not offloaded
- Mobius/Domain EOFA cache header implementaiotn has thread_loop
- ImprovedStaggered accelerate
- Lattice_reduction - remnant thread_loops must offload. Audit thread_loop in main code for non-accelerated code
Lattice_rng
Lattice_transfer.h
- Stencil.h : Thread loops in exchange code. Need to offload these
- Lebesque order reintroduction. StencilView should have pointer
- accelerate A2Autils
- accelerate A2Autils -- off critical path for HMC
- Lebesque order reintroduction. StencilView should have pointer to it
GPU branch code item work list
-----------------------------
7) Accelerate the cshift
7) Accelerate the cshift & benchmark
* 0) Single GPU
- 128 bit integer table load in GPU code.
- coalescedRead <- threadIdx.x
- Gianluca's changes to Cayley into gpu-port
- GPU accelerate EOFA
- Staggered kernels -> GPU coalesced loop
- Staggered kernels -> GPU coalesced loop, loop in kernels
- Staggered kernels inline for GPU -- DONE
* 2) 5D terms & Gianluca
* Gianluca merger
- Cayley coefficients -> GPU retention or prefetch
- Mobius kernel fusion. -- Gianluca?
- Make GPU offload reductions optionally deterministic -- Gianluca
- Gianluca's changes to Cayley into gpu-port
- Mobius kernel fusion. -- Gianluca?
- Make GPU offload reductions deterministic -- Gianluca merge
- Lattice_reduction - remnant thread_loops must offload. Audit thread_loop in main code for non-accelerated code
* 3) Comms/NVlink
- OpenMP tasks to run comms threads.
- OpenMP tasks to run comms threads. Experiment with it
- Remove explicit openMP in staggered.
- Single parallel region around both the Kernel call
and the comms.
- Single parallel region around both the Kernel call and the comms.
- Fix the halo exchange SIMT loop
- Stencil gather
- Stencil gather ??
- SIMD dirs in stencil
* 4) ET enhancements
- eval -> scalar ops in ET engine
- coalescedRead, coalescedWrite in expressions.
- coalescedRead, coalescedWrite in expressions.
* 5) Misc
- Conserved current clean up.
- multLinkProp eliminate
8) Merge develop and test HMC
9) Gamma tables on GPU; check this.
9) Gamma tables on GPU; check this. Appear to work, but no idea why. Are these done on CPU?
10) Audit
- pragma once uniformly
- Audit NAMESPACE CHANGES
- Audit changes
=============================================================================================
- GPU accelerate EOFA -- DONE
- LinalgUtils ssp loop not offloaded -- DONE
- coalescedRead <- threadIdx.x -- DONE
- Stencil.h : Thread loops in exchange code. Need to offload these -- DONE ; pending debug
- Mobius/Domain EOFA cache header implementaiotn has thread_loop -- DONE ; pending test
- Differentiate non-temporal coalescedWrite from temporal -- DONE
- Clean up PRAGMAS, and SIMT_loop -- DONE
thread_loop interface revisit.
_foreach