mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-11 14:40:46 +01:00
Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port
Conflicts: Grid/stencil/Stencil.h
This commit is contained in:
commit
705a8098b2
@ -62,7 +62,7 @@ public:
|
|||||||
LatticeCoordinate(coor, nu + shift);
|
LatticeCoordinate(coor, nu + shift);
|
||||||
ph = ph + twist[nu]*coor*((1./(in.Grid()->FullDimensions()[nu+shift])));
|
ph = ph + twist[nu]*coor*((1./(in.Grid()->FullDimensions()[nu+shift])));
|
||||||
}
|
}
|
||||||
in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
|
in_buf = exp(Scalar(2.0*M_PI)*ci*ph*(-1.0))*in;
|
||||||
|
|
||||||
if(fiveD){//FFT only on temporal and spatial dimensions
|
if(fiveD){//FFT only on temporal and spatial dimensions
|
||||||
std::vector<int> mask(Nd+1,1); mask[0] = 0;
|
std::vector<int> mask(Nd+1,1); mask[0] = 0;
|
||||||
@ -77,7 +77,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
//phase for boundary condition
|
//phase for boundary condition
|
||||||
out = out * exp((Real)(2.0*M_PI)*ci*ph);
|
out = out * exp(Scalar(2.0*M_PI)*ci*ph);
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
|
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
|
||||||
|
@ -323,10 +323,8 @@ public:
|
|||||||
this->HaloExchangeOptGather(source,compress);
|
this->HaloExchangeOptGather(source,compress);
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
// Asynchronous MPI calls multidirectional, Isend etc...
|
// Asynchronous MPI calls multidirectional, Isend etc...
|
||||||
// this->CommunicateBegin(reqs);
|
|
||||||
// this->CommunicateComplete(reqs);
|
|
||||||
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
|
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
|
||||||
this->Communicate();
|
// this->Communicate();
|
||||||
double t2=usecond(); timer1 += t2-t1;
|
double t2=usecond(); timer1 += t2-t1;
|
||||||
this->CommsMerge(compress);
|
this->CommsMerge(compress);
|
||||||
double t3=usecond(); timer2 += t3-t2;
|
double t3=usecond(); timer2 += t3-t2;
|
||||||
|
@ -1 +0,0 @@
|
|||||||
../CayleyFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../DomainWallEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../MobiusEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../PartialFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonCloverFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiationGparity.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonTMFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../CayleyFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../DomainWallEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../MobiusEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../PartialFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonCloverFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiationGparity.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonTMFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../CayleyFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../DomainWallEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../MobiusEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../PartialFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonCloverFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiationGparity.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonTMFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../CayleyFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../DomainWallEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../MobiusEOFAFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../PartialFractionFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonCloverFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermion5DInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonFermionInstantiation.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiationGparity.cc.master
|
|
@ -1 +0,0 @@
|
|||||||
../WilsonTMFermionInstantiation.cc.master
|
|
@ -777,7 +777,7 @@ public:
|
|||||||
|
|
||||||
int permute_slice=0;
|
int permute_slice=0;
|
||||||
if(permute_dim){
|
if(permute_dim){
|
||||||
int wrap = sshift/rd;
|
int wrap = sshift/rd; wrap=wrap % ly; // but it is local anyway
|
||||||
int num = sshift%rd;
|
int num = sshift%rd;
|
||||||
if ( x< rd-num ) permute_slice=wrap;
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
else permute_slice = (wrap+1)%ly;
|
else permute_slice = (wrap+1)%ly;
|
||||||
|
@ -69,6 +69,11 @@ void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,
|
|||||||
// vstream(vec, extracted);
|
// vstream(vec, extracted);
|
||||||
vec = extracted;
|
vec = extracted;
|
||||||
}
|
}
|
||||||
|
template<class vobj> accelerator_inline
|
||||||
|
void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
|
||||||
|
{
|
||||||
|
vstream(vec, extracted);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
accelerator_inline int SIMTlane(int Nsimd) { return threadIdx.y; } // CUDA specific
|
||||||
|
|
||||||
@ -92,6 +97,11 @@ void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object &
|
|||||||
{
|
{
|
||||||
insertLane(lane,vec,extracted);
|
insertLane(lane,vec,extracted);
|
||||||
}
|
}
|
||||||
|
template<class vobj> accelerator_inline
|
||||||
|
void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
|
||||||
|
{
|
||||||
|
insertLane(lane,vec,extracted);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
50
TODO
50
TODO
@ -1,66 +1,58 @@
|
|||||||
- Lattice_arith - are the mult, mac etc.. still needed after ET engine?
|
- Lattice_arith - are the mult, mac etc.. still needed after ET engine?
|
||||||
- LinalgUtils ssp loop not offloaded
|
|
||||||
- Mobius/Domain EOFA cache header implementaiotn has thread_loop
|
|
||||||
- ImprovedStaggered accelerate
|
- ImprovedStaggered accelerate
|
||||||
- Lattice_reduction - remnant thread_loops must offload. Audit thread_loop in main code for non-accelerated code
|
|
||||||
Lattice_rng
|
Lattice_rng
|
||||||
Lattice_transfer.h
|
Lattice_transfer.h
|
||||||
|
|
||||||
- Stencil.h : Thread loops in exchange code. Need to offload these
|
- accelerate A2Autils -- off critical path for HMC
|
||||||
|
- Lebesque order reintroduction. StencilView should have pointer to it
|
||||||
- Lebesque order reintroduction. StencilView should have pointer
|
|
||||||
|
|
||||||
- accelerate A2Autils
|
|
||||||
|
|
||||||
GPU branch code item work list
|
GPU branch code item work list
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
7) Accelerate the cshift
|
7) Accelerate the cshift & benchmark
|
||||||
|
|
||||||
* 0) Single GPU
|
* 0) Single GPU
|
||||||
- 128 bit integer table load in GPU code.
|
- 128 bit integer table load in GPU code.
|
||||||
- coalescedRead <- threadIdx.x
|
- Staggered kernels -> GPU coalesced loop, loop in kernels
|
||||||
- Gianluca's changes to Cayley into gpu-port
|
|
||||||
- GPU accelerate EOFA
|
|
||||||
- Staggered kernels -> GPU coalesced loop
|
|
||||||
- Staggered kernels inline for GPU -- DONE
|
- Staggered kernels inline for GPU -- DONE
|
||||||
|
|
||||||
|
* Gianluca merger
|
||||||
* 2) 5D terms & Gianluca
|
|
||||||
- Cayley coefficients -> GPU retention or prefetch
|
- Cayley coefficients -> GPU retention or prefetch
|
||||||
- Mobius kernel fusion. -- Gianluca?
|
- Gianluca's changes to Cayley into gpu-port
|
||||||
- Make GPU offload reductions optionally deterministic -- Gianluca
|
- Mobius kernel fusion. -- Gianluca?
|
||||||
|
- Make GPU offload reductions deterministic -- Gianluca merge
|
||||||
|
- Lattice_reduction - remnant thread_loops must offload. Audit thread_loop in main code for non-accelerated code
|
||||||
|
|
||||||
* 3) Comms/NVlink
|
* 3) Comms/NVlink
|
||||||
- OpenMP tasks to run comms threads.
|
- OpenMP tasks to run comms threads. Experiment with it
|
||||||
- Remove explicit openMP in staggered.
|
- Remove explicit openMP in staggered.
|
||||||
- Single parallel region around both the Kernel call
|
- Single parallel region around both the Kernel call and the comms.
|
||||||
and the comms.
|
|
||||||
- Fix the halo exchange SIMT loop
|
- Fix the halo exchange SIMT loop
|
||||||
- Stencil gather
|
- Stencil gather ??
|
||||||
- SIMD dirs in stencil
|
- SIMD dirs in stencil
|
||||||
|
|
||||||
* 4) ET enhancements
|
* 4) ET enhancements
|
||||||
- eval -> scalar ops in ET engine
|
- eval -> scalar ops in ET engine
|
||||||
- coalescedRead, coalescedWrite in expressions.
|
- coalescedRead, coalescedWrite in expressions.
|
||||||
|
|
||||||
* 5) Misc
|
* 5) Misc
|
||||||
|
|
||||||
- Conserved current clean up.
|
- Conserved current clean up.
|
||||||
- multLinkProp eliminate
|
- multLinkProp eliminate
|
||||||
|
|
||||||
|
|
||||||
8) Merge develop and test HMC
|
8) Merge develop and test HMC
|
||||||
|
9) Gamma tables on GPU; check this. Appear to work, but no idea why. Are these done on CPU?
|
||||||
9) Gamma tables on GPU; check this.
|
|
||||||
|
|
||||||
10) Audit
|
10) Audit
|
||||||
- pragma once uniformly
|
- pragma once uniformly
|
||||||
- Audit NAMESPACE CHANGES
|
- Audit NAMESPACE CHANGES
|
||||||
- Audit changes
|
- Audit changes
|
||||||
|
|
||||||
|
|
||||||
=============================================================================================
|
=============================================================================================
|
||||||
|
- GPU accelerate EOFA -- DONE
|
||||||
|
- LinalgUtils ssp loop not offloaded -- DONE
|
||||||
|
- coalescedRead <- threadIdx.x -- DONE
|
||||||
|
- Stencil.h : Thread loops in exchange code. Need to offload these -- DONE ; pending debug
|
||||||
|
- Mobius/Domain EOFA cache header implementaiotn has thread_loop -- DONE ; pending test
|
||||||
|
- Differentiate non-temporal coalescedWrite from temporal -- DONE
|
||||||
|
|
||||||
- Clean up PRAGMAS, and SIMT_loop -- DONE
|
- Clean up PRAGMAS, and SIMT_loop -- DONE
|
||||||
thread_loop interface revisit.
|
thread_loop interface revisit.
|
||||||
_foreach
|
_foreach
|
||||||
|
Loading…
x
Reference in New Issue
Block a user