mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-18 07:47:06 +01:00
Working simple OpenMP offloading with cudaMallocManaged; cshift not working
This commit is contained in:
@ -266,7 +266,8 @@ protected:
|
|||||||
T = Zero();
|
T = Zero();
|
||||||
autoView(T_v,T,AcceleratorWrite);
|
autoView(T_v,T,AcceleratorWrite);
|
||||||
autoView(F_v,F,AcceleratorRead);
|
autoView(F_v,F,AcceleratorRead);
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
int size=CloverTerm.Grid()->oSites();
|
||||||
|
accelerator_for(i, size,1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
||||||
@ -284,7 +285,8 @@ protected:
|
|||||||
|
|
||||||
autoView(T_v, T,AcceleratorWrite);
|
autoView(T_v, T,AcceleratorWrite);
|
||||||
autoView(F_v, F,AcceleratorRead);
|
autoView(F_v, F,AcceleratorRead);
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
int size=CloverTerm.Grid()->oSites();
|
||||||
|
accelerator_for(i, size,1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -F_v[i]()();
|
T_v[i]()(0, 1) = -F_v[i]()();
|
||||||
T_v[i]()(1, 0) = F_v[i]()();
|
T_v[i]()(1, 0) = F_v[i]()();
|
||||||
@ -302,7 +304,8 @@ protected:
|
|||||||
|
|
||||||
autoView(T_v,T,AcceleratorWrite);
|
autoView(T_v,T,AcceleratorWrite);
|
||||||
autoView(F_v,F,AcceleratorRead);
|
autoView(F_v,F,AcceleratorRead);
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
int size=CloverTerm.Grid()->oSites();
|
||||||
|
accelerator_for(i, size,1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
||||||
@ -320,7 +323,8 @@ protected:
|
|||||||
|
|
||||||
autoView( T_v , T, AcceleratorWrite);
|
autoView( T_v , T, AcceleratorWrite);
|
||||||
autoView( F_v , F, AcceleratorRead);
|
autoView( F_v , F, AcceleratorRead);
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
int size=CloverTerm.Grid()->oSites();
|
||||||
|
accelerator_for(i, size,1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
||||||
@ -338,7 +342,8 @@ protected:
|
|||||||
|
|
||||||
autoView( T_v ,T,AcceleratorWrite);
|
autoView( T_v ,T,AcceleratorWrite);
|
||||||
autoView( F_v ,F,AcceleratorRead);
|
autoView( F_v ,F,AcceleratorRead);
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
int size=CloverTerm.Grid()->oSites();
|
||||||
|
accelerator_for(i, size,1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -(F_v[i]()());
|
T_v[i]()(0, 1) = -(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = (F_v[i]()());
|
T_v[i]()(1, 0) = (F_v[i]()());
|
||||||
@ -357,7 +362,8 @@ protected:
|
|||||||
|
|
||||||
autoView( T_v , T,AcceleratorWrite);
|
autoView( T_v , T,AcceleratorWrite);
|
||||||
autoView( F_v , F,AcceleratorRead);
|
autoView( F_v , F,AcceleratorRead);
|
||||||
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
int size=CloverTerm.Grid()->oSites();
|
||||||
|
accelerator_for(i, size,1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
||||||
|
@ -110,8 +110,10 @@ public:
|
|||||||
autoView( out_v, out, AcceleratorWrite);
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
autoView( phi_v, phi, AcceleratorRead);
|
autoView( phi_v, phi, AcceleratorRead);
|
||||||
autoView( Umu_v, Umu, AcceleratorRead);
|
autoView( Umu_v, Umu, AcceleratorRead);
|
||||||
|
int size=out.Grid()->oSites();
|
||||||
|
|
||||||
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
|
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
|
||||||
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
|
accelerator_for(sss,size,Nsimd,{
|
||||||
calcSpinor tmp;
|
calcSpinor tmp;
|
||||||
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
|
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
|
||||||
coalescedWrite(out_v[sss],tmp);
|
coalescedWrite(out_v[sss],tmp);
|
||||||
@ -203,7 +205,8 @@ public:
|
|||||||
autoView( tmp_v , tmp, AcceleratorWrite);
|
autoView( tmp_v , tmp, AcceleratorWrite);
|
||||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||||
accelerator_for(sss,tmp.Grid()->oSites(),1,{
|
int size=tmp.Grid()->oSites();
|
||||||
|
accelerator_for(sss,size,1,{
|
||||||
int sU=sss;
|
int sU=sss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
@ -217,7 +220,8 @@ public:
|
|||||||
const int Nsimd = SiteSpinor::Nsimd();
|
const int Nsimd = SiteSpinor::Nsimd();
|
||||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||||
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
|
int size=mat.Grid()->oSites();
|
||||||
|
accelerator_for(sss,size,Nsimd,{
|
||||||
int sU=sss;
|
int sU=sss;
|
||||||
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
|
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
|
||||||
ColorMatrixType sum;
|
ColorMatrixType sum;
|
||||||
|
@ -88,7 +88,8 @@ public:
|
|||||||
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
|
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
|
||||||
autoView(U_v,U,AcceleratorWrite);
|
autoView(U_v,U,AcceleratorWrite);
|
||||||
autoView(W_v,W,AcceleratorRead);
|
autoView(W_v,W,AcceleratorRead);
|
||||||
accelerator_for( ss, U.Grid()->oSites(), 1, {
|
int size=U.Grid()->oSites();
|
||||||
|
accelerator_for( ss, size, 1, {
|
||||||
U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
|
U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -133,7 +134,8 @@ public:
|
|||||||
//auto start = std::chrono::high_resolution_clock::now();
|
//auto start = std::chrono::high_resolution_clock::now();
|
||||||
autoView(U_v,U,AcceleratorWrite);
|
autoView(U_v,U,AcceleratorWrite);
|
||||||
autoView(P_v,P,AcceleratorRead);
|
autoView(P_v,P,AcceleratorRead);
|
||||||
accelerator_for(ss, P.Grid()->oSites(),1,{
|
int size=P.Grid()->oSites();
|
||||||
|
accelerator_for(ss, size,1,{
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
|
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
|
||||||
}
|
}
|
||||||
|
@ -28,6 +28,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
//#ifndef ACCELERATOR_H
|
||||||
|
//#define ACCELERATOR_H
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef HAVE_MALLOC_MALLOC_H
|
#ifdef HAVE_MALLOC_MALLOC_H
|
||||||
@ -434,30 +437,33 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
|
|||||||
|
|
||||||
//OpenMP Target Offloading
|
//OpenMP Target Offloading
|
||||||
#ifdef OMPTARGET
|
#ifdef OMPTARGET
|
||||||
uint32_t nt=acceleratorThreads();
|
|
||||||
|
//uint32_t gpu_threads=acceleratorThreads();
|
||||||
|
|
||||||
#define accelerator
|
#define accelerator
|
||||||
#define accelerator_inline strong_inline
|
#define accelerator_inline strong_inline
|
||||||
#define accelerator_for(iterator,num,nsimd, ... ) \
|
#define accelerator_for(i,num,nsimd, ... ) \
|
||||||
DO_PRAGMA(omp target teams distribute parallel for thread_limit(nt)) \
|
_Pragma("omp target teams distribute parallel for") \
|
||||||
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||||
#define accelerator_forNB(iterator,num,nsimd, ... ) \
|
#define accelerator_forNB(i,num,nsimd, ... ) \
|
||||||
DO_PRAGMA(omp target teams distribute parallel for thread_limit(nt) nowait) \
|
_Pragma("omp target teams distribute parallel for nowait") \
|
||||||
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
||||||
#define accelerator_barrier(dummy) DO_PRAGMA(omp barrier)
|
#define accelerator_barrier(dummy) _Pragma("omp barrier")
|
||||||
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
|
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
DO_PRAGMA(omp target teams distribute parallel for thread_limit(nt) collapse(2)) \
|
_Pragma("omp target teams distribute parallel for collapse(2)") \
|
||||||
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
|
for ( uint64_t iter1=0;iter1<num1;iter1++) \
|
||||||
|
for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ;
|
||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { DO_PRAGMA(omp target enter data map(to:from[0:bytes])}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) {;}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ DO_PRAGMA(omp target exit data map(from:from[0:bytes])}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){;}
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);}
|
||||||
inline void acceleratorCopySynchronize(void) {};
|
inline void acceleratorCopySynchronize(void) {;};
|
||||||
|
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
||||||
#ifdef OMPTARGET_MANAGED
|
#ifdef OMPTARGET_MANAGED
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
inline void *acceleratorAllocShared(size_t bytes)
|
inline void *acceleratorAllocShared(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
@ -469,11 +475,14 @@ inline void *acceleratorAllocShared(size_t bytes)
|
|||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);};
|
inline void acceleratorFreeShared(void *ptr){cudaFree(ptr);};
|
||||||
|
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||||
|
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||||
#else
|
#else
|
||||||
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||||
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
|
||||||
inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||||
|
#endif
|
||||||
|
|
||||||
//OpenMP CPU threads
|
//OpenMP CPU threads
|
||||||
#else
|
#else
|
||||||
@ -504,6 +513,7 @@ inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALI
|
|||||||
inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
inline void acceleratorFreeShared(void *ptr){free(ptr);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif // CPU target
|
#endif // CPU target
|
||||||
|
|
||||||
@ -567,3 +577,5 @@ accelerator_inline void acceleratorFence(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
@ -46,7 +46,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
#define DO_PRAGMA_(x) _Pragma (#x)
|
#define DO_PRAGMA_(x) _Pragma ("x")
|
||||||
#define DO_PRAGMA(x) DO_PRAGMA_(x)
|
#define DO_PRAGMA(x) DO_PRAGMA_(x)
|
||||||
#define thread_num(a) omp_get_thread_num()
|
#define thread_num(a) omp_get_thread_num()
|
||||||
#define thread_max(a) omp_get_max_threads()
|
#define thread_max(a) omp_get_max_threads()
|
||||||
|
Reference in New Issue
Block a user