mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
commit
80fd6ab407
@ -34,6 +34,12 @@
|
|||||||
#define __SYCL__REDEFINE__
|
#define __SYCL__REDEFINE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* HIP save and restore compile environment*/
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
#pragma push
|
||||||
|
#pragma push_macro("__HIP_DEVICE_COMPILE__")
|
||||||
|
#endif
|
||||||
|
#define EIGEN_NO_HIP
|
||||||
|
|
||||||
#include <Grid/Eigen/Dense>
|
#include <Grid/Eigen/Dense>
|
||||||
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
||||||
@ -52,6 +58,12 @@
|
|||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*HIP restore*/
|
||||||
|
#ifdef __HIP__REDEFINE__
|
||||||
|
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
|
||||||
|
#pragma pop
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
@ -138,21 +138,6 @@ public:
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
void SendRecvPacket(void *xmit,
|
|
||||||
void *recv,
|
|
||||||
int xmit_to_rank,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes);
|
|
||||||
|
|
||||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int xmit_to_rank,
|
|
||||||
void *recv,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes);
|
|
||||||
|
|
||||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
|
||||||
|
|
||||||
double StencilSendToRecvFrom(void *xmit,
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
|
@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
|
|||||||
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
||||||
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
||||||
|
|
||||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|
||||||
void *recv,
|
|
||||||
int xmit_to_rank,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Basic Halo comms primitive -- should never call in single node
|
// Basic Halo comms primitive -- should never call in single node
|
||||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int dest,
|
|
||||||
void *recv,
|
|
||||||
int from,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(in,out,bytes*words);
|
bcopy(in,out,bytes*words);
|
||||||
@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> list;
|
|
||||||
// Discard the "dir"
|
|
||||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
|
||||||
SendToRecvFromComplete(list);
|
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
// Discard the "dir"
|
|
||||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
SendToRecvFromComplete(waitall);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void){};
|
void CartesianCommunicator::StencilBarrier(void){};
|
||||||
|
@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
#include <hip/hip_runtime_api.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
#define header "SharedMemoryMpi: "
|
#define header "SharedMemoryMpi: "
|
||||||
@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_CUDA
|
#if defined(GRID_CUDA) ||defined(GRID_HIP)
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef GRID_MPI3_SHM_NONE
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
auto err = cudaMalloc(&ShmCommBuf, bytes);
|
|
||||||
#else
|
|
||||||
auto err = cudaMallocManaged(&ShmCommBuf, bytes);
|
|
||||||
#endif
|
|
||||||
if ( err != cudaSuccess) {
|
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
if ( WorldRank == 0 ){
|
if ( WorldRank == 0 ){
|
||||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
|
||||||
|
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
}
|
}
|
||||||
SharedMemoryZero(ShmCommBuf,bytes);
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
|
|
||||||
@ -475,15 +472,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// If it is me, pass around the IPC access key
|
// If it is me, pass around the IPC access key
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
|
#ifdef GRID_CUDA
|
||||||
cudaIpcMemHandle_t handle;
|
cudaIpcMemHandle_t handle;
|
||||||
|
|
||||||
if ( r==WorldShmRank ) {
|
if ( r==WorldShmRank ) {
|
||||||
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipIpcMemHandle_t handle;
|
||||||
|
if ( r==WorldShmRank ) {
|
||||||
|
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
|
if ( err != hipSuccess) {
|
||||||
|
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// Share this IPC handle across the Shm Comm
|
// Share this IPC handle across the Shm Comm
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
@ -500,13 +508,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// If I am not the source, overwrite thisBuf with remote buffer
|
// If I am not the source, overwrite thisBuf with remote buffer
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * thisBuf = ShmCommBuf;
|
void * thisBuf = ShmCommBuf;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
if ( r!=WorldShmRank ) {
|
if ( r!=WorldShmRank ) {
|
||||||
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
if ( r!=WorldShmRank ) {
|
||||||
|
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
|
||||||
|
if ( err != hipSuccess) {
|
||||||
|
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Save a copy of the device buffers
|
// Save a copy of the device buffers
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
|
auto tmp =ret_v(ss);
|
||||||
mac(&tmp,&lhs_t,&rhs_t);
|
mac(&tmp,&lhs_t,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
|||||||
autoView( ret_v , ret, AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
auto tmp =ret_v(ss);
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
mac(&tmp,&lhs_t,&rhs);
|
mac(&tmp,&lhs_t,&rhs);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
|||||||
autoView( ret_v , ret, AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
autoView( rhs_v , lhs, AcceleratorRead);
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
auto tmp =ret_v(ss);
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
mac(&tmp,&lhs,&rhs_t);
|
mac(&tmp,&lhs,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
|
@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
extern hipDeviceProp_t *gpu_props;
|
extern hipDeviceProp_t *gpu_props;
|
||||||
|
#define WARP_SIZE 64
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
extern cudaDeviceProp *gpu_props;
|
extern cudaDeviceProp *gpu_props;
|
||||||
|
#define WARP_SIZE 32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
|
||||||
__device__ unsigned int retirementCount = 0;
|
__device__ unsigned int retirementCount = 0;
|
||||||
|
|
||||||
template <class Iterator>
|
template <class Iterator>
|
||||||
@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
|
|
||||||
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
||||||
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
||||||
__syncwarp();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
const Iterator VEC = WARP_SIZE;
|
const Iterator VEC = WARP_SIZE;
|
||||||
const Iterator vid = tid & (VEC-1);
|
const Iterator vid = tid & (VEC-1);
|
||||||
@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
beta += temp;
|
beta += temp;
|
||||||
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
__syncwarp();
|
acceleratorSynchronise();
|
||||||
}
|
}
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
beta = Zero();
|
beta = Zero();
|
||||||
@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
}
|
}
|
||||||
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic Nc kernels
|
// Generic Nc kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Naik> accelerator_inline
|
template<int Naik>
|
||||||
|
static accelerator_inline
|
||||||
void DhopSiteGeneric(StencilView &st,
|
void DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteGenericInt(StencilView &st,
|
void DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteGenericExt(StencilView &st,
|
void DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Nc=3 specific kernels
|
// Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHand(StencilView &st,
|
void DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHandInt(StencilView &st,
|
void DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHandExt(StencilView &st,
|
void DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Asm Nc=3 specific kernels
|
// Asm Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void DhopSiteAsm(StencilView &st,
|
void DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
|
|
||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
LatticeInteger zz (UGrid); zz=0.0;
|
PropagatorField zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
for (int s=0;s<Ls;s++) {
|
for (int s=0;s<Ls;s++) {
|
||||||
|
|
||||||
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
|
|
||||||
LatticeInteger zz (UGrid); zz=0.0;
|
PropagatorField zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
|
@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
|
@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
|||||||
// Int, Ext, Int+Ext cases for comms overlap
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
// Only contributions from interior of our node
|
// Only contributions from interior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
// Only contributions from exterior of our node
|
// Only contributions from exterior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
|||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Driving / wrapping routine to select right kernel
|
// Driving / wrapping routine to select right kernel
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
|
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
|
||||||
{
|
{
|
||||||
@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
|
|||||||
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
|
||||||
});
|
});
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
}
|
}
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
|
@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_RESULT_EXT(ss,F)
|
HAND_RESULT_EXT(ss,F)
|
||||||
|
|
||||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
nmu = 0; \
|
nmu = 0; \
|
||||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT_EXT(ss);
|
HAND_RESULT_EXT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// All legs kernels ; comms then compute
|
// All legs kernels ; comms then compute
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
|||||||
coalescedWrite(out[sF],result,lane);
|
coalescedWrite(out[sF],result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Interior kernels
|
// Interior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
|||||||
coalescedWrite(out[sF], result,lane);
|
coalescedWrite(out[sF], result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Exterior kernels
|
// Exterior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
};
|
};
|
||||||
|
|
||||||
#define DhopDirMacro(Dir,spProj,spRecon) \
|
#define DhopDirMacro(Dir,spProj,spRecon) \
|
||||||
template <class Impl> \
|
template <class Impl> accelerator_inline \
|
||||||
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
||||||
{ \
|
{ \
|
||||||
@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
|
|||||||
DhopDirMacro(Zm,spProjZm,spReconZm);
|
DhopDirMacro(Zm,spProjZm,spReconZm);
|
||||||
DhopDirMacro(Tm,spProjTm,spReconTm);
|
DhopDirMacro(Tm,spProjTm,spReconTm);
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||||
{
|
{
|
||||||
|
@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
||||||
|
typedef struct { uint16_t x;} half;
|
||||||
|
#endif
|
||||||
|
typedef struct Half2_t { half x; half y; } Half2;
|
||||||
|
|
||||||
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
|
#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH )
|
||||||
|
|
||||||
template<class pair>
|
template<class pair>
|
||||||
@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const
|
|||||||
}
|
}
|
||||||
|
|
||||||
constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
|
constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
|
||||||
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
|
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2);
|
||||||
constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
|
constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
|
||||||
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
|
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
|
||||||
constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
|
constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
|
||||||
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
|
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
|
||||||
constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
|
constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
|
||||||
|
|
||||||
typedef GpuComplex<half2 > GpuComplexH;
|
typedef GpuComplex<Half2 > GpuComplexH;
|
||||||
typedef GpuComplex<float2 > GpuComplexF;
|
typedef GpuComplex<float2 > GpuComplexF;
|
||||||
typedef GpuComplex<double2> GpuComplexD;
|
typedef GpuComplex<double2> GpuComplexD;
|
||||||
|
|
||||||
@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
|||||||
accelerator_inline float half2float(half h)
|
accelerator_inline float half2float(half h)
|
||||||
{
|
{
|
||||||
float f;
|
float f;
|
||||||
#ifdef GRID_SIMT
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
f = __half2float(h);
|
f = __half2float(h);
|
||||||
#else
|
#else
|
||||||
//f = __half2float(h);
|
|
||||||
__half_raw hr(h);
|
|
||||||
Grid_half hh;
|
Grid_half hh;
|
||||||
hh.x = hr.x;
|
hh.x = hr.x;
|
||||||
f= sfw_half_to_float(hh);
|
f= sfw_half_to_float(hh);
|
||||||
@ -161,13 +164,11 @@ accelerator_inline float half2float(half h)
|
|||||||
accelerator_inline half float2half(float f)
|
accelerator_inline half float2half(float f)
|
||||||
{
|
{
|
||||||
half h;
|
half h;
|
||||||
#ifdef GRID_SIMT
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
h = __float2half(f);
|
h = __float2half(f);
|
||||||
#else
|
#else
|
||||||
Grid_half hh = sfw_float_to_half(f);
|
Grid_half hh = sfw_float_to_half(f);
|
||||||
__half_raw hr;
|
h.x = hh.x;
|
||||||
hr.x = hh.x;
|
|
||||||
h = __half(hr);
|
|
||||||
#endif
|
#endif
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
@ -523,7 +524,7 @@ namespace Optimization {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Single / Half
|
// Single / Half
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
|
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
|
||||||
int N = GpuVectorCF::N;
|
int N = GpuVectorCF::N;
|
||||||
GpuVectorCH h;
|
GpuVectorCH h;
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
|
@ -59,6 +59,7 @@ void acceleratorInit(void)
|
|||||||
printf("AcceleratorCudaInit[%d]: ========================\n",rank);
|
printf("AcceleratorCudaInit[%d]: ========================\n",rank);
|
||||||
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
||||||
|
|
||||||
|
|
||||||
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
||||||
GPU_PROP(managedMemory);
|
GPU_PROP(managedMemory);
|
||||||
GPU_PROP(isMultiGpuBoard);
|
GPU_PROP(isMultiGpuBoard);
|
||||||
@ -113,20 +114,24 @@ void acceleratorInit(void)
|
|||||||
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
|
||||||
|
|
||||||
|
printf("world_rank %d has %d devices\n",world_rank,nDevices);
|
||||||
|
size_t totalDeviceMem=0;
|
||||||
for (int i = 0; i < nDevices; i++) {
|
for (int i = 0; i < nDevices; i++) {
|
||||||
|
|
||||||
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
|
||||||
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
|
||||||
|
|
||||||
hipGetDeviceProperties(&gpu_props[i], i);
|
hipGetDeviceProperties(&gpu_props[i], i);
|
||||||
|
hipDeviceProp_t prop;
|
||||||
|
prop = gpu_props[i];
|
||||||
|
totalDeviceMem = prop.totalGlobalMem;
|
||||||
if ( world_rank == 0) {
|
if ( world_rank == 0) {
|
||||||
hipDeviceProp_t prop;
|
|
||||||
prop = gpu_props[i];
|
|
||||||
printf("AcceleratorHipInit: ========================\n");
|
printf("AcceleratorHipInit: ========================\n");
|
||||||
printf("AcceleratorHipInit: Device Number : %d\n", i);
|
printf("AcceleratorHipInit: Device Number : %d\n", i);
|
||||||
printf("AcceleratorHipInit: ========================\n");
|
printf("AcceleratorHipInit: ========================\n");
|
||||||
printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
|
printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
|
||||||
|
|
||||||
|
GPU_PROP_FMT(totalGlobalMem,"%lu");
|
||||||
// GPU_PROP(managedMemory);
|
// GPU_PROP(managedMemory);
|
||||||
GPU_PROP(isMultiGpuBoard);
|
GPU_PROP(isMultiGpuBoard);
|
||||||
GPU_PROP(warpSize);
|
GPU_PROP(warpSize);
|
||||||
@ -135,6 +140,7 @@ void acceleratorInit(void)
|
|||||||
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
// GPU_PROP(singleToDoublePrecisionPerfRatio);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
|
||||||
#undef GPU_PROP_FMT
|
#undef GPU_PROP_FMT
|
||||||
#undef GPU_PROP
|
#undef GPU_PROP
|
||||||
#ifdef GRID_IBM_SUMMIT
|
#ifdef GRID_IBM_SUMMIT
|
||||||
|
@ -307,17 +307,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
|||||||
|
|
||||||
inline void *acceleratorAllocShared(size_t bytes)
|
inline void *acceleratorAllocShared(size_t bytes)
|
||||||
{
|
{
|
||||||
#if 0
|
|
||||||
void *ptr=NULL;
|
void *ptr=NULL;
|
||||||
auto err = hipMallocManaged((void **)&ptr,bytes);
|
auto err = hipMallocManaged((void **)&ptr,bytes);
|
||||||
if( err != hipSuccess ) {
|
if( err != hipSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err));
|
printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err));
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
#else
|
|
||||||
return malloc(bytes);
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||||
|
|
||||||
@ -327,7 +323,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
|||||||
auto err = hipMalloc((void **)&ptr,bytes);
|
auto err = hipMalloc((void **)&ptr,bytes);
|
||||||
if( err != hipSuccess ) {
|
if( err != hipSuccess ) {
|
||||||
ptr = (void *) NULL;
|
ptr = (void *) NULL;
|
||||||
printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err));
|
printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err));
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
};
|
};
|
||||||
|
@ -36,12 +36,12 @@ int main (int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
#define LMAX (48)
|
#define LMAX (40)
|
||||||
#define LMIN (8)
|
#define LMIN (8)
|
||||||
#define LADD (8)
|
#define LADD (8)
|
||||||
|
|
||||||
int64_t Nwarm=50;
|
int64_t Nwarm=10;
|
||||||
int64_t Nloop=500;
|
int64_t Nloop=100;
|
||||||
|
|
||||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||||
Coordinate mpi_layout = GridDefaultMpi();
|
Coordinate mpi_layout = GridDefaultMpi();
|
||||||
@ -118,6 +118,41 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z=z+ x*y"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
|
|
||||||
|
for(int lat=LMIN;lat<=LMAX;lat+=LADD){
|
||||||
|
|
||||||
|
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
|
||||||
|
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
|
||||||
|
|
||||||
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
|
|
||||||
|
LatticeColourMatrix z(&Grid); random(pRNG,z);
|
||||||
|
LatticeColourMatrix x(&Grid); random(pRNG,x);
|
||||||
|
LatticeColourMatrix y(&Grid); random(pRNG,y);
|
||||||
|
|
||||||
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
|
z=z+x*y;
|
||||||
|
}
|
||||||
|
double start=usecond();
|
||||||
|
for(int64_t i=0;i<Nloop;i++){
|
||||||
|
z=z+x*y;
|
||||||
|
}
|
||||||
|
double stop=usecond();
|
||||||
|
double time = (stop-start)/Nloop*1000.0;
|
||||||
|
|
||||||
|
double bytes=4*vol*Nc*Nc*sizeof(Complex);
|
||||||
|
double flops=Nc*Nc*(6+8+8)*vol;
|
||||||
|
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 mult(z,x,y)"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 mult(z,x,y)"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
@ -143,7 +178,6 @@ int main (int argc, char ** argv)
|
|||||||
double start=usecond();
|
double start=usecond();
|
||||||
for(int64_t i=0;i<Nloop;i++){
|
for(int64_t i=0;i<Nloop;i++){
|
||||||
mult(z,x,y);
|
mult(z,x,y);
|
||||||
// mac(z,x,y);
|
|
||||||
}
|
}
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
double time = (stop-start)/Nloop*1000.0;
|
double time = (stop-start)/Nloop*1000.0;
|
||||||
|
@ -187,7 +187,8 @@ int main (int argc, char ** argv)
|
|||||||
auto xx = coalescedRead(x_v[ss]);
|
auto xx = coalescedRead(x_v[ss]);
|
||||||
auto yy = coalescedRead(y_v[ss]);
|
auto yy = coalescedRead(y_v[ss]);
|
||||||
auto zz = coalescedRead(z_v[ss]);
|
auto zz = coalescedRead(z_v[ss]);
|
||||||
zz = zz+xx*yy;
|
//zz = zz+xx*yy;
|
||||||
|
mac(&zz,&xx,&yy);
|
||||||
coalescedWrite(z_v[ss],zz);
|
coalescedWrite(z_v[ss],zz);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -330,12 +330,18 @@ case ${CXXTEST} in
|
|||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
hipcc)
|
hipcc)
|
||||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
# CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr"
|
||||||
|
CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
|
||||||
CXXLD=${CXX}
|
CXXLD=${CXX}
|
||||||
if test $ac_openmp = yes; then
|
if test $ac_openmp = yes; then
|
||||||
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
|
CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp"
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
|
dpcpp)
|
||||||
|
LDFLAGS="$LDFLAGS"
|
||||||
|
CXXFLAGS="$CXXFLAGS"
|
||||||
|
CXXLD=${CXX}
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
CXXLD=${CXX}
|
CXXLD=${CXX}
|
||||||
CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
|
CXXFLAGS="$CXXFLAGS -fno-strict-aliasing"
|
||||||
|
@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu
|
|||||||
|
|
||||||
This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
|
This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging:
|
||||||
|
|
||||||
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none
|
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug
|
||||||
|
|
||||||
#### 2. `Release`
|
#### 2. `Release`
|
||||||
|
|
||||||
Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
|
Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation):
|
||||||
|
|
||||||
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none
|
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release
|
||||||
|
|
||||||
#### 3. `MPIDebug`
|
#### 3. `MPIDebug`
|
||||||
|
|
||||||
Debug configuration with MPI:
|
Debug configuration with MPI:
|
||||||
|
|
||||||
../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx
|
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug
|
||||||
|
|
||||||
### 5.3 Build Grid
|
### 5.3 Build Grid
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user