mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-22 17:52:02 +01:00
Compare commits
6 Commits
913fbca74a
...
477ebf24f4
Author | SHA1 | Date | |
---|---|---|---|
477ebf24f4 | |||
0d5639f707 | |||
413312f9a9 | |||
03508448f8 | |||
e1e5c75023 | |||
9296299b61 |
@ -117,19 +117,19 @@ public:
|
|||||||
typedef decltype(coalescedRead(*in)) sobj;
|
typedef decltype(coalescedRead(*in)) sobj;
|
||||||
typedef decltype(coalescedRead(*out0)) hsobj;
|
typedef decltype(coalescedRead(*out0)) hsobj;
|
||||||
|
|
||||||
unsigned int Nsimd = vobj::Nsimd();
|
constexpr unsigned int Nsimd = vobj::Nsimd();
|
||||||
unsigned int mask = Nsimd >> (type + 1);
|
unsigned int mask = Nsimd >> (type + 1);
|
||||||
int lane = acceleratorSIMTlane(Nsimd);
|
int lane = acceleratorSIMTlane(Nsimd);
|
||||||
int j0 = lane &(~mask); // inner coor zero
|
int j0 = lane &(~mask); // inner coor zero
|
||||||
int j1 = lane |(mask) ; // inner coor one
|
int j1 = lane |(mask) ; // inner coor one
|
||||||
const vobj *vp0 = &in[k];
|
const vobj *vp0 = &in[k]; // out0[j] = merge low bit of type from in[k] and in[m]
|
||||||
const vobj *vp1 = &in[m];
|
const vobj *vp1 = &in[m]; // out1[j] = merge hi bit of type from in[k] and in[m]
|
||||||
const vobj *vp = (lane&mask) ? vp1:vp0;
|
const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
|
||||||
auto sa = coalescedRead(*vp,j0);
|
auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
|
||||||
auto sb = coalescedRead(*vp,j1);
|
auto sb = coalescedRead(*vp,j1); // lane to read for out 1
|
||||||
hsobj psa, psb;
|
hsobj psa, psb;
|
||||||
projector::Proj(psa,sa,mu,dag);
|
projector::Proj(psa,sa,mu,dag); // spin project the result0
|
||||||
projector::Proj(psb,sb,mu,dag);
|
projector::Proj(psb,sb,mu,dag); // spin project the result1
|
||||||
coalescedWrite(out0[j],psa);
|
coalescedWrite(out0[j],psa);
|
||||||
coalescedWrite(out1[j],psb);
|
coalescedWrite(out1[j],psb);
|
||||||
#else
|
#else
|
||||||
|
@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
|
|||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class cobj,class vobj,class compressor>
|
template<class cobj,class vobj,class compressor>
|
||||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
||||||
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
|
commVector<cobj *> pointers,
|
||||||
|
int dimension,int plane,
|
||||||
|
int cbmask,compressor &compress,int type) __attribute__((noinline));
|
||||||
|
|
||||||
template<class cobj,class vobj,class compressor>
|
template<class cobj,class vobj,class compressor>
|
||||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
|
||||||
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
const Lattice<vobj> &rhs,
|
||||||
|
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type)
|
compressor &compress,int type)
|
||||||
{
|
{
|
||||||
assert( (table.size()&0x1)==0);
|
assert( (table.size()&0x1)==0);
|
||||||
@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
|
|||||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
auto rhs_v = rhs.View(AcceleratorRead);
|
||||||
|
auto rhs_p = &rhs_v[0];
|
||||||
auto p0=&pointers[0][0];
|
auto p0=&pointers[0][0];
|
||||||
auto p1=&pointers[1][0];
|
auto p1=&pointers[1][0];
|
||||||
auto tp=&table[0];
|
auto tp=&table[0];
|
||||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
accelerator_forNB(j, num, vobj::Nsimd(), {
|
||||||
compress.CompressExchange(p0,p1, &rhs_v[0], j,
|
compress.CompressExchange(p0,p1, rhs_p, j,
|
||||||
so+tp[2*j ].second,
|
so+tp[2*j ].second,
|
||||||
so+tp[2*j+1].second,
|
so+tp[2*j+1].second,
|
||||||
type);
|
type);
|
||||||
});
|
});
|
||||||
rhs_v.ViewClose();
|
rhs_v.ViewClose();
|
||||||
}
|
}
|
||||||
@ -230,8 +234,8 @@ public:
|
|||||||
};
|
};
|
||||||
struct Merge {
|
struct Merge {
|
||||||
cobj * mpointer;
|
cobj * mpointer;
|
||||||
Vector<scalar_object *> rpointers;
|
// std::vector<scalar_object *> rpointers;
|
||||||
Vector<cobj *> vpointers;
|
std::vector<cobj *> vpointers;
|
||||||
Integer buffer_size;
|
Integer buffer_size;
|
||||||
Integer type;
|
Integer type;
|
||||||
};
|
};
|
||||||
@ -406,6 +410,7 @@ public:
|
|||||||
comms_bytes+=bytes;
|
comms_bytes+=bytes;
|
||||||
shm_bytes +=2*Packets[i].bytes-bytes;
|
shm_bytes +=2*Packets[i].bytes-bytes;
|
||||||
}
|
}
|
||||||
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
@ -420,7 +425,7 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void Communicate(void)
|
void Communicate(void)
|
||||||
{
|
{
|
||||||
if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){
|
if ( 0 ){
|
||||||
thread_region {
|
thread_region {
|
||||||
// must be called in parallel region
|
// must be called in parallel region
|
||||||
int mythread = thread_num();
|
int mythread = thread_num();
|
||||||
@ -569,7 +574,7 @@ public:
|
|||||||
d.buffer_size = buffer_size;
|
d.buffer_size = buffer_size;
|
||||||
dv.push_back(d);
|
dv.push_back(d);
|
||||||
}
|
}
|
||||||
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||||
Merge m;
|
Merge m;
|
||||||
m.type = type;
|
m.type = type;
|
||||||
m.mpointer = merge_p;
|
m.mpointer = merge_p;
|
||||||
@ -582,6 +587,7 @@ public:
|
|||||||
}
|
}
|
||||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||||
mpi3synctime-=usecond();
|
mpi3synctime-=usecond();
|
||||||
|
accelerator_barrier();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
mpi3synctime+=usecond();
|
mpi3synctime+=usecond();
|
||||||
shmmergetime-=usecond();
|
shmmergetime-=usecond();
|
||||||
@ -1114,8 +1120,8 @@ public:
|
|||||||
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
|
int bytes = (reduced_buffer_size*datum_bytes)/simd_layout;
|
||||||
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
|
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
|
||||||
|
|
||||||
Vector<cobj *> rpointers(maxl);
|
std::vector<cobj *> rpointers(maxl);
|
||||||
Vector<cobj *> spointers(maxl);
|
std::vector<cobj *> spointers(maxl);
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Work out what to send where
|
// Work out what to send where
|
||||||
|
@ -298,14 +298,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
|||||||
}); \
|
}); \
|
||||||
});
|
});
|
||||||
|
|
||||||
#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); }
|
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
|
||||||
|
|
||||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
||||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||||
|
|
||||||
inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); }
|
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
|
||||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
|
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||||
|
131
benchmarks/Benchmark_halo.cc
Normal file
131
benchmarks/Benchmark_halo.cc
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
#define CUDA_PROFILE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef CUDA_PROFILE
|
||||||
|
#include <cuda_profiler_api.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
Coordinate latt4= GridDefaultLatt();
|
||||||
|
Coordinate mpi = GridDefaultMpi();
|
||||||
|
Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd());
|
||||||
|
|
||||||
|
GridLogLayout();
|
||||||
|
|
||||||
|
int Ls=16;
|
||||||
|
for(int i=0;i<argc;i++)
|
||||||
|
if(std::string(argv[i]) == "-Ls"){
|
||||||
|
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi);
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||||
|
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||||
|
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||||
|
|
||||||
|
LatticeFermionF src (FGrid); random(RNG5,src);
|
||||||
|
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||||
|
src = src*N2;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||||
|
LatticeGaugeFieldF Umu(UGrid);
|
||||||
|
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||||
|
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
RealD NN = UGrid->NodeCount();
|
||||||
|
|
||||||
|
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
|
const int ncall = 500;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::HaloGatherOpt "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
{
|
||||||
|
typename DomainWallFermionF::Compressor compressor(0);
|
||||||
|
FGrid->Barrier();
|
||||||
|
Dw.Stencil.HaloExchangeOptGather(src,compressor);
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Stencil.HaloExchangeOptGather(src,compressor);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
|
||||||
|
double bytes=0.0;
|
||||||
|
if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3];
|
||||||
|
if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3];
|
||||||
|
if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3];
|
||||||
|
if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2];
|
||||||
|
bytes = bytes * Ls * 8.* (24.+12.)* 2.0;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Gather us /call = "<< (t1-t0)/ncall<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Gather MBs /call = "<< bytes*ncall/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
exit(0);
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
##SBATCH -p PVC-SPR-QZEH
|
##SBATCH -p PVC-SPR-QZEH
|
||||||
##SBATCH -p PVC-ICX-QZNW
|
##SBATCH -p PVC-ICX-QZNW
|
||||||
|
|
||||||
#SBATCH -p QZ1J-ICX-PVC
|
#SBATCH -p QZ1J-ICX-PVC
|
||||||
|
|
||||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||||
@ -19,8 +20,14 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
|
|||||||
export I_MPI_OFFLOAD_CELL=tile
|
export I_MPI_OFFLOAD_CELL=tile
|
||||||
export EnableImplicitScaling=0
|
export EnableImplicitScaling=0
|
||||||
export EnableWalkerPartition=0
|
export EnableWalkerPartition=0
|
||||||
|
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
|
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
||||||
|
|
||||||
mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.1x2.log
|
||||||
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.2x1.log
|
||||||
|
|
||||||
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log
|
||||||
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log
|
||||||
|
|
||||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log
|
|
||||||
|
|
||||||
|
@ -7,8 +7,8 @@ echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
|
|||||||
|
|
||||||
if [ $MPI_LOCALRANKID = "0" ]
|
if [ $MPI_LOCALRANKID = "0" ]
|
||||||
then
|
then
|
||||||
# ~psteinbr/build_pti/ze_tracer -c $@
|
# ~psteinbr/build_pti/ze_tracer -h $@
|
||||||
onetrace --chrome-kernel-timeline $@
|
onetrace --chrome-device-timeline $@
|
||||||
else
|
else
|
||||||
$@
|
$@
|
||||||
fi
|
fi
|
||||||
|
Reference in New Issue
Block a user