1 #include "ghost/config.h"
10 #include <cuda_runtime.h>
18 #define MAX_COLS_PER_BLOCK 16
19 #define MAX_COLS_PER_BLOCK_COLMAJOR 16
20 #define SELL_CUDA_THREADSPERBLOCK 512
24 #define LOCALDOT_ONTHEFLY
26 template<
typename m_t,
typename v_t,
typename v_t_b,
int nrowsinblock,
int C,
int ncols,
bool do_axpby,
bool do_scale,
bool do_vshift,
bool do_dot_yy,
bool do_dot_xy,
bool do_dot_xx,
bool do_chain_axpby>
27 __global__
void SELL_kernel_CU_rm_tmpl(v_t *
const __restrict__ lhs,
const ghost_lidx lhs_lda,
const v_t *
const __restrict__ rhs,
const ghost_lidx rhs_lda,
const ghost_spmv_flags flags,
const ghost_lidx nrows,
const ghost_lidx *
const __restrict__ rowlen,
const ghost_lidx *
const __restrict__ mcol,
const m_t *
const __restrict__ val,
const ghost_lidx *
const __restrict__ chunkstart,
const v_t *
const __restrict__ shift,
const v_t alpha,
const v_t beta, v_t *
const __restrict__ localdot, v_t *
const __restrict__ z,
const ghost_lidx z_lda,
const v_t delta,
const v_t eta)
30 const int ncolsinblock = blockDim.x / nrowsinblock;
31 int row = blockIdx.x * nrowsinblock + threadIdx.x / ncolsinblock;
32 int col = blockIdx.y * ncolsinblock + threadIdx.x % ncolsinblock;
34 if (row < nrows && col < ncols) {
35 int offs = chunkstart[row / C] + row % C;
41 for (j = 0; j < rowlen[row]; j++) {
42 tmp = axpy<v_t, m_t>(tmp, rhs[rhs_lda * mcol[offs + j * C] + col], val[offs + j * C]);
46 tmp = axpy<v_t, v_t>(tmp, rhs[rhs_lda * row + col], scale2<v_t, float>(shift[col], -1.f));
49 tmp = scale<v_t>(alpha, tmp);
52 lhs[lhs_lda * row + col] = axpy<v_t, v_t>(tmp, lhs[lhs_lda * row + col], beta);
54 lhs[lhs_lda * row + col] = tmp;
57 z[z_lda * row + col] = axpby<v_t>(lhs[lhs_lda * row + col], z[z_lda * row + col], eta, delta);
60 #ifdef LOCALDOT_ONTHEFLY
61 row = blockIdx.x * nrowsinblock + threadIdx.x % nrowsinblock;
62 col = blockIdx.y * ncolsinblock + threadIdx.x / nrowsinblock;
64 if (do_dot_yy || do_dot_xy || do_dot_xx) {
65 if (!do_dot_yy && do_dot_xy && do_dot_xx) {
71 dot2 = mulConj<v_t>(rhs[rhs_lda * row + col], lhs[lhs_lda * row + col]);
72 dot3 = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * row + col]);
78 if (nrowsinblock <= 32) {
81 if (threadIdx.x % nrowsinblock == 0) {
82 localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
83 fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
91 if ((threadIdx.x < blockDim.x / warpSize) && threadIdx.x % (nrowsinblock / warpSize) == 0) {
92 col = threadIdx.x / (blockDim.x / (warpSize * ncolsinblock));
93 localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
94 fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
103 dot1 = mulConjSame<v_t, v_t_b>(lhs[lhs_lda * row + col]);
104 dot2 = mulConj<v_t>(rhs[rhs_lda * row + col], lhs[lhs_lda * row + col]);
105 dot3 = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * row + col]);
112 if (nrowsinblock <= 32) {
116 if (threadIdx.x % nrowsinblock == 0) {
117 fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot1);
118 localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
119 fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
129 if ((threadIdx.x < blockDim.x / warpSize) && threadIdx.x % (nrowsinblock / warpSize) == 0) {
130 col = threadIdx.x / (blockDim.x / (warpSize * ncolsinblock));
131 fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot1);
132 localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
133 fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
143 template<
typename m_t,
typename v_t,
typename v_t_b,
int nrowsinblock,
int C,
int ncols,
bool do_axpby,
bool do_scale,
bool do_vshift,
bool do_dot_yy,
bool do_dot_xy,
bool do_dot_xx,
bool do_chain_axpby>
145 v_t *
const __restrict__ lhs,
147 const v_t *
const __restrict__ rhs,
153 const m_t *
const __restrict__ val,
154 const ghost_lidx *
const __restrict__ chunkstart,
155 const v_t *
const __restrict__ shift,
158 v_t *
const __restrict__ localdot,
159 v_t *
const __restrict__ z,
164 int i = threadIdx.x + blockIdx.x * blockDim.x;
165 int col = blockDim.y * blockIdx.y + threadIdx.y;
167 if (i < nrows && col < ncols) {
169 if (C == blockDim.x) {
170 cs = chunkstart[blockIdx.x];
173 cs = chunkstart[i / C];
174 tid = threadIdx.x % C;
181 for (j = 0; j < rowlen[i]; j++) {
182 tmp = axpy<v_t, m_t>(tmp, rhs[rhs_lda * col + mcol[cs + tid + j * C]], val[cs + tid + j * C]);
186 tmp = axpy<v_t, v_t>(tmp, rhs[rhs_lda * col + i], scale2<v_t, float>(shift[col], -1.f));
189 tmp = scale<v_t>(alpha, tmp);
192 lhs[lhs_lda * col + i] = axpy<v_t, float>(scale<v_t>(lhs[lhs_lda * col + i], beta), tmp, 1.f);
194 lhs[lhs_lda * col + i] = tmp;
196 if (do_chain_axpby) {
197 z[z_lda * col + i] = axpby<v_t>(lhs[lhs_lda * col + i], z[z_lda * col + i], eta, delta);
200 #ifdef LOCALDOT_ONTHEFLY
201 if (do_dot_yy && do_dot_xy && do_dot_xx) {
209 dot1 = mulConjSame<v_t, v_t_b>(lhs[lhs_lda * col + i]);
210 dot2 = mulConj<v_t>(rhs[rhs_lda * col + i], lhs[lhs_lda * col + i]);
211 dot3 = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * col + i]);
220 if (threadIdx.x == 0) {
221 fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot1);
222 localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
223 fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
231 dot = mulConjSame<v_t, v_t_b>(lhs[lhs_lda * col + i]);
237 if (threadIdx.x == 0) {
238 fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot);
246 dot = mulConj<v_t>(rhs[rhs_lda * col + i], lhs[lhs_lda * col + i]);
252 if (threadIdx.x == 0) {
253 localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot;
261 dot = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * col + i]);
267 if (threadIdx.x == 0) {
268 fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot);
276 template<
typename m_dt,
typename v_dt_host,
typename v_dt_device,
typename v_dt_base,
int C,
int ncols,
bool do_axpby,
bool do_scale,
bool do_vshift,
bool do_dot_yy,
bool do_dot_xy,
bool do_dot_xx,
bool do_chain_axpby>
279 GHOST_FUNC_ENTER(GHOST_FUNCTYPE_MATH)
282 void *lhsval, *rhsval, *zval;
297 lhsval = lhscompact->
cu_val;
298 rhsval = rhscompact->
cu_val;
301 v_dt_device *cu_localdot = NULL;
302 v_dt_device *cu_shift = NULL;
303 v_dt_host *localdot = NULL;
304 v_dt_device *shift = NULL;
305 v_dt_device
scale, beta, sdelta, seta;
306 zero<v_dt_device>(
scale);
307 zero<v_dt_device>(beta);
308 zero<v_dt_device>(sdelta);
309 zero<v_dt_device>(seta);
326 v_dt_host hbeta = 1.;
327 memcpy(&beta, &hbeta,
sizeof(beta));
360 reqSmem =
sizeof(v_dt_device) * 32 * block.y;
362 if (prop.sharedMemPerBlock < reqSmem) {
363 GHOST_WARNING_LOG(
"Not enough shared memory available! CUDA kernel will not execute!");
365 GHOST_DEBUG_LOG(1,
"grid %dx%d block %dx%d shmem %zu", grid.x, grid.y, block.x, block.y, reqSmem);
366 SELL_kernel_CU_cm_tmpl<m_dt, v_dt_device, v_dt_base, 0, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, reqSmem>>>((v_dt_device *)lhsval, lhs->
stride, (v_dt_device *)rhsval, rhs->
stride, opts.
flags, mat->
context->
row_map->
dim, mat->
cu_rowLen, mat->
cu_col, (m_dt *)mat->
cu_val, mat->
cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
369 const int nrowsinblock = 16;
376 GHOST_DEBUG_LOG(1,
"grid %dx%d block %dx%d nrowsinblock %d", grid.x, grid.y, block.x, block.y, nrowsinblock);
377 SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, 0>>>((v_dt_device *)lhsval, lhs->
stride, (v_dt_device *)rhsval, rhs->
stride, opts.
flags, mat->
context->
row_map->
dim, mat->
cu_rowLen, mat->
cu_col, (m_dt *)mat->
cu_val, mat->
cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
378 }
else if (ncols > 4) {
379 const int nrowsinblock = 32;
386 GHOST_DEBUG_LOG(1,
"grid %dx%d block %dx%d nrowsinblock %d", grid.x, grid.y, block.x, block.y, nrowsinblock);
387 SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, 0>>>((v_dt_device *)lhsval, lhs->
stride, (v_dt_device *)rhsval, rhs->
stride, opts.
flags, mat->
context->
row_map->
dim, mat->
cu_rowLen, mat->
cu_col, (m_dt *)mat->
cu_val, mat->
cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
388 }
else if (ncols > 2) {
389 const int nrowsinblock = 64;
396 int smem = (block.x / 32) *
sizeof(v_dt_device);
397 GHOST_DEBUG_LOG(1,
"grid %dx%d block %dx%d nrowsinblock %d smem %d", grid.x, grid.y, block.x, block.y, nrowsinblock, smem);
398 SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, smem>>>((v_dt_device *)lhsval, lhs->
stride, (v_dt_device *)rhsval, rhs->
stride, opts.
flags, mat->
context->
row_map->
dim, mat->
cu_rowLen, mat->
cu_col, (m_dt *)mat->
cu_val, mat->
cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
399 }
else if (ncols > 1) {
400 const int nrowsinblock = 128;
407 int smem = (block.x / 32) *
sizeof(v_dt_device);
408 GHOST_DEBUG_LOG(1,
"grid %dx%d block %dx%d nrowsinblock %d smem %d", grid.x, grid.y, block.x, block.y, nrowsinblock, smem);
409 SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, smem>>>((v_dt_device *)lhsval, lhs->
stride, (v_dt_device *)rhsval, rhs->
stride, opts.
flags, mat->
context->
row_map->
dim, mat->
cu_rowLen, mat->
cu_col, (m_dt *)mat->
cu_val, mat->
cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
411 const int nrowsinblock = 256;
418 int smem = (block.x / 32) *
sizeof(v_dt_device);
419 GHOST_DEBUG_LOG(1,
"grid %dx%d block %dx%d nrowsinblock %d smem %d", grid.x, grid.y, block.x, block.y, nrowsinblock, smem);
420 SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, smem>>>((v_dt_device *)lhsval, lhs->
stride, (v_dt_device *)rhsval, rhs->
stride, opts.
flags, mat->
context->
row_map->
dim, mat->
cu_rowLen, mat->
cu_col, (m_dt *)mat->
cu_val, mat->
cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
424 if (lhscompact != lhs) {
429 if (rhscompact != rhs) {
434 cudaDeviceSynchronize();
437 #ifdef LOCALDOT_ONTHEFLY
439 v_dt_device *cu_localdot_result;
480 memset(localdot, 0, rhs->
traits.
ncols * 3 *
sizeof(v_dt_host));
497 GHOST_FUNC_EXIT(GHOST_FUNCTYPE_MATH);
ghost_lidx dim
The local dimension for this rank.
Definition: map.h:104
ghost_spmv_flags flags
Definition: sparsemat.h:72
Functions for global mathematical operations.
int ghost_cu_deviceprop
Definition: cu_util.h:27
ghost_error ghost_cu_upload(void *devmem, void *hostmem, size_t bytesize)
Upload memory from the the host to the GPU.
Definition: cu_util.c:160
#define CUDA_CALL_RETURN(call)
Definition: error.h:223
__global__ void SELL_kernel_CU_cm_tmpl(v_t *const __restrict__ lhs, const ghost_lidx lhs_lda, const v_t *const __restrict__ rhs, const ghost_lidx rhs_lda, const ghost_spmv_flags flags, const ghost_lidx nrows, const ghost_lidx *const __restrict__ rowlen, const ghost_lidx *const __restrict__ mcol, const m_t *const __restrict__ val, const ghost_lidx *const __restrict__ chunkstart, const v_t *const __restrict__ shift, const v_t alpha, const v_t beta, v_t *const __restrict__ localdot, v_t *const __restrict__ z, const ghost_lidx z_lda, const v_t delta, const v_t eta)
Definition: sell_spmv_cu_kernel.h:144
Header file for type definitions.
__inline__ __device__ v_t ghost_1dPartialBlockReduceSum(v_t val, int nwarps)
Definition: cu_sell_kernel.h:131
__global__ void SELL_kernel_CU_rm_tmpl(v_t *const __restrict__ lhs, const ghost_lidx lhs_lda, const v_t *const __restrict__ rhs, const ghost_lidx rhs_lda, const ghost_spmv_flags flags, const ghost_lidx nrows, const ghost_lidx *const __restrict__ rowlen, const ghost_lidx *const __restrict__ mcol, const m_t *const __restrict__ val, const ghost_lidx *const __restrict__ chunkstart, const v_t *const __restrict__ shift, const v_t alpha, const v_t beta, v_t *const __restrict__ localdot, v_t *const __restrict__ z, const ghost_lidx z_lda, const v_t delta, const v_t eta)
Definition: sell_spmv_cu_kernel.h:27
#define MAX_COLS_PER_BLOCK_COLMAJOR
Definition: sell_spmv_cu_kernel.h:19
#define PAD(n, p)
Pad a number to a multiple of a second number.
Definition: util.h:44
#define GHOST_PERFWARNING_LOG(...)
Definition: log.h:124
#define GHOST_SPMV_PARSE_TRAITS(traits, _alpha, _beta, _gamma, _dot, _z, _delta, _eta, dt_in, dt_out)
Parse the SPMV arguments.
Definition: spmv.h:69
Column-major storage (as in Fortran).
Definition: densemat.h:103
No error occured.
Definition: error.h:27
ghost_map * row_map
The row map of this context.
Definition: context.h:144
#define SPM_NROWSPAD(mat)
Definition: sparsemat.h:668
#define GHOST_SPMV_DOT
Definition: spmv.h:37
ghost_lidx * cu_rowLen
The length of each row.
Definition: sparsemat.h:560
int32_t ghost_lidx
Definition: types.h:503
ghost_error ghost_cu_malloc(void **mem, size_t bytesize)
Allocate CUDA device memory.
Definition: cu_util.c:88
__device__ T scale(T y, T a)
Definition: cu_complex.h:275
ghost_error ghost_cu_free(void *mem)
Free GPU memory.
Definition: cu_util.c:232
ghost_error ghost_cu_device(int *device)
Get the active GPU.
Definition: cu_util.c:435
#define GHOST_INSTR_STOP(tag)
Instrumentation will be ignored.
Definition: instr.h:135
Types, functions and macros for error handling.
ghost_error
Error return type.
Definition: error.h:23
#define CEILDIV(a, b)
Definition: util.h:46
ghost_datatype datatype
The data type.
Definition: densemat.h:189
ghost_error ghost_sellspmv_cu_tmpl(ghost_densemat *lhs, ghost_sparsemat *mat, ghost_densemat *rhs, ghost_spmv_opts opts)
Definition: sell_spmv_cu_kernel.h:277
ghost_spmv_flags
Flags to be passed to sparse matrix-vector multiplication.
Definition: spmv.h:15
General utility functions.
ghost_densemat_flags flags
Property flags.
Definition: densemat.h:181
ghost_error ghost_cu_download(void *hostmem, void *devmem, size_t bytesize)
Download memory from a GPU to the host.
Definition: cu_util.c:215
ghost_error ghost_localdot(void *res, ghost_densemat *vec1, ghost_densemat *vec2)
Compute the local dot product of two dense vectors/matrices.
Definition: dot.cpp:40
void ghost_densemat_destroy(ghost_densemat *vec)
Destroys a densemat, i.e., frees all its data structures.
Definition: densemat.c:581
#define MIN(x, y)
Definition: timing.h:14
ghost_error ghost_cu_reduce_multiple(void *out, void *data, ghost_datatype dt, ghost_lidx n, ghost_lidx ncols)
ghost_context * context
The context of the matrix (if distributed).
Definition: sparsemat.h:498
ghost_lidx stride
The leading dimensions of the densemat in memory.
Definition: densemat.h:264
#define GHOST_DEBUG_LOG(level,...)
Definition: log.h:114
Macros used for code instrumentation.
#define SPM_NROWS(mat)
Definition: sparsemat.h:664
A sparse matrix.
Definition: sparsemat.h:476
__inline__ __device__ v_t ghost_blockReduceSum(v_t val)
Definition: cu_sell_kernel.h:163
#define GHOST_DENSEMAT_SCATTERED
Definition: densemat.h:89
ghost_lidx ncols
The number of columns.
Definition: densemat.h:172
Inline template functions for CUDA complex number handling.
#define GHOST_CALL_RETURN(call)
This macro should be used for calling a GHOST function inside a function which itself returns a ghost...
Definition: error.h:122
Definition: sparsemat.h:71
ghost_lidx * cu_chunkStart
Pointer to start of each chunk.
Definition: sparsemat.h:568
ghost_error ghost_densemat_init_densemat(ghost_densemat *x, ghost_densemat *y, ghost_lidx roffs, ghost_lidx coffs)
Initializes a densemat from another densemat at a given column and row offset.
Definition: densemat.c:776
ghost_densemat_traits traits
The densemat's traits.
Definition: densemat.h:231
__inline__ __device__ v_t ghost_partialWarpReduceSum(v_t val, int size, int width)
Definition: cu_sell_kernel.h:62
#define SELL_CUDA_THREADSPERBLOCK
Definition: sell_spmv_cu_kernel.h:20
ghost_error ghost_cu_memset(void *s, int c, size_t n)
Memset GPU memory.
Definition: cu_util.c:145
#define GHOST_INSTR_START(tag)
Instrumentation will be ignored.
Definition: instr.h:127
char * cu_val
The CUDA matrix.
Definition: sparsemat.h:552
ghost_error ghost_densemat_clone(ghost_densemat **dst, ghost_densemat *src, ghost_lidx nc, ghost_lidx coffs)
Create a ghost_densemat as a clone of another ghost_densemat at a column given offset.
Definition: densemat.c:820
char * cu_val
The values of the densemat on the CUDA device.
Definition: densemat.h:284
#define GHOST_WARNING_LOG(...)
Definition: log.h:123
ghost_error ghost_cu_deviceprop_get(ghost_cu_deviceprop *prop)
Get the CUDA device properties.
Definition: cu_util.c:517
ghost_densemat_storage storage
The storage order.
Definition: densemat.h:185
ghost_lidx * cu_col
The column indices.
Definition: sparsemat.h:556
A dense vector/matrix.
Definition: densemat.h:226