GHOST  1.1.2
General, Hybrid, and Optimized Sparse Toolkit
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
sell_spmv_cu_kernel.h
Go to the documentation of this file.
1 #include "ghost/config.h"
2 #include "ghost/types.h"
3 #include "ghost/instr.h"
4 #include "ghost/log.h"
5 #include "ghost/error.h"
6 #include "ghost/util.h"
7 #include "ghost/math.h"
8 
9 #include <cuComplex.h>
10 #include <cuda_runtime.h>
11 #include <cuda.h>
12 #include <complex.h>
13 #include <complex>
14 
15 #include "ghost/cu_complex.h"
16 #include "ghost/cu_sell_kernel.h"
17 
18 #define MAX_COLS_PER_BLOCK 16
19 #define MAX_COLS_PER_BLOCK_COLMAJOR 16
20 #define SELL_CUDA_THREADSPERBLOCK 512
21 
22 
23 // comment this out if the dot products should not be computed on the fly but _after_ the SpMV
24 #define LOCALDOT_ONTHEFLY
25 
26 template<typename m_t, typename v_t, typename v_t_b, int nrowsinblock, int C, int ncols, bool do_axpby, bool do_scale, bool do_vshift, bool do_dot_yy, bool do_dot_xy, bool do_dot_xx, bool do_chain_axpby>
27 __global__ void SELL_kernel_CU_rm_tmpl(v_t *const __restrict__ lhs, const ghost_lidx lhs_lda, const v_t *const __restrict__ rhs, const ghost_lidx rhs_lda, const ghost_spmv_flags flags, const ghost_lidx nrows, const ghost_lidx *const __restrict__ rowlen, const ghost_lidx *const __restrict__ mcol, const m_t *const __restrict__ val, const ghost_lidx *const __restrict__ chunkstart, const v_t *const __restrict__ shift, const v_t alpha, const v_t beta, v_t *const __restrict__ localdot, v_t *const __restrict__ z, const ghost_lidx z_lda, const v_t delta, const v_t eta)
28 {
29 
30  const int ncolsinblock = blockDim.x / nrowsinblock;
31  int row = blockIdx.x * nrowsinblock + threadIdx.x / ncolsinblock;
32  int col = blockIdx.y * ncolsinblock + threadIdx.x % ncolsinblock;
33 
34  if (row < nrows && col < ncols) {
35  int offs = chunkstart[row / C] + row % C;// chunkstart + rowinchunk
36  int j;
37  v_t tmp;
38 
39  zero<v_t>(tmp);
40 
41  for (j = 0; j < rowlen[row]; j++) {
42  tmp = axpy<v_t, m_t>(tmp, rhs[rhs_lda * mcol[offs + j * C] + col], val[offs + j * C]);
43  }
44 
45  if (do_vshift) {
46  tmp = axpy<v_t, v_t>(tmp, rhs[rhs_lda * row + col], scale2<v_t, float>(shift[col], -1.f));
47  }
48  if (do_scale) {
49  tmp = scale<v_t>(alpha, tmp);
50  }
51  if (do_axpby) {
52  lhs[lhs_lda * row + col] = axpy<v_t, v_t>(tmp, lhs[lhs_lda * row + col], beta);
53  } else {
54  lhs[lhs_lda * row + col] = tmp;
55  }
56  if (do_chain_axpby) {
57  z[z_lda * row + col] = axpby<v_t>(lhs[lhs_lda * row + col], z[z_lda * row + col], eta, delta);
58  }
59  }
60 #ifdef LOCALDOT_ONTHEFLY
61  row = blockIdx.x * nrowsinblock + threadIdx.x % nrowsinblock;
62  col = blockIdx.y * ncolsinblock + threadIdx.x / nrowsinblock;
63  if (col < ncols) {
64  if (do_dot_yy || do_dot_xy || do_dot_xx) {
65  if (!do_dot_yy && do_dot_xy && do_dot_xx) {
66  v_t_b dot3;
67  v_t dot2;
68 
69  __syncthreads();
70  if (row < nrows) {
71  dot2 = mulConj<v_t>(rhs[rhs_lda * row + col], lhs[lhs_lda * row + col]);
72  dot3 = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * row + col]);
73  } else {
74  zero<v_t>(dot2);
75  zero<v_t_b>(dot3);
76  }
77 
78  if (nrowsinblock <= 32) {
79  dot2 = ghost_partialWarpReduceSum(dot2, nrowsinblock, warpSize);
80  dot3 = ghost_partialWarpReduceSum(dot3, nrowsinblock, warpSize);
81  if (threadIdx.x % nrowsinblock == 0) {
82  localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
83  fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
84  }
85  } else {
86  dot2 = ghost_1dPartialBlockReduceSum(dot2, nrowsinblock / warpSize);
87  __syncthreads();
88  dot3 = ghost_1dPartialBlockReduceSum(dot3, nrowsinblock / warpSize);
89  __syncthreads();
90 
91  if ((threadIdx.x < blockDim.x / warpSize) && threadIdx.x % (nrowsinblock / warpSize) == 0) {
92  col = threadIdx.x / (blockDim.x / (warpSize * ncolsinblock));
93  localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
94  fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
95  }
96  }
97  } else {
98  v_t_b dot1, dot3;
99  v_t dot2;
100 
101  __syncthreads();
102  if (row < nrows) {
103  dot1 = mulConjSame<v_t, v_t_b>(lhs[lhs_lda * row + col]);
104  dot2 = mulConj<v_t>(rhs[rhs_lda * row + col], lhs[lhs_lda * row + col]);
105  dot3 = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * row + col]);
106  } else {
107  zero<v_t_b>(dot1);
108  zero<v_t>(dot2);
109  zero<v_t_b>(dot3);
110  }
111 
112  if (nrowsinblock <= 32) {
113  dot1 = ghost_partialWarpReduceSum(dot1, nrowsinblock, warpSize);
114  dot2 = ghost_partialWarpReduceSum(dot2, nrowsinblock, warpSize);
115  dot3 = ghost_partialWarpReduceSum(dot3, nrowsinblock, warpSize);
116  if (threadIdx.x % nrowsinblock == 0) {
117  fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot1);
118  localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
119  fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
120  }
121  } else {
122  dot1 = ghost_1dPartialBlockReduceSum(dot1, nrowsinblock / warpSize);
123  __syncthreads();
124  dot2 = ghost_1dPartialBlockReduceSum(dot2, nrowsinblock / warpSize);
125  __syncthreads();
126  dot3 = ghost_1dPartialBlockReduceSum(dot3, nrowsinblock / warpSize);
127  __syncthreads();
128 
129  if ((threadIdx.x < blockDim.x / warpSize) && threadIdx.x % (nrowsinblock / warpSize) == 0) {
130  col = threadIdx.x / (blockDim.x / (warpSize * ncolsinblock));
131  fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot1);
132  localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
133  fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
134  }
135  }
136  }
137  }
138  }
139 
140 #endif
141 }
142 
143 template<typename m_t, typename v_t, typename v_t_b, int nrowsinblock, int C, int ncols, bool do_axpby, bool do_scale, bool do_vshift, bool do_dot_yy, bool do_dot_xy, bool do_dot_xx, bool do_chain_axpby>
144 __global__ void SELL_kernel_CU_cm_tmpl(
145  v_t *const __restrict__ lhs,
146  const ghost_lidx lhs_lda,
147  const v_t *const __restrict__ rhs,
148  const ghost_lidx rhs_lda,
149  const ghost_spmv_flags flags,
150  const ghost_lidx nrows,
151  const ghost_lidx *const __restrict__ rowlen,
152  const ghost_lidx *const __restrict__ mcol,
153  const m_t *const __restrict__ val,
154  const ghost_lidx *const __restrict__ chunkstart,
155  const v_t *const __restrict__ shift,
156  const v_t alpha,
157  const v_t beta,
158  v_t *const __restrict__ localdot,
159  v_t *const __restrict__ z,
160  const ghost_lidx z_lda,
161  const v_t delta,
162  const v_t eta)
163 {
164  int i = threadIdx.x + blockIdx.x * blockDim.x;
165  int col = blockDim.y * blockIdx.y + threadIdx.y;
166 
167  if (i < nrows && col < ncols) {
168  int cs, tid;
169  if (C == blockDim.x) {
170  cs = chunkstart[blockIdx.x];
171  tid = threadIdx.x;
172  } else {
173  cs = chunkstart[i / C];
174  tid = threadIdx.x % C;
175  }
176  int j;
177  v_t tmp;
178 
179  zero<v_t>(tmp);
180 
181  for (j = 0; j < rowlen[i]; j++) {
182  tmp = axpy<v_t, m_t>(tmp, rhs[rhs_lda * col + mcol[cs + tid + j * C]], val[cs + tid + j * C]);
183  }
184 
185  if (do_vshift) {
186  tmp = axpy<v_t, v_t>(tmp, rhs[rhs_lda * col + i], scale2<v_t, float>(shift[col], -1.f));
187  }
188  if (do_scale) {
189  tmp = scale<v_t>(alpha, tmp);
190  }
191  if (do_axpby) {
192  lhs[lhs_lda * col + i] = axpy<v_t, float>(scale<v_t>(lhs[lhs_lda * col + i], beta), tmp, 1.f);
193  } else {
194  lhs[lhs_lda * col + i] = tmp;
195  }
196  if (do_chain_axpby) {
197  z[z_lda * col + i] = axpby<v_t>(lhs[lhs_lda * col + i], z[z_lda * col + i], eta, delta);
198  }
199  }
200 #ifdef LOCALDOT_ONTHEFLY
201  if (do_dot_yy && do_dot_xy && do_dot_xx) {
202  v_t_b dot1, dot3;
203  v_t dot2;
204  zero<v_t_b>(dot1);
205  zero<v_t>(dot2);
206  zero<v_t_b>(dot3);
207 
208  if (i < nrows) {
209  dot1 = mulConjSame<v_t, v_t_b>(lhs[lhs_lda * col + i]);
210  dot2 = mulConj<v_t>(rhs[rhs_lda * col + i], lhs[lhs_lda * col + i]);
211  dot3 = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * col + i]);
212  }
213 
214  dot1 = ghost_blockReduceSum(dot1);
215  __syncthreads();
216  dot2 = ghost_blockReduceSum(dot2);
217  __syncthreads();
218  dot3 = ghost_blockReduceSum(dot3);
219 
220  if (threadIdx.x == 0) {
221  fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot1);
222  localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot2;
223  fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot3);
224  }
225  } else {
226  if (do_dot_yy) {
227  v_t_b dot;
228  zero<v_t_b>(dot);
229 
230  if (i < nrows) {
231  dot = mulConjSame<v_t, v_t_b>(lhs[lhs_lda * col + i]);
232  }
233 
234  dot = ghost_blockReduceSum(dot);
235  __syncthreads();
236 
237  if (threadIdx.x == 0) {
238  fromReal<v_t, v_t_b>(localdot[0 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot);
239  }
240  }
241  if (do_dot_xy) {
242  v_t dot;
243  zero<v_t>(dot);
244 
245  if (i < nrows) {
246  dot = mulConj<v_t>(rhs[rhs_lda * col + i], lhs[lhs_lda * col + i]);
247  }
248 
249  dot = ghost_blockReduceSum(dot);
250  __syncthreads();
251 
252  if (threadIdx.x == 0) {
253  localdot[1 * ncols * gridDim.x + gridDim.x * col + blockIdx.x] = dot;
254  }
255  }
256  if (do_dot_xx) {
257  v_t_b dot;
258  zero<v_t_b>(dot);
259 
260  if (i < nrows) {
261  dot = mulConjSame<v_t, v_t_b>(rhs[rhs_lda * col + i]);
262  }
263 
264  dot = ghost_blockReduceSum(dot);
265  __syncthreads();
266 
267  if (threadIdx.x == 0) {
268  fromReal<v_t, v_t_b>(localdot[2 * ncols * gridDim.x + gridDim.x * col + blockIdx.x], dot);
269  }
270  }
271  }
272 
273 #endif
274 }
275 
276 template<typename m_dt, typename v_dt_host, typename v_dt_device, typename v_dt_base, int C, int ncols, bool do_axpby, bool do_scale, bool do_vshift, bool do_dot_yy, bool do_dot_xy, bool do_dot_xx, bool do_chain_axpby>
278 {
279  GHOST_FUNC_ENTER(GHOST_FUNCTYPE_MATH)
280  cudaGetLastError(); /* Remove previous error */
282  void *lhsval, *rhsval, *zval;
283  ghost_lidx zstride;
284  ghost_densemat *lhscompact, *rhscompact, *zcompact;
286  GHOST_PERFWARNING_LOG("Cloning (and compressing) lhs before operation");
287  GHOST_CALL_RETURN(ghost_densemat_clone(&lhscompact, lhs, lhs->traits.ncols, 0));
288  } else {
289  lhscompact = lhs;
290  }
292  GHOST_PERFWARNING_LOG("Cloning (and compressing) rhs before operation");
293  GHOST_CALL_RETURN(ghost_densemat_clone(&rhscompact, rhs, rhs->traits.ncols, 0));
294  } else {
295  rhscompact = rhs;
296  }
297  lhsval = lhscompact->cu_val;
298  rhsval = rhscompact->cu_val;
299  int cu_device;
300  GHOST_CALL_RETURN(ghost_cu_device(&cu_device));
301  v_dt_device *cu_localdot = NULL;
302  v_dt_device *cu_shift = NULL;
303  v_dt_host *localdot = NULL;
304  v_dt_device *shift = NULL;
305  v_dt_device scale, beta, sdelta, seta;
306  zero<v_dt_device>(scale);
307  zero<v_dt_device>(beta);
308  zero<v_dt_device>(sdelta);
309  zero<v_dt_device>(seta);
310  ghost_densemat *z = NULL;
311  dim3 block, grid;
312  GHOST_SPMV_PARSE_TRAITS(opts, scale, beta, shift, localdot, z, sdelta, seta, v_dt_host, v_dt_device);
313  if (z && (z->traits.flags & GHOST_DENSEMAT_SCATTERED)) {
314  GHOST_PERFWARNING_LOG("Cloning (and compressing) z before operation");
315  GHOST_CALL_RETURN(ghost_densemat_clone(&zcompact, z, z->traits.ncols, 0));
316  } else {
317  zcompact = z;
318  }
319  if (z) {
320  zstride = z->stride;
321  zval = zcompact->cu_val;
322  } else {
323  zstride = 0;
324  }
325  if (opts.flags & GHOST_SPMV_AXPY) {
326  v_dt_host hbeta = 1.;
327  memcpy(&beta, &hbeta, sizeof(beta));// use memcpy because no conversion between "complex double" and cuDoubleComplex exists but they are compatible
328  }
329  if (opts.flags & (GHOST_SPMV_SHIFT | GHOST_SPMV_VSHIFT)) {
330  size_t shiftsize = sizeof(v_dt_device) * (opts.flags & (GHOST_SPMV_VSHIFT | GHOST_SPMV_SHIFT) ? rhs->traits.ncols : 0);
331  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_shift, shiftsize));
332  if (opts.flags & GHOST_SPMV_SHIFT) {
333  ghost_lidx c;
334  for (c = 0; c < rhs->traits.ncols; c++) {
335  GHOST_CALL_RETURN(ghost_cu_upload(&cu_shift[c], shift, sizeof(v_dt_device)));
336  }
337  } else {
338  GHOST_CALL_RETURN(ghost_cu_upload(cu_shift, shift, shiftsize));
339  }
340  }
341 
342  ghost_cu_deviceprop prop;
344 
345  GHOST_INSTR_START("spmv_cuda")
346  if (rhs->traits.storage == GHOST_DENSEMAT_COLMAJOR || (rhs->stride == 1)) {
348  block.y = MIN(MAX_COLS_PER_BLOCK_COLMAJOR, rhs->traits.ncols);
349  while (block.x * block.y > SELL_CUDA_THREADSPERBLOCK && block.x > 32) {
350  block.x -= 32;
351  }
352 
353  grid.x = CEILDIV(SPM_NROWSPAD(mat), block.x);
355  if (opts.flags & GHOST_SPMV_DOT) {
356  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_localdot, sizeof(v_dt_device) * rhs->traits.ncols * 3 * grid.x));
357  }
358  size_t reqSmem = 0;
359  if (opts.flags & GHOST_SPMV_DOT) {
360  reqSmem = sizeof(v_dt_device) * 32 * block.y;
361  }
362  if (prop.sharedMemPerBlock < reqSmem) {
363  GHOST_WARNING_LOG("Not enough shared memory available! CUDA kernel will not execute!");
364  }
365  GHOST_DEBUG_LOG(1, "grid %dx%d block %dx%d shmem %zu", grid.x, grid.y, block.x, block.y, reqSmem);
366  SELL_kernel_CU_cm_tmpl<m_dt, v_dt_device, v_dt_base, 0, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, reqSmem>>>((v_dt_device *)lhsval, lhs->stride, (v_dt_device *)rhsval, rhs->stride, opts.flags, mat->context->row_map->dim, mat->cu_rowLen, mat->cu_col, (m_dt *)mat->cu_val, mat->cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
367  } else {
368  if (ncols > 8) {// 16 rows per block, 9...16 columns per block, 144...256 threads per block
369  const int nrowsinblock = 16;
370  grid.x = CEILDIV(SPM_NROWS(mat), nrowsinblock);
371  grid.y = CEILDIV(rhs->traits.ncols, nrowsinblock);
372  block.x = nrowsinblock * CEILDIV(rhs->traits.ncols, grid.y);
373  if (opts.flags & GHOST_SPMV_DOT) {
374  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_localdot, sizeof(v_dt_device) * rhs->traits.ncols * 3 * grid.x));
375  }
376  GHOST_DEBUG_LOG(1, "grid %dx%d block %dx%d nrowsinblock %d", grid.x, grid.y, block.x, block.y, nrowsinblock);
377  SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, 0>>>((v_dt_device *)lhsval, lhs->stride, (v_dt_device *)rhsval, rhs->stride, opts.flags, mat->context->row_map->dim, mat->cu_rowLen, mat->cu_col, (m_dt *)mat->cu_val, mat->cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
378  } else if (ncols > 4) {// 32 rows per block, 5...8 columns per block, 160...256 threads per block
379  const int nrowsinblock = 32;
380  grid.x = CEILDIV(SPM_NROWS(mat), nrowsinblock);
381  grid.y = 1;
382  block.x = nrowsinblock * rhs->traits.ncols;
383  if (opts.flags & GHOST_SPMV_DOT) {
384  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_localdot, sizeof(v_dt_device) * rhs->traits.ncols * 3 * grid.x));
385  }
386  GHOST_DEBUG_LOG(1, "grid %dx%d block %dx%d nrowsinblock %d", grid.x, grid.y, block.x, block.y, nrowsinblock);
387  SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, 0>>>((v_dt_device *)lhsval, lhs->stride, (v_dt_device *)rhsval, rhs->stride, opts.flags, mat->context->row_map->dim, mat->cu_rowLen, mat->cu_col, (m_dt *)mat->cu_val, mat->cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
388  } else if (ncols > 2) {// 64 rows per block, 3...4 columns per block, 192...256 threads per block
389  const int nrowsinblock = 64;
390  grid.x = CEILDIV(SPM_NROWS(mat), nrowsinblock);
391  grid.y = 1;
392  block.x = nrowsinblock * rhs->traits.ncols;
393  if (opts.flags & GHOST_SPMV_DOT) {
394  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_localdot, sizeof(v_dt_device) * rhs->traits.ncols * 3 * grid.x));
395  }
396  int smem = (block.x / 32) * sizeof(v_dt_device);
397  GHOST_DEBUG_LOG(1, "grid %dx%d block %dx%d nrowsinblock %d smem %d", grid.x, grid.y, block.x, block.y, nrowsinblock, smem);
398  SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, smem>>>((v_dt_device *)lhsval, lhs->stride, (v_dt_device *)rhsval, rhs->stride, opts.flags, mat->context->row_map->dim, mat->cu_rowLen, mat->cu_col, (m_dt *)mat->cu_val, mat->cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
399  } else if (ncols > 1) {// 64 rows per block, 3...4 columns per block, 192...256 threads per block
400  const int nrowsinblock = 128;
401  grid.x = CEILDIV(SPM_NROWS(mat), nrowsinblock);
402  grid.y = 1;
403  block.x = nrowsinblock * rhs->traits.ncols;
404  if (opts.flags & GHOST_SPMV_DOT) {
405  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_localdot, sizeof(v_dt_device) * rhs->traits.ncols * 3 * grid.x));
406  }
407  int smem = (block.x / 32) * sizeof(v_dt_device);
408  GHOST_DEBUG_LOG(1, "grid %dx%d block %dx%d nrowsinblock %d smem %d", grid.x, grid.y, block.x, block.y, nrowsinblock, smem);
409  SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, smem>>>((v_dt_device *)lhsval, lhs->stride, (v_dt_device *)rhsval, rhs->stride, opts.flags, mat->context->row_map->dim, mat->cu_rowLen, mat->cu_col, (m_dt *)mat->cu_val, mat->cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
410  } else {
411  const int nrowsinblock = 256;
412  grid.x = CEILDIV(SPM_NROWS(mat), nrowsinblock);
413  grid.y = 1;
414  block.x = nrowsinblock * rhs->traits.ncols;
415  if (opts.flags & GHOST_SPMV_DOT) {
416  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_localdot, sizeof(v_dt_device) * rhs->traits.ncols * 3 * grid.x));
417  }
418  int smem = (block.x / 32) * sizeof(v_dt_device);
419  GHOST_DEBUG_LOG(1, "grid %dx%d block %dx%d nrowsinblock %d smem %d", grid.x, grid.y, block.x, block.y, nrowsinblock, smem);
420  SELL_kernel_CU_rm_tmpl<m_dt, v_dt_device, v_dt_base, nrowsinblock, C, ncols, do_axpby, do_scale, do_vshift, do_dot_yy, do_dot_xy, do_dot_xx, do_chain_axpby><<<grid, block, smem>>>((v_dt_device *)lhsval, lhs->stride, (v_dt_device *)rhsval, rhs->stride, opts.flags, mat->context->row_map->dim, mat->cu_rowLen, mat->cu_col, (m_dt *)mat->cu_val, mat->cu_chunkStart, (v_dt_device *)cu_shift, (v_dt_device)scale, (v_dt_device)beta, (v_dt_device *)cu_localdot, (v_dt_device *)zval, zstride, (v_dt_device)sdelta, (v_dt_device)seta);
421  }
422  }
423  CUDA_CALL_RETURN(cudaGetLastError());
424  if (lhscompact != lhs) {
425  GHOST_DEBUG_LOG(1, "Transform lhs back");
426  GHOST_CALL_RETURN(ghost_densemat_init_densemat(lhs, lhscompact, 0, 0));
427  ghost_densemat_destroy(lhscompact);
428  }
429  if (rhscompact != rhs) {
430  GHOST_DEBUG_LOG(1, "Transform rhs back");
431  GHOST_CALL_RETURN(ghost_densemat_init_densemat(rhs, rhscompact, 0, 0));
432  ghost_densemat_destroy(rhscompact);
433  }
434  cudaDeviceSynchronize();
435  GHOST_INSTR_STOP("spmv_cuda")
436  if (opts.flags & GHOST_SPMV_DOT) {
437 #ifdef LOCALDOT_ONTHEFLY
438  GHOST_INSTR_START("spmv_cuda_dot_reduction")
439  v_dt_device *cu_localdot_result;
440  GHOST_CALL_RETURN(ghost_cu_malloc((void **)&cu_localdot_result, sizeof(v_dt_device) * rhs->traits.ncols));
441  if (opts.flags & GHOST_SPMV_DOT_YY) {
442  GHOST_CALL_RETURN(ghost_cu_memset(cu_localdot_result, 0, sizeof(v_dt_device) * rhs->traits.ncols));
443 
444  /* ghost_lidx col;
445  for (col = 0; col < rhs->traits.ncols; col++) {
446  ghost_cu_reduce(&cu_localdot_result[col], &cu_localdot[grid.x * col + 0 * rhs->traits.ncols * grid.x], rhs->traits.datatype, grid.x);
447  }*/
448 
449  ghost_cu_reduce_multiple(cu_localdot_result, &cu_localdot[0 * rhs->traits.ncols * grid.x], rhs->traits.datatype, grid.x, rhs->traits.ncols);
450  GHOST_CALL_RETURN(ghost_cu_download(localdot, cu_localdot_result, rhs->traits.ncols * sizeof(v_dt_host)));
451  }
452  if (opts.flags & GHOST_SPMV_DOT_XY) {
453  GHOST_CALL_RETURN(ghost_cu_memset(cu_localdot_result, 0, sizeof(v_dt_device) * rhs->traits.ncols));
454 
455  /*ghost_lidx col;
456  for (col = 0; col < rhs->traits.ncols; col++) {
457  ghost_cu_reduce(&cu_localdot_result[col], &cu_localdot[grid.x * col + 1 * rhs->traits.ncols * grid.x], rhs->traits.datatype, grid.x);
458  }*/
459 
460  ghost_cu_reduce_multiple(cu_localdot_result, &cu_localdot[1 * rhs->traits.ncols * grid.x], rhs->traits.datatype, grid.x, rhs->traits.ncols);
461  GHOST_CALL_RETURN(ghost_cu_download(&localdot[rhs->traits.ncols], cu_localdot_result, rhs->traits.ncols * sizeof(v_dt_host)));
462  }
463  if (opts.flags & GHOST_SPMV_DOT_XX) {
464  GHOST_CALL_RETURN(ghost_cu_memset(cu_localdot_result, 0, sizeof(v_dt_device) * rhs->traits.ncols));
465 
466  /* ghost_lidx col;
467  for (col = 0; col < rhs->traits.ncols; col++) {
468  ghost_cu_reduce(&cu_localdot_result[col], &cu_localdot[grid.x * col + 2 * rhs->traits.ncols * grid.x], rhs->traits.datatype, grid.x);
469  }*/
470 
471  ghost_cu_reduce_multiple(cu_localdot_result, &cu_localdot[2 * rhs->traits.ncols * grid.x], rhs->traits.datatype, grid.x, rhs->traits.ncols);
472 
473  GHOST_CALL_RETURN(ghost_cu_download(&localdot[2 * rhs->traits.ncols], cu_localdot_result, rhs->traits.ncols * sizeof(v_dt_host)));
474  }
475  GHOST_CALL_RETURN(ghost_cu_free(cu_localdot_result));
476  GHOST_INSTR_STOP("spmv_cuda_dot_reduction")
477 #else
478  GHOST_INSTR_START("spmv_cuda_dot")
479  GHOST_PERFWARNING_LOG("Not doing the local dot product on-the-fly!");
480  memset(localdot, 0, rhs->traits.ncols * 3 * sizeof(v_dt_host));
481  ghost_localdot(&localdot[0], lhs, lhs);
482  ghost_localdot(&localdot[rhs->traits.ncols], rhs, lhs);
483  ghost_localdot(&localdot[2 * rhs->traits.ncols], rhs, rhs);
484  GHOST_INSTR_STOP("spmv_cuda_dot")
485 #endif
486  }
487  //if (traits.flags & GHOST_SPMV_CHAIN_AXPBY) {
488  // GHOST_PERFWARNING_LOG("AXPBY will not be done on-the-fly!");
489  // z->axpby(z,lhs,&seta,&sdelta);
490  // }
491  if (opts.flags & GHOST_SPMV_DOT) {
492  GHOST_CALL_RETURN(ghost_cu_free(cu_localdot));
493  }
494  if (opts.flags & (GHOST_SPMV_SHIFT | GHOST_SPMV_VSHIFT)) {
495  GHOST_CALL_RETURN(ghost_cu_free(cu_shift));
496  }
497  GHOST_FUNC_EXIT(GHOST_FUNCTYPE_MATH);
498  return ret;
499 }
ghost_lidx dim
The local dimension for this rank.
Definition: map.h:104
ghost_spmv_flags flags
Definition: sparsemat.h:72
Functions for global mathematical operations.
int ghost_cu_deviceprop
Definition: cu_util.h:27
ghost_error ghost_cu_upload(void *devmem, void *hostmem, size_t bytesize)
Upload memory from the the host to the GPU.
Definition: cu_util.c:160
#define CUDA_CALL_RETURN(call)
Definition: error.h:223
__global__ void SELL_kernel_CU_cm_tmpl(v_t *const __restrict__ lhs, const ghost_lidx lhs_lda, const v_t *const __restrict__ rhs, const ghost_lidx rhs_lda, const ghost_spmv_flags flags, const ghost_lidx nrows, const ghost_lidx *const __restrict__ rowlen, const ghost_lidx *const __restrict__ mcol, const m_t *const __restrict__ val, const ghost_lidx *const __restrict__ chunkstart, const v_t *const __restrict__ shift, const v_t alpha, const v_t beta, v_t *const __restrict__ localdot, v_t *const __restrict__ z, const ghost_lidx z_lda, const v_t delta, const v_t eta)
Definition: sell_spmv_cu_kernel.h:144
Header file for type definitions.
Definition: spmv.h:22
__inline__ __device__ v_t ghost_1dPartialBlockReduceSum(v_t val, int nwarps)
Definition: cu_sell_kernel.h:131
Definition: spmv.h:27
Definition: spmv.h:25
__global__ void SELL_kernel_CU_rm_tmpl(v_t *const __restrict__ lhs, const ghost_lidx lhs_lda, const v_t *const __restrict__ rhs, const ghost_lidx rhs_lda, const ghost_spmv_flags flags, const ghost_lidx nrows, const ghost_lidx *const __restrict__ rowlen, const ghost_lidx *const __restrict__ mcol, const m_t *const __restrict__ val, const ghost_lidx *const __restrict__ chunkstart, const v_t *const __restrict__ shift, const v_t alpha, const v_t beta, v_t *const __restrict__ localdot, v_t *const __restrict__ z, const ghost_lidx z_lda, const v_t delta, const v_t eta)
Definition: sell_spmv_cu_kernel.h:27
#define MAX_COLS_PER_BLOCK_COLMAJOR
Definition: sell_spmv_cu_kernel.h:19
#define PAD(n, p)
Pad a number to a multiple of a second number.
Definition: util.h:44
#define GHOST_PERFWARNING_LOG(...)
Definition: log.h:124
#define GHOST_SPMV_PARSE_TRAITS(traits, _alpha, _beta, _gamma, _dot, _z, _delta, _eta, dt_in, dt_out)
Parse the SPMV arguments.
Definition: spmv.h:69
Column-major storage (as in Fortran).
Definition: densemat.h:103
No error occured.
Definition: error.h:27
ghost_map * row_map
The row map of this context.
Definition: context.h:144
#define SPM_NROWSPAD(mat)
Definition: sparsemat.h:668
#define GHOST_SPMV_DOT
Definition: spmv.h:37
ghost_lidx * cu_rowLen
The length of each row.
Definition: sparsemat.h:560
int32_t ghost_lidx
Definition: types.h:503
ghost_error ghost_cu_malloc(void **mem, size_t bytesize)
Allocate CUDA device memory.
Definition: cu_util.c:88
Definition: spmv.h:17
__device__ T scale(T y, T a)
Definition: cu_complex.h:275
ghost_error ghost_cu_free(void *mem)
Free GPU memory.
Definition: cu_util.c:232
ghost_error ghost_cu_device(int *device)
Get the active GPU.
Definition: cu_util.c:435
#define GHOST_INSTR_STOP(tag)
Instrumentation will be ignored.
Definition: instr.h:135
Types, functions and macros for error handling.
ghost_error
Error return type.
Definition: error.h:23
#define CEILDIV(a, b)
Definition: util.h:46
ghost_datatype datatype
The data type.
Definition: densemat.h:189
ghost_error ghost_sellspmv_cu_tmpl(ghost_densemat *lhs, ghost_sparsemat *mat, ghost_densemat *rhs, ghost_spmv_opts opts)
Definition: sell_spmv_cu_kernel.h:277
ghost_spmv_flags
Flags to be passed to sparse matrix-vector multiplication.
Definition: spmv.h:15
General utility functions.
ghost_densemat_flags flags
Property flags.
Definition: densemat.h:181
ghost_error ghost_cu_download(void *hostmem, void *devmem, size_t bytesize)
Download memory from a GPU to the host.
Definition: cu_util.c:215
ghost_error ghost_localdot(void *res, ghost_densemat *vec1, ghost_densemat *vec2)
Compute the local dot product of two dense vectors/matrices.
Definition: dot.cpp:40
void ghost_densemat_destroy(ghost_densemat *vec)
Destroys a densemat, i.e., frees all its data structures.
Definition: densemat.c:581
#define MIN(x, y)
Definition: timing.h:14
ghost_error ghost_cu_reduce_multiple(void *out, void *data, ghost_datatype dt, ghost_lidx n, ghost_lidx ncols)
ghost_context * context
The context of the matrix (if distributed).
Definition: sparsemat.h:498
ghost_lidx stride
The leading dimensions of the densemat in memory.
Definition: densemat.h:264
#define GHOST_DEBUG_LOG(level,...)
Definition: log.h:114
Definition: spmv.h:31
Macros used for code instrumentation.
#define SPM_NROWS(mat)
Definition: sparsemat.h:664
A sparse matrix.
Definition: sparsemat.h:476
__inline__ __device__ v_t ghost_blockReduceSum(v_t val)
Definition: cu_sell_kernel.h:163
#define GHOST_DENSEMAT_SCATTERED
Definition: densemat.h:89
ghost_lidx ncols
The number of columns.
Definition: densemat.h:172
Inline template functions for CUDA complex number handling.
#define GHOST_CALL_RETURN(call)
This macro should be used for calling a GHOST function inside a function which itself returns a ghost...
Definition: error.h:122
Definition: sparsemat.h:71
ghost_lidx * cu_chunkStart
Pointer to start of each chunk.
Definition: sparsemat.h:568
ghost_error ghost_densemat_init_densemat(ghost_densemat *x, ghost_densemat *y, ghost_lidx roffs, ghost_lidx coffs)
Initializes a densemat from another densemat at a given column and row offset.
Definition: densemat.c:776
ghost_densemat_traits traits
The densemat's traits.
Definition: densemat.h:231
__inline__ __device__ v_t ghost_partialWarpReduceSum(v_t val, int size, int width)
Definition: cu_sell_kernel.h:62
#define SELL_CUDA_THREADSPERBLOCK
Definition: sell_spmv_cu_kernel.h:20
Definition: spmv.h:26
ghost_error ghost_cu_memset(void *s, int c, size_t n)
Memset GPU memory.
Definition: cu_util.c:145
Macros for logging.
#define GHOST_INSTR_START(tag)
Instrumentation will be ignored.
Definition: instr.h:127
char * cu_val
The CUDA matrix.
Definition: sparsemat.h:552
ghost_error ghost_densemat_clone(ghost_densemat **dst, ghost_densemat *src, ghost_lidx nc, ghost_lidx coffs)
Create a ghost_densemat as a clone of another ghost_densemat at a column given offset.
Definition: densemat.c:820
char * cu_val
The values of the densemat on the CUDA device.
Definition: densemat.h:284
#define GHOST_WARNING_LOG(...)
Definition: log.h:123
ghost_error ghost_cu_deviceprop_get(ghost_cu_deviceprop *prop)
Get the CUDA device properties.
Definition: cu_util.c:517
ghost_densemat_storage storage
The storage order.
Definition: densemat.h:185
ghost_lidx * cu_col
The column indices.
Definition: sparsemat.h:556
A dense vector/matrix.
Definition: densemat.h:226