GHOST  1.1.2
General, Hybrid, and Optimized Sparse Toolkit
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
cu_sell_kernel.h
Go to the documentation of this file.
1 #ifndef GHOST_CU_SELL_KERNEL_H
2 #define GHOST_CU_SELL_KERNEL_H
3 
4 #include "ghost/types.h"
5 #include "ghost/cu_complex.h"
6 #include <cuda.h>
7 
8 extern __shared__ char shared[];
9 
10 
11 template<typename v_t>
12 __device__ inline v_t ghost_shfl_down32(v_t var, unsigned int srcLane) {
13  return ghost_shfl_down(var, srcLane, 32);
14 }
15 
16 
17 template<typename v_t>
18 __device__ inline v_t ghost_shfl_down(v_t var, unsigned int srcLane, int width)
19 {
20 #if __CUDACC_VER_MAJOR__ < 9
21  return __shfl_down(var, srcLane, width);
22 #else
23  return __shfl_down_sync(0xFFFFFFFF, var, srcLane, width);
24 #endif
25 }
26 
27 template<>
28 __device__ inline cuFloatComplex ghost_shfl_down<cuFloatComplex>(cuFloatComplex var, unsigned int srcLane, int width)
29 {
30  float2 a = *reinterpret_cast<float2 *>(&var);
31  a.x = ghost_shfl_down(a.x, srcLane, width);
32  a.y = ghost_shfl_down(a.y, srcLane, width);
33  return *reinterpret_cast<cuFloatComplex *>(&a);
34 }
35 
36 template<>
37 __device__ inline cuDoubleComplex ghost_shfl_down<cuDoubleComplex>(cuDoubleComplex var, unsigned int srcLane, int width)
38 {
39  double2 a = *reinterpret_cast<double2 *>(&var);
40  a.x = ghost_shfl_down(a.x, srcLane, width);
41  a.y = ghost_shfl_down(a.y, srcLane, width);
42  return *reinterpret_cast<cuDoubleComplex *>(&a);
43 }
44 
45 // This assumes that the warpSize is 32.
46 // Hard-coding this enhances the performance significantly due to unrolling
47 template<typename v_t>
48 __inline__ __device__
49  v_t
51 {
52 #pragma unroll
53  for (int offset = 32 / 2; offset > 0; offset /= 2) {
54  val = accu<v_t>(val, ghost_shfl_down32(val, offset));
55  }
56  return val;
57 }
58 
59 template<typename v_t>
60 __inline__ __device__
61  v_t
62  ghost_partialWarpReduceSum(v_t val, int size, int width)
63 {
64  for (int offset = size / 2; offset > 0; offset /= 2) {
65  val = accu<v_t>(val, ghost_shfl_down(val, offset, width));
66  }
67  return val;
68 }
69 
70 // fixed width/warpSize=32, templated size
71 template<typename v_t, int size>
72 __inline__ __device__
73  v_t
75 {
76 #pragma unroll
77  for (int offset = size / 2; offset > 0; offset /= 2) {
78  val = accu<v_t>(val, ghost_shfl_down32(val, offset));
79  }
80  return val;
81 }
82 
83 
84 template<>
85 __inline__ __device__
86  double3
88 {
89  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
90  val.x += ghost_shfl_down(val.x, offset, warpSize);
91  val.y += ghost_shfl_down(val.y, offset, warpSize);
92  val.z += ghost_shfl_down(val.z, offset, warpSize);
93  }
94  return val;
95 }
96 
97 template<typename v_t>
98 __inline__ __device__
99  v_t
100  ghost_partialBlockReduceSum(v_t val, int size)
101 {
102 
103  v_t *shmem = (v_t *)shared;// Shared mem for 32 partial sums
104 
105  int lane = (threadIdx.x % warpSize);
106  int wid = (threadIdx.x / warpSize);
107 
108  val = ghost_warpReduceSum(val);// Each warp performs partial reduction
109 
110  if (threadIdx.x % warpSize == 0)
111  shmem[wid] = val;// Write reduced value to shared memory
112 
113  __syncthreads();// Wait for all partial reductions
114 
115  //read from shared memory only if that warp existed
116  if (threadIdx.x < blockDim.x / warpSize) {
117  val = shmem[lane];
118  } else {
119  zero<v_t>(val);
120  }
121 
122  if (threadIdx.x / warpSize == 0)
123  val = ghost_partialWarpReduceSum(val, size, warpSize);//Final reduce within first warp
124 
125  return val;
126 }
127 
128 template<typename v_t>
129 __inline__ __device__
130  v_t
131  ghost_1dPartialBlockReduceSum(v_t val, int nwarps)
132 {
133 
134  v_t *shmem = (v_t *)shared;// Shared mem for 32 partial sums
135 
136  int lane = (threadIdx.x % warpSize);
137  int wid = (threadIdx.x / warpSize);
138 
139  val = ghost_warpReduceSum(val);// Each warp performs partial reduction
140 
141  if (threadIdx.x % warpSize == 0)
142  shmem[wid] = val;// Write reduced value to shared memory
143 
144  __syncthreads();// Wait for all partial reductions
145 
146  //read from shared memory only if that warp existed
147  if (threadIdx.x < blockDim.x / warpSize) {
148  val = shmem[lane];
149  } else {
150  zero<v_t>(val);
151  }
152 
153  if (threadIdx.x / warpSize == 0) {
154  val = ghost_partialWarpReduceSum(val, nwarps, nwarps);//Final reduce within first warp
155  }
156 
157  return val;
158 }
159 
160 template<typename v_t>
161 __inline__ __device__
162  v_t
164 {
165 
166  __shared__ v_t shmem[32];// Shared mem for 32 partial sums
167 
168  int lane = (threadIdx.x % warpSize) + (32 * threadIdx.y);
169  int wid = (threadIdx.x / warpSize) + (32 * threadIdx.y);
170 
171  val = ghost_warpReduceSum(val);// Each warp performs partial reduction
172 
173  if (threadIdx.x % warpSize == 0)
174  shmem[wid] = val;// Write reduced value to shared memory
175 
176  __syncthreads();// Wait for all partial reductions
177 
178  //read from shared memory only if that warp existed
179  if (threadIdx.x < blockDim.x / warpSize) {
180  val = shmem[lane];
181  } else {
182  zero<v_t>(val);
183  }
184 
185  if (threadIdx.x / warpSize == 0)
186  val = ghost_warpReduceSum(val);//Final reduce within first warp
187 
188  return val;
189 }
190 
191 template<>
192 __inline__ __device__
193  double3
195 {
196 
197  double3 *shmem = (double3 *)shared;// Shared mem for 32 partial sums
198 
199  int lane = (threadIdx.x % warpSize) + (32 * threadIdx.y);
200  int wid = (threadIdx.x / warpSize) + (32 * threadIdx.y);
201 
202  val = ghost_warpReduceSum(val);// Each warp performs partial reduction
203 
204  if (threadIdx.x % warpSize == 0)
205  shmem[wid] = val;// Write reduced value to shared memory
206 
207  __syncthreads();// Wait for all partial reductions
208 
209  //read from shared memory only if that warp existed
210  if (threadIdx.x < blockDim.x / warpSize) {
211  val = shmem[lane];
212  } else {
213  val.x = 0.;
214  val.y = 0.;
215  val.z = 0.;
216  }
217 
218  if (threadIdx.x / warpSize == 0)
219  val = ghost_warpReduceSum(val);//Final reduce within first warp
220 
221  return val;
222 }
223 /*
224  template<typename v_t>
225  __global__ void deviceReduceKernel(v_t *in, v_t* out, int N) {
226  v_t sum;
227 //printf("<%d,%d>::: %f {%p} %f\n",threadIdx.x,threadIdx.y,in[0],in,sum);
228 zero<v_t>(sum);
229 //reduce multiple elements per thread
230 for (int i = blockIdx.x * blockDim.x + threadIdx.x;
231 i < N;
232 i += blockDim.x * gridDim.x) {
233 sum = axpy<v_t>(sum,in[i],1.f);
234 }
235 sum = blockReduceSum(sum);
236 if (threadIdx.x==0) {
237 out[blockIdx.x*blockDim.y+threadIdx.y]=sum;
238 }
239 if (gridDim.x > 1 && threadIdx.x==0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
240 int N = gridDim.x;
241 dim3 grid((int)(ceil(gridDim.x/(double)blockDim.x)),gridDim.y);
242 dim3 block(blockDim.x,blockDim.y);
243 printf("recursive call with grid %dx%d block %dx%d N %d from grid %dx%d block %dx%d\n",grid.x,grid.y,block.x,block.y,N,gridDim.x,gridDim.y,blockDim.x,blockDim.y);
244 __syncthreads();
245 deviceReduceKernel<<<grid,block,grid.y*block.y*32*sizeof(v_t)>>> (in,out,N);
246 __syncthreads();
247 }
248 
249 }
250 
251 template<typename v_t>
252 __global__ void localdotKernel(v_t *lhs, int lhs_lda, v_t* rhs, int rhs_lda, v_t *localdot) {
253 v_t dot1, dot2, dot3;
254 zero<v_t>(dot1);
255 zero<v_t>(dot2);
256 zero<v_t>(dot3);
257 int i = threadIdx.x+blockIdx.x*blockDim.x;
258 int col = blockDim.y*blockIdx.y+threadIdx.y;
259 
260 dot1 = axpy<v_t>(dot1,lhs[lhs_lda*i+col],lhs[lhs_lda*i+col]);
261 dot2 = axpy<v_t>(dot2,rhs[rhs_lda*i+col],lhs[lhs_lda*i+col]);
262 dot3 = axpy<v_t>(dot3,rhs[rhs_lda*i+col],rhs[rhs_lda*i+col]);
263 
264 dot1 = blockReduceSum(dot1);
265 __syncthreads();
266 dot2 = blockReduceSum(dot2);
267 __syncthreads();
268 dot3 = blockReduceSum(dot3);
269 __syncthreads();
270 
271 if (threadIdx.x==0) {
272 localdot[3*col + 0] = dot1;
273 localdot[3*col + 1] = dot2;
274 localdot[3*col + 2] = dot3;
275 }
276 }
277  */
278 
279 struct CustomSum {
280  template<typename T>
281  __device__ __forceinline__
282  T
283  operator()(const T &a, const T &b) const
284  {
285  return a + b;
286  }
287  __device__ __forceinline__
288  cuDoubleComplex
289  operator()(const cuDoubleComplex &a, const cuDoubleComplex &b) const
290  {
291  return cuCadd(a, b);
292  }
293  __device__ __forceinline__
294  cuFloatComplex
295  operator()(const cuFloatComplex &a, const cuFloatComplex &b) const
296  {
297  return cuCaddf(a, b);
298  }
299 };
300 
301 
302 #if (__CUDA_ARCH__ >= 600)
303 template<typename v_t>
304 __device__ inline void ghost_atomicAdd(v_t *addr, v_t val)
305 {
306  atomicAdd(addr, val);
307 }
308 
309 template<>
310 __device__ inline void ghost_atomicAdd(cuDoubleComplex *addr, cuDoubleComplex val)
311 {
312  atomicAdd((double *)addr, Real<cuDoubleComplex, double>(val));
313  atomicAdd((double *)addr + 1, Imag<cuDoubleComplex, double>(val));
314 }
315 
316 template<>
317 __device__ inline void ghost_atomicAdd(cuFloatComplex *addr, cuFloatComplex val)
318 {
319  atomicAdd((float *)addr, Real<cuFloatComplex, float>(val));
320  atomicAdd((float *)addr + 1, Imag<cuFloatComplex, float>(val));
321 }
322 
323 template<typename v_t>
324 __global__ void ghost_deviceReduceSum(v_t *in, v_t *out, ghost_lidx N)
325 {
326  ghost_lidx i;
327  v_t sum;
328  zero<v_t>(sum);
329 
330  for (i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
331  sum = accu<v_t>(sum, in[i]);
332  }
333  sum = ghost_warpReduceSum(sum);
334  if ((threadIdx.x & (warpSize - 1)) == 0)
335  ghost_atomicAdd<v_t>(out, sum);
336 }
337 
338 
339 template<typename v_t>
340 __global__ void ghost_deviceReduceSumMultiple(v_t *in, v_t *out, ghost_lidx N, ghost_lidx ncols)
341 {
342 
343  ghost_lidx col = blockIdx.x;
344  if (col > ncols)
345  return;
346 
347  v_t sum;
348  zero<v_t>(sum);
349  for (ghost_lidx i = threadIdx.x; i < N; i += blockDim.x) {
350  sum = accu<v_t>(sum, in[col * N + i]);
351  }
352 
353  sum = ghost_warpReduceSum(sum);
354  if ((threadIdx.x & (warpSize - 1)) == 0)
355  ghost_atomicAdd<v_t>(out + col, sum);
356 }
357 
358 #else
359 
360 template<typename v_t>
361 __global__ void ghost_deviceReduceSum(v_t *in, v_t *out, ghost_lidx N)
362 {
363 
364  ghost_lidx i;
365  v_t sum;
366  zero<v_t>(sum);
367 
368  for (i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
369  sum = accu<v_t>(sum, in[i]);
370  }
371  sum = ghost_blockReduceSum(sum);
372  if (threadIdx.x == 0) {
373  out[blockIdx.x] = sum;
374  }
375 }
376 
377 template<typename v_t>
378 __global__ void ghost_deviceReduceSumMultiple(v_t *in, v_t *out, ghost_lidx N, ghost_lidx ncols)
379 {
380 
381  ghost_lidx col = blockIdx.x;
382  if (col > ncols)
383  return;
384 
385  v_t sum;
386  zero<v_t>(sum);
387  for (ghost_lidx i = threadIdx.x; i < N; i += blockDim.x) {
388  sum = accu<v_t>(sum, in[col * N + i]);
389  }
390  sum = ghost_blockReduceSum(sum);
391  if (threadIdx.x == 0) {
392  out[col] = sum;
393  }
394 }
395 
396 #endif
397 
398 template<typename T>
399 __device__ __inline__ T streaming_load(const T *addr)
400 {
401  return *addr;
402 }
403 template<>
404 __device__ __inline__ double streaming_load(const double *addr)
405 {
406  double ret;
407  asm("ld.global.cs.f64 %0, [%1];"
408  : "=d"(ret)
409  : "l"(addr));
410  return ret;
411 }
412 template<>
413 __device__ __inline__ float streaming_load(const float *addr)
414 {
415  float ret;
416  asm("ld.global.cs.f32 %0, [%1];"
417  : "=f"(ret)
418  : "l"(addr));
419  return ret;
420 }
421 template<>
422 __device__ __inline__ cuDoubleComplex streaming_load(const cuDoubleComplex *addr)
423 {
424  double re, im;
425  asm("ld.global.cs.f64 %0, [%1];"
426  : "=d"(re)
427  : "l"((const double *)addr));
428  asm("ld.global.cs.f64 %0, [%1+8];"
429  : "=d"(im)
430  : "l"((const double *)addr));
431  return make_cuDoubleComplex(re, im);
432 }
433 template<>
434 __device__ __inline__ cuFloatComplex streaming_load(const cuFloatComplex *addr)
435 {
436  float re, im;
437  asm("ld.global.cs.f32 %0, [%1];"
438  : "=f"(re)
439  : "l"((const float *)addr));
440  asm("ld.global.cs.f32 %0, [%1+4];"
441  : "=f"(im)
442  : "l"((const float *)addr));
443  return make_cuFloatComplex(re, im);
444 }
445 
446 template<typename T>
447 __device__ __inline__ void streaming_store(T *addr, const T val)
448 {
449  *addr = val;
450 }
451 template<>
452 __device__ __inline__ void streaming_store(double *addr, const double val)
453 {
454  asm("st.global.cs.f64 [%0], %1;" ::"l"(addr), "d"(val));
455 }
456 template<>
457 __device__ __inline__ void streaming_store(float *addr, const float val)
458 {
459  asm("st.global.cs.f32 [%0], %1;" ::"l"(addr), "f"(val));
460 }
461 template<>
462 __device__ __inline__ void streaming_store(cuDoubleComplex *addr, const cuDoubleComplex val)
463 {
464  double re, im;
465  re = cuCreal(val);
466  im = cuCimag(val);
467  asm("st.global.cs.f64 [%0], %1;" ::"l"((double *)addr), "d"(re));
468  asm("st.global.cs.f64 [%0+8], %1;" ::"l"((double *)addr), "d"(im));
469 }
470 
471 
472 #endif
__device__ double Real< cuDoubleComplex, double >(cuDoubleComplex val)
Definition: cu_complex.h:72
Header file for type definitions.
__device__ __forceinline__ cuFloatComplex operator()(const cuFloatComplex &a, const cuFloatComplex &b) const
Definition: cu_sell_kernel.h:295
__inline__ __device__ v_t ghost_1dPartialBlockReduceSum(v_t val, int nwarps)
Definition: cu_sell_kernel.h:131
__shared__ char shared[]
__global__ void ghost_deviceReduceSumMultiple(v_t *in, v_t *out, ghost_lidx N, ghost_lidx ncols)
Definition: cu_sell_kernel.h:378
__inline__ __device__ v_t ghost_partialWarpReduceSumFast(v_t val)
Definition: cu_sell_kernel.h:74
int32_t ghost_lidx
Definition: types.h:503
Definition: cu_sell_kernel.h:279
__device__ v_t ghost_shfl_down(v_t var, unsigned int srcLane, int width)
Definition: cu_sell_kernel.h:18
__device__ __forceinline__ cuDoubleComplex operator()(const cuDoubleComplex &a, const cuDoubleComplex &b) const
Definition: cu_sell_kernel.h:289
__device__ double Imag< cuDoubleComplex, double >(cuDoubleComplex val)
Definition: cu_complex.h:90
__device__ v_t ghost_shfl_down32(v_t var, unsigned int srcLane)
Definition: cu_sell_kernel.h:12
__device__ __forceinline__ T operator()(const T &a, const T &b) const
Definition: cu_sell_kernel.h:283
__device__ __inline__ T streaming_load(const T *addr)
Definition: cu_sell_kernel.h:399
__inline__ __device__ double3 ghost_blockReduceSum< double3 >(double3 val)
Definition: cu_sell_kernel.h:194
__device__ float Imag< cuFloatComplex, float >(cuFloatComplex val)
Definition: cu_complex.h:96
__device__ cuDoubleComplex ghost_shfl_down< cuDoubleComplex >(cuDoubleComplex var, unsigned int srcLane, int width)
Definition: cu_sell_kernel.h:37
__inline__ __device__ v_t ghost_warpReduceSum(v_t val)
Definition: cu_sell_kernel.h:50
__inline__ __device__ double3 ghost_warpReduceSum< double3 >(double3 val)
Definition: cu_sell_kernel.h:87
__device__ cuFloatComplex ghost_shfl_down< cuFloatComplex >(cuFloatComplex var, unsigned int srcLane, int width)
Definition: cu_sell_kernel.h:28
#define N
Definition: bench.c:20
__inline__ __device__ v_t ghost_blockReduceSum(v_t val)
Definition: cu_sell_kernel.h:163
__inline__ __device__ v_t ghost_partialBlockReduceSum(v_t val, int size)
Definition: cu_sell_kernel.h:100
Inline template functions for CUDA complex number handling.
__global__ void ghost_deviceReduceSum(v_t *in, v_t *out, ghost_lidx N)
Definition: cu_sell_kernel.h:361
__device__ float Real< cuFloatComplex, float >(cuFloatComplex val)
Definition: cu_complex.h:78
__inline__ __device__ v_t ghost_partialWarpReduceSum(v_t val, int size, int width)
Definition: cu_sell_kernel.h:62
__device__ __inline__ void streaming_store(T *addr, const T val)
Definition: cu_sell_kernel.h:447