1 #ifndef GHOST_CU_SELL_KERNEL_H
2 #define GHOST_CU_SELL_KERNEL_H
8 extern __shared__
char shared[];
11 template<
typename v_t>
17 template<
typename v_t>
20 #if __CUDACC_VER_MAJOR__ < 9
21 return __shfl_down(var, srcLane, width);
23 return __shfl_down_sync(0xFFFFFFFF, var, srcLane, width);
30 float2 a = *
reinterpret_cast<float2 *
>(&var);
33 return *
reinterpret_cast<cuFloatComplex *
>(&a);
39 double2 a = *
reinterpret_cast<double2 *
>(&var);
42 return *
reinterpret_cast<cuDoubleComplex *
>(&a);
47 template<
typename v_t>
53 for (
int offset = 32 / 2; offset > 0; offset /= 2) {
59 template<
typename v_t>
64 for (
int offset = size / 2; offset > 0; offset /= 2) {
71 template<
typename v_t,
int size>
77 for (
int offset = size / 2; offset > 0; offset /= 2) {
89 for (
int offset = warpSize / 2; offset > 0; offset /= 2) {
97 template<
typename v_t>
103 v_t *shmem = (v_t *)
shared;
105 int lane = (threadIdx.x % warpSize);
106 int wid = (threadIdx.x / warpSize);
110 if (threadIdx.x % warpSize == 0)
116 if (threadIdx.x < blockDim.x / warpSize) {
122 if (threadIdx.x / warpSize == 0)
128 template<
typename v_t>
129 __inline__ __device__
134 v_t *shmem = (v_t *)
shared;
136 int lane = (threadIdx.x % warpSize);
137 int wid = (threadIdx.x / warpSize);
141 if (threadIdx.x % warpSize == 0)
147 if (threadIdx.x < blockDim.x / warpSize) {
153 if (threadIdx.x / warpSize == 0) {
160 template<
typename v_t>
161 __inline__ __device__
166 __shared__ v_t shmem[32];
168 int lane = (threadIdx.x % warpSize) + (32 * threadIdx.y);
169 int wid = (threadIdx.x / warpSize) + (32 * threadIdx.y);
173 if (threadIdx.x % warpSize == 0)
179 if (threadIdx.x < blockDim.x / warpSize) {
185 if (threadIdx.x / warpSize == 0)
192 __inline__ __device__
197 double3 *shmem = (double3 *)
shared;
199 int lane = (threadIdx.x % warpSize) + (32 * threadIdx.y);
200 int wid = (threadIdx.x / warpSize) + (32 * threadIdx.y);
204 if (threadIdx.x % warpSize == 0)
210 if (threadIdx.x < blockDim.x / warpSize) {
218 if (threadIdx.x / warpSize == 0)
281 __device__ __forceinline__
287 __device__ __forceinline__
289 operator()(
const cuDoubleComplex &a,
const cuDoubleComplex &b)
const
293 __device__ __forceinline__
295 operator()(
const cuFloatComplex &a,
const cuFloatComplex &b)
const
297 return cuCaddf(a, b);
302 #if (__CUDA_ARCH__ >= 600)
303 template<
typename v_t>
304 __device__
inline void ghost_atomicAdd(v_t *addr, v_t val)
306 atomicAdd(addr, val);
310 __device__
inline void ghost_atomicAdd(cuDoubleComplex *addr, cuDoubleComplex val)
317 __device__
inline void ghost_atomicAdd(cuFloatComplex *addr, cuFloatComplex val)
323 template<
typename v_t>
330 for (i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
331 sum = accu<v_t>(sum, in[i]);
334 if ((threadIdx.x & (warpSize - 1)) == 0)
335 ghost_atomicAdd<v_t>(out, sum);
339 template<
typename v_t>
349 for (
ghost_lidx i = threadIdx.x; i < N; i += blockDim.x) {
350 sum = accu<v_t>(sum, in[col * N + i]);
354 if ((threadIdx.x & (warpSize - 1)) == 0)
355 ghost_atomicAdd<v_t>(out + col, sum);
360 template<
typename v_t>
368 for (i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
369 sum = accu<v_t>(sum, in[i]);
372 if (threadIdx.x == 0) {
373 out[blockIdx.x] = sum;
377 template<
typename v_t>
387 for (
ghost_lidx i = threadIdx.x; i < N; i += blockDim.x) {
388 sum = accu<v_t>(sum, in[col * N + i]);
391 if (threadIdx.x == 0) {
407 asm(
"ld.global.cs.f64 %0, [%1];"
416 asm(
"ld.global.cs.f32 %0, [%1];"
422 __device__ __inline__ cuDoubleComplex
streaming_load(
const cuDoubleComplex *addr)
425 asm(
"ld.global.cs.f64 %0, [%1];"
427 :
"l"((
const double *)addr));
428 asm(
"ld.global.cs.f64 %0, [%1+8];"
430 :
"l"((
const double *)addr));
431 return make_cuDoubleComplex(re, im);
437 asm(
"ld.global.cs.f32 %0, [%1];"
439 :
"l"((
const float *)addr));
440 asm(
"ld.global.cs.f32 %0, [%1+4];"
442 :
"l"((
const float *)addr));
443 return make_cuFloatComplex(re, im);
454 asm(
"st.global.cs.f64 [%0], %1;" ::
"l"(addr),
"d"(val));
459 asm(
"st.global.cs.f32 [%0], %1;" ::
"l"(addr),
"f"(val));
462 __device__ __inline__
void streaming_store(cuDoubleComplex *addr,
const cuDoubleComplex val)
467 asm(
"st.global.cs.f64 [%0], %1;" ::
"l"((
double *)addr),
"d"(re));
468 asm(
"st.global.cs.f64 [%0+8], %1;" ::
"l"((
double *)addr),
"d"(im));
__device__ double Real< cuDoubleComplex, double >(cuDoubleComplex val)
Definition: cu_complex.h:72
Header file for type definitions.
__device__ __forceinline__ cuFloatComplex operator()(const cuFloatComplex &a, const cuFloatComplex &b) const
Definition: cu_sell_kernel.h:295
__inline__ __device__ v_t ghost_1dPartialBlockReduceSum(v_t val, int nwarps)
Definition: cu_sell_kernel.h:131
__global__ void ghost_deviceReduceSumMultiple(v_t *in, v_t *out, ghost_lidx N, ghost_lidx ncols)
Definition: cu_sell_kernel.h:378
__inline__ __device__ v_t ghost_partialWarpReduceSumFast(v_t val)
Definition: cu_sell_kernel.h:74
int32_t ghost_lidx
Definition: types.h:503
Definition: cu_sell_kernel.h:279
__device__ v_t ghost_shfl_down(v_t var, unsigned int srcLane, int width)
Definition: cu_sell_kernel.h:18
__device__ __forceinline__ cuDoubleComplex operator()(const cuDoubleComplex &a, const cuDoubleComplex &b) const
Definition: cu_sell_kernel.h:289
__device__ double Imag< cuDoubleComplex, double >(cuDoubleComplex val)
Definition: cu_complex.h:90
__device__ v_t ghost_shfl_down32(v_t var, unsigned int srcLane)
Definition: cu_sell_kernel.h:12
__device__ __forceinline__ T operator()(const T &a, const T &b) const
Definition: cu_sell_kernel.h:283
__device__ __inline__ T streaming_load(const T *addr)
Definition: cu_sell_kernel.h:399
__inline__ __device__ double3 ghost_blockReduceSum< double3 >(double3 val)
Definition: cu_sell_kernel.h:194
__device__ float Imag< cuFloatComplex, float >(cuFloatComplex val)
Definition: cu_complex.h:96
__device__ cuDoubleComplex ghost_shfl_down< cuDoubleComplex >(cuDoubleComplex var, unsigned int srcLane, int width)
Definition: cu_sell_kernel.h:37
__inline__ __device__ v_t ghost_warpReduceSum(v_t val)
Definition: cu_sell_kernel.h:50
__inline__ __device__ double3 ghost_warpReduceSum< double3 >(double3 val)
Definition: cu_sell_kernel.h:87
__device__ cuFloatComplex ghost_shfl_down< cuFloatComplex >(cuFloatComplex var, unsigned int srcLane, int width)
Definition: cu_sell_kernel.h:28
#define N
Definition: bench.c:20
__inline__ __device__ v_t ghost_blockReduceSum(v_t val)
Definition: cu_sell_kernel.h:163
__inline__ __device__ v_t ghost_partialBlockReduceSum(v_t val, int size)
Definition: cu_sell_kernel.h:100
Inline template functions for CUDA complex number handling.
__global__ void ghost_deviceReduceSum(v_t *in, v_t *out, ghost_lidx N)
Definition: cu_sell_kernel.h:361
__device__ float Real< cuFloatComplex, float >(cuFloatComplex val)
Definition: cu_complex.h:78
__inline__ __device__ v_t ghost_partialWarpReduceSum(v_t val, int size, int width)
Definition: cu_sell_kernel.h:62
__device__ __inline__ void streaming_store(T *addr, const T val)
Definition: cu_sell_kernel.h:447