GHOST  1.1.2
General, Hybrid, and Optimized Sparse Toolkit
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
tsmttsm_cu_kernel.h
Go to the documentation of this file.
1 
7 #ifndef GHOST_TSMTTSM_CU_KERNEL_H
8 #define GHOST_TSMTTSM_CU_KERNEL_H
9 #include <cublas_v2.h>
10 #include <iostream>
11 #include <typeinfo>
12 #include "ghost/config.h"
13 #include "ghost/cu_complex.h"
14 #include "ghost/cu_util.h"
15 #include "ghost/types.h"
17 
18 namespace {
19 void *d_temp_storage = NULL;
20 size_t temp_storage_bytes = 0;
21 template<typename oT, int M, int N>
22 __global__ void deviceReduce(
23  oT *blockResults, oT *result, oT alpha, oT beta, int blockCount, int lda, int ldb, int ldc)
24 {
25  int tidx = blockDim.x * blockIdx.x + threadIdx.x;
26  if (tidx >= M * N) return;
27  int n = tidx / M;
28  int m = tidx % M;
29 
30  oT sum;
31  zero(sum);
32  for (int i = 0; i < blockCount; i++) {
33  sum = accu(sum, blockResults[i * N * M + n * M + m]);
34  }
35 
36  result[n * ldc + m] = accu(scale(result[n * ldc + m], beta), scale2(sum, alpha));
37 }
38 
39 template<typename T, bool conjv, bool TRANSPOSE>
40 __device__ T condConj1(T v)
41 {
42  if (conjv && !TRANSPOSE) v = conj(v);
43  return v;
44 }
45 template<typename T, bool conjv, bool TRANSPOSE>
46 __device__ T condConj2(T v)
47 {
48  if (conjv && TRANSPOSE) v = conj(v);
49  return v;
50 }
51 
52 // round to next smaller power of two
53 __device__ int roundPoT(int v)
54 {
55  int r = v;
56  r |= r >> 1;
57  r |= r >> 2;
58  r |= r >> 4;
59  r |= r >> 8;
60  r |= r >> 16;
61  r -= (r >> 1);
62  return r;
63 }
64 
65 template<typename T, typename oT, int conjv, int M, int N, int BLOCKSIZE, bool TRANSPOSE, bool SELF>
66 __global__ void __launch_bounds__(BLOCKSIZE) genv7_blockProductKernel(
67  const T *A, const T *B, oT *out, const int K, const int lda, const int ldb, const int ldc)
68 {
69  const int rowsPerBlock = BLOCKSIZE / M;
70  int m = threadIdx.x % M;
71  int localRow = threadIdx.x / M;
72  int bOffset = localRow * ldb + m;
73  int aOffset = localRow * lda + m;
74  if (m >= N) bOffset = localRow * ldb + 0;
75  if (bOffset >= rowsPerBlock * ldb) bOffset = 0;
76  if (aOffset >= rowsPerBlock * lda) aOffset = 0;
77 
78  __shared__ oT blockStorage[rowsPerBlock * M * (sizeof(T) > sizeof(oT) ? 2 : 1)];
79  T *rowCache = reinterpret_cast<T *>(blockStorage);
80 
81  zero(blockStorage[threadIdx.x]);
82  __syncthreads();
83 
84  oT threadSum[N];
85  for (int n = 0; n < N; n++) {
86  zero(threadSum[n]);
87  }
88 
89  // Block synchronous loop
90  int idx = blockIdx.x * rowsPerBlock;
91  T avNow = __ldg(A + idx * lda + aOffset);
92  T bvNow = __ldg(B + idx * ldb + bOffset);
93  T avNext;
94  T bvNext;
95  zero(avNext);
96  zero(bvNext);
97 
98  for (; idx < K - rowsPerBlock; idx += gridDim.x * rowsPerBlock) {
99  int idxNext = min(K - rowsPerBlock, idx + gridDim.x * rowsPerBlock);
100  avNext = __ldg(A + idxNext * lda + aOffset);
101 
102  if (!SELF) {
103  bvNext = __ldg(B + idxNext * ldb + bOffset);
104  } else {
105  bvNext = avNext;
106  }
107  __syncthreads();
108  rowCache[threadIdx.x] = bvNow;
109  __syncthreads();
110 
111  int localAddress = threadIdx.x - m;
112  for (int n = 0; n < N; n++) {
113  threadSum[n] = axpy(threadSum[n], condConj1<oT, conjv, TRANSPOSE>((oT)avNow),
114  condConj2<oT, conjv, TRANSPOSE>((oT)rowCache[localAddress + n]));
115  }
116  avNow = avNext;
117  bvNow = bvNext;
118  }
119 
120  // Remainder loop
121  for (idx = idx + localRow; idx < K; idx += gridDim.x * rowsPerBlock) {
122  T av = A[idx * lda + m];
123  for (int n = 0; n < N; n++) {
124  threadSum[n] = axpy(threadSum[n], condConj1<oT, conjv, TRANSPOSE>((oT)av),
125  condConj2<oT, conjv, TRANSPOSE>((oT)B[idx * ldb + n]));
126  }
127  }
128 
129  const int redSteps = roundPoT(rowsPerBlock);
130 
131  // Calculate block results
132  for (int n = 0; n < N; n++) {
133  __syncthreads();
134  blockStorage[threadIdx.x] = threadSum[n];
135  __syncthreads();
136 
137  for (unsigned int s = redSteps; s > 0; s /= 2) {
138  if (localRow < s && localRow < rowsPerBlock - s) {
139  blockStorage[localRow * M + m] =
140  accu(blockStorage[localRow * M + m], blockStorage[(localRow + s) * M + m]);
141  }
142  __syncthreads();
143  }
144 
145  if (threadIdx.x < M) {
146  if (TRANSPOSE) {
147  out[blockIdx.x * M * N + m * N + n] = blockStorage[m];
148  } else {
149  out[blockIdx.x * N * M + n * M + m] = blockStorage[m];
150  }
151  }
152  }
153 }
154 } // namespace
155 
156 template<typename T, typename oT, int M, int N, int conjv>
157 static ghost_error ghost_tsmttsm_cu_rm(oT *const __restrict__ C, const T *const __restrict__ A,
158  const T *const __restrict__ B, const oT alpha, const oT beta, ghost_lidx K, ghost_lidx ldc,
159  ghost_lidx lda, ghost_lidx ldb)
160 {
162 
163  const int targetBlockSize = 256;
164  int deviceUsed;
165  cudaGetDevice(&deviceUsed);
166  cudaDeviceProp prop;
168 
169  int numBlocks;
170 
171  if (N > M) {
172  int const blockSize = (targetBlockSize / N) * N;
173  CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
174  genv7_blockProductKernel<T, oT, conjv, M, N, blockSize, true, false>, blockSize, 0),
175  ret);
176  } else {
177  int const blockSize = (targetBlockSize / M) * M;
178  if (M == N && A == B) {
179  CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
180  genv7_blockProductKernel<T, oT, conjv, M, N, blockSize, false, true>,
181  blockSize, 0),
182  ret);
183  } else {
184  CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks,
185  genv7_blockProductKernel<T, oT, conjv, M, N, blockSize, false, false>,
186  blockSize, 0),
187  ret);
188  }
189  }
190  int blockCount = min( prop.multiProcessorCount * numBlocks, K*N / 10 / targetBlockSize + 1);
191 
192 
193  size_t required_temp_storage_bytes = M * N * blockCount * sizeof(oT);
194  ghost_cu_temp_buffer_malloc(&d_temp_storage, required_temp_storage_bytes);
195 
196 
197  if (N > M) {
198  int const blockSize = (targetBlockSize / N) * N;
199  genv7_blockProductKernel<T, oT, conjv, N, M, blockSize, true, false>
200  <<<blockCount, blockSize>>>(B, A, (oT *)d_temp_storage, K, ldb, lda, ldc);
201  } else {
202  int const blockSize = (targetBlockSize / M) * M;
203  if (M == N && A == B) {
204  genv7_blockProductKernel<T, oT, conjv, M, N, blockSize, false, true>
205  <<<blockCount, blockSize>>>(A, B, (oT *)d_temp_storage, K, lda, ldb, ldc);
206  } else {
207  genv7_blockProductKernel<T, oT, conjv, M, N, blockSize, false, false>
208  <<<blockCount, blockSize>>>(A, B, (oT *)d_temp_storage, K, lda, ldb, ldc);
209  }
210  }
211 
212  CUDA_CALL(cudaGetLastError(), ret);
213  deviceReduce<oT, M, N>
214  <<<(M * N) / 256 + 1, 256>>>((oT *)d_temp_storage, C, alpha, beta, blockCount, lda, ldb, ldc);
215  CUDA_CALL(cudaGetLastError(), ret);
216  ghost_cu_temp_buffer_free(d_temp_storage);
217  return ret;
218 }
219 #endif
Header file for type definitions.
__device__ T conj(T x)
Definition: cu_complex.h:226
#define CUDA_CALL(call, __err)
Definition: error.h:238
No error occured.
Definition: error.h:27
int32_t ghost_lidx
Definition: types.h:503
__device__ T scale(T y, T a)
Definition: cu_complex.h:275
__device__ T1 scale2(T1 y, T2 a)
Definition: cu_complex.h:293
ghost_error
Error return type.
Definition: error.h:23
__device__ __host__ void zero(T &val)
Definition: cu_complex.h:12
CUDA utility functions.
#define N
Definition: bench.c:20
__device__ t accu(t val, t val2)
Definition: cu_complex.h:103
Inline template functions for CUDA complex number handling.
ghost_error ghost_cu_temp_buffer_free(void *mem)
Frees memory allocated with ghost_cu_temp_buffer_malloc. Threadsafe.
Definition: cu_temp_buffer_malloc.cpp:65
__device__ T axpy(T val, T val2, T2 val3)
Definition: cu_complex.h:122
static ghost_error ghost_tsmttsm_cu_rm(oT *const __restrict__ C, const T *const __restrict__ A, const T *const __restrict__ B, const oT alpha, const oT beta, ghost_lidx K, ghost_lidx ldc, ghost_lidx lda, ghost_lidx ldb)
Definition: tsmttsm_cu_kernel.h:157
ghost_error ghost_cu_deviceprop_get(ghost_cu_deviceprop *prop)
Get the CUDA device properties.
Definition: cu_util.c:517
ghost_error ghost_cu_temp_buffer_malloc(void **mem, size_t bytesize)
Useful for allocating small temporary buffers. Keeps a list of previously allocated and freed buffers...
Definition: cu_temp_buffer_malloc.cpp:27