ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
spgemm_rmerge.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
2 #define VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <stdexcept>
26 
27 #include "viennacl/forwards.h"
28 #include "viennacl/scalar.hpp"
29 #include "viennacl/vector.hpp"
30 #include "viennacl/tools/tools.hpp"
32 
33 #include "viennacl/tools/timer.hpp"
34 
36 
37 namespace viennacl
38 {
39 namespace linalg
40 {
41 namespace cuda
42 {
43 
45 template<typename NumericT>
46 static inline __device__ NumericT load_and_cache(const NumericT *address)
47 {
48 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
49  return __ldg(address);
50 #else
51  return *address;
52 #endif
53 }
54 
55 
56 //
57 // Stage 1: Obtain upper bound for number of elements per row in C:
58 //
59 template<typename IndexT>
60 __device__ IndexT round_to_next_power_of_2(IndexT val)
61 {
62  if (val > 32)
63  return 64; // just to indicate that we need to split/factor the matrix!
64  else if (val > 16)
65  return 32;
66  else if (val > 8)
67  return 16;
68  else if (val > 4)
69  return 8;
70  else if (val > 2)
71  return 4;
72  else if (val > 1)
73  return 2;
74  else
75  return 1;
76 }
77 
78 template<typename IndexT>
79 __global__ void compressed_matrix_gemm_stage_1(
80  const IndexT * A_row_indices,
81  const IndexT * A_col_indices,
82  IndexT A_size1,
83  const IndexT * B_row_indices,
84  IndexT *subwarpsize_per_group,
85  IndexT *max_nnz_row_A_per_group,
86  IndexT *max_nnz_row_B_per_group)
87 {
88  unsigned int subwarpsize_in_thread = 0;
89  unsigned int max_nnz_row_A = 0;
90  unsigned int max_nnz_row_B = 0;
91 
92  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
93  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
94 
95  for (unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; row < row_per_group_end; row += blockDim.x)
96  {
97  unsigned int A_row_start = A_row_indices[row];
98  unsigned int A_row_end = A_row_indices[row+1];
99  unsigned int row_num = A_row_end - A_row_start;
100  subwarpsize_in_thread = max(A_row_end - A_row_start, subwarpsize_in_thread);
101  max_nnz_row_A = max(max_nnz_row_A, row_num);
102  for (unsigned int j = A_row_start; j < A_row_end; ++j)
103  {
104  unsigned int col = A_col_indices[j];
105  unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
106  max_nnz_row_B = max(row_len_B, max_nnz_row_B);
107  }
108  }
109 
110  // reduction to obtain maximum in thread block
111  __shared__ unsigned int shared_subwarpsize[256];
112  __shared__ unsigned int shared_max_nnz_row_A[256];
113  __shared__ unsigned int shared_max_nnz_row_B[256];
114 
115  shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
116  shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
117  shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
118  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
119  {
120  __syncthreads();
121  if (threadIdx.x < stride)
122  {
123  shared_subwarpsize[threadIdx.x] = max( shared_subwarpsize[threadIdx.x], shared_subwarpsize[threadIdx.x + stride]);
124  shared_max_nnz_row_A[threadIdx.x] = max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + stride]);
125  shared_max_nnz_row_B[threadIdx.x] = max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + stride]);
126  }
127  }
128 
129  if (threadIdx.x == 0)
130  {
131  subwarpsize_per_group[blockIdx.x] = round_to_next_power_of_2(shared_subwarpsize[0]);
132  max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
133  max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
134  }
135 }
136 
137 //
138 // Stage 2: Determine sparsity pattern of C
139 //
140 
141 // Using warp shuffle routines (CUDA arch 3.5)
142 template<unsigned int SubWarpSizeV, typename IndexT>
143 __device__ IndexT subwarp_minimum_shuffle(IndexT min_index)
144 {
145  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
146  min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
147  return min_index;
148 }
149 
150 // Using shared memory
151 template<unsigned int SubWarpSizeV, typename IndexT>
152 __device__ IndexT subwarp_minimum_shared(IndexT min_index, IndexT id_in_warp, IndexT *shared_buffer)
153 {
154  shared_buffer[threadIdx.x] = min_index;
155  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
156  shared_buffer[threadIdx.x] = min(shared_buffer[threadIdx.x], shared_buffer[(threadIdx.x + i) % 512]);
157  return shared_buffer[threadIdx.x - id_in_warp];
158 }
159 
160 
161 template<unsigned int SubWarpSizeV, typename IndexT>
163  const IndexT * A_row_indices,
164  const IndexT * A_col_indices,
165  IndexT A_size1,
166  const IndexT * B_row_indices,
167  const IndexT * B_col_indices,
168  IndexT B_size2,
169  IndexT * C_row_indices)
170 {
171  __shared__ unsigned int shared_buffer[512];
172 
173  unsigned int num_warps = blockDim.x / SubWarpSizeV;
174  unsigned int warp_id = threadIdx.x / SubWarpSizeV;
175  unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
176 
177  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
178  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
179 
180  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
181  {
182  unsigned int row_A_start = A_row_indices[row];
183  unsigned int row_A_end = A_row_indices[row+1];
184 
185  unsigned int my_row_B = row_A_start + id_in_warp;
186  unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
187  unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
188  unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
189 
190  unsigned int num_nnz = 0;
191  if (row_A_end - row_A_start > 1) // zero or no row can be processed faster
192  {
193  unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
194 
195  while (1)
196  {
197  // determine current minimum (warp shuffle)
198  unsigned int min_index = current_front_index;
199  min_index = subwarp_minimum_shared<SubWarpSizeV>(min_index, id_in_warp, shared_buffer);
200 
201  if (min_index == B_size2)
202  break;
203 
204  // update front:
205  if (current_front_index == min_index)
206  {
207  ++row_B_start;
208  current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
209  }
210 
211  ++num_nnz;
212  }
213  }
214  else
215  {
216  num_nnz = row_B_end - row_B_start;
217  }
218 
219  if (id_in_warp == 0)
220  C_row_indices[row] = num_nnz;
221  }
222 
223 }
224 
225 
226 //
227 // Stage 3: Fill C with values
228 //
229 
230 // Using warp shuffle routines (CUDA arch 3.5)
231 template<unsigned int SubWarpSizeV, typename NumericT>
233 {
234  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
235  output_value += __shfl_xor((int)output_value, (int)i);
236  return output_value;
237 }
238 
239 // Using shared memory
240 template<unsigned int SubWarpSizeV, typename NumericT>
241 __device__ NumericT subwarp_accumulate_shared(NumericT output_value, unsigned int id_in_warp, NumericT *shared_buffer)
242 {
243  shared_buffer[threadIdx.x] = output_value;
244  for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
245  shared_buffer[threadIdx.x] += shared_buffer[(threadIdx.x + i) % 512];
246  return shared_buffer[threadIdx.x - id_in_warp];
247 }
248 
249 
250 template<unsigned int SubWarpSizeV, typename IndexT, typename NumericT>
252  const IndexT * A_row_indices,
253  const IndexT * A_col_indices,
254  const NumericT * A_elements,
255  IndexT A_size1,
256  const IndexT * B_row_indices,
257  const IndexT * B_col_indices,
258  const NumericT * B_elements,
259  IndexT B_size2,
260  IndexT const * C_row_indices,
261  IndexT * C_col_indices,
262  NumericT * C_elements)
263 {
264  __shared__ unsigned int shared_indices[512];
265  __shared__ NumericT shared_values[512];
266 
267  unsigned int num_warps = blockDim.x / SubWarpSizeV;
268  unsigned int warp_id = threadIdx.x / SubWarpSizeV;
269  unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
270 
271  unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
272  unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
273 
274  for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
275  {
276  unsigned int row_A_start = A_row_indices[row];
277  unsigned int row_A_end = A_row_indices[row+1];
278 
279  unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? id_in_warp : 0); // special case: single row
280  unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
281  unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
282  unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
283  NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
284 
285  unsigned int index_in_C = C_row_indices[row];
286 
287  if (row_A_end - row_A_start > 1)
288  {
289  unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
290  NumericT current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements + row_B_start) : 0;
291 
292  unsigned int index_buffer = 0;
293  NumericT value_buffer = 0;
294  unsigned int buffer_size = 0;
295  while (1)
296  {
297  // determine current minimum:
298  unsigned int min_index = subwarp_minimum_shared<SubWarpSizeV>(current_front_index, id_in_warp, shared_indices);
299 
300  if (min_index == B_size2) // done
301  break;
302 
303  // compute entry in C:
304  NumericT output_value = (current_front_index == min_index) ? val_A * current_front_value : 0;
305  output_value = subwarp_accumulate_shared<SubWarpSizeV>(output_value, id_in_warp, shared_values);
306 
307  // update front:
308  if (current_front_index == min_index)
309  {
310  ++row_B_start;
311  current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
312  current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements + row_B_start) : 0;
313  }
314 
315  // write current front to register buffer:
316  index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
317  value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
318  ++buffer_size;
319 
320  // flush register buffer via a coalesced write once full:
321  if (buffer_size == SubWarpSizeV)
322  {
323  C_col_indices[index_in_C + id_in_warp] = index_buffer;
324  C_elements[index_in_C + id_in_warp] = value_buffer;
325  }
326 
327  index_in_C += (buffer_size == SubWarpSizeV) ? SubWarpSizeV : 0;
328  buffer_size = (buffer_size == SubWarpSizeV) ? 0 : buffer_size;
329  }
330 
331  // write remaining entries in register buffer to C:
332  if (id_in_warp < buffer_size)
333  {
334  C_col_indices[index_in_C + id_in_warp] = index_buffer;
335  C_elements[index_in_C + id_in_warp] = value_buffer;
336  }
337  }
338  else // write respective row using the full subwarp:
339  {
340  for (unsigned int i = row_B_start + id_in_warp; i < row_B_end; i += SubWarpSizeV)
341  {
342  C_col_indices[index_in_C + id_in_warp] = load_and_cache(B_col_indices + i);
343  C_elements[index_in_C + id_in_warp] = val_A * load_and_cache(B_elements + i);
344  index_in_C += SubWarpSizeV;
345  }
346  }
347 
348  }
349 
350 }
351 
352 
353 
354 //
355 // Decomposition kernels:
356 //
357 template<typename IndexT>
358 __global__ void compressed_matrix_gemm_decompose_1(
359  const IndexT * A_row_indices,
360  IndexT A_size1,
361  IndexT max_per_row,
362  IndexT *chunks_per_row)
363 {
364  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
365  {
366  IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
367  chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
368  }
369 }
370 
371 
372 template<typename IndexT, typename NumericT>
373 __global__ void compressed_matrix_gemm_A2(
374  IndexT * A2_row_indices,
375  IndexT * A2_col_indices,
376  NumericT * A2_elements,
377  IndexT A2_size1,
378  IndexT *new_row_buffer)
379 {
380  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
381  {
382  unsigned int index_start = new_row_buffer[i];
383  unsigned int index_stop = new_row_buffer[i+1];
384 
385  A2_row_indices[i] = index_start;
386 
387  for (IndexT j = index_start; j < index_stop; ++j)
388  {
389  A2_col_indices[j] = j;
390  A2_elements[j] = NumericT(1);
391  }
392  }
393 
394  // write last entry in row_buffer with global thread 0:
395  if (threadIdx.x == 0 && blockIdx.x == 0)
396  A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
397 }
398 
399 template<typename IndexT, typename NumericT>
400 __global__ void compressed_matrix_gemm_G1(
401  IndexT * G1_row_indices,
402  IndexT * G1_col_indices,
403  NumericT * G1_elements,
404  IndexT G1_size1,
405  IndexT const *A_row_indices,
406  IndexT const *A_col_indices,
407  NumericT const *A_elements,
408  IndexT A_size1,
409  IndexT A_nnz,
410  IndexT max_per_row,
411  IndexT *new_row_buffer)
412 {
413  // Part 1: Copy column indices and entries:
414  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
415  {
416  G1_col_indices[i] = A_col_indices[i];
417  G1_elements[i] = A_elements[i];
418  }
419 
420  // Part 2: Derive new row indicies:
421  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
422  {
423  unsigned int old_start = A_row_indices[i];
424  unsigned int new_start = new_row_buffer[i];
425  unsigned int row_chunks = new_row_buffer[i+1] - new_start;
426 
427  for (IndexT j=0; j<row_chunks; ++j)
428  G1_row_indices[new_start + j] = old_start + j * max_per_row;
429  }
430 
431  // write last entry in row_buffer with global thread 0:
432  if (threadIdx.x == 0 && blockIdx.x == 0)
433  G1_row_indices[G1_size1] = A_row_indices[A_size1];
434 }
435 
436 
437 
447 template<class NumericT, unsigned int AlignmentV>
451 {
452  C.resize(A.size1(), B.size2(), false);
453 
454  unsigned int blocknum = 256;
455  unsigned int threadnum = 128;
456 
457  viennacl::vector<unsigned int> subwarp_sizes(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
458  viennacl::vector<unsigned int> max_nnz_row_A(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
459  viennacl::vector<unsigned int> max_nnz_row_B(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
460 
461  //
462  // Stage 1: Determine upper bound for number of nonzeros
463  //
464  compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
465  viennacl::cuda_arg<unsigned int>(A.handle2()),
466  static_cast<unsigned int>(A.size1()),
467  viennacl::cuda_arg<unsigned int>(B.handle1()),
468  viennacl::cuda_arg(subwarp_sizes),
469  viennacl::cuda_arg(max_nnz_row_A),
470  viennacl::cuda_arg(max_nnz_row_B)
471  );
472  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_1");
473 
474  subwarp_sizes.switch_memory_context(viennacl::context(MAIN_MEMORY));
475  unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
476 
477  max_nnz_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
478  unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
479 
480  max_nnz_row_B.switch_memory_context(viennacl::context(MAIN_MEMORY));
481  unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
482 
483  unsigned int max_subwarp_size = 0;
484  //std::cout << "Scratchpad offsets: " << std::endl;
485  for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
486  max_subwarp_size = std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
487  unsigned int A_max_nnz_per_row = 0;
488  for (std::size_t i=0; i<max_nnz_row_A.size(); ++i)
489  A_max_nnz_per_row = std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
490 
491  if (max_subwarp_size > 32)
492  {
493  // determine augmented size:
494  unsigned int max_entries_in_G = 32;
495  if (A_max_nnz_per_row <= 256)
496  max_entries_in_G = 16;
497  if (A_max_nnz_per_row <= 64)
498  max_entries_in_G = 8;
499 
500  viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
501  compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
502  static_cast<unsigned int>(A.size1()),
503  static_cast<unsigned int>(max_entries_in_G),
504  viennacl::cuda_arg(exclusive_scan_helper)
505  );
506  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_decompose_1");
507 
508  viennacl::linalg::exclusive_scan(exclusive_scan_helper);
509  unsigned int augmented_size = exclusive_scan_helper[A.size1()];
510 
511  // split A = A2 * G1
512  viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
514 
515  // fill A2:
516  compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
517  viennacl::cuda_arg<unsigned int>(A2.handle2()),
518  viennacl::cuda_arg<NumericT>(A2.handle()),
519  static_cast<unsigned int>(A2.size1()),
520  viennacl::cuda_arg(exclusive_scan_helper)
521  );
522  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_A2");
523 
524  // fill G1:
525  compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
526  viennacl::cuda_arg<unsigned int>(G1.handle2()),
527  viennacl::cuda_arg<NumericT>(G1.handle()),
528  static_cast<unsigned int>(G1.size1()),
529  viennacl::cuda_arg<unsigned int>(A.handle1()),
530  viennacl::cuda_arg<unsigned int>(A.handle2()),
531  viennacl::cuda_arg<NumericT>(A.handle()),
532  static_cast<unsigned int>(A.size1()),
533  static_cast<unsigned int>(A.nnz()),
534  static_cast<unsigned int>(max_entries_in_G),
535  viennacl::cuda_arg(exclusive_scan_helper)
536  );
537  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_G1");
538 
539  // compute tmp = G1 * B;
540  // C = A2 * tmp;
542  prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
543  prod_impl(A2, tmp, C); // this may split A2 again
544  return;
545  }
546 
547  //std::cout << "Running RMerge with subwarp size " << max_subwarp_size << std::endl;
548 
549  subwarp_sizes.switch_memory_context(viennacl::traits::context(A));
550  max_nnz_row_A.switch_memory_context(viennacl::traits::context(A));
551  max_nnz_row_B.switch_memory_context(viennacl::traits::context(A));
552 
553  //
554  // Stage 2: Determine pattern of C
555  //
556 
557  if (max_subwarp_size == 32)
558  {
559  compressed_matrix_gemm_stage_2<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
560  viennacl::cuda_arg<unsigned int>(A.handle2()),
561  static_cast<unsigned int>(A.size1()),
562  viennacl::cuda_arg<unsigned int>(B.handle1()),
563  viennacl::cuda_arg<unsigned int>(B.handle2()),
564  static_cast<unsigned int>(B.size2()),
565  viennacl::cuda_arg<unsigned int>(C.handle1())
566  );
567  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
568  }
569  else if (max_subwarp_size == 16)
570  {
571  compressed_matrix_gemm_stage_2<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
572  viennacl::cuda_arg<unsigned int>(A.handle2()),
573  static_cast<unsigned int>(A.size1()),
574  viennacl::cuda_arg<unsigned int>(B.handle1()),
575  viennacl::cuda_arg<unsigned int>(B.handle2()),
576  static_cast<unsigned int>(B.size2()),
577  viennacl::cuda_arg<unsigned int>(C.handle1())
578  );
579  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
580  }
581  else
582  {
583  compressed_matrix_gemm_stage_2<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
584  viennacl::cuda_arg<unsigned int>(A.handle2()),
585  static_cast<unsigned int>(A.size1()),
586  viennacl::cuda_arg<unsigned int>(B.handle1()),
587  viennacl::cuda_arg<unsigned int>(B.handle2()),
588  static_cast<unsigned int>(B.size2()),
589  viennacl::cuda_arg<unsigned int>(C.handle1())
590  );
591  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
592  }
593 
594  // exclusive scan on C.handle1(), ultimately allowing to allocate remaining memory for C
596  viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
597  unsigned int current_offset = 0;
598  for (std::size_t i=0; i<C.size1(); ++i)
599  {
600  unsigned int tmp = row_buffer[i];
601  row_buffer.set(i, current_offset);
602  current_offset += tmp;
603  }
604  row_buffer.set(C.size1(), current_offset);
605  viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
606 
607 
608  //
609  // Stage 3: Compute entries in C
610  //
611  C.reserve(current_offset, false);
612 
613  if (max_subwarp_size == 32)
614  {
615  compressed_matrix_gemm_stage_3<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
616  viennacl::cuda_arg<unsigned int>(A.handle2()),
617  viennacl::cuda_arg<NumericT>(A.handle()),
618  static_cast<unsigned int>(A.size1()),
619  viennacl::cuda_arg<unsigned int>(B.handle1()),
620  viennacl::cuda_arg<unsigned int>(B.handle2()),
621  viennacl::cuda_arg<NumericT>(B.handle()),
622  static_cast<unsigned int>(B.size2()),
623  viennacl::cuda_arg<unsigned int>(C.handle1()),
624  viennacl::cuda_arg<unsigned int>(C.handle2()),
625  viennacl::cuda_arg<NumericT>(C.handle())
626  );
627  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
628  }
629  else if (max_subwarp_size == 16)
630  {
631  compressed_matrix_gemm_stage_3<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
632  viennacl::cuda_arg<unsigned int>(A.handle2()),
633  viennacl::cuda_arg<NumericT>(A.handle()),
634  static_cast<unsigned int>(A.size1()),
635  viennacl::cuda_arg<unsigned int>(B.handle1()),
636  viennacl::cuda_arg<unsigned int>(B.handle2()),
637  viennacl::cuda_arg<NumericT>(B.handle()),
638  static_cast<unsigned int>(B.size2()),
639  viennacl::cuda_arg<unsigned int>(C.handle1()),
640  viennacl::cuda_arg<unsigned int>(C.handle2()),
641  viennacl::cuda_arg<NumericT>(C.handle())
642  );
643  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
644  }
645  else
646  {
647  compressed_matrix_gemm_stage_3<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
648  viennacl::cuda_arg<unsigned int>(A.handle2()),
649  viennacl::cuda_arg<NumericT>(A.handle()),
650  static_cast<unsigned int>(A.size1()),
651  viennacl::cuda_arg<unsigned int>(B.handle1()),
652  viennacl::cuda_arg<unsigned int>(B.handle2()),
653  viennacl::cuda_arg<NumericT>(B.handle()),
654  static_cast<unsigned int>(B.size2()),
655  viennacl::cuda_arg<unsigned int>(C.handle1()),
656  viennacl::cuda_arg<unsigned int>(C.handle2()),
657  viennacl::cuda_arg<NumericT>(C.handle())
658  );
659  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
660  }
661 
662 }
663 
664 } // namespace cuda
665 } //namespace linalg
666 } //namespace viennacl
667 
668 
669 #endif
const vcl_size_t & size2() const
Returns the number of columns.
Helper class implementing an array on the host. Default case: No conversion necessary.
Definition: util.hpp:92
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
Definition: memory.hpp:220
__device__ IndexT subwarp_minimum_shared(IndexT min_index, IndexT id_in_warp, IndexT *shared_buffer)
const vcl_size_t & size1() const
Returns the number of rows.
__global__ void compressed_matrix_gemm_decompose_1(const IndexT *A_row_indices, IndexT A_size1, IndexT max_per_row, IndexT *chunks_per_row)
Definition: spgemm.hpp:469
Various little tools used here and there in ViennaCL.
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
__global__ void compressed_matrix_gemm_stage_3(const IndexT *A_row_indices, const IndexT *A_col_indices, const NumericT *A_elements, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, const NumericT *B_elements, IndexT B_size2, IndexT const *C_row_indices, IndexT *C_col_indices, NumericT *C_elements, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices, NumericT *scratchpad_values)
Definition: spgemm.hpp:365
float NumericT
Definition: bisect.cpp:40
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
__device__ IndexT round_to_next_power_of_2(IndexT val)
Definition: spgemm.hpp:63
__global__ void compressed_matrix_gemm_stage_1(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, IndexT *subwarpsize_per_group, IndexT *max_nnz_row_A_per_group, IndexT *max_nnz_row_B_per_group)
Definition: spgemm.hpp:82
__device__ IndexT subwarp_minimum_shuffle(IndexT min_index)
__global__ void compressed_matrix_gemm_A2(IndexT *A2_row_indices, IndexT *A2_col_indices, NumericT *A2_elements, IndexT A2_size1, IndexT *new_row_buffer)
Definition: spgemm.hpp:484
__device__ NumericT subwarp_accumulate_shared(NumericT output_value, unsigned int id_in_warp, NumericT *shared_buffer)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
Implementations of direct triangular solvers for sparse matrices using CUDA.
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
void set(vcl_size_t index, U value)
Definition: util.hpp:115
__global__ void compressed_matrix_gemm_G1(IndexT *G1_row_indices, IndexT *G1_col_indices, NumericT *G1_elements, IndexT G1_size1, IndexT const *A_row_indices, IndexT const *A_col_indices, NumericT const *A_elements, IndexT A_size1, IndexT A_nnz, IndexT max_per_row, IndexT *new_row_buffer)
Definition: spgemm.hpp:511
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
Definition: mem_handle.hpp:230
A sparse square matrix in compressed sparse rows format.
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
__device__ NumericT subwarp_accumulate_shuffle(NumericT output_value)
__global__ void compressed_matrix_gemm_stage_2(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, IndexT B_size2, IndexT *C_row_indices, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices)
Definition: spgemm.hpp:217
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91