1 #ifndef VIENNACL_LINALG_CUDA_SPGEMM_HPP_
2 #define VIENNACL_LINALG_CUDA_SPGEMM_HPP_
27 #include <thrust/scan.h>
28 #include <thrust/device_ptr.h>
48 template<
typename NumericT>
51 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
52 return __ldg(address);
62 template<
typename IndexT>
81 template<
typename IndexT>
83 const IndexT * A_row_indices,
84 const IndexT * A_col_indices,
86 const IndexT * B_row_indices,
87 IndexT *subwarpsize_per_group,
88 IndexT *max_nnz_row_A_per_group,
89 IndexT *max_nnz_row_B_per_group)
91 unsigned int subwarpsize_in_thread = 0;
92 unsigned int max_nnz_row_A = 0;
93 unsigned int max_nnz_row_B = 0;
95 unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
96 unsigned int row_per_group_end =
min(A_size1, rows_per_group * (blockIdx.x + 1));
98 for (
unsigned int row = rows_per_group * blockIdx.x + threadIdx.x;
row < row_per_group_end;
row += blockDim.x)
100 unsigned int A_row_start = A_row_indices[
row];
101 unsigned int A_row_end = A_row_indices[
row+1];
102 unsigned int row_num = A_row_end - A_row_start;
105 unsigned int subwarp_sqrt = (
unsigned int)sqrt(
double(row_num)) + 1;
106 subwarpsize_in_thread =
max(subwarp_sqrt, subwarpsize_in_thread);
109 subwarpsize_in_thread =
max(A_row_end - A_row_start, subwarpsize_in_thread);
110 max_nnz_row_A =
max(max_nnz_row_A, row_num);
111 for (
unsigned int j = A_row_start; j < A_row_end; ++j)
113 unsigned int col = A_col_indices[j];
114 unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
115 max_nnz_row_B =
max(row_len_B, max_nnz_row_B);
120 __shared__
unsigned int shared_subwarpsize[256];
121 __shared__
unsigned int shared_max_nnz_row_A[256];
122 __shared__
unsigned int shared_max_nnz_row_B[256];
124 shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
125 shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
126 shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
132 shared_subwarpsize[threadIdx.x] =
max( shared_subwarpsize[threadIdx.x], shared_subwarpsize[threadIdx.x +
stride]);
133 shared_max_nnz_row_A[threadIdx.x] =
max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x +
stride]);
134 shared_max_nnz_row_B[threadIdx.x] =
max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x +
stride]);
138 if (threadIdx.x == 0)
141 max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
142 max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
149 inline __device__
unsigned int merge_subwarp_symbolic(
unsigned int row_B_start,
unsigned int row_B_end,
unsigned int const *B_col_indices,
unsigned int B_size2,
unsigned int subwarpsize)
151 unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
153 unsigned int num_nnz = 0;
157 unsigned int min_index = current_front_index;
158 for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
159 min_index =
min(min_index, __shfl_xor((
int)min_index, (
int)i));
161 if (min_index == B_size2)
165 current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
166 : current_front_index;
173 inline __device__
unsigned int merge_subwarp_symbolic_double(
unsigned int row_B_start,
unsigned int row_B_end,
unsigned int const *B_col_indices,
unsigned int B_size2,
174 unsigned int *output_array,
unsigned int id_in_warp,
unsigned int subwarpsize)
176 unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
178 unsigned int num_nnz = 0;
179 unsigned int index_buffer = 0;
180 unsigned int buffer_size = 0;
184 unsigned int min_index = current_front_index;
185 for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
186 min_index =
min(min_index, __shfl_xor((
int)min_index, (
int)i));
188 if (min_index == B_size2)
192 current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
193 : current_front_index;
196 index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
199 if (buffer_size == subwarpsize)
201 output_array[id_in_warp] = index_buffer;
202 output_array += subwarpsize;
210 if (id_in_warp < buffer_size)
211 output_array[id_in_warp] = index_buffer;
216 template<
typename IndexT>
218 const IndexT * A_row_indices,
219 const IndexT * A_col_indices,
221 const IndexT * B_row_indices,
222 const IndexT * B_col_indices,
224 IndexT * C_row_indices,
225 unsigned int *subwarpsize_array,
226 unsigned int *max_row_size_A,
227 unsigned int *max_row_size_B,
228 unsigned int *scratchpad_offsets,
229 unsigned int *scratchpad_indices)
231 unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
233 unsigned int num_warps = blockDim.x / subwarpsize;
234 unsigned int warp_id = threadIdx.x / subwarpsize;
235 unsigned int id_in_warp = threadIdx.x % subwarpsize;
237 unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize;
238 unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
239 unsigned int *subwarp_scratchpad_start = scratchpad_indices + scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
241 unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
242 unsigned int row_per_group_end =
min(A_size1, rows_per_group * (blockIdx.x + 1));
244 for (
unsigned int row = rows_per_group * blockIdx.x + warp_id;
row < row_per_group_end;
row += num_warps)
246 unsigned int row_A_start = A_row_indices[
row];
247 unsigned int row_A_end = A_row_indices[
row+1];
249 if (row_A_end - row_A_start > subwarpsize)
251 unsigned int final_merge_start = 0;
252 unsigned int final_merge_end = 0;
255 unsigned int *subwarp_scratchpad = subwarp_scratchpad_start;
256 unsigned int iter = 0;
257 while (row_A_end > row_A_start)
259 unsigned int my_row_B = row_A_start + id_in_warp;
260 unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
261 unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
262 unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
265 subwarp_scratchpad, id_in_warp, subwarpsize);
267 final_merge_start = (iter == id_in_warp) ? subwarp_scratchpad - scratchpad_indices : final_merge_start;
268 final_merge_end = (iter == id_in_warp) ? final_merge_start + nnz_in_merge : final_merge_end;
271 row_A_start += subwarpsize;
272 subwarp_scratchpad += scratchpad_rowlength;
276 unsigned int num_nnz =
merge_subwarp_symbolic(final_merge_start, final_merge_end, scratchpad_indices, B_size2, subwarpsize);
279 C_row_indices[
row] = num_nnz;
284 unsigned int my_row_B = row_A_start + id_in_warp;
285 unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
286 unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
287 unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
289 unsigned int num_nnz =
merge_subwarp_symbolic(row_B_start, row_B_end, B_col_indices, B_size2, subwarpsize);
292 C_row_indices[
row] = num_nnz;
302 template<
typename NumericT>
304 unsigned int input_start,
unsigned int input_end,
const unsigned int *input_indices,
const NumericT *input_values,
unsigned int invalid_token,
305 unsigned int *output_indices,
NumericT *output_values,
306 unsigned int id_in_warp,
unsigned int subwarpsize)
308 unsigned int current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
309 NumericT current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0;
311 unsigned int index_buffer = 0;
313 unsigned int buffer_size = 0;
314 unsigned int nnz_written = 0;
318 unsigned int min_index = current_front_index;
319 for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
320 min_index =
min(min_index, __shfl_xor((
int)min_index, (int)i));
322 if (min_index == invalid_token)
326 NumericT output_value = (current_front_index == min_index) ? scaling_factor * current_front_value : 0;
327 for (
unsigned int i = subwarpsize/2; i >= 1; i /= 2)
328 output_value += __shfl_xor((
int)output_value, (int)i);
331 if (current_front_index == min_index)
334 current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
335 current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0;
339 index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
340 value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
344 if (buffer_size == subwarpsize)
346 output_indices[id_in_warp] = index_buffer; output_indices += subwarpsize;
347 output_values[id_in_warp] = value_buffer; output_values += subwarpsize;
355 if (id_in_warp < buffer_size)
357 output_indices[id_in_warp] = index_buffer;
358 output_values[id_in_warp] = value_buffer;
364 template<
typename IndexT,
typename NumericT>
366 const IndexT * A_row_indices,
367 const IndexT * A_col_indices,
370 const IndexT * B_row_indices,
371 const IndexT * B_col_indices,
374 IndexT
const * C_row_indices,
375 IndexT * C_col_indices,
377 unsigned int *subwarpsize_array,
378 unsigned int *max_row_size_A,
379 unsigned int *max_row_size_B,
380 unsigned int *scratchpad_offsets,
381 unsigned int *scratchpad_indices,
384 unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
386 unsigned int num_warps = blockDim.x / subwarpsize;
387 unsigned int warp_id = threadIdx.x / subwarpsize;
388 unsigned int id_in_warp = threadIdx.x % subwarpsize;
390 unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize;
391 unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
392 unsigned int subwarp_scratchpad_shift = scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
394 unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
395 unsigned int row_per_group_end =
min(A_size1, rows_per_group * (blockIdx.x + 1));
397 for (
unsigned int row = rows_per_group * blockIdx.x + warp_id;
row < row_per_group_end;
row += num_warps)
399 unsigned int row_A_start = A_row_indices[
row];
400 unsigned int row_A_end = A_row_indices[
row+1];
402 if (row_A_end - row_A_start > subwarpsize)
405 unsigned int final_merge_start = 0;
406 unsigned int final_merge_end = 0;
407 unsigned int iter = 0;
408 unsigned int *scratchpad_indices_ptr = scratchpad_indices + subwarp_scratchpad_shift;
409 NumericT *scratchpad_values_ptr = scratchpad_values + subwarp_scratchpad_shift;
411 while (row_A_start < row_A_end)
413 unsigned int my_row_B = row_A_start + id_in_warp;
414 unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
415 unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
416 unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
417 NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
420 row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
421 scratchpad_indices_ptr, scratchpad_values_ptr,
422 id_in_warp, subwarpsize);
424 if (iter == id_in_warp)
426 final_merge_start = scratchpad_indices_ptr - scratchpad_indices;
427 final_merge_end = final_merge_start + nnz_written;
431 row_A_start += subwarpsize;
432 scratchpad_indices_ptr += scratchpad_rowlength;
433 scratchpad_values_ptr += scratchpad_rowlength;
437 unsigned int index_in_C = C_row_indices[
row];
439 final_merge_start, final_merge_end, scratchpad_indices, scratchpad_values, B_size2,
440 C_col_indices + index_in_C, C_elements + index_in_C,
441 id_in_warp, subwarpsize);
445 unsigned int my_row_B = row_A_start + id_in_warp;
446 unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
447 unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
448 unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
449 NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
451 unsigned int index_in_C = C_row_indices[
row];
454 row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
455 C_col_indices + index_in_C, C_elements + index_in_C,
456 id_in_warp, subwarpsize);
468 template<
typename IndexT>
470 const IndexT * A_row_indices,
473 IndexT *chunks_per_row)
475 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
477 IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
478 chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
483 template<
typename IndexT,
typename NumericT>
485 IndexT * A2_row_indices,
486 IndexT * A2_col_indices,
489 IndexT *new_row_buffer)
491 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
493 unsigned int index_start = new_row_buffer[i];
494 unsigned int index_stop = new_row_buffer[i+1];
496 A2_row_indices[i] = index_start;
498 for (IndexT j = index_start; j < index_stop; ++j)
500 A2_col_indices[j] = j;
506 if (threadIdx.x == 0 && blockIdx.x == 0)
507 A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
510 template<
typename IndexT,
typename NumericT>
512 IndexT * G1_row_indices,
513 IndexT * G1_col_indices,
516 IndexT
const *A_row_indices,
517 IndexT
const *A_col_indices,
522 IndexT *new_row_buffer)
525 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
527 G1_col_indices[i] = A_col_indices[i];
528 G1_elements[i] = A_elements[i];
532 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
534 unsigned int old_start = A_row_indices[i];
535 unsigned int new_start = new_row_buffer[i];
536 unsigned int row_chunks = new_row_buffer[i+1] - new_start;
538 for (IndexT j=0; j<row_chunks; ++j)
539 G1_row_indices[new_start + j] = old_start + j * max_per_row;
543 if (threadIdx.x == 0 && blockIdx.x == 0)
544 G1_row_indices[G1_size1] = A_row_indices[A_size1];
558 template<
class NumericT,
unsigned int AlignmentV>
565 unsigned int blocknum = 256;
566 unsigned int threadnum = 128;
572 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
579 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
580 cudaDeviceSynchronize();
584 compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
585 viennacl::cuda_arg<unsigned int>(A.
handle2()),
586 static_cast<unsigned int>(A.
size1()),
587 viennacl::cuda_arg<unsigned int>(B.
handle1()),
593 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
594 cudaDeviceSynchronize();
595 std::cout <<
"Stage 1 device: " << timer.
get() << std::endl;
600 unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.
handle());
603 unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.
handle());
606 unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.
handle());
611 unsigned int * scratchpad_offsets_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(scratchpad_offsets.handle());
613 unsigned int max_subwarp_size = 0;
614 unsigned int A_max_nnz_per_row = 0;
615 unsigned int scratchpad_offset = 0;
617 for (std::size_t i=0; i<subwarp_sizes.
size(); ++i)
619 max_subwarp_size =
std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
620 A_max_nnz_per_row =
std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
622 scratchpad_offsets_ptr[i] = scratchpad_offset;
626 unsigned int max_warp_reloads = max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1;
627 unsigned int max_row_length_after_warp_merge = subwarp_sizes_ptr[i] * max_nnz_row_B_ptr[i];
628 unsigned int warps_in_group = threadnum / subwarp_sizes_ptr[i];
629 scratchpad_offset += max_warp_reloads
630 * max_row_length_after_warp_merge
635 if (max_subwarp_size > 32)
638 unsigned int max_entries_in_G = 1024;
639 if (A_max_nnz_per_row <= 512*512)
640 max_entries_in_G = 512;
641 if (A_max_nnz_per_row <= 256*256)
642 max_entries_in_G = 256;
643 if (A_max_nnz_per_row <= 128*128)
644 max_entries_in_G = 128;
645 if (A_max_nnz_per_row <= 64*64)
646 max_entries_in_G = 64;
649 compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
650 static_cast<unsigned int>(A.
size1()),
651 static_cast<unsigned int>(max_entries_in_G),
657 thrust::device_ptr<unsigned int>(
viennacl::cuda_arg(exclusive_scan_helper) + exclusive_scan_helper.size()),
660 unsigned int augmented_size = exclusive_scan_helper[A.
size1()];
667 compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
668 viennacl::cuda_arg<unsigned int>(A2.handle2()),
669 viennacl::cuda_arg<NumericT>(A2.handle()),
670 static_cast<unsigned int>(A2.size1()),
676 compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
677 viennacl::cuda_arg<unsigned int>(G1.handle2()),
678 viennacl::cuda_arg<NumericT>(G1.handle()),
679 static_cast<unsigned int>(G1.size1()),
680 viennacl::cuda_arg<unsigned int>(A.
handle1()),
681 viennacl::cuda_arg<unsigned int>(A.
handle2()),
682 viennacl::cuda_arg<NumericT>(A.
handle()),
683 static_cast<unsigned int>(A.
size1()),
684 static_cast<unsigned int>(A.
nnz()),
685 static_cast<unsigned int>(max_entries_in_G),
705 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
706 std::cout <<
"Intermediate host stage: " << timer.
get() << std::endl;
714 compressed_matrix_gemm_stage_2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
715 viennacl::cuda_arg<unsigned int>(A.
handle2()),
716 static_cast<unsigned int>(A.
size1()),
717 viennacl::cuda_arg<unsigned int>(B.
handle1()),
718 viennacl::cuda_arg<unsigned int>(B.
handle2()),
719 static_cast<unsigned int>(B.
size2()),
720 viennacl::cuda_arg<unsigned int>(C.
handle1()),
728 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
729 cudaDeviceSynchronize();
730 std::cout <<
"Stage 2: " << timer.
get() << std::endl;
738 unsigned int current_offset = 0;
739 for (std::size_t i=0; i<C.
size1(); ++i)
741 unsigned int tmp = row_buffer[i];
742 row_buffer.set(i, current_offset);
743 current_offset += tmp;
745 row_buffer.
set(C.
size1(), current_offset);
752 C.
reserve(current_offset,
false);
756 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
757 std::cout <<
"Intermediate stage 2->3: " << timer.
get() << std::endl;
761 compressed_matrix_gemm_stage_3<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
762 viennacl::cuda_arg<unsigned int>(A.
handle2()),
763 viennacl::cuda_arg<NumericT>(A.
handle()),
764 static_cast<unsigned int>(A.
size1()),
765 viennacl::cuda_arg<unsigned int>(B.
handle1()),
766 viennacl::cuda_arg<unsigned int>(B.
handle2()),
767 viennacl::cuda_arg<NumericT>(B.
handle()),
768 static_cast<unsigned int>(B.
size2()),
769 viennacl::cuda_arg<unsigned int>(C.
handle1()),
770 viennacl::cuda_arg<unsigned int>(C.
handle2()),
771 viennacl::cuda_arg<NumericT>(C.
handle()),
780 #ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
781 cudaDeviceSynchronize();
782 std::cout <<
"Stage 3: " << timer.
get() << std::endl;
783 std::cout <<
"----------" << std::endl;
const vcl_size_t & size2() const
Returns the number of columns.
Helper class implementing an array on the host. Default case: No conversion necessary.
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
const vcl_size_t & size1() const
Returns the number of rows.
__global__ void compressed_matrix_gemm_decompose_1(const IndexT *A_row_indices, IndexT A_size1, IndexT max_per_row, IndexT *chunks_per_row)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
void exclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an exclusive scan using CUDA.
T max(const T &lhs, const T &rhs)
Maximum.
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
__global__ void compressed_matrix_gemm_stage_3(const IndexT *A_row_indices, const IndexT *A_col_indices, const NumericT *A_elements, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, const NumericT *B_elements, IndexT B_size2, IndexT const *C_row_indices, IndexT *C_col_indices, NumericT *C_elements, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices, NumericT *scratchpad_values)
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
__device__ IndexT round_to_next_power_of_2(IndexT val)
__device__ unsigned int merge_subwarp_symbolic(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int subwarpsize)
__global__ void compressed_matrix_gemm_stage_1(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, IndexT *subwarpsize_per_group, IndexT *max_nnz_row_A_per_group, IndexT *max_nnz_row_B_per_group)
__global__ void compressed_matrix_gemm_A2(IndexT *A2_row_indices, IndexT *A2_col_indices, NumericT *A2_elements, IndexT A2_size1, IndexT *new_row_buffer)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__device__ unsigned int merge_subwarp_symbolic_double(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int *output_array, unsigned int id_in_warp, unsigned int subwarpsize)
Implementations of direct triangular solvers for sparse matrices using CUDA.
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
void set(vcl_size_t index, U value)
size_type size() const
Returns the length of the vector (cf. std::vector)
__device__ unsigned int merge_subwarp_numeric(NumericT scaling_factor, unsigned int input_start, unsigned int input_end, const unsigned int *input_indices, const NumericT *input_values, unsigned int invalid_token, unsigned int *output_indices, NumericT *output_values, unsigned int id_in_warp, unsigned int subwarpsize)
__global__ void compressed_matrix_gemm_G1(IndexT *G1_row_indices, IndexT *G1_col_indices, NumericT *G1_elements, IndexT G1_size1, IndexT const *A_row_indices, IndexT const *A_col_indices, NumericT const *A_elements, IndexT A_size1, IndexT A_nnz, IndexT max_per_row, IndexT *new_row_buffer)
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
const handle_type & handle() const
Returns the memory handle.
__global__ void compressed_matrix_gemm_stage_2(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, IndexT B_size2, IndexT *C_row_indices, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices)
NumericT min(std::vector< NumericT > const &v1)
void switch_memory_context(viennacl::context new_ctx)