1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
53 template<
typename NumericT>
55 const unsigned int * row_indices,
56 const unsigned int * column_indices,
62 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
64 row += gridDim.x * blockDim.x)
67 unsigned int row_end = row_indices[
row+1];
72 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
73 value =
max(value, fabs(elements[i]));
77 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
78 value += fabs(elements[i]);
82 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
83 value += elements[i] * elements[i];
88 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
90 if (column_indices[i] ==
row)
106 template<
typename NumericT,
unsigned int AligmentV>
111 csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
112 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
113 viennacl::cuda_arg<NumericT>(mat.
handle()),
115 static_cast<unsigned int>(mat.
size1()),
116 static_cast<unsigned int>(info_selector)
123 template<
typename NumericT>
129 template<
typename NumericT>
137 template<
unsigned int SubWarpSizeV,
typename AlphaBetaHandlerT,
typename NumericT>
139 const unsigned int * row_indices,
140 const unsigned int * column_indices,
143 unsigned int start_x,
147 unsigned int start_result,
148 unsigned int inc_result,
149 unsigned int size_result,
152 __shared__
NumericT shared_elements[512];
154 const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
155 const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
156 const unsigned int block_start = blockIdx.x * block_increment;
157 const unsigned int block_stop =
min(block_start + block_increment, size_result);
159 for (
unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
161 row += blockDim.x / SubWarpSizeV)
164 unsigned int row_end = row_indices[
row+1];
165 for (
unsigned int i = row_indices[
row] + id_in_row; i < row_end; i += SubWarpSizeV)
166 dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
168 shared_elements[threadIdx.x] =
dot_prod;
169 if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
170 if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
171 if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
172 if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
173 if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
176 AlphaBetaHandlerT::apply(result[
row * inc_result + start_result], alpha, shared_elements[threadIdx.x], beta);
181 template<
typename AlphaBetaHandlerT,
typename NumericT>
183 const unsigned int * row_indices,
184 const unsigned int * column_indices,
185 const unsigned int * row_blocks,
187 unsigned int num_blocks,
189 unsigned int start_x,
193 unsigned int start_result,
194 unsigned int inc_result,
195 unsigned int size_result,
198 __shared__
NumericT shared_elements[1024];
200 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
202 unsigned int row_start = row_blocks[block_id];
203 unsigned int row_stop = row_blocks[block_id + 1];
204 unsigned int element_start = row_indices[row_start];
205 unsigned int element_stop = row_indices[row_stop];
206 unsigned int rows_to_process = row_stop - row_start;
208 if (rows_to_process > 1)
211 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
212 shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
217 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
220 unsigned int thread_row_start = row_indices[
row] - element_start;
221 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
222 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
223 dot_prod += shared_elements[i];
224 AlphaBetaHandlerT::apply(result[
row * inc_result + start_result], alpha, dot_prod, beta);
231 shared_elements[threadIdx.x] = 0;
232 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
233 shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
240 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
243 if (threadIdx.x == 0)
244 AlphaBetaHandlerT::apply(result[row_start * inc_result + start_result], alpha, shared_elements[0], beta);
262 template<
class NumericT,
unsigned int AlignmentV>
269 static bool first =
true;
270 static bool is_maxwell =
false;
280 int device_index = 0;
282 cudaError_t err_flag = cudaGetDevice(&device_index);
283 if (err_flag == cudaSuccess)
285 err_flag = cudaGetDeviceProperties(&prop, device_index);
286 if (err_flag == cudaSuccess && prop.major >= 5)
292 if (is_maxwell &&
double(mat.
nnz()) /
double(mat.
size1()) > 6.4)
295 compressed_matrix_vec_mul_kernel<8, detail::spmv_alpha_beta, NumericT><<<512, 256>>>(
296 viennacl::cuda_arg<unsigned int>(mat.
handle1()),
297 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
298 viennacl::cuda_arg<NumericT>(mat.
handle()),
300 static_cast<unsigned int>(vec.
start()),
301 static_cast<unsigned int>(vec.
stride()),
304 static_cast<unsigned int>(result.
start()),
305 static_cast<unsigned int>(result.
stride()),
306 static_cast<unsigned int>(result.
size()),
310 compressed_matrix_vec_mul_kernel<8, detail::spmv_pure, NumericT><<<512, 256>>>(
311 viennacl::cuda_arg<unsigned int>(mat.
handle1()),
312 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
313 viennacl::cuda_arg<NumericT>(mat.
handle()),
315 static_cast<unsigned int>(vec.
start()),
316 static_cast<unsigned int>(vec.
stride()),
319 static_cast<unsigned int>(result.
start()),
320 static_cast<unsigned int>(result.
stride()),
321 static_cast<unsigned int>(result.
size()),
326 else if (!is_maxwell &&
double(mat.
nnz()) /
double(mat.
size1()) > 12.0)
329 compressed_matrix_vec_mul_kernel<16, detail::spmv_alpha_beta, NumericT><<<512, 256>>>(
330 viennacl::cuda_arg<unsigned int>(mat.
handle1()),
331 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
332 viennacl::cuda_arg<NumericT>(mat.
handle()),
334 static_cast<unsigned int>(vec.
start()),
335 static_cast<unsigned int>(vec.
stride()),
338 static_cast<unsigned int>(result.
start()),
339 static_cast<unsigned int>(result.
stride()),
340 static_cast<unsigned int>(result.
size()),
344 compressed_matrix_vec_mul_kernel<16, detail::spmv_pure, NumericT><<<512, 256>>>(
345 viennacl::cuda_arg<unsigned int>(mat.
handle1()),
346 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
347 viennacl::cuda_arg<NumericT>(mat.
handle()),
349 static_cast<unsigned int>(vec.
start()),
350 static_cast<unsigned int>(vec.
stride()),
353 static_cast<unsigned int>(result.
start()),
354 static_cast<unsigned int>(result.
stride()),
355 static_cast<unsigned int>(result.
size()),
363 compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_alpha_beta><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
364 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
365 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
366 viennacl::cuda_arg<NumericT>(mat.
handle()),
367 static_cast<unsigned int>(mat.
blocks1()),
369 static_cast<unsigned int>(vec.
start()),
370 static_cast<unsigned int>(vec.
stride()),
373 static_cast<unsigned int>(result.
start()),
374 static_cast<unsigned int>(result.
stride()),
375 static_cast<unsigned int>(result.
size()),
379 compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_pure><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
380 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
381 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
382 viennacl::cuda_arg<NumericT>(mat.
handle()),
383 static_cast<unsigned int>(mat.
blocks1()),
385 static_cast<unsigned int>(vec.
start()),
386 static_cast<unsigned int>(vec.
stride()),
389 static_cast<unsigned int>(result.
start()),
390 static_cast<unsigned int>(result.
stride()),
391 static_cast<unsigned int>(result.
size()),
402 template<
typename LayoutT>
405 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
406 unsigned int row_start,
unsigned int row_inc,
407 unsigned int col_start,
unsigned int col_inc,
408 unsigned int internal_rows,
unsigned int internal_cols)
410 return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
418 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
419 unsigned int row_start,
unsigned int row_inc,
420 unsigned int col_start,
unsigned int col_inc,
421 unsigned int internal_rows,
unsigned int internal_cols)
423 return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
429 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
431 const unsigned int * sp_mat_row_indices,
432 const unsigned int * sp_mat_col_indices,
435 unsigned int d_mat_row_start,
436 unsigned int d_mat_col_start,
437 unsigned int d_mat_row_inc,
438 unsigned int d_mat_col_inc,
439 unsigned int d_mat_row_size,
440 unsigned int d_mat_col_size,
441 unsigned int d_mat_internal_rows,
442 unsigned int d_mat_internal_cols,
444 unsigned int result_row_start,
445 unsigned int result_col_start,
446 unsigned int result_row_inc,
447 unsigned int result_col_inc,
448 unsigned int result_row_size,
449 unsigned int result_col_size,
450 unsigned int result_internal_rows,
451 unsigned int result_internal_cols)
453 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
455 unsigned int row_start = sp_mat_row_indices[
row];
456 unsigned int row_end = sp_mat_row_indices[
row+1];
458 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
462 for (
unsigned int k = row_start; k < row_end; k++)
464 unsigned int j = sp_mat_col_indices[k];
466 NumericT y = d_mat[ DMatIndexT::apply(j, col,
467 d_mat_row_start, d_mat_row_inc,
468 d_mat_col_start, d_mat_col_inc,
469 d_mat_internal_rows, d_mat_internal_cols) ];
474 result[ResultIndexT::apply(
row, col,
475 result_row_start, result_row_inc,
476 result_col_start, result_col_inc,
477 result_internal_rows, result_internal_cols)] = r;
491 template<
typename NumericT,
unsigned int AlignmentV>
499 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
500 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
501 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
520 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
521 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
522 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
541 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
542 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
543 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
562 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
563 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
564 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
583 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
585 const unsigned int * sp_mat_row_indices,
586 const unsigned int * sp_mat_col_indices,
589 unsigned int d_mat_row_start,
590 unsigned int d_mat_col_start,
591 unsigned int d_mat_row_inc,
592 unsigned int d_mat_col_inc,
593 unsigned int d_mat_row_size,
594 unsigned int d_mat_col_size,
595 unsigned int d_mat_internal_rows,
596 unsigned int d_mat_internal_cols,
598 unsigned int result_row_start,
599 unsigned int result_col_start,
600 unsigned int result_row_inc,
601 unsigned int result_col_inc,
602 unsigned int result_row_size,
603 unsigned int result_col_size,
604 unsigned int result_internal_rows,
605 unsigned int result_internal_cols)
607 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
609 unsigned int row_start = sp_mat_row_indices[
row];
610 unsigned int row_end = sp_mat_row_indices[
row+1];
612 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
616 for (
unsigned int k = row_start; k < row_end; k++)
618 unsigned int j = sp_mat_col_indices[k];
620 NumericT y = d_mat[ DMatIndexT::apply(col, j,
621 d_mat_row_start, d_mat_row_inc,
622 d_mat_col_start, d_mat_col_inc,
623 d_mat_internal_rows, d_mat_internal_cols) ];
628 result [ ResultIndexT::apply(
row, col,
629 result_row_start, result_row_inc,
630 result_col_start, result_col_inc,
631 result_internal_rows, result_internal_cols) ] = r;
646 template<
typename NumericT,
unsigned int AlignmentV>
654 if (d_mat.lhs().row_major() && result.
row_major())
657 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
658 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
659 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
675 else if (d_mat.lhs().row_major() && !result.
row_major())
678 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
679 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
680 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
696 else if (!d_mat.lhs().row_major() && result.
row_major())
699 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
700 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
701 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
720 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
721 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
722 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
745 template<
typename NumericT>
747 const unsigned int * row_indices,
748 const unsigned int * column_indices,
753 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
755 row += gridDim.x * blockDim.x)
758 unsigned int row_end = row_indices[
row+1];
759 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
761 unsigned int col_index = column_indices[i];
762 if (col_index ==
row)
778 template<
typename SparseMatrixT,
typename NumericT>
784 csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
785 viennacl::cuda_arg<unsigned int>(mat.handle2()),
786 viennacl::cuda_arg<NumericT>(mat.handle()),
788 static_cast<unsigned int>(mat.size1())
799 template<
typename SparseMatrixT,
typename NumericT>
805 csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
806 viennacl::cuda_arg<unsigned int>(mat.handle2()),
807 viennacl::cuda_arg<NumericT>(mat.handle()),
809 static_cast<unsigned int>(mat.size1())
821 template<
typename SparseMatrixT,
typename NumericT>
827 csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
828 viennacl::cuda_arg<unsigned int>(mat.handle2()),
829 viennacl::cuda_arg<NumericT>(mat.handle()),
831 static_cast<unsigned int>(mat.size1())
842 template<
typename SparseMatrixT,
typename NumericT>
848 csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
849 viennacl::cuda_arg<unsigned int>(mat.handle2()),
850 viennacl::cuda_arg<NumericT>(mat.handle()),
852 static_cast<unsigned int>(mat.size1())
866 template<
typename SparseMatrixT,
typename NumericT>
872 csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
873 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
874 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
876 static_cast<unsigned int>(mat.
lhs().size1())
887 template<
typename SparseMatrixT,
typename NumericT>
895 compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
896 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
897 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
899 static_cast<unsigned int>(mat.
size1())
902 csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
903 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
904 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
907 static_cast<unsigned int>(mat.
lhs().size1())
918 template<
typename SparseMatrixT,
typename NumericT>
924 csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
925 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
926 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
928 static_cast<unsigned int>(mat.
lhs().size1())
939 template<
typename SparseMatrixT,
typename NumericT>
947 compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
948 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
949 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
951 static_cast<unsigned int>(mat.
size1())
954 csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
955 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
956 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
959 static_cast<unsigned int>(mat.
lhs().size1())
969 template<
typename NumericT,
unsigned int AlignmentV>
978 csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
979 viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
980 viennacl::cuda_arg<NumericT>(L.lhs().handle()),
981 viennacl::cuda_arg<unsigned int>(block_indices),
983 static_cast<unsigned int>(L.lhs().size1())
988 template<
typename NumericT,
unsigned int AlignmentV>
997 csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
998 viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
999 viennacl::cuda_arg<NumericT>(U.lhs().handle()),
1001 viennacl::cuda_arg<unsigned int>(block_indices),
1003 static_cast<unsigned int>(U.lhs().size1())
1015 template<
typename NumericT>
1017 const unsigned int * row_jumper,
1018 const unsigned int * row_indices,
1019 const unsigned int * column_indices,
1021 unsigned int nonzero_rows,
1023 unsigned int start_x,
1027 unsigned int start_result,
1028 unsigned int inc_result,
1029 unsigned int size_result,
1032 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
1034 i += gridDim.x * blockDim.x)
1037 unsigned int row_end = row_jumper[i+1];
1038 for (
unsigned int j = row_jumper[i]; j < row_end; ++j)
1039 dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
1041 unsigned int index = row_indices[i] * inc_result + start_result;
1042 if (beta != 0) result[index] += alpha *
dot_prod;
1043 else result[index] = alpha *
dot_prod;
1056 template<
typename NumericT>
1063 if (beta < 0 || beta > 0)
1068 compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
1069 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
1070 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
1071 viennacl::cuda_arg<NumericT>(mat.
handle()),
1072 static_cast<unsigned int>(mat.
nnz1()),
1074 static_cast<unsigned int>(vec.
start()),
1075 static_cast<unsigned int>(vec.
stride()),
1078 static_cast<unsigned int>(result.
start()),
1079 static_cast<unsigned int>(result.
stride()),
1080 static_cast<unsigned int>(result.
size()),
1094 template<
typename NumericT>
1097 const unsigned int * group_boundaries,
1099 unsigned int option)
1101 __shared__
unsigned int shared_rows[128];
1102 __shared__
NumericT inter_results[128];
1106 unsigned int last_index = blockDim.x - 1;
1107 unsigned int group_start = group_boundaries[blockIdx.x];
1108 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1109 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1111 unsigned int local_index = 0;
1113 for (
unsigned int k = 0; k < k_end; ++k)
1115 local_index = group_start + k * blockDim.x + threadIdx.x;
1117 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1118 val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
1121 if (threadIdx.x == 0 && k > 0)
1123 if (tmp.x == shared_rows[last_index])
1129 val =
max(val, fabs(inter_results[last_index]));
1133 val = fabs(val) + inter_results[last_index];
1137 val = sqrt(val * val + inter_results[last_index]);
1151 result[shared_rows[last_index]] = inter_results[last_index];
1155 result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
1164 shared_rows[threadIdx.x] = tmp.x;
1169 inter_results[threadIdx.x] = val;
1172 inter_results[threadIdx.x] = fabs(val);
1175 inter_results[threadIdx.x] = val * val;
1183 NumericT left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1189 inter_results[threadIdx.x] =
max(inter_results[threadIdx.x], left);
1193 inter_results[threadIdx.x] += left;
1197 inter_results[threadIdx.x] += left;
1207 if (threadIdx.x != last_index &&
1208 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1209 inter_results[threadIdx.x] != 0)
1211 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1217 if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
1218 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1221 template<
typename NumericT,
unsigned int AlignmentV>
1226 coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle12()),
1227 viennacl::cuda_arg<NumericT>(mat.
handle()),
1228 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
1230 static_cast<unsigned int>(info_selector)
1238 template<
typename NumericT>
1241 const unsigned int * group_boundaries,
1243 unsigned int start_x,
1247 unsigned int start_result,
1248 unsigned int inc_result,
1251 __shared__
unsigned int shared_rows[128];
1252 __shared__
NumericT inter_results[128];
1256 unsigned int group_start = group_boundaries[blockIdx.x];
1257 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1258 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1260 unsigned int local_index = 0;
1262 for (
unsigned int k = 0; k < k_end; ++k)
1264 local_index = group_start + k * blockDim.x + threadIdx.x;
1266 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1267 val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1270 if (threadIdx.x == 0 && k > 0)
1272 if (tmp.x == shared_rows[blockDim.x-1])
1273 val += inter_results[blockDim.x-1];
1275 result[shared_rows[blockDim.x-1] * inc_result + start_result] += alpha * inter_results[blockDim.x-1];
1277 result[shared_rows[blockDim.x-1] * inc_result + start_result] = alpha * inter_results[blockDim.x-1];
1282 shared_rows[threadIdx.x] = tmp.x;
1283 inter_results[threadIdx.x] = val;
1289 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1291 inter_results[threadIdx.x] += left;
1296 if (local_index < group_end - 1 && threadIdx.x < blockDim.x-1 &&
1297 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1299 if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
1300 else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
1306 if (local_index + 1 == group_end) {
1307 if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
1308 else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
1321 template<
typename NumericT,
unsigned int AlignmentV>
1328 if (beta < 0 || beta > 0)
1333 coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle12()),
1334 viennacl::cuda_arg<NumericT>(mat.
handle()),
1335 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
1337 static_cast<unsigned int>(vec.
start()),
1338 static_cast<unsigned int>(vec.
stride()),
1341 static_cast<unsigned int>(result.
start()),
1342 static_cast<unsigned int>(result.
stride()),
1351 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1354 const unsigned int * group_boundaries,
1356 unsigned int d_mat_row_start,
1357 unsigned int d_mat_col_start,
1358 unsigned int d_mat_row_inc,
1359 unsigned int d_mat_col_inc,
1360 unsigned int d_mat_row_size,
1361 unsigned int d_mat_col_size,
1362 unsigned int d_mat_internal_rows,
1363 unsigned int d_mat_internal_cols,
1365 unsigned int result_row_start,
1366 unsigned int result_col_start,
1367 unsigned int result_row_inc,
1368 unsigned int result_col_inc,
1369 unsigned int result_row_size,
1370 unsigned int result_col_size,
1371 unsigned int result_internal_rows,
1372 unsigned int result_internal_cols)
1374 __shared__
unsigned int shared_rows[128];
1375 __shared__
NumericT inter_results[128];
1379 unsigned int group_start = group_boundaries[blockIdx.x];
1380 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1381 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1383 unsigned int local_index = 0;
1385 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1387 for (
unsigned int k = 0; k < k_end; ++k)
1389 local_index = group_start + k * blockDim.x + threadIdx.x;
1391 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1392 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1393 d_mat_row_start, d_mat_row_inc,
1394 d_mat_col_start, d_mat_col_inc,
1395 d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1398 if (threadIdx.x == 0 && k > 0)
1400 if (tmp.x == shared_rows[blockDim.x-1])
1401 val += inter_results[blockDim.x-1];
1403 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1404 result_row_start, result_row_inc,
1405 result_col_start, result_col_inc,
1406 result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1411 shared_rows[threadIdx.x] = tmp.x;
1412 inter_results[threadIdx.x] = val;
1418 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1420 inter_results[threadIdx.x] += left;
1425 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1426 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1428 result[ResultIndexT::apply(tmp.x, result_col,
1429 result_row_start, result_row_inc,
1430 result_col_start, result_col_inc,
1431 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1437 if (local_index + 1 == group_end)
1438 result[ResultIndexT::apply(tmp.x, result_col,
1439 result_row_start, result_row_inc,
1440 result_col_start, result_col_inc,
1441 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1454 template<
typename NumericT,
unsigned int AlignmentV>
1462 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1463 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1464 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1483 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1484 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1485 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1504 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1505 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1506 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1525 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1526 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1527 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1546 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1549 const unsigned int * group_boundaries,
1551 unsigned int d_mat_row_start,
1552 unsigned int d_mat_col_start,
1553 unsigned int d_mat_row_inc,
1554 unsigned int d_mat_col_inc,
1555 unsigned int d_mat_row_size,
1556 unsigned int d_mat_col_size,
1557 unsigned int d_mat_internal_rows,
1558 unsigned int d_mat_internal_cols,
1560 unsigned int result_row_start,
1561 unsigned int result_col_start,
1562 unsigned int result_row_inc,
1563 unsigned int result_col_inc,
1564 unsigned int result_row_size,
1565 unsigned int result_col_size,
1566 unsigned int result_internal_rows,
1567 unsigned int result_internal_cols)
1569 __shared__
unsigned int shared_rows[128];
1570 __shared__
NumericT inter_results[128];
1574 unsigned int group_start = group_boundaries[blockIdx.x];
1575 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1576 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1578 unsigned int local_index = 0;
1580 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1582 for (
unsigned int k = 0; k < k_end; ++k)
1584 local_index = group_start + k * blockDim.x + threadIdx.x;
1586 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1587 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1588 d_mat_row_start, d_mat_row_inc,
1589 d_mat_col_start, d_mat_col_inc,
1590 d_mat_internal_rows, d_mat_internal_cols)] : 0;
1593 if (threadIdx.x == 0 && k > 0)
1595 if (tmp.x == shared_rows[blockDim.x-1])
1596 val += inter_results[blockDim.x-1];
1598 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1599 result_row_start, result_row_inc,
1600 result_col_start, result_col_inc,
1601 result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1606 shared_rows[threadIdx.x] = tmp.x;
1607 inter_results[threadIdx.x] = val;
1613 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1615 inter_results[threadIdx.x] += left;
1620 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1621 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1623 result[ ResultIndexT::apply(tmp.x, result_col,
1624 result_row_start, result_row_inc,
1625 result_col_start, result_col_inc,
1626 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1632 if (local_index + 1 == group_end)
1633 result[ ResultIndexT::apply(tmp.x, result_col,
1634 result_row_start, result_row_inc,
1635 result_col_start, result_col_inc,
1636 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1648 template<
typename NumericT,
unsigned int AlignmentV>
1655 if (d_mat.lhs().row_major() && result.
row_major())
1658 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1659 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1660 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1676 else if (d_mat.lhs().row_major() && !result.
row_major())
1679 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1680 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1681 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1697 else if (!d_mat.lhs().row_major() && result.
row_major())
1700 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1701 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1702 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1721 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1722 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1723 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1746 template<
typename AlphaBetaHandlerT,
typename NumericT>
1750 unsigned int start_x,
1754 unsigned int start_result,
1755 unsigned int inc_result,
1757 unsigned int row_num,
1758 unsigned int col_num,
1759 unsigned int internal_row_num,
1760 unsigned int items_per_row,
1761 unsigned int aligned_items_per_row
1764 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1765 unsigned int glb_sz = gridDim.x * blockDim.x;
1767 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1771 unsigned int offset = row_id;
1772 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1778 int col = coords[offset];
1779 sum += x[col * inc_x + start_x] * val;
1783 AlphaBetaHandlerT::apply(result[row_id * inc_result + start_result], alpha, sum, beta);
1796 template<
typename NumericT,
unsigned int AlignmentV>
1804 ell_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
1805 viennacl::cuda_arg<NumericT>(mat.
handle()),
1807 static_cast<unsigned int>(vec.
start()),
1808 static_cast<unsigned int>(vec.
stride()),
1811 static_cast<unsigned int>(result.
start()),
1812 static_cast<unsigned int>(result.
stride()),
1814 static_cast<unsigned int>(mat.
size1()),
1815 static_cast<unsigned int>(mat.
size2()),
1817 static_cast<unsigned int>(mat.
maxnnz()),
1821 ell_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
1822 viennacl::cuda_arg<NumericT>(mat.
handle()),
1824 static_cast<unsigned int>(vec.
start()),
1825 static_cast<unsigned int>(vec.
stride()),
1828 static_cast<unsigned int>(result.
start()),
1829 static_cast<unsigned int>(result.
stride()),
1831 static_cast<unsigned int>(mat.
size1()),
1832 static_cast<unsigned int>(mat.
size2()),
1834 static_cast<unsigned int>(mat.
maxnnz()),
1840 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1843 unsigned int sp_mat_row_num,
1844 unsigned int sp_mat_col_num,
1845 unsigned int sp_mat_internal_row_num,
1846 unsigned int sp_mat_items_per_row,
1847 unsigned int sp_mat_aligned_items_per_row,
1849 unsigned int d_mat_row_start,
1850 unsigned int d_mat_col_start,
1851 unsigned int d_mat_row_inc,
1852 unsigned int d_mat_col_inc,
1853 unsigned int d_mat_row_size,
1854 unsigned int d_mat_col_size,
1855 unsigned int d_mat_internal_rows,
1856 unsigned int d_mat_internal_cols,
1858 unsigned int result_row_start,
1859 unsigned int result_col_start,
1860 unsigned int result_row_inc,
1861 unsigned int result_col_inc,
1862 unsigned int result_row_size,
1863 unsigned int result_col_size,
1864 unsigned int result_internal_rows,
1865 unsigned int result_internal_cols)
1867 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1868 unsigned int glb_sz = gridDim.x * blockDim.x;
1870 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1872 unsigned int row = rc % sp_mat_row_num;
1873 unsigned int col = rc / sp_mat_row_num;
1875 unsigned int offset =
row;
1878 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1880 unsigned int j = sp_mat_coords[offset];
1885 NumericT y = d_mat[ DMatIndexT::apply(j, col,
1886 d_mat_row_start, d_mat_row_inc,
1887 d_mat_col_start, d_mat_col_inc,
1888 d_mat_internal_rows, d_mat_internal_cols) ];
1893 result [ ResultIndexT::apply(row, col,
1894 result_row_start, result_row_inc,
1895 result_col_start, result_col_inc,
1896 result_internal_rows, result_internal_cols) ] = r;
1910 template<
typename NumericT,
unsigned int AlignmentV>
1918 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1919 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1920 static_cast<unsigned int>(sp_mat.
size1()),
1921 static_cast<unsigned int>(sp_mat.
size2()),
1923 static_cast<unsigned int>(sp_mat.
maxnnz()),
1942 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1943 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1944 static_cast<unsigned int>(sp_mat.
size1()),
1945 static_cast<unsigned int>(sp_mat.
size2()),
1947 static_cast<unsigned int>(sp_mat.
maxnnz()),
1966 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1967 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1968 static_cast<unsigned int>(sp_mat.
size1()),
1969 static_cast<unsigned int>(sp_mat.
size2()),
1971 static_cast<unsigned int>(sp_mat.
maxnnz()),
1990 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1991 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1992 static_cast<unsigned int>(sp_mat.
size1()),
1993 static_cast<unsigned int>(sp_mat.
size2()),
1995 static_cast<unsigned int>(sp_mat.
maxnnz()),
2013 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT >
2016 unsigned int sp_mat_row_num,
2017 unsigned int sp_mat_col_num,
2018 unsigned int sp_mat_internal_row_num,
2019 unsigned int sp_mat_items_per_row,
2020 unsigned int sp_mat_aligned_items_per_row,
2022 unsigned int d_mat_row_start,
2023 unsigned int d_mat_col_start,
2024 unsigned int d_mat_row_inc,
2025 unsigned int d_mat_col_inc,
2026 unsigned int d_mat_row_size,
2027 unsigned int d_mat_col_size,
2028 unsigned int d_mat_internal_rows,
2029 unsigned int d_mat_internal_cols,
2031 unsigned int result_row_start,
2032 unsigned int result_col_start,
2033 unsigned int result_row_inc,
2034 unsigned int result_col_inc,
2035 unsigned int result_row_size,
2036 unsigned int result_col_size,
2037 unsigned int result_internal_rows,
2038 unsigned int result_internal_cols)
2040 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2041 unsigned int glb_sz = gridDim.x * blockDim.x;
2043 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
2045 unsigned int row = rc % sp_mat_row_num;
2046 unsigned int col = rc / sp_mat_row_num;
2048 unsigned int offset =
row;
2051 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
2053 unsigned int j = sp_mat_coords[offset];
2058 NumericT y = d_mat[ DMatIndexT::apply(col, j,
2059 d_mat_row_start, d_mat_row_inc,
2060 d_mat_col_start, d_mat_col_inc,
2061 d_mat_internal_rows, d_mat_internal_cols) ];
2066 result [ ResultIndexT::apply(row, col,
2067 result_row_start, result_row_inc,
2068 result_col_start, result_col_inc,
2069 result_internal_rows, result_internal_cols) ] = r;
2083 template<
typename NumericT,
unsigned int AlignmentV>
2090 if (d_mat.lhs().row_major() && result.
row_major())
2093 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
2094 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
2095 static_cast<unsigned int>(sp_mat.
size1()),
2096 static_cast<unsigned int>(sp_mat.
size2()),
2098 static_cast<unsigned int>(sp_mat.
maxnnz()),
2115 else if (d_mat.lhs().row_major() && !result.
row_major())
2118 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
2119 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
2120 static_cast<unsigned int>(sp_mat.
size1()),
2121 static_cast<unsigned int>(sp_mat.
size2()),
2123 static_cast<unsigned int>(sp_mat.
maxnnz()),
2140 else if (!d_mat.lhs().row_major() && result.
row_major())
2143 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
2144 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
2145 static_cast<unsigned int>(sp_mat.
size1()),
2146 static_cast<unsigned int>(sp_mat.
size2()),
2148 static_cast<unsigned int>(sp_mat.
maxnnz()),
2168 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
2169 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
2170 static_cast<unsigned int>(sp_mat.
size1()),
2171 static_cast<unsigned int>(sp_mat.
size2()),
2173 static_cast<unsigned int>(sp_mat.
maxnnz()),
2196 template<
typename AlphaBetaHandlerT,
typename NumericT>
2198 const unsigned int * column_indices,
2199 const unsigned int * block_start,
2202 unsigned int start_x,
2204 unsigned int size_x,
2207 unsigned int start_result,
2208 unsigned int inc_result,
2209 unsigned int size_result,
2211 unsigned int block_size)
2213 unsigned int blocks_per_threadblock = blockDim.x / block_size;
2214 unsigned int id_in_block = threadIdx.x % block_size;
2215 unsigned int num_blocks = (size_result - 1) / block_size + 1;
2216 unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
2217 unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
2219 for (
unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
2221 unsigned int row = block_idx * block_size + id_in_block;
2222 unsigned int offset = block_start[block_idx];
2223 unsigned int num_columns = columns_per_block[block_idx];
2226 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
2228 unsigned int index = offset + item_id * block_size + id_in_block;
2231 sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2234 if (row < size_result)
2235 AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, sum, beta);
2247 template<
typename NumericT,
typename IndexT>
2255 sliced_ell_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
2256 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2257 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2258 viennacl::cuda_arg<NumericT>(mat.
handle()),
2260 static_cast<unsigned int>(vec.
start()),
2261 static_cast<unsigned int>(vec.
stride()),
2262 static_cast<unsigned int>(vec.
size()),
2265 static_cast<unsigned int>(result.
start()),
2266 static_cast<unsigned int>(result.
stride()),
2267 static_cast<unsigned int>(result.
size()),
2272 sliced_ell_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
2273 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2274 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2275 viennacl::cuda_arg<NumericT>(mat.
handle()),
2277 static_cast<unsigned int>(vec.
start()),
2278 static_cast<unsigned int>(vec.
stride()),
2279 static_cast<unsigned int>(vec.
size()),
2282 static_cast<unsigned int>(result.
start()),
2283 static_cast<unsigned int>(result.
stride()),
2284 static_cast<unsigned int>(result.
size()),
2297 template<
typename AlphaBetaHandlerT,
typename NumericT>
2300 const unsigned int * csr_rows,
2301 const unsigned int * csr_cols,
2304 unsigned int start_x,
2308 unsigned int start_result,
2309 unsigned int inc_result,
2311 unsigned int row_num,
2312 unsigned int internal_row_num,
2313 unsigned int items_per_row,
2314 unsigned int aligned_items_per_row
2317 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2318 unsigned int glb_sz = gridDim.x * blockDim.x;
2320 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2324 unsigned int offset = row_id;
2325 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2327 NumericT val = ell_elements[offset];
2332 int col = ell_coords[offset];
2333 sum += (x[col * inc_x + start_x] * val);
2337 unsigned int col_begin = csr_rows[row_id];
2338 unsigned int col_end = csr_rows[row_id + 1];
2340 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2341 sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2343 AlphaBetaHandlerT::apply(result[row_id * inc_result + start_result], alpha, sum, beta);
2357 template<
typename NumericT,
unsigned int AlignmentV>
2365 hyb_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2366 viennacl::cuda_arg<NumericT>(mat.
handle()),
2367 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2368 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2369 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2371 static_cast<unsigned int>(vec.
start()),
2372 static_cast<unsigned int>(vec.
stride()),
2375 static_cast<unsigned int>(result.
start()),
2376 static_cast<unsigned int>(result.
stride()),
2378 static_cast<unsigned int>(mat.
size1()),
2380 static_cast<unsigned int>(mat.
ell_nnz()),
2384 hyb_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2385 viennacl::cuda_arg<NumericT>(mat.
handle()),
2386 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2387 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2388 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2390 static_cast<unsigned int>(vec.
start()),
2391 static_cast<unsigned int>(vec.
stride()),
2394 static_cast<unsigned int>(result.
start()),
2395 static_cast<unsigned int>(result.
stride()),
2397 static_cast<unsigned int>(mat.
size1()),
2399 static_cast<unsigned int>(mat.
ell_nnz()),
2407 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2410 const unsigned int * csr_rows,
2411 const unsigned int * csr_cols,
2413 unsigned int row_num,
2414 unsigned int internal_row_num,
2415 unsigned int items_per_row,
2416 unsigned int aligned_items_per_row,
2418 unsigned int d_mat_row_start,
2419 unsigned int d_mat_col_start,
2420 unsigned int d_mat_row_inc,
2421 unsigned int d_mat_col_inc,
2422 unsigned int d_mat_row_size,
2423 unsigned int d_mat_col_size,
2424 unsigned int d_mat_internal_rows,
2425 unsigned int d_mat_internal_cols,
2427 unsigned int result_row_start,
2428 unsigned int result_col_start,
2429 unsigned int result_row_inc,
2430 unsigned int result_col_inc,
2431 unsigned int result_row_size,
2432 unsigned int result_col_size,
2433 unsigned int result_internal_rows,
2434 unsigned int result_internal_cols)
2436 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2437 unsigned int glb_sz = gridDim.x * blockDim.x;
2439 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2441 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2445 unsigned int offset = row_id;
2446 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2448 NumericT val = ell_elements[offset];
2452 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2453 d_mat_row_start, d_mat_row_inc,
2454 d_mat_col_start, d_mat_col_inc,
2455 d_mat_internal_rows, d_mat_internal_cols)] * val;
2459 unsigned int col_begin = csr_rows[row_id];
2460 unsigned int col_end = csr_rows[row_id + 1];
2462 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2464 sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2465 d_mat_row_start, d_mat_row_inc,
2466 d_mat_col_start, d_mat_col_inc,
2467 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2470 result[ResultIndexT::apply(row_id, result_col,
2471 result_row_start, result_row_inc,
2472 result_col_start, result_col_inc,
2473 result_internal_rows, result_internal_cols)] =
sum;
2488 template<
typename NumericT,
unsigned int AlignmentV>
2496 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2497 viennacl::cuda_arg<NumericT>(mat.
handle()),
2498 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2499 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2500 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2501 static_cast<unsigned int>(mat.
size1()),
2503 static_cast<unsigned int>(mat.
ell_nnz()),
2523 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2524 viennacl::cuda_arg<NumericT>(mat.
handle()),
2525 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2526 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2527 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2528 static_cast<unsigned int>(mat.
size1()),
2530 static_cast<unsigned int>(mat.
ell_nnz()),
2550 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2551 viennacl::cuda_arg<NumericT>(mat.
handle()),
2552 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2553 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2554 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2555 static_cast<unsigned int>(mat.
size1()),
2557 static_cast<unsigned int>(mat.
ell_nnz()),
2577 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2578 viennacl::cuda_arg<NumericT>(mat.
handle()),
2579 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2580 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2581 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2582 static_cast<unsigned int>(mat.
size1()),
2584 static_cast<unsigned int>(mat.
ell_nnz()),
2605 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2608 const unsigned int * csr_rows,
2609 const unsigned int * csr_cols,
2611 unsigned int row_num,
2612 unsigned int internal_row_num,
2613 unsigned int items_per_row,
2614 unsigned int aligned_items_per_row,
2616 unsigned int d_mat_row_start,
2617 unsigned int d_mat_col_start,
2618 unsigned int d_mat_row_inc,
2619 unsigned int d_mat_col_inc,
2620 unsigned int d_mat_row_size,
2621 unsigned int d_mat_col_size,
2622 unsigned int d_mat_internal_rows,
2623 unsigned int d_mat_internal_cols,
2625 unsigned int result_row_start,
2626 unsigned int result_col_start,
2627 unsigned int result_row_inc,
2628 unsigned int result_col_inc,
2629 unsigned int result_row_size,
2630 unsigned int result_col_size,
2631 unsigned int result_internal_rows,
2632 unsigned int result_internal_cols)
2634 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2635 unsigned int glb_sz = gridDim.x * blockDim.x;
2637 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2639 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2643 unsigned int offset = row_id;
2644 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2646 NumericT val = ell_elements[offset];
2650 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2651 d_mat_row_start, d_mat_row_inc,
2652 d_mat_col_start, d_mat_col_inc,
2653 d_mat_internal_rows, d_mat_internal_cols)] * val;
2657 unsigned int col_begin = csr_rows[row_id];
2658 unsigned int col_end = csr_rows[row_id + 1];
2660 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2662 sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2663 d_mat_row_start, d_mat_row_inc,
2664 d_mat_col_start, d_mat_col_inc,
2665 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2668 result[ResultIndexT::apply(row_id, result_col,
2669 result_row_start, result_row_inc,
2670 result_col_start, result_col_inc,
2671 result_internal_rows, result_internal_cols)] =
sum;
2686 template<
typename NumericT,
unsigned int AlignmentV>
2693 if (d_mat.lhs().row_major() && result.
row_major())
2696 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2697 viennacl::cuda_arg<NumericT>(mat.
handle()),
2698 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2699 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2700 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2701 static_cast<unsigned int>(mat.
size1()),
2703 static_cast<unsigned int>(mat.
ell_nnz()),
2720 else if (d_mat.lhs().row_major() && !result.
row_major())
2723 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2724 viennacl::cuda_arg<NumericT>(mat.
handle()),
2725 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2726 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2727 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2728 static_cast<unsigned int>(mat.
size1()),
2730 static_cast<unsigned int>(mat.
ell_nnz()),
2747 else if (!d_mat.lhs().row_major() && result.
row_major())
2750 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2751 viennacl::cuda_arg<NumericT>(mat.
handle()),
2752 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2753 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2754 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2755 static_cast<unsigned int>(mat.
size1()),
2757 static_cast<unsigned int>(mat.
ell_nnz()),
2777 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2778 viennacl::cuda_arg<NumericT>(mat.
handle()),
2779 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2780 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2781 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2782 static_cast<unsigned int>(mat.
size1()),
2784 static_cast<unsigned int>(mat.
ell_nnz()),
vcl_size_t internal_ellnnz() const
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Simple enable-if variant that uses the SFINAE pattern.
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const handle_type & handle() const
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
A tag class representing a lower triangular matrix.
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
static __device__ void apply(NumericT &result, NumericT alpha, NumericT Ax, NumericT beta)
vcl_size_t internal_size1() const
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
Expression template class for representing a tree of expressions which ultimately result in a matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta)
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
static __device__ void apply(NumericT &result, NumericT alpha, NumericT Ax, NumericT beta)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
vcl_size_t size1() const
Returns the size of the result vector.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
const handle_type & handle2() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
A tag class representing an upper triangular matrix.
Sparse matrix class using the sliced ELLPACK with parameters C, .
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta, unsigned int block_size)
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
vcl_size_t ell_nnz() const
A tag class representing a lower triangular matrix with unit diagonal.
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag for column-major storage of a dense matrix.
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
const handle_type & handle5() const
LHS & lhs() const
Get left hand side operand.
size_type start() const
Returns the offset within the buffer.
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Implementation of the ViennaCL scalar class.
Implementations of NMF operations using CUDA.
A tag class representing an upper triangular matrix with unit diagonal.
NumericT min(std::vector< NumericT > const &v1)
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)