ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
31 
33 
34 #include <vector>
35 
36 #ifdef VIENNACL_WITH_OPENMP
37 #include <omp.h>
38 #endif
39 
40 namespace viennacl
41 {
42 namespace linalg
43 {
44 namespace host_based
45 {
46 //
47 // Compressed matrix
48 //
49 
50 namespace detail
51 {
52  template<typename NumericT, unsigned int AlignmentV>
56  {
57  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
58  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
59  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
60  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
61 
62  for (vcl_size_t row = 0; row < mat.size1(); ++row)
63  {
64  NumericT value = 0;
65  unsigned int row_end = row_buffer[row+1];
66 
67  switch (info_selector)
68  {
70  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
71  value = std::max<NumericT>(value, std::fabs(elements[i]));
72  break;
73 
75  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
76  value += std::fabs(elements[i]);
77  break;
78 
80  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
81  value += elements[i] * elements[i];
82  value = std::sqrt(value);
83  break;
84 
86  for (unsigned int i = row_buffer[row]; i < row_end; ++i)
87  {
88  if (col_buffer[i] == row)
89  {
90  value = elements[i];
91  break;
92  }
93  }
94  break;
95  }
96  result_buf[row] = value;
97  }
98  }
99 }
100 
101 
110 template<typename NumericT, unsigned int AlignmentV>
113  NumericT alpha,
115  NumericT beta)
116 {
117  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
118  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
119  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
120  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
121  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
122 
123 #ifdef VIENNACL_WITH_OPENMP
124  #pragma omp parallel for
125 #endif
126  for (long row = 0; row < static_cast<long>(mat.size1()); ++row)
127  {
128  NumericT dot_prod = 0;
129  vcl_size_t row_end = row_buffer[row+1];
130  for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
131  dot_prod += elements[i] * vec_buf[col_buffer[i] * vec.stride() + vec.start()];
132 
133  if (beta < 0 || beta > 0)
134  {
135  vcl_size_t index = static_cast<vcl_size_t>(row) * result.stride() + result.start();
136  result_buf[index] = alpha * dot_prod + beta * result_buf[index];
137  }
138  else
139  result_buf[static_cast<vcl_size_t>(row) * result.stride() + result.start()] = alpha * dot_prod;
140  }
141 
142 }
143 
152 template<typename NumericT, unsigned int AlignmentV>
154  const viennacl::matrix_base<NumericT> & d_mat,
156 
157  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
158  unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
159  unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
160 
161  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
162  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
163 
164  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
165  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
166  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
167  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
168  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
169  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
170 
171  vcl_size_t result_start1 = viennacl::traits::start1(result);
172  vcl_size_t result_start2 = viennacl::traits::start2(result);
173  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
174  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
175  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
176  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
177 
179  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
181  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
182 
184  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
186  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
187 
188  if ( d_mat.row_major() ) {
189 #ifdef VIENNACL_WITH_OPENMP
190  #pragma omp parallel for
191 #endif
192  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
193  vcl_size_t row_start = sp_mat_row_buffer[row];
194  vcl_size_t row_end = sp_mat_row_buffer[row+1];
195  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
196  NumericT temp = 0;
197  for (vcl_size_t k = row_start; k < row_end; ++k) {
198  temp += sp_mat_elements[k] * d_mat_wrapper_row(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), col);
199  }
200  if (result.row_major())
201  result_wrapper_row(row, col) = temp;
202  else
203  result_wrapper_col(row, col) = temp;
204  }
205  }
206  }
207  else {
208 #ifdef VIENNACL_WITH_OPENMP
209  #pragma omp parallel for
210 #endif
211  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
212  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
213  vcl_size_t row_start = sp_mat_row_buffer[row];
214  vcl_size_t row_end = sp_mat_row_buffer[row+1];
215  NumericT temp = 0;
216  for (vcl_size_t k = row_start; k < row_end; ++k) {
217  temp += sp_mat_elements[k] * d_mat_wrapper_col(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), static_cast<vcl_size_t>(col));
218  }
219  if (result.row_major())
220  result_wrapper_row(row, col) = temp;
221  else
222  result_wrapper_col(row, col) = temp;
223  }
224  }
225  }
226 
227 }
228 
238 template<typename NumericT, unsigned int AlignmentV>
242  viennacl::op_trans > & d_mat,
244 
245  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
246  unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
247  unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
248 
249  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
250  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
251 
252  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
253  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
254  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
255  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
256  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
257  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
258 
259  vcl_size_t result_start1 = viennacl::traits::start1(result);
260  vcl_size_t result_start2 = viennacl::traits::start2(result);
261  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
262  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
263  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
264  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
265 
267  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
269  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
270 
272  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
274  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
275 
276  if ( d_mat.lhs().row_major() ) {
277 #ifdef VIENNACL_WITH_OPENMP
278  #pragma omp parallel for
279 #endif
280  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
281  vcl_size_t row_start = sp_mat_row_buffer[row];
282  vcl_size_t row_end = sp_mat_row_buffer[row+1];
283  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
284  NumericT temp = 0;
285  for (vcl_size_t k = row_start; k < row_end; ++k) {
286  temp += sp_mat_elements[k] * d_mat_wrapper_row(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
287  }
288  if (result.row_major())
289  result_wrapper_row(row, col) = temp;
290  else
291  result_wrapper_col(row, col) = temp;
292  }
293  }
294  }
295  else {
296 #ifdef VIENNACL_WITH_OPENMP
297  #pragma omp parallel for
298 #endif
299  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
300  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
301  vcl_size_t row_start = sp_mat_row_buffer[row];
302  vcl_size_t row_end = sp_mat_row_buffer[row+1];
303  NumericT temp = 0;
304  for (vcl_size_t k = row_start; k < row_end; ++k) {
305  temp += sp_mat_elements[k] * d_mat_wrapper_col(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
306  }
307  if (result.row_major())
308  result_wrapper_row(row, col) = temp;
309  else
310  result_wrapper_col(row, col) = temp;
311  }
312  }
313  }
314 
315 }
316 
317 
327 template<typename NumericT, unsigned int AlignmentV>
331 {
332 
333  NumericT const * A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
334  unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
335  unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
336 
337  NumericT const * B_elements = detail::extract_raw_pointer<NumericT>(B.handle());
338  unsigned int const * B_row_buffer = detail::extract_raw_pointer<unsigned int>(B.handle1());
339  unsigned int const * B_col_buffer = detail::extract_raw_pointer<unsigned int>(B.handle2());
340 
341  C.resize(A.size1(), B.size2(), false);
342  unsigned int * C_row_buffer = detail::extract_raw_pointer<unsigned int>(C.handle1());
343 
344 #if defined(VIENNACL_WITH_OPENMP)
345  unsigned int block_factor = 10;
346  unsigned int max_threads = omp_get_max_threads();
347  long chunk_size = long(A.size1()) / long(block_factor * max_threads) + 1;
348 #else
349  unsigned int max_threads = 1;
350 #endif
351  std::vector<unsigned int> max_length_row_C(max_threads);
352  std::vector<unsigned int *> row_C_temp_index_buffers(max_threads);
353  std::vector<NumericT *> row_C_temp_value_buffers(max_threads);
354 
355 
356  /*
357  * Stage 1: Determine maximum length of work buffers:
358  */
359 
360 #if defined(VIENNACL_WITH_OPENMP)
361  #pragma omp parallel for schedule(dynamic, chunk_size)
362 #endif
363  for (long i=0; i<long(A.size1()); ++i)
364  {
365  unsigned int row_start_A = A_row_buffer[i];
366  unsigned int row_end_A = A_row_buffer[i+1];
367 
368  unsigned int row_C_upper_bound_row = 0;
369  for (unsigned int j = row_start_A; j<row_end_A; ++j)
370  {
371  unsigned int row_B = A_col_buffer[j];
372 
373  unsigned int entries_in_row = B_row_buffer[row_B+1] - B_row_buffer[row_B];
374  row_C_upper_bound_row += entries_in_row;
375  }
376 
377 #ifdef VIENNACL_WITH_OPENMP
378  unsigned int thread_id = omp_get_thread_num();
379 #else
380  unsigned int thread_id = 0;
381 #endif
382 
383  max_length_row_C[thread_id] = std::max(max_length_row_C[thread_id], std::min(row_C_upper_bound_row, static_cast<unsigned int>(B.size2())));
384  }
385 
386  // determine global maximum row length
387  for (std::size_t i=1; i<max_length_row_C.size(); ++i)
388  max_length_row_C[0] = std::max(max_length_row_C[0], max_length_row_C[i]);
389 
390  // allocate work vectors:
391  for (unsigned int i=0; i<max_threads; ++i)
392  row_C_temp_index_buffers[i] = (unsigned int *)malloc(sizeof(unsigned int)*3*max_length_row_C[0]);
393 
394 
395  /*
396  * Stage 2: Determine sparsity pattern of C
397  */
398 
399 #ifdef VIENNACL_WITH_OPENMP
400  #pragma omp parallel for schedule(dynamic, chunk_size)
401 #endif
402  for (long i=0; i<long(A.size1()); ++i)
403  {
404  unsigned int thread_id = 0;
405  #ifdef VIENNACL_WITH_OPENMP
406  thread_id = omp_get_thread_num();
407  #endif
408  unsigned int buffer_len = max_length_row_C[0];
409 
410  unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
411  unsigned int *row_C_vector_2 = row_C_vector_1 + buffer_len;
412  unsigned int *row_C_vector_3 = row_C_vector_2 + buffer_len;
413 
414  unsigned int row_start_A = A_row_buffer[i];
415  unsigned int row_end_A = A_row_buffer[i+1];
416 
417  C_row_buffer[i] = row_C_scan_symbolic_vector(row_start_A, row_end_A, A_col_buffer,
418  B_row_buffer, B_col_buffer, static_cast<unsigned int>(B.size2()),
419  row_C_vector_1, row_C_vector_2, row_C_vector_3);
420  }
421 
422  // exclusive scan to obtain row start indices:
423  unsigned int current_offset = 0;
424  for (std::size_t i=0; i<C.size1(); ++i)
425  {
426  unsigned int tmp = C_row_buffer[i];
427  C_row_buffer[i] = current_offset;
428  current_offset += tmp;
429  }
430  C_row_buffer[C.size1()] = current_offset;
431  C.reserve(current_offset, false);
432 
433  // allocate work vectors:
434  for (unsigned int i=0; i<max_threads; ++i)
435  row_C_temp_value_buffers[i] = (NumericT *)malloc(sizeof(NumericT)*3*max_length_row_C[0]);
436 
437  /*
438  * Stage 3: Compute product (code similar, maybe pull out into a separate function to avoid code duplication?)
439  */
440  NumericT * C_elements = detail::extract_raw_pointer<NumericT>(C.handle());
441  unsigned int * C_col_buffer = detail::extract_raw_pointer<unsigned int>(C.handle2());
442 
443 #ifdef VIENNACL_WITH_OPENMP
444  #pragma omp parallel for schedule(dynamic, chunk_size)
445 #endif
446  for (long i = 0; i < long(A.size1()); ++i)
447  {
448  unsigned int row_start_A = A_row_buffer[i];
449  unsigned int row_end_A = A_row_buffer[i+1];
450 
451  unsigned int row_C_buffer_start = C_row_buffer[i];
452  unsigned int row_C_buffer_end = C_row_buffer[i+1];
453 
454 #ifdef VIENNACL_WITH_OPENMP
455  unsigned int thread_id = omp_get_thread_num();
456 #else
457  unsigned int thread_id = 0;
458 #endif
459 
460  unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
461  unsigned int *row_C_vector_2 = row_C_vector_1 + max_length_row_C[0];
462  unsigned int *row_C_vector_3 = row_C_vector_2 + max_length_row_C[0];
463 
464  NumericT *row_C_vector_1_values = row_C_temp_value_buffers[thread_id];
465  NumericT *row_C_vector_2_values = row_C_vector_1_values + max_length_row_C[0];
466  NumericT *row_C_vector_3_values = row_C_vector_2_values + max_length_row_C[0];
467 
468  row_C_scan_numeric_vector(row_start_A, row_end_A, A_col_buffer, A_elements,
469  B_row_buffer, B_col_buffer, B_elements, static_cast<unsigned int>(B.size2()),
470  row_C_buffer_start, row_C_buffer_end, C_col_buffer, C_elements,
471  row_C_vector_1, row_C_vector_1_values,
472  row_C_vector_2, row_C_vector_2_values,
473  row_C_vector_3, row_C_vector_3_values);
474  }
475 
476  // clean up at the end:
477  for (unsigned int i=0; i<max_threads; ++i)
478  {
479  free(row_C_temp_index_buffers[i]);
480  free(row_C_temp_value_buffers[i]);
481  }
482 
483 }
484 
485 
486 
487 
488 //
489 // Triangular solve for compressed_matrix, A \ b
490 //
491 namespace detail
492 {
493  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
494  void csr_inplace_solve(IndexArrayT const & row_buffer,
495  IndexArrayT const & col_buffer,
496  ConstScalarArrayT const & element_buffer,
497  ScalarArrayT & vec_buffer,
498  vcl_size_t num_cols,
500  {
501  vcl_size_t row_begin = row_buffer[1];
502  for (vcl_size_t row = 1; row < num_cols; ++row)
503  {
504  NumericT vec_entry = vec_buffer[row];
505  vcl_size_t row_end = row_buffer[row+1];
506  for (vcl_size_t i = row_begin; i < row_end; ++i)
507  {
508  vcl_size_t col_index = col_buffer[i];
509  if (col_index < row)
510  vec_entry -= vec_buffer[col_index] * element_buffer[i];
511  }
512  vec_buffer[row] = vec_entry;
513  row_begin = row_end;
514  }
515  }
516 
517  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
518  void csr_inplace_solve(IndexArrayT const & row_buffer,
519  IndexArrayT const & col_buffer,
520  ConstScalarArrayT const & element_buffer,
521  ScalarArrayT & vec_buffer,
522  vcl_size_t num_cols,
524  {
525  vcl_size_t row_begin = row_buffer[0];
526  for (vcl_size_t row = 0; row < num_cols; ++row)
527  {
528  NumericT vec_entry = vec_buffer[row];
529 
530  // substitute and remember diagonal entry
531  vcl_size_t row_end = row_buffer[row+1];
532  NumericT diagonal_entry = 0;
533  for (vcl_size_t i = row_begin; i < row_end; ++i)
534  {
535  vcl_size_t col_index = col_buffer[i];
536  if (col_index < row)
537  vec_entry -= vec_buffer[col_index] * element_buffer[i];
538  else if (col_index == row)
539  diagonal_entry = element_buffer[i];
540  }
541 
542  vec_buffer[row] = vec_entry / diagonal_entry;
543  row_begin = row_end;
544  }
545  }
546 
547 
548  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
549  void csr_inplace_solve(IndexArrayT const & row_buffer,
550  IndexArrayT const & col_buffer,
551  ConstScalarArrayT const & element_buffer,
552  ScalarArrayT & vec_buffer,
553  vcl_size_t num_cols,
555  {
556  for (vcl_size_t row2 = 1; row2 < num_cols; ++row2)
557  {
558  vcl_size_t row = (num_cols - row2) - 1;
559  NumericT vec_entry = vec_buffer[row];
560  vcl_size_t row_begin = row_buffer[row];
561  vcl_size_t row_end = row_buffer[row+1];
562  for (vcl_size_t i = row_begin; i < row_end; ++i)
563  {
564  vcl_size_t col_index = col_buffer[i];
565  if (col_index > row)
566  vec_entry -= vec_buffer[col_index] * element_buffer[i];
567  }
568  vec_buffer[row] = vec_entry;
569  }
570  }
571 
572  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
573  void csr_inplace_solve(IndexArrayT const & row_buffer,
574  IndexArrayT const & col_buffer,
575  ConstScalarArrayT const & element_buffer,
576  ScalarArrayT & vec_buffer,
577  vcl_size_t num_cols,
579  {
580  for (vcl_size_t row2 = 0; row2 < num_cols; ++row2)
581  {
582  vcl_size_t row = (num_cols - row2) - 1;
583  NumericT vec_entry = vec_buffer[row];
584 
585  // substitute and remember diagonal entry
586  vcl_size_t row_begin = row_buffer[row];
587  vcl_size_t row_end = row_buffer[row+1];
588  NumericT diagonal_entry = 0;
589  for (vcl_size_t i = row_begin; i < row_end; ++i)
590  {
591  vcl_size_t col_index = col_buffer[i];
592  if (col_index > row)
593  vec_entry -= vec_buffer[col_index] * element_buffer[i];
594  else if (col_index == row)
595  diagonal_entry = element_buffer[i];
596  }
597 
598  vec_buffer[row] = vec_entry / diagonal_entry;
599  }
600  }
601 
602 } //namespace detail
603 
604 
605 
612 template<typename NumericT, unsigned int AlignmentV>
614  vector_base<NumericT> & vec,
616 {
617  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
618  NumericT const * elements = detail::extract_raw_pointer<NumericT>(L.handle());
619  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
620  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
621 
622  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
623 }
624 
631 template<typename NumericT, unsigned int AlignmentV>
633  vector_base<NumericT> & vec,
635 {
636  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
637  NumericT const * elements = detail::extract_raw_pointer<NumericT>(L.handle());
638  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
639  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
640 
641  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
642 }
643 
644 
651 template<typename NumericT, unsigned int AlignmentV>
653  vector_base<NumericT> & vec,
655 {
656  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
657  NumericT const * elements = detail::extract_raw_pointer<NumericT>(U.handle());
658  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
659  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
660 
661  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
662 }
663 
670 template<typename NumericT, unsigned int AlignmentV>
672  vector_base<NumericT> & vec,
674 {
675  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
676  NumericT const * elements = detail::extract_raw_pointer<NumericT>(U.handle());
677  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
678  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
679 
680  detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
681 }
682 
683 
684 
685 
686 
687 
688 
689 //
690 // Triangular solve for compressed_matrix, A^T \ b
691 //
692 
693 namespace detail
694 {
695  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
696  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
697  IndexArrayT const & col_buffer,
698  ConstScalarArrayT const & element_buffer,
699  ScalarArrayT & vec_buffer,
700  vcl_size_t num_cols,
702  {
703  vcl_size_t col_begin = row_buffer[0];
704  for (vcl_size_t col = 0; col < num_cols; ++col)
705  {
706  NumericT vec_entry = vec_buffer[col];
707  vcl_size_t col_end = row_buffer[col+1];
708  for (vcl_size_t i = col_begin; i < col_end; ++i)
709  {
710  unsigned int row_index = col_buffer[i];
711  if (row_index > col)
712  vec_buffer[row_index] -= vec_entry * element_buffer[i];
713  }
714  col_begin = col_end;
715  }
716  }
717 
718  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
719  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
720  IndexArrayT const & col_buffer,
721  ConstScalarArrayT const & element_buffer,
722  ScalarArrayT & vec_buffer,
723  vcl_size_t num_cols,
725  {
726  vcl_size_t col_begin = row_buffer[0];
727  for (vcl_size_t col = 0; col < num_cols; ++col)
728  {
729  vcl_size_t col_end = row_buffer[col+1];
730 
731  // Stage 1: Find diagonal entry:
732  NumericT diagonal_entry = 0;
733  for (vcl_size_t i = col_begin; i < col_end; ++i)
734  {
735  vcl_size_t row_index = col_buffer[i];
736  if (row_index == col)
737  {
738  diagonal_entry = element_buffer[i];
739  break;
740  }
741  }
742 
743  // Stage 2: Substitute
744  NumericT vec_entry = vec_buffer[col] / diagonal_entry;
745  vec_buffer[col] = vec_entry;
746  for (vcl_size_t i = col_begin; i < col_end; ++i)
747  {
748  vcl_size_t row_index = col_buffer[i];
749  if (row_index > col)
750  vec_buffer[row_index] -= vec_entry * element_buffer[i];
751  }
752  col_begin = col_end;
753  }
754  }
755 
756  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
757  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
758  IndexArrayT const & col_buffer,
759  ConstScalarArrayT const & element_buffer,
760  ScalarArrayT & vec_buffer,
761  vcl_size_t num_cols,
763  {
764  for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
765  {
766  vcl_size_t col = (num_cols - col2) - 1;
767 
768  NumericT vec_entry = vec_buffer[col];
769  vcl_size_t col_begin = row_buffer[col];
770  vcl_size_t col_end = row_buffer[col+1];
771  for (vcl_size_t i = col_begin; i < col_end; ++i)
772  {
773  vcl_size_t row_index = col_buffer[i];
774  if (row_index < col)
775  vec_buffer[row_index] -= vec_entry * element_buffer[i];
776  }
777 
778  }
779  }
780 
781  template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
782  void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
783  IndexArrayT const & col_buffer,
784  ConstScalarArrayT const & element_buffer,
785  ScalarArrayT & vec_buffer,
786  vcl_size_t num_cols,
788  {
789  for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
790  {
791  vcl_size_t col = (num_cols - col2) - 1;
792  vcl_size_t col_begin = row_buffer[col];
793  vcl_size_t col_end = row_buffer[col+1];
794 
795  // Stage 1: Find diagonal entry:
796  NumericT diagonal_entry = 0;
797  for (vcl_size_t i = col_begin; i < col_end; ++i)
798  {
799  vcl_size_t row_index = col_buffer[i];
800  if (row_index == col)
801  {
802  diagonal_entry = element_buffer[i];
803  break;
804  }
805  }
806 
807  // Stage 2: Substitute
808  NumericT vec_entry = vec_buffer[col] / diagonal_entry;
809  vec_buffer[col] = vec_entry;
810  for (vcl_size_t i = col_begin; i < col_end; ++i)
811  {
812  vcl_size_t row_index = col_buffer[i];
813  if (row_index < col)
814  vec_buffer[row_index] -= vec_entry * element_buffer[i];
815  }
816  }
817  }
818 
819 
820  //
821  // block solves
822  //
823  template<typename NumericT, unsigned int AlignmentV>
826  op_trans> & L,
827  viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
828  vector_base<NumericT> const & /* L_diagonal */, //ignored
829  vector_base<NumericT> & vec,
831  {
832  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
833 
834  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
835  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
836  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
837  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
838 
839  vcl_size_t col_begin = row_buffer[0];
840  for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
841  {
842  NumericT vec_entry = vec_buffer[col];
843  vcl_size_t col_end = row_buffer[col+1];
844  for (vcl_size_t i = col_begin; i < col_end; ++i)
845  {
846  unsigned int row_index = col_buffer[i];
847  if (row_index > col)
848  vec_buffer[row_index] -= vec_entry * elements[i];
849  }
850  col_begin = col_end;
851  }
852  }
853 
854  template<typename NumericT, unsigned int AlignmentV>
857  op_trans> & L,
858  viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
859  vector_base<NumericT> const & L_diagonal,
860  vector_base<NumericT> & vec,
862  {
863  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
864 
865  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
866  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
867  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
868  NumericT const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(L_diagonal.handle());
869  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
870 
871  vcl_size_t col_begin = row_buffer[0];
872  for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
873  {
874  vcl_size_t col_end = row_buffer[col+1];
875 
876  NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
877  vec_buffer[col] = vec_entry;
878  for (vcl_size_t i = col_begin; i < col_end; ++i)
879  {
880  vcl_size_t row_index = col_buffer[i];
881  if (row_index > col)
882  vec_buffer[row_index] -= vec_entry * elements[i];
883  }
884  col_begin = col_end;
885  }
886  }
887 
888 
889 
890  template<typename NumericT, unsigned int AlignmentV>
893  op_trans> & U,
894  viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
895  vector_base<NumericT> const & /* U_diagonal */, //ignored
896  vector_base<NumericT> & vec,
898  {
899  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
900 
901  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
902  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
903  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
904  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
905 
906  for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
907  {
908  vcl_size_t col = (U.lhs().size1() - col2) - 1;
909 
910  NumericT vec_entry = vec_buffer[col];
911  vcl_size_t col_begin = row_buffer[col];
912  vcl_size_t col_end = row_buffer[col+1];
913  for (vcl_size_t i = col_begin; i < col_end; ++i)
914  {
915  vcl_size_t row_index = col_buffer[i];
916  if (row_index < col)
917  vec_buffer[row_index] -= vec_entry * elements[i];
918  }
919 
920  }
921  }
922 
923  template<typename NumericT, unsigned int AlignmentV>
926  op_trans> & U,
927  viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
928  vector_base<NumericT> const & U_diagonal,
929  vector_base<NumericT> & vec,
931  {
932  // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
933 
934  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
935  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
936  NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
937  NumericT const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(U_diagonal.handle());
938  NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
939 
940  for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
941  {
942  vcl_size_t col = (U.lhs().size1() - col2) - 1;
943  vcl_size_t col_begin = row_buffer[col];
944  vcl_size_t col_end = row_buffer[col+1];
945 
946  // Stage 2: Substitute
947  NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
948  vec_buffer[col] = vec_entry;
949  for (vcl_size_t i = col_begin; i < col_end; ++i)
950  {
951  vcl_size_t row_index = col_buffer[i];
952  if (row_index < col)
953  vec_buffer[row_index] -= vec_entry * elements[i];
954  }
955  }
956  }
957 
958 
959 } //namespace detail
960 
967 template<typename NumericT, unsigned int AlignmentV>
970  op_trans> const & proxy,
971  vector_base<NumericT> & vec,
973 {
974  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
975  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
976  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
977  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
978 
979  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
980 }
981 
988 template<typename NumericT, unsigned int AlignmentV>
991  op_trans> const & proxy,
992  vector_base<NumericT> & vec,
994 {
995  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
996  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
997  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
998  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
999 
1000  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
1001 }
1002 
1003 
1010 template<typename NumericT, unsigned int AlignmentV>
1013  op_trans> const & proxy,
1014  vector_base<NumericT> & vec,
1016 {
1017  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1018  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
1019  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
1020  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
1021 
1022  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
1023 }
1024 
1025 
1032 template<typename NumericT, unsigned int AlignmentV>
1035  op_trans> const & proxy,
1036  vector_base<NumericT> & vec,
1038 {
1039  NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1040  NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
1041  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
1042  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
1043 
1044  detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
1045 }
1046 
1047 
1048 
1049 //
1050 // Compressed Compressed Matrix
1051 //
1052 
1061 template<typename NumericT>
1063  const viennacl::vector_base<NumericT> & vec,
1064  NumericT alpha,
1066  NumericT beta)
1067 {
1068  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1069  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1070  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1071  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
1072  unsigned int const * row_indices = detail::extract_raw_pointer<unsigned int>(mat.handle3());
1073  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1074 
1075  if (beta < 0 || beta > 0)
1076  {
1077  for (vcl_size_t i = 0; i< result.size(); ++i)
1078  result_buf[i * result.stride() + result.start()] *= beta;
1079  }
1080  else // flush
1081  {
1082  for (vcl_size_t i = 0; i< result.size(); ++i)
1083  result_buf[i * result.stride() + result.start()] = 0;
1084  }
1085 
1086 #ifdef VIENNACL_WITH_OPENMP
1087  #pragma omp parallel for
1088 #endif
1089  for (long i = 0; i < static_cast<long>(mat.nnz1()); ++i)
1090  {
1091  NumericT dot_prod = 0;
1092  vcl_size_t row_end = row_buffer[i+1];
1093  for (vcl_size_t j = row_buffer[i]; j < row_end; ++j)
1094  dot_prod += elements[j] * vec_buf[col_buffer[j] * vec.stride() + vec.start()];
1095 
1096  if (beta > 0 || beta < 0)
1097  result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()] += alpha * dot_prod;
1098  else
1099  result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()] = alpha * dot_prod;
1100  }
1101 
1102 }
1103 
1104 
1105 
1106 //
1107 // Coordinate Matrix
1108 //
1109 
1110 namespace detail
1111 {
1112  template<typename NumericT, unsigned int AlignmentV>
1114  vector_base<NumericT> & vec,
1116  {
1117  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1118  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1119  unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
1120 
1121  NumericT value = 0;
1122  unsigned int last_row = 0;
1123 
1124  for (vcl_size_t i = 0; i < mat.nnz(); ++i)
1125  {
1126  unsigned int current_row = coord_buffer[2*i];
1127 
1128  if (current_row != last_row)
1129  {
1130  if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
1131  value = std::sqrt(value);
1132 
1133  result_buf[last_row] = value;
1134  value = 0;
1135  last_row = current_row;
1136  }
1137 
1138  switch (info_selector)
1139  {
1141  value = std::max<NumericT>(value, std::fabs(elements[i]));
1142  break;
1143 
1145  value += std::fabs(elements[i]);
1146  break;
1147 
1149  value += elements[i] * elements[i];
1150  break;
1151 
1152  case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
1153  if (coord_buffer[2*i+1] == current_row)
1154  value = elements[i];
1155  break;
1156 
1157  //default:
1158  // break;
1159  }
1160  }
1161 
1162  if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
1163  value = std::sqrt(value);
1164 
1165  result_buf[last_row] = value;
1166  }
1167 }
1168 
1177 template<typename NumericT, unsigned int AlignmentV>
1179  const viennacl::vector_base<NumericT> & vec,
1180  NumericT alpha,
1182  NumericT beta)
1183 {
1184  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1185  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1186  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1187  unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
1188 
1189  if (beta < 0 || beta > 0)
1190  {
1191  for (vcl_size_t i = 0; i< result.size(); ++i)
1192  result_buf[i * result.stride() + result.start()] *= beta;
1193  }
1194  else // flush
1195  {
1196  for (vcl_size_t i = 0; i< result.size(); ++i)
1197  result_buf[i * result.stride() + result.start()] = 0;
1198  }
1199 
1200  for (vcl_size_t i = 0; i < mat.nnz(); ++i)
1201  result_buf[coord_buffer[2*i] * result.stride() + result.start()]
1202  += alpha * elements[i] * vec_buf[coord_buffer[2*i+1] * vec.stride() + vec.start()];
1203 }
1204 
1213 template<typename NumericT, unsigned int AlignmentV>
1215  const viennacl::matrix_base<NumericT> & d_mat,
1217 
1218  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1219  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
1220 
1221  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1222  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1223 
1224  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
1225  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
1226  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
1227  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
1228  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
1229  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
1230 
1231  vcl_size_t result_start1 = viennacl::traits::start1(result);
1232  vcl_size_t result_start2 = viennacl::traits::start2(result);
1233  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1234  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1235  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1236  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1237 
1239  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1241  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1242 
1244  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1246  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1247 
1248  if ( d_mat.row_major() ) {
1249 
1250 #ifdef VIENNACL_WITH_OPENMP
1251  #pragma omp parallel for
1252 #endif
1253  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1254  {
1255  if (result.row_major())
1256  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1257  result_wrapper_row(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1258  else
1259  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1260  result_wrapper_col(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1261  }
1262 
1263 #ifdef VIENNACL_WITH_OPENMP
1264  #pragma omp parallel for
1265 #endif
1266  for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
1267  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1268  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1269  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1270  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1271  NumericT y = d_mat_wrapper_row( c, col);
1272  if (result.row_major())
1273  result_wrapper_row(r, col) += x * y;
1274  else
1275  result_wrapper_col(r, col) += x * y;
1276  }
1277  }
1278  }
1279 
1280  else {
1281 
1282 #ifdef VIENNACL_WITH_OPENMP
1283  #pragma omp parallel for
1284 #endif
1285  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1286  {
1287  if (result.row_major())
1288  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1289  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1290  else
1291  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1292  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1293  }
1294 
1295 #ifdef VIENNACL_WITH_OPENMP
1296  #pragma omp parallel for
1297 #endif
1298  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
1299 
1300  for (vcl_size_t i = 0; i < sp_mat.nnz(); ++i) {
1301 
1302  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1303  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1304  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1305  NumericT y = d_mat_wrapper_col( c, col);
1306 
1307  if (result.row_major())
1308  result_wrapper_row( r, col) += x*y;
1309  else
1310  result_wrapper_col( r, col) += x*y;
1311  }
1312 
1313  }
1314  }
1315 
1316 }
1317 
1318 
1327 template<typename NumericT, unsigned int AlignmentV>
1331  viennacl::op_trans > & d_mat,
1333 
1334  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1335  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
1336 
1337  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
1338  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1339 
1340  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
1341  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
1342  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
1343  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
1344  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
1345  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
1346 
1347  vcl_size_t result_start1 = viennacl::traits::start1(result);
1348  vcl_size_t result_start2 = viennacl::traits::start2(result);
1349  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1350  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1351  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1352  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1353 
1355  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1357  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1358 
1360  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1362  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1363 
1364  if ( d_mat.lhs().row_major() )
1365  {
1366 #ifdef VIENNACL_WITH_OPENMP
1367  #pragma omp parallel for
1368 #endif
1369  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1370  {
1371  if (result.row_major())
1372  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1373  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1374  else
1375  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1376  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1377  }
1378 
1379 #ifdef VIENNACL_WITH_OPENMP
1380  #pragma omp parallel for
1381 #endif
1382  for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
1383  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1384  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1385  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1386  if (result.row_major())
1387  {
1388  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1389  NumericT y = d_mat_wrapper_row( col, c);
1390  result_wrapper_row(r, col) += x * y;
1391  }
1392  }
1393  else
1394  {
1395  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1396  NumericT y = d_mat_wrapper_row( col, c);
1397  result_wrapper_col(r, col) += x * y;
1398  }
1399  }
1400  }
1401 
1402 
1403  }
1404  else
1405  {
1406 #ifdef VIENNACL_WITH_OPENMP
1407  #pragma omp parallel for
1408 #endif
1409  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1410  {
1411  if (result.row_major())
1412  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1413  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1414  else
1415  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1416  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1417  }
1418 
1419 #ifdef VIENNACL_WITH_OPENMP
1420  #pragma omp parallel for
1421 #endif
1422  for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
1423  NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
1424  vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
1425  vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
1426  if (result.row_major())
1427  {
1428  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1429  NumericT y = d_mat_wrapper_col( col, c);
1430  result_wrapper_row(r, col) += x * y;
1431  }
1432  }
1433  else
1434  {
1435  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1436  NumericT y = d_mat_wrapper_col( col, c);
1437  result_wrapper_col(r, col) += x * y;
1438  }
1439  }
1440  }
1441  }
1442 
1443 }
1444 
1445 
1446 
1447 //
1448 // ELL Matrix
1449 //
1458 template<typename NumericT, unsigned int AlignmentV>
1460  const viennacl::vector_base<NumericT> & vec,
1461  NumericT alpha,
1463  NumericT beta)
1464 {
1465  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1466  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1467  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1468  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1469 
1470  for (vcl_size_t row = 0; row < mat.size1(); ++row)
1471  {
1472  NumericT sum = 0;
1473 
1474  for (unsigned int item_id = 0; item_id < mat.internal_maxnnz(); ++item_id)
1475  {
1476  vcl_size_t offset = row + item_id * mat.internal_size1();
1477  NumericT val = elements[offset];
1478 
1479  if (val > 0 || val < 0)
1480  {
1481  unsigned int col = coords[offset];
1482  sum += (vec_buf[col * vec.stride() + vec.start()] * val);
1483  }
1484  }
1485 
1486  if (beta < 0 || beta > 0)
1487  {
1488  vcl_size_t index = row * result.stride() + result.start();
1489  result_buf[index] = alpha * sum + beta * result_buf[index];
1490  }
1491  else
1492  result_buf[row * result.stride() + result.start()] = alpha * sum;
1493  }
1494 }
1495 
1504 template<typename NumericT, unsigned int AlignmentV>
1506  const viennacl::matrix_base<NumericT> & d_mat,
1508 {
1509  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1510  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
1511 
1512  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1513  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1514 
1515  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
1516  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
1517  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
1518  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
1519  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
1520  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
1521 
1522  vcl_size_t result_start1 = viennacl::traits::start1(result);
1523  vcl_size_t result_start2 = viennacl::traits::start2(result);
1524  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1525  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1526  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1527  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1528 
1530  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1532  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1533 
1535  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1537  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1538 
1539  if ( d_mat.row_major() ) {
1540 #ifdef VIENNACL_WITH_OPENMP
1541  #pragma omp parallel for
1542 #endif
1543  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1544  {
1545  if (result.row_major())
1546  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1547  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1548  else
1549  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1550  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1551  }
1552 
1553 #ifdef VIENNACL_WITH_OPENMP
1554  #pragma omp parallel for
1555 #endif
1556  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1557  {
1558  for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id)
1559  {
1560  vcl_size_t offset = static_cast<vcl_size_t>(row) + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
1561  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1562  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1563 
1564  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1565  {
1566  if (result.row_major())
1567  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1568  result_wrapper_row(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
1569  else
1570  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1571  result_wrapper_col(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
1572  }
1573  }
1574  }
1575  }
1576  else {
1577 #ifdef VIENNACL_WITH_OPENMP
1578  #pragma omp parallel for
1579 #endif
1580  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1581  {
1582  if (result.row_major())
1583  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1584  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1585  else
1586  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1587  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1588  }
1589 
1590 #ifdef VIENNACL_WITH_OPENMP
1591  #pragma omp parallel for
1592 #endif
1593  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
1594 
1595  for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
1596 
1597  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
1598 
1599  vcl_size_t offset = row + item_id * sp_mat.internal_size1();
1600  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1601  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1602 
1603  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1604  {
1605  if (result.row_major())
1606  result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
1607  else
1608  result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
1609  }
1610  }
1611  }
1612  }
1613  }
1614 
1615 }
1616 
1626 template<typename NumericT, unsigned int AlignmentV>
1630  viennacl::op_trans > & d_mat,
1632 
1633  NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
1634  unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
1635 
1636  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
1637  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1638 
1639  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
1640  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
1641  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
1642  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
1643  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
1644  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
1645 
1646  vcl_size_t result_start1 = viennacl::traits::start1(result);
1647  vcl_size_t result_start2 = viennacl::traits::start2(result);
1648  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1649  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1650  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1651  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1652 
1654  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1656  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1657 
1659  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1661  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1662 
1663  if ( d_mat.lhs().row_major() )
1664  {
1665 #ifdef VIENNACL_WITH_OPENMP
1666  #pragma omp parallel for
1667 #endif
1668  for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
1669  {
1670  if (result.row_major())
1671  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1672  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1673  else
1674  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1675  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1676  }
1677 
1678  for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
1679 
1680  for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
1681 
1682  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
1683 
1684  vcl_size_t offset = row + item_id * sp_mat.internal_size1();
1685  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1686  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1687 
1688  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1689  {
1690  if (result.row_major())
1691  result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
1692  else
1693  result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
1694  }
1695  }
1696  }
1697  }
1698  }
1699  else
1700  {
1701 #ifdef VIENNACL_WITH_OPENMP
1702  #pragma omp parallel for
1703 #endif
1704  for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
1705  {
1706  if (result.row_major())
1707  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1708  result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1709  else
1710  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
1711  result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
1712  }
1713 
1714 #ifdef VIENNACL_WITH_OPENMP
1715  #pragma omp parallel for
1716 #endif
1717  for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
1718 
1719  for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id) {
1720 
1721  vcl_size_t offset = row + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
1722  NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
1723  vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
1724 
1725  if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
1726  {
1727  if (result.row_major())
1728  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1729  result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
1730  else
1731  for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
1732  result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
1733  }
1734  }
1735  }
1736  }
1737 
1738 }
1739 
1740 
1741 //
1742 // SELL-C-\sigma Matrix
1743 //
1752 template<typename NumericT, typename IndexT>
1754  const viennacl::vector_base<NumericT> & vec,
1755  NumericT alpha,
1757  NumericT beta)
1758 {
1759  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1760  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1761  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1762  IndexT const * columns_per_block = detail::extract_raw_pointer<IndexT>(mat.handle1());
1763  IndexT const * column_indices = detail::extract_raw_pointer<IndexT>(mat.handle2());
1764  IndexT const * block_start = detail::extract_raw_pointer<IndexT>(mat.handle3());
1765 
1766  vcl_size_t num_blocks = mat.size1() / mat.rows_per_block() + 1;
1767 
1768 #ifdef VIENNACL_WITH_OPENMP
1769  #pragma omp parallel for
1770 #endif
1771  for (long block_idx2 = 0; block_idx2 < static_cast<long>(num_blocks); ++block_idx2)
1772  {
1773  vcl_size_t block_idx = static_cast<vcl_size_t>(block_idx2);
1774  vcl_size_t current_columns_per_block = columns_per_block[block_idx];
1775 
1776  std::vector<NumericT> result_values(mat.rows_per_block());
1777 
1778  for (IndexT column_entry_index = 0;
1779  column_entry_index < current_columns_per_block;
1780  ++column_entry_index)
1781  {
1782  vcl_size_t stride_start = block_start[block_idx] + column_entry_index * mat.rows_per_block();
1783  // Note: This for-loop may be unrolled by hand for exploiting vectorization
1784  // Careful benchmarking recommended first, memory channels may be saturated already!
1785  for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
1786  {
1787  NumericT val = elements[stride_start + row_in_block];
1788 
1789  result_values[row_in_block] += (val > 0 || val < 0) ? vec_buf[column_indices[stride_start + row_in_block] * vec.stride() + vec.start()] * val : 0;
1790  }
1791  }
1792 
1793  vcl_size_t first_row_in_matrix = block_idx * mat.rows_per_block();
1794  if (beta < 0 || beta > 0)
1795  {
1796  for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
1797  {
1798  if (first_row_in_matrix + row_in_block < result.size())
1799  {
1800  vcl_size_t index = (first_row_in_matrix + row_in_block) * result.stride() + result.start();
1801  result_buf[index] = alpha * result_values[row_in_block] + beta * result_buf[index];
1802  }
1803  }
1804  }
1805  else
1806  {
1807  for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
1808  {
1809  if (first_row_in_matrix + row_in_block < result.size())
1810  result_buf[(first_row_in_matrix + row_in_block) * result.stride() + result.start()] = alpha * result_values[row_in_block];
1811  }
1812  }
1813  }
1814 }
1815 
1816 
1817 //
1818 // Hybrid Matrix
1819 //
1828 template<typename NumericT, unsigned int AlignmentV>
1830  const viennacl::vector_base<NumericT> & vec,
1831  NumericT alpha,
1833  NumericT beta)
1834 {
1835  NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
1836  NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
1837  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1838  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1839  NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
1840  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
1841  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
1842 
1843 
1844  for (vcl_size_t row = 0; row < mat.size1(); ++row)
1845  {
1846  NumericT sum = 0;
1847 
1848  //
1849  // Part 1: Process ELL part
1850  //
1851  for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
1852  {
1853  vcl_size_t offset = row + item_id * mat.internal_size1();
1854  NumericT val = elements[offset];
1855 
1856  if (val > 0 || val < 0)
1857  {
1858  unsigned int col = coords[offset];
1859  sum += (vec_buf[col * vec.stride() + vec.start()] * val);
1860  }
1861  }
1862 
1863  //
1864  // Part 2: Process HYB part
1865  //
1866  vcl_size_t col_begin = csr_row_buffer[row];
1867  vcl_size_t col_end = csr_row_buffer[row + 1];
1868 
1869  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1870  {
1871  sum += (vec_buf[csr_col_buffer[item_id] * vec.stride() + vec.start()] * csr_elements[item_id]);
1872  }
1873 
1874  if (beta < 0 || beta > 0)
1875  {
1876  vcl_size_t index = row * result.stride() + result.start();
1877  result_buf[index] = alpha * sum + beta * result_buf[index];
1878  }
1879  else
1880  result_buf[row * result.stride() + result.start()] = alpha * sum;
1881  }
1882 
1883 }
1884 
1885 //
1886 // Hybrid Matrix
1887 //
1896 template<typename NumericT, unsigned int AlignmentV>
1898  const viennacl::matrix_base<NumericT> & d_mat,
1900 {
1901  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1902  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1903 
1904  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
1905  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
1906  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
1907  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
1908  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
1909  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
1910 
1911  vcl_size_t result_start1 = viennacl::traits::start1(result);
1912  vcl_size_t result_start2 = viennacl::traits::start2(result);
1913  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
1914  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
1915  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
1916  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
1917 
1919  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1921  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
1922 
1924  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1926  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
1927 
1928  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
1929  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
1930  NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
1931  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
1932  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
1933 
1934 
1935  for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
1936  {
1937  for (vcl_size_t row = 0; row < mat.size1(); ++row)
1938  {
1939  NumericT sum = 0;
1940 
1941  //
1942  // Part 1: Process ELL part
1943  //
1944  for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
1945  {
1946  vcl_size_t offset = row + item_id * mat.internal_size1();
1947  NumericT val = elements[offset];
1948 
1949  if (val < 0 || val > 0) // val != 0 without compiler warnings
1950  {
1951  vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
1952  if (d_mat.row_major())
1953  sum += d_mat_wrapper_row(col, result_col) * val;
1954  else
1955  sum += d_mat_wrapper_col(col, result_col) * val;
1956  }
1957  }
1958 
1959  //
1960  // Part 2: Process HYB/CSR part
1961  //
1962  vcl_size_t col_begin = csr_row_buffer[row];
1963  vcl_size_t col_end = csr_row_buffer[row + 1];
1964 
1965  if (d_mat.row_major())
1966  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1967  sum += d_mat_wrapper_row(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
1968  else
1969  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
1970  sum += d_mat_wrapper_col(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
1971 
1972  if (result.row_major())
1973  result_wrapper_row(row, result_col) = sum;
1974  else
1975  result_wrapper_col(row, result_col) = sum;
1976  }
1977  } // for result_col
1978 }
1979 
1980 
1989 template<typename NumericT, unsigned int AlignmentV>
1993  viennacl::op_trans > & d_mat,
1995 {
1996  NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
1997  NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
1998 
1999  vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
2000  vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
2001  vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
2002  vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
2003  vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
2004  vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
2005 
2006  vcl_size_t result_start1 = viennacl::traits::start1(result);
2007  vcl_size_t result_start2 = viennacl::traits::start2(result);
2008  vcl_size_t result_inc1 = viennacl::traits::stride1(result);
2009  vcl_size_t result_inc2 = viennacl::traits::stride2(result);
2010  vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
2011  vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
2012 
2014  d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
2016  d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
2017 
2019  result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
2021  result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
2022 
2023  NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
2024  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
2025  NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
2026  unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
2027  unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
2028 
2029 
2030  for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
2031  {
2032  for (vcl_size_t row = 0; row < mat.size1(); ++row)
2033  {
2034  NumericT sum = 0;
2035 
2036  //
2037  // Part 1: Process ELL part
2038  //
2039  for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
2040  {
2041  vcl_size_t offset = row + item_id * mat.internal_size1();
2042  NumericT val = elements[offset];
2043 
2044  if (val < 0 || val > 0) // val != 0 without compiler warnings
2045  {
2046  vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
2047  if (d_mat.lhs().row_major())
2048  sum += d_mat_wrapper_row(result_col, col) * val;
2049  else
2050  sum += d_mat_wrapper_col(result_col, col) * val;
2051  }
2052  }
2053 
2054  //
2055  // Part 2: Process HYB/CSR part
2056  //
2057  vcl_size_t col_begin = csr_row_buffer[row];
2058  vcl_size_t col_end = csr_row_buffer[row + 1];
2059 
2060  if (d_mat.lhs().row_major())
2061  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
2062  sum += d_mat_wrapper_row(result_col, static_cast<vcl_size_t>(csr_col_buffer[item_id])) * csr_elements[item_id];
2063  else
2064  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
2065  sum += d_mat_wrapper_col(result_col, static_cast<vcl_size_t>(csr_col_buffer[item_id])) * csr_elements[item_id];
2066 
2067  if (result.row_major())
2068  result_wrapper_row(row, result_col) = sum;
2069  else
2070  result_wrapper_col(row, result_col) = sum;
2071  }
2072  } // for result_col
2073 }
2074 
2075 
2076 } // namespace host_based
2077 } //namespace linalg
2078 } //namespace viennacl
2079 
2080 
2081 #endif
const vcl_size_t & size2() const
Returns the number of columns.
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
handle_type & handle2()
Definition: ell_matrix.hpp:103
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:386
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
A tag class representing a lower triangular matrix.
Definition: forwards.h:849
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:394
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
vcl_size_t nnz() const
Returns the number of nonzero entries.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
vcl_size_t rows_per_block() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
float NumericT
Definition: bisect.cpp:40
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Helper array for accessing a strided submatrix embedded in a larger matrix.
Definition: common.hpp:73
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
A tag class representing an upper triangular matrix.
Definition: forwards.h:854
void prod_impl(const matrix_base< NumericT > &mat, bool trans, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
std::size_t vcl_size_t
Definition: forwards.h:75
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &, vcl_size_t, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:226
Common routines for single-threaded or OpenMP-enabled execution on CPU.
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
handle_type & handle()
Definition: ell_matrix.hpp:100
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:248
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
void csr_inplace_solve(IndexArrayT const &row_buffer, IndexArrayT const &col_buffer, ConstScalarArrayT const &element_buffer, ScalarArrayT &vec_buffer, vcl_size_t num_cols, viennacl::linalg::unit_lower_tag)
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
A tag class representing transposed matrices.
Definition: forwards.h:220
void row_info(compressed_matrix< NumericT, AlignmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
A sparse square matrix in compressed sparse rows format.
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
vcl_size_t size1() const
Returns the number of rows.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
void csr_trans_inplace_solve(IndexArrayT const &row_buffer, IndexArrayT const &col_buffer, ConstScalarArrayT const &element_buffer, ScalarArrayT &vec_buffer, vcl_size_t num_cols, viennacl::linalg::unit_lower_tag)
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...