ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/ocl/device.hpp"
27 #include "viennacl/ocl/handle.hpp"
28 #include "viennacl/ocl/kernel.hpp"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/vector.hpp"
31 #include "viennacl/tools/tools.hpp"
41 
42 namespace viennacl
43 {
44 namespace linalg
45 {
46 namespace opencl
47 {
48 
49 //
50 // Compressed matrix
51 //
52 
53 namespace detail
54 {
55  template<typename NumericT, unsigned int AlignmentV>
59  {
60  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
63 
64  viennacl::ocl::enqueue(row_info_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
65  viennacl::traits::opencl_handle(x),
66  cl_uint(A.size1()),
67  cl_uint(info_selector)
68  )
69  );
70  }
71 }
72 
81 template<typename NumericT, unsigned int AlignmentV>
84  NumericT alpha,
86  NumericT beta)
87 {
88  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
90  bool use_nvidia_specific = AlignmentV == 1 && ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0);
91  bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
92 
93 
94  std::stringstream ss;
95  ss << "vec_mul";
96  unsigned int alignment = AlignmentV; //prevent unreachable code warnings below
97  if (use_nvidia_specific)
98  ss << "_nvidia";
99  else
100  {
101  if (alignment == 4)
102  ss << "4";
103  if (alignment == 8)
104  ss << "8";
105  }
106 
107  if (with_alpha_beta)
108  ss << "_alpha_beta";
109 
111 
113  layout_x.start = cl_uint(viennacl::traits::start(x));
114  layout_x.stride = cl_uint(viennacl::traits::stride(x));
115  layout_x.size = cl_uint(viennacl::traits::size(x));
116  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
117 
119  layout_y.start = cl_uint(viennacl::traits::start(y));
120  layout_y.stride = cl_uint(viennacl::traits::stride(y));
121  layout_y.size = cl_uint(viennacl::traits::size(y));
122  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
123 
124  if (alignment == 4 || alignment == 8)
125  {
126  if (with_alpha_beta)
127  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
128  x, layout_x,
129  alpha,
130  y, layout_y,
131  beta
132  ));
133  else
134  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
135  x, layout_x,
136  y, layout_y
137  ));
138  }
139  else
140  {
141  if (ctx.current_device().max_work_group_size() >= 256)
142  k.local_work_size(0, 256);
143 
144  if (use_nvidia_specific)
145  {
146  k.global_work_size(0, 512 * k.local_work_size(0));
147 
148  if (with_alpha_beta)
149  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
150  x, layout_x,
151  alpha,
152  y, layout_y,
153  beta
154  ));
155  else
156  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
157  x, layout_x,
158  y, layout_y
159  ));
160  }
161  else // use CSR adaptive:
162  {
163  k.global_work_size(0, A.blocks1() * k.local_work_size(0));
164 
165  if (with_alpha_beta)
166  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
167  x, layout_x,
168  alpha,
169  y, layout_y,
170  beta
171  ));
172  else
173  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
174  x, layout_x,
175  y, layout_y
176  ));
177  }
178  }
179 }
180 
181 
190 template< typename NumericT, unsigned int AlignmentV>
194 
195  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
199 
200  viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
201  viennacl::traits::opencl_handle(d_A),
202  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
203  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
204  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
206  viennacl::traits::opencl_handle(y),
207  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
208  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
209  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
211 }
212 
222 template<typename NumericT, unsigned int AlignmentV>
226  viennacl::op_trans > const & d_A,
228 
229  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
232  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
233 
234  viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
235  viennacl::traits::opencl_handle(d_A.lhs()),
236  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
237  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
238  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
239  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
240  viennacl::traits::opencl_handle(y),
241  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
242  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
243  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
245 }
246 
256 template<typename NumericT, unsigned int AlignmentV>
260 {
261 
262  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
264 
265  /*
266  * Stage 1: Analyze sparsity pattern in order to properly allocate temporary arrays
267  *
268  * - Upper bound for the row lengths in C
269  */
270  viennacl::vector<unsigned int> upper_bound_nonzeros_per_row_A(256, ctx); // upper bound for the nonzeros per row encountered for each work group
271 
273  viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
274  viennacl::traits::opencl_handle(upper_bound_nonzeros_per_row_A)
275  ) );
276 
277  upper_bound_nonzeros_per_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
278  unsigned int * upper_bound_nonzeros_per_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(upper_bound_nonzeros_per_row_A.handle());
279 
280  unsigned int max_nnz_per_row_A = 0;
281  for (std::size_t i=0; i<upper_bound_nonzeros_per_row_A.size(); ++i)
282  max_nnz_per_row_A = std::max(max_nnz_per_row_A, upper_bound_nonzeros_per_row_A_ptr[i]);
283 
284  if (max_nnz_per_row_A > 32)
285  {
286  // determine augmented size:
287  unsigned int max_entries_in_G = 32;
288  if (max_nnz_per_row_A <= 256)
289  max_entries_in_G = 16;
290  if (max_nnz_per_row_A <= 64)
291  max_entries_in_G = 8;
292 
293  viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
295  viennacl::ocl::enqueue(k_decompose_1(A.handle1().opencl_handle(), cl_uint(A.size1()),
296  cl_uint(max_entries_in_G),
297  viennacl::traits::opencl_handle(exclusive_scan_helper)
298  ) );
299 
300  // exclusive scan of helper array to find new size:
301  viennacl::linalg::exclusive_scan(exclusive_scan_helper);
302  unsigned int augmented_size = exclusive_scan_helper[A.size1()];
303 
304  // split A = A2 * G1
305  viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
307 
308  // fill A2:
310  viennacl::ocl::enqueue(k_fill_A2(A2.handle1().opencl_handle(), A2.handle2().opencl_handle(), A2.handle().opencl_handle(), cl_uint(A2.size1()),
311  viennacl::traits::opencl_handle(exclusive_scan_helper)
312  ) );
313 
314  // fill G1:
316  viennacl::ocl::enqueue(k_fill_G1(G1.handle1().opencl_handle(), G1.handle2().opencl_handle(), G1.handle().opencl_handle(), cl_uint(G1.size1()),
317  A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), cl_uint(A.nnz()),
318  cl_uint(max_entries_in_G),
319  viennacl::traits::opencl_handle(exclusive_scan_helper)
320  ) );
321 
322  // compute tmp = G1 * B;
323  // C = A2 * tmp;
325  prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
326  prod_impl(A2, tmp, C); // this may split A2 again
327  return;
328  }
329 
330 
331  /*
332  * Stage 2: Determine sparsity pattern of C
333  */
334  C.resize(A.size1(), B.size2(), false);
335 
337  k2.local_work_size(0, 32); // run with one warp/wavefront
338  k2.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
339  viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
340  B.handle1().opencl_handle(), B.handle2().opencl_handle(), cl_uint(B.size2()),
341  C.handle1().opencl_handle()
342  ) );
343 
344  // exclusive scan on host to obtain row start indices:
346  viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
347  unsigned int current_offset = 0;
348  for (std::size_t i=0; i<C.size1(); ++i)
349  {
350  unsigned int tmp = row_buffer[i];
351  row_buffer.set(i, current_offset);
352  current_offset += tmp;
353  }
354  row_buffer.set(C.size1(), current_offset);
355  viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
356 
357 
358  /*
359  * Stage 3: Compute entries in C
360  */
361 
362  C.reserve(current_offset, false);
363 
365  k3.local_work_size(0, 32); // run with one warp/wavefront
366  k3.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
367  viennacl::ocl::enqueue(k3(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
368  B.handle1().opencl_handle(), B.handle2().opencl_handle(), B.handle().opencl_handle(), cl_uint(B.size2()),
369  C.handle1().opencl_handle(), C.handle2().opencl_handle(), C.handle().opencl_handle()
370  ) );
371 
372 }
373 
374 // triangular solvers
375 
381 template<typename NumericT, unsigned int MAT_AlignmentV>
385 {
386  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
389 
390  k.local_work_size(0, 128);
392  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
393  viennacl::traits::opencl_handle(x),
394  cl_uint(L.size1())
395  )
396  );
397 }
398 
404 template<typename NumericT, unsigned int AlignmentV>
408 {
409  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
411 
413 
414  k.local_work_size(0, 128);
416  viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
417  viennacl::traits::opencl_handle(x),
418  cl_uint(L.size1())
419  )
420  );
421 }
422 
423 
429 template<typename NumericT, unsigned int AlignmentV>
433 {
434  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
437 
438  k.local_work_size(0, 128);
440  viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
441  viennacl::traits::opencl_handle(x),
442  cl_uint(U.size1())
443  )
444  );
445 }
446 
452 template<typename NumericT, unsigned int AlignmentV>
456 {
457  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
459 
461 
462  k.local_work_size(0, 128);
464  viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
465  viennacl::traits::opencl_handle(x),
466  cl_uint(U.size1())
467  )
468  );
469 }
470 
471 
472 
473 
474 
475 // transposed triangular solvers
476 
477 namespace detail
478 {
479  //
480  // block solves
481  //
482  template<typename NumericT, unsigned int AlignmentV>
485  op_trans> & L,
486  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
487  vector_base<NumericT> const & /* L_diagonal */, //ignored
490  {
491  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L.lhs()).context());
494  block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
495 
496  viennacl::ocl::enqueue(block_solve_kernel(L.lhs().handle1().opencl_handle(),
497  L.lhs().handle2().opencl_handle(),
498  L.lhs().handle().opencl_handle(),
499  block_indices.opencl_handle(),
500  x,
501  static_cast<cl_uint>(x.size())));
502  }
503 
504 
505  template<typename NumericT, unsigned int AlignmentV>
508  op_trans> const & U,
509  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
510  vector_base<NumericT> const & U_diagonal,
513  {
514  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U.lhs()).context());
517  block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
518 
519  viennacl::ocl::enqueue(block_solve_kernel(U.lhs().handle1().opencl_handle(),
520  U.lhs().handle2().opencl_handle(),
521  U.lhs().handle().opencl_handle(),
522  U_diagonal,
523  block_indices.opencl_handle(),
524  x,
525  static_cast<cl_uint>(x.size())));
526  }
527 
528 
529 }
530 
531 
537 template<typename NumericT, unsigned int AlignmentV>
540  op_trans> const & proxy_L,
543 {
544  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
547 
548  k.local_work_size(0, 128);
550  viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
551  viennacl::traits::opencl_handle(x),
552  cl_uint(proxy_L.lhs().size1())
553  )
554  );
555 }
556 
557 
563 template<typename NumericT, unsigned int AlignmentV>
566  op_trans> const & proxy_L,
569 {
570  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
572 
573  viennacl::vector<NumericT> diagonal(x.size());
575 
577 
578  k.local_work_size(0, 128);
579  k.global_work_size(0, k.local_work_size());
580  viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
581  viennacl::traits::opencl_handle(diagonal),
582  viennacl::traits::opencl_handle(x),
583  cl_uint(proxy_L.lhs().size1())
584  )
585  );
586 }
587 
593 template<typename NumericT, unsigned int AlignmentV>
596  op_trans> const & proxy_U,
599 {
600  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
603 
604  k.local_work_size(0, 128);
606  viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
607  viennacl::traits::opencl_handle(x),
608  cl_uint(proxy_U.lhs().size1())
609  )
610  );
611 }
612 
613 
619 template<typename NumericT, unsigned int AlignmentV>
622  op_trans> const & proxy_U,
625 {
626  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
628 
629  viennacl::vector<NumericT> diagonal(x.size());
631 
633 
634  k.local_work_size(0, 128);
635  k.global_work_size(0, k.local_work_size());
636  viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
637  viennacl::traits::opencl_handle(diagonal),
638  viennacl::traits::opencl_handle(x),
639  cl_uint(proxy_U.lhs().size1())
640  )
641  );
642 }
643 
644 
645 //
646 // Compressed Compressed matrix
647 //
648 
657 template<typename NumericT>
660  NumericT alpha,
662  NumericT beta)
663 {
664  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
667 
668  if (beta < 0 || beta > 0) // multiply by beta
669  viennacl::linalg::opencl::av(y, y, beta, 1, false, false);
670  else
671  y.clear();
672 
674  layout_x.start = cl_uint(viennacl::traits::start(x));
675  layout_x.stride = cl_uint(viennacl::traits::stride(x));
676  layout_x.size = cl_uint(viennacl::traits::size(x));
677  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
678 
680  layout_y.start = cl_uint(viennacl::traits::start(y));
681  layout_y.stride = cl_uint(viennacl::traits::stride(y));
682  layout_y.size = cl_uint(viennacl::traits::size(y));
683  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
684 
685  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle3().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.nnz1()),
686  x, layout_x,
687  alpha,
688  y, layout_y,
689  beta
690  ));
691 }
692 
693 
694 //
695 // Coordinate matrix
696 //
697 
698 namespace detail
699 {
700  template<typename NumericT, unsigned int AlignmentV>
704  {
705  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
708  unsigned int thread_num = 128; //k.local_work_size(0);
709 
710  row_info_kernel.local_work_size(0, thread_num);
711 
712  row_info_kernel.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
713  viennacl::ocl::enqueue(row_info_kernel(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
714  viennacl::traits::opencl_handle(x),
715  cl_uint(info_selector),
716  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
717  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
718  }
719 }
720 
729 template<typename NumericT, unsigned int AlignmentV>
732  NumericT alpha,
734  NumericT beta)
735 {
736  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
738 
739  if (beta < 0 || beta > 0) // multiply by beta
740  viennacl::linalg::opencl::av(y, y, beta, 1, false, false);
741  else
742  y.clear();
743 
745  layout_x.start = cl_uint(viennacl::traits::start(x));
746  layout_x.stride = cl_uint(viennacl::traits::stride(x));
747  layout_x.size = cl_uint(viennacl::traits::size(x));
748  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
749 
751  layout_y.start = cl_uint(viennacl::traits::start(y));
752  layout_y.stride = cl_uint(viennacl::traits::stride(y));
753  layout_y.size = cl_uint(viennacl::traits::size(y));
754  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
755 
756  //std::cout << "prod(coordinate_matrix" << AlignmentV << ", vector) called with internal_nnz=" << A.internal_nnz() << std::endl;
757 
759  unsigned int thread_num = 128; //k.local_work_size(0);
760 
761  k.local_work_size(0, thread_num);
762 
763  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
764  //k.global_work_size(0, thread_num); //Only one work group
765  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
766  viennacl::traits::opencl_handle(x),
767  layout_x,
768  alpha,
769  viennacl::traits::opencl_handle(y),
770  layout_y,
771  beta,
772  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
773  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
774 
775 }
776 
777 
786 template<typename NumericT, unsigned int AlignmentV>
790 {
791  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
793 
796 
797  y.clear();
798 
799  unsigned int thread_num = 128; //k.local_work_size(0);
800  k.local_work_size(0, thread_num);
801  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
802 
803  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
804  viennacl::traits::opencl_handle(d_A),
805  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
806  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
807  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
809  viennacl::traits::opencl_handle(y),
810  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
811  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
812  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
814  viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
816 
817 }
818 
827 template<typename NumericT, unsigned int AlignmentV>
831  viennacl::op_trans > const & d_A,
833 {
834  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
836 
838  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
839 
840  y.clear();
841 
842  unsigned int thread_num = 128; //k.local_work_size(0);
843  k.local_work_size(0, thread_num);
844  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
845 
846  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
847  viennacl::traits::opencl_handle(d_A),
848  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
849  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
850  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
851  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
852  viennacl::traits::opencl_handle(y),
853  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
854  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
855  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
857  viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
859 
860 }
861 
862 
863 //
864 // ELL Matrix
865 //
866 
867 template<typename NumericT, unsigned int AlignmentV>
870  NumericT alpha,
872  NumericT beta)
873 {
874  assert(A.size1() == y.size());
875  assert(A.size2() == x.size());
876 
877  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
879 
880  bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
881 
883  layout_x.start = cl_uint(viennacl::traits::start(x));
884  layout_x.stride = cl_uint(viennacl::traits::stride(x));
885  layout_x.size = cl_uint(viennacl::traits::size(x));
886  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
887 
889  layout_y.start = cl_uint(viennacl::traits::start(y));
890  layout_y.stride = cl_uint(viennacl::traits::stride(y));
891  layout_y.size = cl_uint(viennacl::traits::size(y));
892  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
893 
894  std::stringstream ss;
895  ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
896  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
897 
898  unsigned int thread_num = 128;
899  unsigned int group_num = 256;
900 
901  k.local_work_size(0, thread_num);
902  k.global_work_size(0, thread_num * group_num);
903 
904  if (with_alpha_beta)
905  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
906  A.handle().opencl_handle(),
907  viennacl::traits::opencl_handle(x),
908  layout_x,
909  alpha,
910  viennacl::traits::opencl_handle(y),
911  layout_y,
912  beta,
913  cl_uint(A.size1()),
914  cl_uint(A.size2()),
915  cl_uint(A.internal_size1()),
916  cl_uint(A.maxnnz()),
917  cl_uint(A.internal_maxnnz())
918  )
919  );
920  else
921  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
922  A.handle().opencl_handle(),
923  viennacl::traits::opencl_handle(x),
924  layout_x,
925  viennacl::traits::opencl_handle(y),
926  layout_y,
927  cl_uint(A.size1()),
928  cl_uint(A.size2()),
929  cl_uint(A.internal_size1()),
930  cl_uint(A.maxnnz()),
931  cl_uint(A.internal_maxnnz())
932  )
933  );
934 
935 
936 }
937 
947 template<typename NumericT, unsigned int AlignmentV>
951 
952  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
956 
957  //unsigned int thread_num = 128;
958  //unsigned int group_num = 256;
959  //
960  //k.local_work_size(0, thread_num);
961  //k.global_work_size(0, thread_num * group_num);
962 
963  viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
964  cl_uint(sp_A.size1()),
965  cl_uint(sp_A.size2()),
966  cl_uint(sp_A.internal_size1()),
967  cl_uint(sp_A.maxnnz()),
968  cl_uint(sp_A.internal_maxnnz()),
969  viennacl::traits::opencl_handle(d_A),
970  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
971  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
972  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
974  viennacl::traits::opencl_handle(y),
975  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
976  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
977  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
979  )
980  );
981 }
982 
992 template<typename NumericT, unsigned int AlignmentV>
996  viennacl::op_trans > const & d_A,
998 
999  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
1002  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
1003 
1004  //unsigned int thread_num = 128;
1005  //unsigned int group_num = 256;
1006  //
1007  //k.local_work_size(0, thread_num);
1008  //k.global_work_size(0, thread_num * group_num);
1009 
1010  viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
1011  cl_uint(sp_A.size1()),
1012  cl_uint(sp_A.size2()),
1013  cl_uint(sp_A.internal_size1()),
1014  cl_uint(sp_A.maxnnz()),
1015  cl_uint(sp_A.internal_maxnnz()),
1016  viennacl::traits::opencl_handle(d_A.lhs()),
1017  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
1018  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
1019  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
1020  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
1021  viennacl::traits::opencl_handle(y),
1022  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
1023  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
1024  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
1026  )
1027  );
1028 }
1029 
1030 //
1031 // SELL-C-\sigma Matrix
1032 //
1033 
1034 template<typename ScalarT, typename IndexT>
1037  ScalarT alpha,
1039  ScalarT beta)
1040 {
1041  assert(A.size1() == y.size());
1042  assert(A.size2() == x.size());
1043 
1044  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
1046 
1047  bool with_alpha_beta = (alpha < ScalarT(1) || alpha > ScalarT(1)) || (beta < 0 || beta > 0);
1048 
1050  layout_x.start = cl_uint(viennacl::traits::start(x));
1051  layout_x.stride = cl_uint(viennacl::traits::stride(x));
1052  layout_x.size = cl_uint(viennacl::traits::size(x));
1053  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
1054 
1056  layout_y.start = cl_uint(viennacl::traits::start(y));
1057  layout_y.stride = cl_uint(viennacl::traits::stride(y));
1058  layout_y.size = cl_uint(viennacl::traits::size(y));
1059  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
1060 
1061  std::stringstream ss;
1062  ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
1063  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::sliced_ell_matrix<ScalarT, IndexT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
1064 
1065  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
1066  unsigned int group_num = 256;
1067 
1068  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1069  thread_num = 256;
1070 
1071  k.local_work_size(0, thread_num);
1072  k.global_work_size(0, thread_num * group_num);
1073 
1074  if (with_alpha_beta)
1075  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
1076  A.handle2().opencl_handle(),
1077  A.handle3().opencl_handle(),
1078  A.handle().opencl_handle(),
1079  viennacl::traits::opencl_handle(x),
1080  layout_x,
1081  alpha,
1082  viennacl::traits::opencl_handle(y),
1083  layout_y,
1084  beta,
1085  cl_uint(A.rows_per_block()))
1086  );
1087  else
1088  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
1089  A.handle2().opencl_handle(),
1090  A.handle3().opencl_handle(),
1091  A.handle().opencl_handle(),
1092  viennacl::traits::opencl_handle(x),
1093  layout_x,
1094  viennacl::traits::opencl_handle(y),
1095  layout_y,
1096  cl_uint(A.rows_per_block()))
1097  );
1098 }
1099 
1100 
1101 //
1102 // Hybrid Matrix
1103 //
1104 
1105 template<typename NumericT, unsigned int AlignmentV>
1108  NumericT alpha,
1110  NumericT beta)
1111 {
1112  assert(A.size1() == y.size());
1113  assert(A.size2() == x.size());
1114 
1115  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
1117 
1118  bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
1119 
1121  layout_x.start = cl_uint(viennacl::traits::start(x));
1122  layout_x.stride = cl_uint(viennacl::traits::stride(x));
1123  layout_x.size = cl_uint(viennacl::traits::size(x));
1124  layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
1125 
1127  layout_y.start = cl_uint(viennacl::traits::start(y));
1128  layout_y.stride = cl_uint(viennacl::traits::stride(y));
1129  layout_y.size = cl_uint(viennacl::traits::size(y));
1130  layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
1131 
1132  viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
1133 
1134  if (with_alpha_beta)
1135  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
1136  A.handle().opencl_handle(),
1137  A.handle3().opencl_handle(),
1138  A.handle4().opencl_handle(),
1139  A.handle5().opencl_handle(),
1140  viennacl::traits::opencl_handle(x),
1141  layout_x,
1142  alpha,
1143  viennacl::traits::opencl_handle(y),
1144  layout_y,
1145  beta,
1146  cl_uint(A.size1()),
1147  cl_uint(A.internal_size1()),
1148  cl_uint(A.ell_nnz()),
1149  cl_uint(A.internal_ellnnz())
1150  )
1151  );
1152  else
1153  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
1154  A.handle().opencl_handle(),
1155  A.handle3().opencl_handle(),
1156  A.handle4().opencl_handle(),
1157  A.handle5().opencl_handle(),
1158  viennacl::traits::opencl_handle(x),
1159  layout_x,
1160  viennacl::traits::opencl_handle(y),
1161  layout_y,
1162  cl_uint(A.size1()),
1163  cl_uint(A.internal_size1()),
1164  cl_uint(A.ell_nnz()),
1165  cl_uint(A.internal_ellnnz())
1166  )
1167  );
1168 }
1169 
1170 template<typename NumericT, unsigned int AlignmentV>
1172  viennacl::matrix_base<NumericT> const & d_A,
1174 {
1175  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
1179 
1180  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
1181  A.handle().opencl_handle(),
1182  A.handle3().opencl_handle(),
1183  A.handle4().opencl_handle(),
1184  A.handle5().opencl_handle(),
1185  cl_uint(A.size1()),
1186  cl_uint(A.internal_size1()),
1187  cl_uint(A.ell_nnz()),
1188  cl_uint(A.internal_ellnnz()),
1189  viennacl::traits::opencl_handle(d_A),
1190  cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
1191  cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
1192  cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
1194  viennacl::traits::opencl_handle(y),
1195  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
1196  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
1197  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
1199  )
1200  );
1201 }
1202 
1203 template<typename NumericT, unsigned int AlignmentV>
1207  viennacl::op_trans > const & d_A,
1209 {
1210  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
1213  detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
1214 
1215  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
1216  A.handle().opencl_handle(),
1217  A.handle3().opencl_handle(),
1218  A.handle4().opencl_handle(),
1219  A.handle5().opencl_handle(),
1220  cl_uint(A.size1()),
1221  cl_uint(A.internal_size1()),
1222  cl_uint(A.ell_nnz()),
1223  cl_uint(A.internal_ellnnz()),
1224  viennacl::traits::opencl_handle(d_A.lhs()),
1225  cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
1226  cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
1227  cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
1228  cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
1229  viennacl::traits::opencl_handle(y),
1230  cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
1231  cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
1232  cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
1234  )
1235  );
1236 }
1237 
1238 
1239 } // namespace opencl
1240 } //namespace linalg
1241 } //namespace viennacl
1242 
1243 
1244 #endif
const vcl_size_t & size2() const
Returns the number of columns.
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
cl_uint stride
Increment between integers.
Definition: kernel.hpp:50
static void init(viennacl::ocl::context &ctx)
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:112
Helper class implementing an array on the host. Default case: No conversion necessary.
Definition: util.hpp:92
Helper class for packing four cl_uint numbers into a uint4 type for access inside an OpenCL kernel...
Definition: kernel.hpp:45
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
Definition: memory.hpp:220
handle_type & handle2()
Definition: ell_matrix.hpp:103
Represents an OpenCL device within ViennaCL.
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
Implementations of NMF operations using OpenCL.
void prod_impl(const matrix_base< NumericT > &mat, bool trans_A, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
cl_uint start
Starting value of the integer stride.
Definition: kernel.hpp:48
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:386
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:49
A tag class representing a lower triangular matrix.
Definition: forwards.h:849
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Main kernel class for generating OpenCL kernels for coordinate_matrix.
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:394
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size2() const
Definition: ell_matrix.hpp:92
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
cl_uint vendor_id() const
A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID...
Definition: device.hpp:917
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
Definition: size.hpp:375
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
vcl_size_t rows_per_block() const
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
cl_uint internal_size
Internal length of the buffer. Might be larger than 'size' due to padding.
Definition: kernel.hpp:54
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
Common implementations shared by OpenCL-based operations.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
Main kernel class for generating OpenCL kernels for ell_matrix.
Definition: ell_matrix.hpp:181
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
Main kernel class for generating OpenCL kernels for compressed_matrix (except solvers).
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605
OpenCL kernel file for compressed_matrix operations.
A tag class representing an upper triangular matrix.
Definition: forwards.h:854
OpenCL kernel file for ell_matrix operations.
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
void clear()
Resets all entries to zero.
Definition: matrix.hpp:634
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
Implementation of a smart-pointer-like class for handling OpenCL handles.
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
void av(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
vcl_size_t size2() const
Definition: hyb_matrix.hpp:99
std::size_t vcl_size_t
Definition: forwards.h:75
Main kernel class for triangular solver OpenCL kernels for compressed_matrix.
Main kernel class for generating OpenCL kernels for ell_matrix.
Common routines for single-threaded or OpenMP-enabled execution on CPU.
OpenCL kernel file for sliced_ell_matrix operations.
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
OpenCL kernel file for hyb_matrix operations.
handle_type & handle()
Definition: ell_matrix.hpp:100
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT)
Direct inplace solver for dense triangular systems. Matlab notation: A \ B.
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:875
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Representation of an OpenCL kernel in ViennaCL.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:248
static void init(viennacl::ocl::context &ctx)
OpenCL kernel file for vector operations.
void set(vcl_size_t index, U value)
Definition: util.hpp:115
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
OpenCL kernel file for coordinate_matrix operations.
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
A tag class representing transposed matrices.
Definition: forwards.h:220
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
Definition: mem_handle.hpp:230
A sparse square matrix in compressed sparse rows format.
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan.
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &x, viennacl::linalg::unit_lower_tag)
static void init(viennacl::ocl::context &ctx)
Definition: hyb_matrix.hpp:208
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
static void init(viennacl::ocl::context &ctx)
Definition: ell_matrix.hpp:188
size_t max_work_group_size() const
Maximum number of work-items in a work-group executing a kernel using the data parallel execution mod...
Definition: device.hpp:483
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864
Main kernel class for generating OpenCL kernels for compressed_compressed_matrix. ...
cl_uint size
Number of values in the stride.
Definition: kernel.hpp:52
Main kernel class for generating OpenCL kernels for hyb_matrix.
Definition: hyb_matrix.hpp:201
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
void switch_memory_context(viennacl::context new_ctx)
Definition: vector.hpp:1064
void row_info(compressed_matrix< NumericT, AlignmentV > const &A, vector_base< NumericT > &x, viennacl::linalg::detail::row_info_types info_selector)