ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
31 
33 
34 //#ifdef VIENNACL_WITH_SPGEMM_RMERGE
36 //#else
37 // #include "viennacl/linalg/cuda/spgemm.hpp"
38 //#endif
39 
40 namespace viennacl
41 {
42 namespace linalg
43 {
44 namespace cuda
45 {
46 //
47 // Compressed matrix
48 //
49 
50 namespace detail
51 {
52 
53  template<typename NumericT>
55  const unsigned int * row_indices,
56  const unsigned int * column_indices,
57  const NumericT * elements,
58  NumericT * result,
59  unsigned int size,
60  unsigned int option)
61  {
62  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
63  row < size;
64  row += gridDim.x * blockDim.x)
65  {
66  NumericT value = 0;
67  unsigned int row_end = row_indices[row+1];
68 
69  switch (option)
70  {
71  case 0: //inf-norm
72  for (unsigned int i = row_indices[row]; i < row_end; ++i)
73  value = max(value, fabs(elements[i]));
74  break;
75 
76  case 1: //1-norm
77  for (unsigned int i = row_indices[row]; i < row_end; ++i)
78  value += fabs(elements[i]);
79  break;
80 
81  case 2: //2-norm
82  for (unsigned int i = row_indices[row]; i < row_end; ++i)
83  value += elements[i] * elements[i];
84  value = sqrt(value);
85  break;
86 
87  case 3: //diagonal entry
88  for (unsigned int i = row_indices[row]; i < row_end; ++i)
89  {
90  if (column_indices[i] == row)
91  {
92  value = elements[i];
93  break;
94  }
95  }
96  break;
97 
98  default:
99  break;
100  }
101  result[row] = value;
102  }
103  }
104 
105 
106  template<typename NumericT, unsigned int AligmentV>
108  vector_base<NumericT> & vec,
110  {
111  csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
112  viennacl::cuda_arg<unsigned int>(mat.handle2()),
113  viennacl::cuda_arg<NumericT>(mat.handle()),
114  viennacl::cuda_arg(vec),
115  static_cast<unsigned int>(mat.size1()),
116  static_cast<unsigned int>(info_selector)
117  );
118  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
119  }
120 
121  struct spmv_pure
122  {
123  template<typename NumericT>
124  __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = Ax; }
125  };
126 
128  {
129  template<typename NumericT>
130  __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = alpha * Ax + ((beta != 0) ? beta * result : 0); }
131  };
132 
133 } //namespace detail
134 
135 
136 
137 template<unsigned int SubWarpSizeV, typename AlphaBetaHandlerT, typename NumericT>
139  const unsigned int * row_indices,
140  const unsigned int * column_indices,
141  const NumericT * elements,
142  const NumericT * x,
143  unsigned int start_x,
144  unsigned int inc_x,
145  NumericT alpha,
146  NumericT * result,
147  unsigned int start_result,
148  unsigned int inc_result,
149  unsigned int size_result,
150  NumericT beta)
151 {
152  __shared__ NumericT shared_elements[512];
153 
154  const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
155  const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
156  const unsigned int block_start = blockIdx.x * block_increment;
157  const unsigned int block_stop = min(block_start + block_increment, size_result);
158 
159  for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
160  row < block_stop;
161  row += blockDim.x / SubWarpSizeV)
162  {
164  unsigned int row_end = row_indices[row+1];
165  for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
166  dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
167 
168  shared_elements[threadIdx.x] = dot_prod;
169  if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
170  if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
171  if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
172  if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
173  if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
174 
175  if (id_in_row == 0)
176  AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, shared_elements[threadIdx.x], beta);
177  }
178 }
179 
180 
181 template<typename AlphaBetaHandlerT, typename NumericT>
183  const unsigned int * row_indices,
184  const unsigned int * column_indices,
185  const unsigned int * row_blocks,
186  const NumericT * elements,
187  unsigned int num_blocks,
188  const NumericT * x,
189  unsigned int start_x,
190  unsigned int inc_x,
191  NumericT alpha,
192  NumericT * result,
193  unsigned int start_result,
194  unsigned int inc_result,
195  unsigned int size_result,
196  NumericT beta)
197 {
198  __shared__ NumericT shared_elements[1024];
199 
200  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
201  {
202  unsigned int row_start = row_blocks[block_id];
203  unsigned int row_stop = row_blocks[block_id + 1];
204  unsigned int element_start = row_indices[row_start];
205  unsigned int element_stop = row_indices[row_stop];
206  unsigned int rows_to_process = row_stop - row_start;
207 
208  if (rows_to_process > 1) // CSR stream with one thread per row
209  {
210  // load to shared buffer:
211  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
212  shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
213 
214  __syncthreads();
215 
216  // use one thread per row to sum:
217  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
218  {
219  NumericT dot_prod = 0;
220  unsigned int thread_row_start = row_indices[row] - element_start;
221  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
222  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
223  dot_prod += shared_elements[i];
224  AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, dot_prod, beta);
225  }
226  }
227  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
228  else // CSR vector for a single row
229  {
230  // load and sum to shared buffer:
231  shared_elements[threadIdx.x] = 0;
232  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
233  shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
234 
235  // reduction to obtain final result
236  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
237  {
238  __syncthreads();
239  if (threadIdx.x < stride)
240  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
241  }
242 
243  if (threadIdx.x == 0)
244  AlphaBetaHandlerT::apply(result[row_start * inc_result + start_result], alpha, shared_elements[0], beta);
245  }
246 
247  __syncthreads(); // avoid race conditions
248  }
249 }
250 
251 
252 
253 
262 template<class NumericT, unsigned int AlignmentV>
265  NumericT alpha,
267  NumericT beta)
268 {
269  static bool first = true;
270  static bool is_maxwell = false;
271 
272  // check whether the CUDA device is from the Maxwell family.
273  // Only run once, because the query to the backend takes about the same time as a kernel launch (~15us), thus being too expensive to query each time.
274  //
275  // Note: This might result in non-optimal kernels being selected if multiple Maxwell- and non-Maxwell GPUs are available in the system and devices are switched at runtime.
276  // However, this situation is certainly rare, hence the the benefits of this singleton outweigh the disadvantages encountered in such a corner case.
277  if (first)
278  {
279  cudaDeviceProp prop;
280  int device_index = 0;
281 
282  cudaError_t err_flag = cudaGetDevice(&device_index);
283  if (err_flag == cudaSuccess)
284  {
285  err_flag = cudaGetDeviceProperties(&prop, device_index);
286  if (err_flag == cudaSuccess && prop.major >= 5)
287  is_maxwell = true;
288  }
289  first = false;
290  }
291 
292  if (is_maxwell && double(mat.nnz()) / double(mat.size1()) > 6.4) // less than 10% of threads expected to idle
293  {
294  if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
295  compressed_matrix_vec_mul_kernel<8, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
296  viennacl::cuda_arg<unsigned int>(mat.handle1()),
297  viennacl::cuda_arg<unsigned int>(mat.handle2()),
298  viennacl::cuda_arg<NumericT>(mat.handle()),
299  viennacl::cuda_arg(vec),
300  static_cast<unsigned int>(vec.start()),
301  static_cast<unsigned int>(vec.stride()),
302  alpha,
303  viennacl::cuda_arg(result),
304  static_cast<unsigned int>(result.start()),
305  static_cast<unsigned int>(result.stride()),
306  static_cast<unsigned int>(result.size()),
307  beta
308  );
309  else
310  compressed_matrix_vec_mul_kernel<8, detail::spmv_pure, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
311  viennacl::cuda_arg<unsigned int>(mat.handle1()),
312  viennacl::cuda_arg<unsigned int>(mat.handle2()),
313  viennacl::cuda_arg<NumericT>(mat.handle()),
314  viennacl::cuda_arg(vec),
315  static_cast<unsigned int>(vec.start()),
316  static_cast<unsigned int>(vec.stride()),
317  alpha,
318  viennacl::cuda_arg(result),
319  static_cast<unsigned int>(result.start()),
320  static_cast<unsigned int>(result.stride()),
321  static_cast<unsigned int>(result.size()),
322  beta
323  );
324  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
325  }
326  else if (!is_maxwell && double(mat.nnz()) / double(mat.size1()) > 12.0) // less than 25% of threads expected to idle
327  {
328  if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
329  compressed_matrix_vec_mul_kernel<16, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
330  viennacl::cuda_arg<unsigned int>(mat.handle1()),
331  viennacl::cuda_arg<unsigned int>(mat.handle2()),
332  viennacl::cuda_arg<NumericT>(mat.handle()),
333  viennacl::cuda_arg(vec),
334  static_cast<unsigned int>(vec.start()),
335  static_cast<unsigned int>(vec.stride()),
336  alpha,
337  viennacl::cuda_arg(result),
338  static_cast<unsigned int>(result.start()),
339  static_cast<unsigned int>(result.stride()),
340  static_cast<unsigned int>(result.size()),
341  beta
342  );
343  else
344  compressed_matrix_vec_mul_kernel<16, detail::spmv_pure, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
345  viennacl::cuda_arg<unsigned int>(mat.handle1()),
346  viennacl::cuda_arg<unsigned int>(mat.handle2()),
347  viennacl::cuda_arg<NumericT>(mat.handle()),
348  viennacl::cuda_arg(vec),
349  static_cast<unsigned int>(vec.start()),
350  static_cast<unsigned int>(vec.stride()),
351  alpha,
352  viennacl::cuda_arg(result),
353  static_cast<unsigned int>(result.start()),
354  static_cast<unsigned int>(result.stride()),
355  static_cast<unsigned int>(result.size()),
356  beta
357  );
358  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
359  }
360  else
361  {
362  if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
363  compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_alpha_beta><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
364  viennacl::cuda_arg<unsigned int>(mat.handle2()),
365  viennacl::cuda_arg<unsigned int>(mat.handle3()),
366  viennacl::cuda_arg<NumericT>(mat.handle()),
367  static_cast<unsigned int>(mat.blocks1()),
368  viennacl::cuda_arg(vec),
369  static_cast<unsigned int>(vec.start()),
370  static_cast<unsigned int>(vec.stride()),
371  alpha,
372  viennacl::cuda_arg(result),
373  static_cast<unsigned int>(result.start()),
374  static_cast<unsigned int>(result.stride()),
375  static_cast<unsigned int>(result.size()),
376  beta
377  );
378  else
379  compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_pure><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
380  viennacl::cuda_arg<unsigned int>(mat.handle2()),
381  viennacl::cuda_arg<unsigned int>(mat.handle3()),
382  viennacl::cuda_arg<NumericT>(mat.handle()),
383  static_cast<unsigned int>(mat.blocks1()),
384  viennacl::cuda_arg(vec),
385  static_cast<unsigned int>(vec.start()),
386  static_cast<unsigned int>(vec.stride()),
387  alpha,
388  viennacl::cuda_arg(result),
389  static_cast<unsigned int>(result.start()),
390  static_cast<unsigned int>(result.stride()),
391  static_cast<unsigned int>(result.size()),
392  beta
393  );
394  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel");
395  }
396 }
397 
402 template<typename LayoutT>
404 {
405  static __device__ unsigned int apply(unsigned int i, unsigned int j,
406  unsigned int row_start, unsigned int row_inc,
407  unsigned int col_start, unsigned int col_inc,
408  unsigned int internal_rows, unsigned int internal_cols)
409  {
410  return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
411  }
412 };
413 
415 template<>
416 struct mat_mult_matrix_index<viennacl::column_major>
417 {
418  static __device__ unsigned int apply(unsigned int i, unsigned int j,
419  unsigned int row_start, unsigned int row_inc,
420  unsigned int col_start, unsigned int col_inc,
421  unsigned int internal_rows, unsigned int internal_cols)
422  {
423  return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
424  }
425 };
429 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
431  const unsigned int * sp_mat_row_indices,
432  const unsigned int * sp_mat_col_indices,
433  const NumericT * sp_mat_elements,
434  const NumericT * d_mat,
435  unsigned int d_mat_row_start,
436  unsigned int d_mat_col_start,
437  unsigned int d_mat_row_inc,
438  unsigned int d_mat_col_inc,
439  unsigned int d_mat_row_size,
440  unsigned int d_mat_col_size,
441  unsigned int d_mat_internal_rows,
442  unsigned int d_mat_internal_cols,
443  NumericT * result,
444  unsigned int result_row_start,
445  unsigned int result_col_start,
446  unsigned int result_row_inc,
447  unsigned int result_col_inc,
448  unsigned int result_row_size,
449  unsigned int result_col_size,
450  unsigned int result_internal_rows,
451  unsigned int result_internal_cols)
452 {
453  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
454  {
455  unsigned int row_start = sp_mat_row_indices[row];
456  unsigned int row_end = sp_mat_row_indices[row+1];
457 
458  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
459  {
460  NumericT r = 0;
461 
462  for (unsigned int k = row_start; k < row_end; k++)
463  {
464  unsigned int j = sp_mat_col_indices[k];
465  NumericT x = sp_mat_elements[k];
466  NumericT y = d_mat[ DMatIndexT::apply(j, col,
467  d_mat_row_start, d_mat_row_inc,
468  d_mat_col_start, d_mat_col_inc,
469  d_mat_internal_rows, d_mat_internal_cols) ];
470 
471  r += x * y;
472  }
473 
474  result[ResultIndexT::apply(row, col,
475  result_row_start, result_row_inc,
476  result_col_start, result_col_inc,
477  result_internal_rows, result_internal_cols)] = r;
478  }
479  }
480 }
481 
482 
491 template<typename NumericT, unsigned int AlignmentV>
493  const viennacl::matrix_base<NumericT> & d_mat,
495 {
496  if (d_mat.row_major() && result.row_major())
497  {
498  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
499  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
500  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
501  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
502 
503  viennacl::cuda_arg(d_mat),
504  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
505  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
506  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
507  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
508 
509  viennacl::cuda_arg(result),
510  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
511  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
512  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
513  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
514  );
515  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
516  }
517  else if (d_mat.row_major() && !result.row_major())
518  {
519  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
520  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
521  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
522  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
523 
524  viennacl::cuda_arg(d_mat),
525  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
526  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
527  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
528  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
529 
530  viennacl::cuda_arg(result),
531  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
532  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
533  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
534  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
535  );
536  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
537  }
538  else if (!d_mat.row_major() && result.row_major())
539  {
540  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
541  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
542  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
543  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
544 
545  viennacl::cuda_arg(d_mat),
546  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
547  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
548  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
549  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
550 
551  viennacl::cuda_arg(result),
552  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
553  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
554  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
555  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
556  );
557  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
558  }
559  else
560  {
561  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
562  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
563  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
564  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
565 
566  viennacl::cuda_arg(d_mat),
567  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
568  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
569  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
570  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
571 
572  viennacl::cuda_arg(result),
573  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
574  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
575  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
576  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
577  );
578  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
579  }
580 }
581 
582 
583 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
585  const unsigned int * sp_mat_row_indices,
586  const unsigned int * sp_mat_col_indices,
587  const NumericT * sp_mat_elements,
588  const NumericT * d_mat,
589  unsigned int d_mat_row_start,
590  unsigned int d_mat_col_start,
591  unsigned int d_mat_row_inc,
592  unsigned int d_mat_col_inc,
593  unsigned int d_mat_row_size,
594  unsigned int d_mat_col_size,
595  unsigned int d_mat_internal_rows,
596  unsigned int d_mat_internal_cols,
597  NumericT * result,
598  unsigned int result_row_start,
599  unsigned int result_col_start,
600  unsigned int result_row_inc,
601  unsigned int result_col_inc,
602  unsigned int result_row_size,
603  unsigned int result_col_size,
604  unsigned int result_internal_rows,
605  unsigned int result_internal_cols)
606 {
607  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
608  {
609  unsigned int row_start = sp_mat_row_indices[row];
610  unsigned int row_end = sp_mat_row_indices[row+1];
611 
612  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
613  {
614  NumericT r = 0;
615 
616  for (unsigned int k = row_start; k < row_end; k++)
617  {
618  unsigned int j = sp_mat_col_indices[k];
619  NumericT x = sp_mat_elements[k];
620  NumericT y = d_mat[ DMatIndexT::apply(col, j,
621  d_mat_row_start, d_mat_row_inc,
622  d_mat_col_start, d_mat_col_inc,
623  d_mat_internal_rows, d_mat_internal_cols) ];
624 
625  r += x * y;
626  }
627 
628  result [ ResultIndexT::apply(row, col,
629  result_row_start, result_row_inc,
630  result_col_start, result_col_inc,
631  result_internal_rows, result_internal_cols) ] = r;
632  }
633  }
634 
635 }
636 
646 template<typename NumericT, unsigned int AlignmentV>
650  viennacl::op_trans > & d_mat,
652 {
653 
654  if (d_mat.lhs().row_major() && result.row_major())
655  {
656  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
657  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
658  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
659  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
660 
661  viennacl::cuda_arg(d_mat.lhs()),
662  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
663  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
664  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
665  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
666 
667  viennacl::cuda_arg(result),
668  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
669  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
670  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
671  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
672  );
673  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
674  }
675  else if (d_mat.lhs().row_major() && !result.row_major())
676  {
677  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
678  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
679  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
680  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
681 
682  viennacl::cuda_arg(d_mat.lhs()),
683  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
684  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
685  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
686  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
687 
688  viennacl::cuda_arg(result),
689  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
690  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
691  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
692  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
693  );
694  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
695  }
696  else if (!d_mat.lhs().row_major() && result.row_major())
697  {
698  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
699  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
700  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
701  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
702 
703  viennacl::cuda_arg(d_mat.lhs()),
704  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
705  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
706  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
707  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
708 
709  viennacl::cuda_arg(result),
710  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
711  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
712  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
713  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
714  );
715  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
716  }
717  else
718  {
719  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
720  (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
721  viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
722  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
723 
724  viennacl::cuda_arg(d_mat.lhs()),
725  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
726  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
727  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
728  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
729 
730  viennacl::cuda_arg(result),
731  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
732  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
733  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
734  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
735  );
736  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
737  }
738 }
739 
740 
741 //
742 // triangular solves for compressed_matrix
743 //
744 
745 template<typename NumericT>
747  const unsigned int * row_indices,
748  const unsigned int * column_indices,
749  const NumericT * elements,
750  NumericT * result,
751  unsigned int size)
752 {
753  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
754  row < size;
755  row += gridDim.x * blockDim.x)
756  {
757  NumericT diag = NumericT(0);
758  unsigned int row_end = row_indices[row+1];
759  for (unsigned int i = row_indices[row]; i < row_end; ++i)
760  {
761  unsigned int col_index = column_indices[i];
762  if (col_index == row)
763  {
764  diag = elements[i];
765  break;
766  }
767  }
768  result[row] = diag;
769  }
770 }
771 
772 
778 template<typename SparseMatrixT, typename NumericT>
780 inplace_solve(const SparseMatrixT & mat,
783 {
784  csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
785  viennacl::cuda_arg<unsigned int>(mat.handle2()),
786  viennacl::cuda_arg<NumericT>(mat.handle()),
787  viennacl::cuda_arg(vec),
788  static_cast<unsigned int>(mat.size1())
789  );
790  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
791 }
792 
793 
799 template<typename SparseMatrixT, typename NumericT>
801 inplace_solve(const SparseMatrixT & mat,
804 {
805  csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
806  viennacl::cuda_arg<unsigned int>(mat.handle2()),
807  viennacl::cuda_arg<NumericT>(mat.handle()),
808  viennacl::cuda_arg(vec),
809  static_cast<unsigned int>(mat.size1())
810  );
811  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
812 }
813 
814 
815 
821 template<typename SparseMatrixT, typename NumericT>
823 inplace_solve(const SparseMatrixT & mat,
826 {
827  csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
828  viennacl::cuda_arg<unsigned int>(mat.handle2()),
829  viennacl::cuda_arg<NumericT>(mat.handle()),
830  viennacl::cuda_arg(vec),
831  static_cast<unsigned int>(mat.size1())
832  );
833  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
834 }
835 
836 
842 template<typename SparseMatrixT, typename NumericT>
844 inplace_solve(const SparseMatrixT & mat,
847 {
848  csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
849  viennacl::cuda_arg<unsigned int>(mat.handle2()),
850  viennacl::cuda_arg<NumericT>(mat.handle()),
851  viennacl::cuda_arg(vec),
852  static_cast<unsigned int>(mat.size1())
853  );
854  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
855 }
856 
857 
858 
859 // transposed
860 
866 template<typename SparseMatrixT, typename NumericT>
871 {
872  csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
873  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
874  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
875  viennacl::cuda_arg(vec),
876  static_cast<unsigned int>(mat.lhs().size1())
877  );
878  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
879 }
880 
881 
887 template<typename SparseMatrixT, typename NumericT>
892 {
893  viennacl::vector<NumericT> diagonal(vec.size());
894 
895  compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
896  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
897  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
898  viennacl::cuda_arg(diagonal),
899  static_cast<unsigned int>(mat.size1())
900  );
901 
902  csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
903  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
904  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
905  viennacl::cuda_arg(diagonal),
906  viennacl::cuda_arg(vec),
907  static_cast<unsigned int>(mat.lhs().size1())
908  );
909  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
910 }
911 
912 
918 template<typename SparseMatrixT, typename NumericT>
923 {
924  csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
925  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
926  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
927  viennacl::cuda_arg(vec),
928  static_cast<unsigned int>(mat.lhs().size1())
929  );
930  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
931 }
932 
933 
939 template<typename SparseMatrixT, typename NumericT>
944 {
945  viennacl::vector<NumericT> diagonal(vec.size());
946 
947  compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
948  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
949  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
950  viennacl::cuda_arg(diagonal),
951  static_cast<unsigned int>(mat.size1())
952  );
953 
954  csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
955  viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
956  viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
957  viennacl::cuda_arg(diagonal),
958  viennacl::cuda_arg(vec),
959  static_cast<unsigned int>(mat.lhs().size1())
960  );
961  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
962 }
963 
964 namespace detail
965 {
966  //
967  // block solves
968  //
969  template<typename NumericT, unsigned int AlignmentV>
972  op_trans> & L,
973  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
974  vector_base<NumericT> const & /* L_diagonal */, //ignored
975  vector_base<NumericT> & vec,
977  {
978  csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
979  viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
980  viennacl::cuda_arg<NumericT>(L.lhs().handle()),
981  viennacl::cuda_arg<unsigned int>(block_indices),
982  viennacl::cuda_arg(vec),
983  static_cast<unsigned int>(L.lhs().size1())
984  );
985  }
986 
987 
988  template<typename NumericT, unsigned int AlignmentV>
991  op_trans> & U,
992  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
993  vector_base<NumericT> const & U_diagonal,
994  vector_base<NumericT> & vec,
996  {
997  csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
998  viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
999  viennacl::cuda_arg<NumericT>(U.lhs().handle()),
1000  viennacl::cuda_arg(U_diagonal),
1001  viennacl::cuda_arg<unsigned int>(block_indices),
1002  viennacl::cuda_arg(vec),
1003  static_cast<unsigned int>(U.lhs().size1())
1004  );
1005  }
1006 
1007 
1008 }
1009 
1010 
1011 //
1012 // Compressed Compressed Matrix
1013 //
1014 
1015 template<typename NumericT>
1017  const unsigned int * row_jumper,
1018  const unsigned int * row_indices,
1019  const unsigned int * column_indices,
1020  const NumericT * elements,
1021  unsigned int nonzero_rows,
1022  const NumericT * x,
1023  unsigned int start_x,
1024  unsigned int inc_x,
1025  NumericT alpha,
1026  NumericT * result,
1027  unsigned int start_result,
1028  unsigned int inc_result,
1029  unsigned int size_result,
1030  NumericT beta)
1031 {
1032  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
1033  i < nonzero_rows;
1034  i += gridDim.x * blockDim.x)
1035  {
1036  NumericT dot_prod = NumericT(0);
1037  unsigned int row_end = row_jumper[i+1];
1038  for (unsigned int j = row_jumper[i]; j < row_end; ++j)
1039  dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
1040 
1041  unsigned int index = row_indices[i] * inc_result + start_result;
1042  if (beta != 0) result[index] += alpha * dot_prod;
1043  else result[index] = alpha * dot_prod;
1044  }
1045 }
1046 
1047 
1056 template<typename NumericT>
1058  const viennacl::vector_base<NumericT> & vec,
1059  NumericT alpha,
1061  NumericT beta)
1062 {
1063  if (beta < 0 || beta > 0)
1064  viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
1065  else
1066  result.clear();
1067 
1068  compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
1069  viennacl::cuda_arg<unsigned int>(mat.handle3()),
1070  viennacl::cuda_arg<unsigned int>(mat.handle2()),
1071  viennacl::cuda_arg<NumericT>(mat.handle()),
1072  static_cast<unsigned int>(mat.nnz1()),
1073  viennacl::cuda_arg(vec),
1074  static_cast<unsigned int>(vec.start()),
1075  static_cast<unsigned int>(vec.stride()),
1076  alpha,
1077  viennacl::cuda_arg(result),
1078  static_cast<unsigned int>(result.start()),
1079  static_cast<unsigned int>(result.stride()),
1080  static_cast<unsigned int>(result.size()),
1081  beta
1082  );
1083  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
1084 }
1085 
1086 //
1087 // Coordinate Matrix
1088 //
1089 
1090 
1091 namespace detail
1092 {
1093 
1094  template<typename NumericT>
1095  __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
1096  const NumericT * elements,
1097  const unsigned int * group_boundaries,
1098  NumericT * result,
1099  unsigned int option)
1100  {
1101  __shared__ unsigned int shared_rows[128];
1102  __shared__ NumericT inter_results[128];
1103 
1104  uint2 tmp;
1105  NumericT val;
1106  unsigned int last_index = blockDim.x - 1;
1107  unsigned int group_start = group_boundaries[blockIdx.x];
1108  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1109  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1110 
1111  unsigned int local_index = 0;
1112 
1113  for (unsigned int k = 0; k < k_end; ++k)
1114  {
1115  local_index = group_start + k * blockDim.x + threadIdx.x;
1116 
1117  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1118  val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
1119 
1120  //check for carry from previous loop run:
1121  if (threadIdx.x == 0 && k > 0)
1122  {
1123  if (tmp.x == shared_rows[last_index])
1124  {
1125  switch (option)
1126  {
1127  case 0: //inf-norm
1128  case 3: //diagonal entry
1129  val = max(val, fabs(inter_results[last_index]));
1130  break;
1131 
1132  case 1: //1-norm
1133  val = fabs(val) + inter_results[last_index];
1134  break;
1135 
1136  case 2: //2-norm
1137  val = sqrt(val * val + inter_results[last_index]);
1138  break;
1139 
1140  default:
1141  break;
1142  }
1143  }
1144  else
1145  {
1146  switch (option)
1147  {
1148  case 0: //inf-norm
1149  case 1: //1-norm
1150  case 3: //diagonal entry
1151  result[shared_rows[last_index]] = inter_results[last_index];
1152  break;
1153 
1154  case 2: //2-norm
1155  result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
1156  default:
1157  break;
1158  }
1159  }
1160  }
1161 
1162  //segmented parallel reduction begin
1163  __syncthreads();
1164  shared_rows[threadIdx.x] = tmp.x;
1165  switch (option)
1166  {
1167  case 0:
1168  case 3:
1169  inter_results[threadIdx.x] = val;
1170  break;
1171  case 1:
1172  inter_results[threadIdx.x] = fabs(val);
1173  break;
1174  case 2:
1175  inter_results[threadIdx.x] = val * val;
1176  default:
1177  break;
1178  }
1179  __syncthreads();
1180 
1181  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1182  {
1183  NumericT left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1184  __syncthreads();
1185  switch (option)
1186  {
1187  case 0: //inf-norm
1188  case 3: //diagonal entry
1189  inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
1190  break;
1191 
1192  case 1: //1-norm
1193  inter_results[threadIdx.x] += left;
1194  break;
1195 
1196  case 2: //2-norm
1197  inter_results[threadIdx.x] += left;
1198  break;
1199 
1200  default:
1201  break;
1202  }
1203  __syncthreads();
1204  }
1205  //segmented parallel reduction end
1206 
1207  if (threadIdx.x != last_index &&
1208  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1209  inter_results[threadIdx.x] != 0)
1210  {
1211  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1212  }
1213 
1214  __syncthreads();
1215  } //for k
1216 
1217  if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
1218  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1219  }
1220 
1221  template<typename NumericT, unsigned int AlignmentV>
1223  vector_base<NumericT> & vec,
1225  {
1226  coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
1227  viennacl::cuda_arg<NumericT>(mat.handle()),
1228  viennacl::cuda_arg<unsigned int>(mat.handle3()),
1229  viennacl::cuda_arg(vec),
1230  static_cast<unsigned int>(info_selector)
1231  );
1232  VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
1233  }
1234 
1235 } //namespace detail
1236 
1237 
1238 template<typename NumericT>
1239 __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1240  const NumericT * elements,
1241  const unsigned int * group_boundaries,
1242  const NumericT * x,
1243  unsigned int start_x,
1244  unsigned int inc_x,
1245  NumericT alpha,
1246  NumericT * result,
1247  unsigned int start_result,
1248  unsigned int inc_result,
1249  NumericT beta)
1250 {
1251  __shared__ unsigned int shared_rows[128];
1252  __shared__ NumericT inter_results[128];
1253 
1254  uint2 tmp;
1255  NumericT val;
1256  unsigned int group_start = group_boundaries[blockIdx.x];
1257  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1258  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1259 
1260  unsigned int local_index = 0;
1261 
1262  for (unsigned int k = 0; k < k_end; ++k)
1263  {
1264  local_index = group_start + k * blockDim.x + threadIdx.x;
1265 
1266  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1267  val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1268 
1269  //check for carry from previous loop run:
1270  if (threadIdx.x == 0 && k > 0)
1271  {
1272  if (tmp.x == shared_rows[blockDim.x-1])
1273  val += inter_results[blockDim.x-1];
1274  else if (beta != 0)
1275  result[shared_rows[blockDim.x-1] * inc_result + start_result] += alpha * inter_results[blockDim.x-1];
1276  else
1277  result[shared_rows[blockDim.x-1] * inc_result + start_result] = alpha * inter_results[blockDim.x-1];
1278  }
1279 
1280  //segmented parallel reduction begin
1281  __syncthreads();
1282  shared_rows[threadIdx.x] = tmp.x;
1283  inter_results[threadIdx.x] = val;
1284  NumericT left = 0;
1285  __syncthreads();
1286 
1287  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1288  {
1289  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1290  __syncthreads();
1291  inter_results[threadIdx.x] += left;
1292  __syncthreads();
1293  }
1294  //segmented parallel reduction end
1295 
1296  if (local_index < group_end - 1 && threadIdx.x < blockDim.x-1 &&
1297  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1298  {
1299  if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
1300  else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
1301  }
1302 
1303  __syncthreads();
1304  } //for k
1305 
1306  if (local_index + 1 == group_end) {
1307  if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
1308  else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
1309  }
1310 }
1311 
1312 
1321 template<typename NumericT, unsigned int AlignmentV>
1323  const viennacl::vector_base<NumericT> & vec,
1324  NumericT alpha,
1326  NumericT beta)
1327 {
1328  if (beta < 0 || beta > 0)
1329  viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
1330  else
1331  result.clear();
1332 
1333  coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
1334  viennacl::cuda_arg<NumericT>(mat.handle()),
1335  viennacl::cuda_arg<unsigned int>(mat.handle3()),
1336  viennacl::cuda_arg(vec),
1337  static_cast<unsigned int>(vec.start()),
1338  static_cast<unsigned int>(vec.stride()),
1339  alpha,
1340  viennacl::cuda_arg(result),
1341  static_cast<unsigned int>(result.start()),
1342  static_cast<unsigned int>(result.stride()),
1343  beta
1344  );
1345  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
1346 }
1347 
1348 
1349 
1350 
1351 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1352 __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1353  const NumericT * elements,
1354  const unsigned int * group_boundaries,
1355  const NumericT * d_mat,
1356  unsigned int d_mat_row_start,
1357  unsigned int d_mat_col_start,
1358  unsigned int d_mat_row_inc,
1359  unsigned int d_mat_col_inc,
1360  unsigned int d_mat_row_size,
1361  unsigned int d_mat_col_size,
1362  unsigned int d_mat_internal_rows,
1363  unsigned int d_mat_internal_cols,
1364  NumericT * result,
1365  unsigned int result_row_start,
1366  unsigned int result_col_start,
1367  unsigned int result_row_inc,
1368  unsigned int result_col_inc,
1369  unsigned int result_row_size,
1370  unsigned int result_col_size,
1371  unsigned int result_internal_rows,
1372  unsigned int result_internal_cols)
1373 {
1374  __shared__ unsigned int shared_rows[128];
1375  __shared__ NumericT inter_results[128];
1376 
1377  uint2 tmp;
1378  NumericT val;
1379  unsigned int group_start = group_boundaries[blockIdx.x];
1380  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1381  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1382 
1383  unsigned int local_index = 0;
1384 
1385  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1386  {
1387  for (unsigned int k = 0; k < k_end; ++k)
1388  {
1389  local_index = group_start + k * blockDim.x + threadIdx.x;
1390 
1391  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1392  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1393  d_mat_row_start, d_mat_row_inc,
1394  d_mat_col_start, d_mat_col_inc,
1395  d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1396 
1397  //check for carry from previous loop run:
1398  if (threadIdx.x == 0 && k > 0)
1399  {
1400  if (tmp.x == shared_rows[blockDim.x-1])
1401  val += inter_results[blockDim.x-1];
1402  else
1403  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1404  result_row_start, result_row_inc,
1405  result_col_start, result_col_inc,
1406  result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1407  }
1408 
1409  //segmented parallel reduction begin
1410  __syncthreads();
1411  shared_rows[threadIdx.x] = tmp.x;
1412  inter_results[threadIdx.x] = val;
1413  NumericT left = 0;
1414  __syncthreads();
1415 
1416  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1417  {
1418  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1419  __syncthreads();
1420  inter_results[threadIdx.x] += left;
1421  __syncthreads();
1422  }
1423  //segmented parallel reduction end
1424 
1425  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1426  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1427  {
1428  result[ResultIndexT::apply(tmp.x, result_col,
1429  result_row_start, result_row_inc,
1430  result_col_start, result_col_inc,
1431  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1432  }
1433 
1434  __syncthreads();
1435  } //for k
1436 
1437  if (local_index + 1 == group_end)
1438  result[ResultIndexT::apply(tmp.x, result_col,
1439  result_row_start, result_row_inc,
1440  result_col_start, result_col_inc,
1441  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1442  }
1443 }
1444 
1445 
1454 template<typename NumericT, unsigned int AlignmentV>
1456  const viennacl::matrix_base<NumericT> & d_mat,
1458 {
1459  if (d_mat.row_major() && result.row_major())
1460  {
1461  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1462  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1463  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1464  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1465 
1466  viennacl::cuda_arg(d_mat),
1467  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1468  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1469  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1470  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1471 
1472  viennacl::cuda_arg(result),
1473  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1474  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1475  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1476  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1477  );
1478  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1479  }
1480  else if (d_mat.row_major() && !result.row_major())
1481  {
1482  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1483  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1484  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1485  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1486 
1487  viennacl::cuda_arg(d_mat),
1488  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1489  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1490  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1491  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1492 
1493  viennacl::cuda_arg(result),
1494  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1495  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1496  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1497  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1498  );
1499  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1500  }
1501  else if (!d_mat.row_major() && result.row_major())
1502  {
1503  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1504  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1505  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1506  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1507 
1508  viennacl::cuda_arg(d_mat),
1509  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1510  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1511  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1512  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1513 
1514  viennacl::cuda_arg(result),
1515  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1516  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1517  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1518  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1519  );
1520  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1521  }
1522  else
1523  {
1524  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1525  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1526  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1527  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1528 
1529  viennacl::cuda_arg(d_mat),
1530  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1531  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1532  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1533  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1534 
1535  viennacl::cuda_arg(result),
1536  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1537  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1538  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1539  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1540  );
1541  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1542  }
1543 
1544 }
1545 
1546 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1547 __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1548  const NumericT * elements,
1549  const unsigned int * group_boundaries,
1550  const NumericT * d_mat,
1551  unsigned int d_mat_row_start,
1552  unsigned int d_mat_col_start,
1553  unsigned int d_mat_row_inc,
1554  unsigned int d_mat_col_inc,
1555  unsigned int d_mat_row_size,
1556  unsigned int d_mat_col_size,
1557  unsigned int d_mat_internal_rows,
1558  unsigned int d_mat_internal_cols,
1559  NumericT * result,
1560  unsigned int result_row_start,
1561  unsigned int result_col_start,
1562  unsigned int result_row_inc,
1563  unsigned int result_col_inc,
1564  unsigned int result_row_size,
1565  unsigned int result_col_size,
1566  unsigned int result_internal_rows,
1567  unsigned int result_internal_cols)
1568 {
1569  __shared__ unsigned int shared_rows[128];
1570  __shared__ NumericT inter_results[128];
1571 
1572  uint2 tmp;
1573  NumericT val;
1574  unsigned int group_start = group_boundaries[blockIdx.x];
1575  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1576  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1577 
1578  unsigned int local_index = 0;
1579 
1580  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1581  {
1582  for (unsigned int k = 0; k < k_end; ++k)
1583  {
1584  local_index = group_start + k * blockDim.x + threadIdx.x;
1585 
1586  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1587  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1588  d_mat_row_start, d_mat_row_inc,
1589  d_mat_col_start, d_mat_col_inc,
1590  d_mat_internal_rows, d_mat_internal_cols)] : 0;
1591 
1592  //check for carry from previous loop run:
1593  if (threadIdx.x == 0 && k > 0)
1594  {
1595  if (tmp.x == shared_rows[blockDim.x-1])
1596  val += inter_results[blockDim.x-1];
1597  else
1598  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1599  result_row_start, result_row_inc,
1600  result_col_start, result_col_inc,
1601  result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1602  }
1603 
1604  //segmented parallel reduction begin
1605  __syncthreads();
1606  shared_rows[threadIdx.x] = tmp.x;
1607  inter_results[threadIdx.x] = val;
1608  NumericT left = 0;
1609  __syncthreads();
1610 
1611  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1612  {
1613  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1614  __syncthreads();
1615  inter_results[threadIdx.x] += left;
1616  __syncthreads();
1617  }
1618  //segmented parallel reduction end
1619 
1620  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1621  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1622  {
1623  result[ ResultIndexT::apply(tmp.x, result_col,
1624  result_row_start, result_row_inc,
1625  result_col_start, result_col_inc,
1626  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1627  }
1628 
1629  __syncthreads();
1630  } //for k
1631 
1632  if (local_index + 1 == group_end)
1633  result[ ResultIndexT::apply(tmp.x, result_col,
1634  result_row_start, result_row_inc,
1635  result_col_start, result_col_inc,
1636  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1637  }
1638 }
1639 
1648 template<typename NumericT, unsigned int AlignmentV>
1652  viennacl::op_trans > & d_mat,
1654 {
1655  if (d_mat.lhs().row_major() && result.row_major())
1656  {
1657  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1658  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1659  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1660  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1661 
1662  viennacl::cuda_arg(d_mat.lhs()),
1663  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1664  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1665  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1666  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1667 
1668  viennacl::cuda_arg(result),
1669  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1670  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1671  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1672  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1673  );
1674  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1675  }
1676  else if (d_mat.lhs().row_major() && !result.row_major())
1677  {
1678  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1679  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1680  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1681  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1682 
1683  viennacl::cuda_arg(d_mat.lhs()),
1684  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1685  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1686  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1687  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1688 
1689  viennacl::cuda_arg(result),
1690  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1691  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1692  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1693  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1694  );
1695  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1696  }
1697  else if (!d_mat.lhs().row_major() && result.row_major())
1698  {
1699  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1700  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1701  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1702  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1703 
1704  viennacl::cuda_arg(d_mat.lhs()),
1705  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1706  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1707  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1708  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1709 
1710  viennacl::cuda_arg(result),
1711  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1712  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1713  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1714  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1715  );
1716  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1717  }
1718  else
1719  {
1720  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1721  (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
1722  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1723  viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
1724 
1725  viennacl::cuda_arg(d_mat.lhs()),
1726  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1727  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1728  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1729  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1730 
1731  viennacl::cuda_arg(result),
1732  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1733  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1734  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1735  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1736  );
1737  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1738  }
1739 }
1740 
1741 
1742 //
1743 // ELL Matrix
1744 //
1745 
1746 template<typename AlphaBetaHandlerT, typename NumericT>
1747 __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
1748  const NumericT * elements,
1749  const NumericT * x,
1750  unsigned int start_x,
1751  unsigned int inc_x,
1752  NumericT alpha,
1753  NumericT * result,
1754  unsigned int start_result,
1755  unsigned int inc_result,
1756  NumericT beta,
1757  unsigned int row_num,
1758  unsigned int col_num,
1759  unsigned int internal_row_num,
1760  unsigned int items_per_row,
1761  unsigned int aligned_items_per_row
1762  )
1763 {
1764  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1765  unsigned int glb_sz = gridDim.x * blockDim.x;
1766 
1767  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1768  {
1769  NumericT sum = 0;
1770 
1771  unsigned int offset = row_id;
1772  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1773  {
1774  NumericT val = elements[offset];
1775 
1776  if (val != NumericT(0))
1777  {
1778  int col = coords[offset];
1779  sum += x[col * inc_x + start_x] * val;
1780  }
1781  }
1782 
1783  AlphaBetaHandlerT::apply(result[row_id * inc_result + start_result], alpha, sum, beta);
1784  }
1785 }
1786 
1787 
1796 template<typename NumericT, unsigned int AlignmentV>
1798  const viennacl::vector_base<NumericT> & vec,
1799  NumericT alpha,
1801  NumericT beta)
1802 {
1803  if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
1804  ell_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle2()),
1805  viennacl::cuda_arg<NumericT>(mat.handle()),
1806  viennacl::cuda_arg(vec),
1807  static_cast<unsigned int>(vec.start()),
1808  static_cast<unsigned int>(vec.stride()),
1809  alpha,
1810  viennacl::cuda_arg(result),
1811  static_cast<unsigned int>(result.start()),
1812  static_cast<unsigned int>(result.stride()),
1813  beta,
1814  static_cast<unsigned int>(mat.size1()),
1815  static_cast<unsigned int>(mat.size2()),
1816  static_cast<unsigned int>(mat.internal_size1()),
1817  static_cast<unsigned int>(mat.maxnnz()),
1818  static_cast<unsigned int>(mat.internal_maxnnz())
1819  );
1820  else
1821  ell_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle2()),
1822  viennacl::cuda_arg<NumericT>(mat.handle()),
1823  viennacl::cuda_arg(vec),
1824  static_cast<unsigned int>(vec.start()),
1825  static_cast<unsigned int>(vec.stride()),
1826  alpha,
1827  viennacl::cuda_arg(result),
1828  static_cast<unsigned int>(result.start()),
1829  static_cast<unsigned int>(result.stride()),
1830  beta,
1831  static_cast<unsigned int>(mat.size1()),
1832  static_cast<unsigned int>(mat.size2()),
1833  static_cast<unsigned int>(mat.internal_size1()),
1834  static_cast<unsigned int>(mat.maxnnz()),
1835  static_cast<unsigned int>(mat.internal_maxnnz())
1836  );
1837  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
1838 }
1839 
1840 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1841 __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
1842  const NumericT * sp_mat_elements,
1843  unsigned int sp_mat_row_num,
1844  unsigned int sp_mat_col_num,
1845  unsigned int sp_mat_internal_row_num,
1846  unsigned int sp_mat_items_per_row,
1847  unsigned int sp_mat_aligned_items_per_row,
1848  const NumericT * d_mat,
1849  unsigned int d_mat_row_start,
1850  unsigned int d_mat_col_start,
1851  unsigned int d_mat_row_inc,
1852  unsigned int d_mat_col_inc,
1853  unsigned int d_mat_row_size,
1854  unsigned int d_mat_col_size,
1855  unsigned int d_mat_internal_rows,
1856  unsigned int d_mat_internal_cols,
1857  NumericT * result,
1858  unsigned int result_row_start,
1859  unsigned int result_col_start,
1860  unsigned int result_row_inc,
1861  unsigned int result_col_inc,
1862  unsigned int result_row_size,
1863  unsigned int result_col_size,
1864  unsigned int result_internal_rows,
1865  unsigned int result_internal_cols)
1866 {
1867  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1868  unsigned int glb_sz = gridDim.x * blockDim.x;
1869 
1870  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1871  {
1872  unsigned int row = rc % sp_mat_row_num;
1873  unsigned int col = rc / sp_mat_row_num;
1874 
1875  unsigned int offset = row;
1876  NumericT r = (NumericT)0;
1877 
1878  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1879  {
1880  unsigned int j = sp_mat_coords[offset];
1881  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1882 
1883  if (x != (NumericT)0)
1884  {
1885  NumericT y = d_mat[ DMatIndexT::apply(j, col,
1886  d_mat_row_start, d_mat_row_inc,
1887  d_mat_col_start, d_mat_col_inc,
1888  d_mat_internal_rows, d_mat_internal_cols) ];
1889 
1890  r += x*y;
1891  }
1892  }
1893  result [ ResultIndexT::apply(row, col,
1894  result_row_start, result_row_inc,
1895  result_col_start, result_col_inc,
1896  result_internal_rows, result_internal_cols) ] = r;
1897  }
1898 
1899 }
1900 
1910 template<typename NumericT, unsigned int AlignmentV>
1912  const viennacl::matrix_base<NumericT> & d_mat,
1914 {
1915  if (d_mat.row_major() && result.row_major())
1916  {
1917  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1918  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1919  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1920  static_cast<unsigned int>(sp_mat.size1()),
1921  static_cast<unsigned int>(sp_mat.size2()),
1922  static_cast<unsigned int>(sp_mat.internal_size1()),
1923  static_cast<unsigned int>(sp_mat.maxnnz()),
1924  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1925  viennacl::cuda_arg(d_mat),
1926  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1927  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1928  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1929  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1930 
1931  viennacl::cuda_arg(result),
1932  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1933  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1934  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1935  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1936  );
1937  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1938  }
1939  else if (d_mat.row_major() && !result.row_major())
1940  {
1941  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1942  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1943  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1944  static_cast<unsigned int>(sp_mat.size1()),
1945  static_cast<unsigned int>(sp_mat.size2()),
1946  static_cast<unsigned int>(sp_mat.internal_size1()),
1947  static_cast<unsigned int>(sp_mat.maxnnz()),
1948  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1949  viennacl::cuda_arg(d_mat),
1950  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1951  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1952  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1953  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1954 
1955  viennacl::cuda_arg(result),
1956  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1957  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1958  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1959  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1960  );
1961  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1962  }
1963  else if (!d_mat.row_major() && result.row_major())
1964  {
1965  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1966  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1967  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1968  static_cast<unsigned int>(sp_mat.size1()),
1969  static_cast<unsigned int>(sp_mat.size2()),
1970  static_cast<unsigned int>(sp_mat.internal_size1()),
1971  static_cast<unsigned int>(sp_mat.maxnnz()),
1972  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1973  viennacl::cuda_arg(d_mat),
1974  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1975  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1976  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1977  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1978 
1979  viennacl::cuda_arg(result),
1980  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1981  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1982  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1983  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1984  );
1985  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1986  }
1987  else
1988  {
1989  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1990  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
1991  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
1992  static_cast<unsigned int>(sp_mat.size1()),
1993  static_cast<unsigned int>(sp_mat.size2()),
1994  static_cast<unsigned int>(sp_mat.internal_size1()),
1995  static_cast<unsigned int>(sp_mat.maxnnz()),
1996  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1997  viennacl::cuda_arg(d_mat),
1998  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1999  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2000  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2001  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2002 
2003  viennacl::cuda_arg(result),
2004  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2005  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2006  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2007  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2008  );
2009  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
2010  }
2011 }
2012 
2013 template<typename DMatIndexT, typename ResultIndexT, typename NumericT >
2014 __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
2015  const NumericT * sp_mat_elements,
2016  unsigned int sp_mat_row_num,
2017  unsigned int sp_mat_col_num,
2018  unsigned int sp_mat_internal_row_num,
2019  unsigned int sp_mat_items_per_row,
2020  unsigned int sp_mat_aligned_items_per_row,
2021  const NumericT * d_mat,
2022  unsigned int d_mat_row_start,
2023  unsigned int d_mat_col_start,
2024  unsigned int d_mat_row_inc,
2025  unsigned int d_mat_col_inc,
2026  unsigned int d_mat_row_size,
2027  unsigned int d_mat_col_size,
2028  unsigned int d_mat_internal_rows,
2029  unsigned int d_mat_internal_cols,
2030  NumericT * result,
2031  unsigned int result_row_start,
2032  unsigned int result_col_start,
2033  unsigned int result_row_inc,
2034  unsigned int result_col_inc,
2035  unsigned int result_row_size,
2036  unsigned int result_col_size,
2037  unsigned int result_internal_rows,
2038  unsigned int result_internal_cols)
2039 {
2040  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2041  unsigned int glb_sz = gridDim.x * blockDim.x;
2042 
2043  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
2044  {
2045  unsigned int row = rc % sp_mat_row_num;
2046  unsigned int col = rc / sp_mat_row_num;
2047 
2048  unsigned int offset = row;
2049  NumericT r = (NumericT)0;
2050 
2051  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
2052  {
2053  unsigned int j = sp_mat_coords[offset];
2054  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
2055 
2056  if (x != (NumericT)0)
2057  {
2058  NumericT y = d_mat[ DMatIndexT::apply(col, j,
2059  d_mat_row_start, d_mat_row_inc,
2060  d_mat_col_start, d_mat_col_inc,
2061  d_mat_internal_rows, d_mat_internal_cols) ];
2062 
2063  r += x*y;
2064  }
2065  }
2066  result [ ResultIndexT::apply(row, col,
2067  result_row_start, result_row_inc,
2068  result_col_start, result_col_inc,
2069  result_internal_rows, result_internal_cols) ] = r;
2070  }
2071 
2072 }
2073 
2083 template<typename NumericT, unsigned int AlignmentV>
2087  viennacl::op_trans > & d_mat,
2089 {
2090  if (d_mat.lhs().row_major() && result.row_major())
2091  {
2092  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
2093  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
2094  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
2095  static_cast<unsigned int>(sp_mat.size1()),
2096  static_cast<unsigned int>(sp_mat.size2()),
2097  static_cast<unsigned int>(sp_mat.internal_size1()),
2098  static_cast<unsigned int>(sp_mat.maxnnz()),
2099  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
2100 
2101  viennacl::cuda_arg(d_mat.lhs()),
2102  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2103  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2104  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2105  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2106 
2107  viennacl::cuda_arg(result),
2108  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2109  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2110  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2111  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2112  );
2113  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
2114  }
2115  else if (d_mat.lhs().row_major() && !result.row_major())
2116  {
2117  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
2118  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
2119  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
2120  static_cast<unsigned int>(sp_mat.size1()),
2121  static_cast<unsigned int>(sp_mat.size2()),
2122  static_cast<unsigned int>(sp_mat.internal_size1()),
2123  static_cast<unsigned int>(sp_mat.maxnnz()),
2124  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
2125 
2126  viennacl::cuda_arg(d_mat.lhs()),
2127  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2128  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2129  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2130  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2131 
2132  viennacl::cuda_arg(result),
2133  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2134  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2135  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2136  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2137  );
2138  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
2139  }
2140  else if (!d_mat.lhs().row_major() && result.row_major())
2141  {
2142  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
2143  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
2144  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
2145  static_cast<unsigned int>(sp_mat.size1()),
2146  static_cast<unsigned int>(sp_mat.size2()),
2147  static_cast<unsigned int>(sp_mat.internal_size1()),
2148  static_cast<unsigned int>(sp_mat.maxnnz()),
2149  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
2150 
2151  viennacl::cuda_arg(d_mat.lhs()),
2152  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2153  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2154  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2155  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2156 
2157  viennacl::cuda_arg(result),
2158  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2159  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2160  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2161  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2162  );
2163  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
2164  }
2165  else
2166  {
2167  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
2168  (viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
2169  viennacl::cuda_arg<NumericT>(sp_mat.handle()),
2170  static_cast<unsigned int>(sp_mat.size1()),
2171  static_cast<unsigned int>(sp_mat.size2()),
2172  static_cast<unsigned int>(sp_mat.internal_size1()),
2173  static_cast<unsigned int>(sp_mat.maxnnz()),
2174  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
2175 
2176  viennacl::cuda_arg(d_mat.lhs()),
2177  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2178  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2179  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2180  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2181 
2182  viennacl::cuda_arg(result),
2183  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2184  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2185  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2186  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2187  );
2188  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
2189  }
2190 }
2191 
2192 //
2193 // SELL-C-\sigma Matrix
2194 //
2195 
2196 template<typename AlphaBetaHandlerT, typename NumericT>
2197 __global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int * columns_per_block,
2198  const unsigned int * column_indices,
2199  const unsigned int * block_start,
2200  const NumericT * elements,
2201  const NumericT * x,
2202  unsigned int start_x,
2203  unsigned int inc_x,
2204  unsigned int size_x,
2205  NumericT alpha,
2206  NumericT * result,
2207  unsigned int start_result,
2208  unsigned int inc_result,
2209  unsigned int size_result,
2210  NumericT beta,
2211  unsigned int block_size)
2212 {
2213  unsigned int blocks_per_threadblock = blockDim.x / block_size;
2214  unsigned int id_in_block = threadIdx.x % block_size;
2215  unsigned int num_blocks = (size_result - 1) / block_size + 1;
2216  unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
2217  unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
2218 
2219  for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
2220  {
2221  unsigned int row = block_idx * block_size + id_in_block;
2222  unsigned int offset = block_start[block_idx];
2223  unsigned int num_columns = columns_per_block[block_idx];
2224 
2225  NumericT sum = 0;
2226  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
2227  {
2228  unsigned int index = offset + item_id * block_size + id_in_block;
2229  NumericT val = elements[index];
2230 
2231  sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2232  }
2233 
2234  if (row < size_result)
2235  AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, sum, beta);
2236  }
2237 }
2238 
2247 template<typename NumericT, typename IndexT>
2249  const viennacl::vector_base<NumericT> & vec,
2250  NumericT alpha,
2252  NumericT beta)
2253 {
2254  if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
2255  sliced_ell_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
2256  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2257  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2258  viennacl::cuda_arg<NumericT>(mat.handle()),
2259  viennacl::cuda_arg(vec),
2260  static_cast<unsigned int>(vec.start()),
2261  static_cast<unsigned int>(vec.stride()),
2262  static_cast<unsigned int>(vec.size()),
2263  alpha,
2264  viennacl::cuda_arg(result),
2265  static_cast<unsigned int>(result.start()),
2266  static_cast<unsigned int>(result.stride()),
2267  static_cast<unsigned int>(result.size()),
2268  beta,
2269  static_cast<unsigned int>(mat.rows_per_block())
2270  );
2271  else
2272  sliced_ell_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
2273  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2274  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2275  viennacl::cuda_arg<NumericT>(mat.handle()),
2276  viennacl::cuda_arg(vec),
2277  static_cast<unsigned int>(vec.start()),
2278  static_cast<unsigned int>(vec.stride()),
2279  static_cast<unsigned int>(vec.size()),
2280  alpha,
2281  viennacl::cuda_arg(result),
2282  static_cast<unsigned int>(result.start()),
2283  static_cast<unsigned int>(result.stride()),
2284  static_cast<unsigned int>(result.size()),
2285  beta,
2286  static_cast<unsigned int>(mat.rows_per_block())
2287  );
2288  VIENNACL_CUDA_LAST_ERROR_CHECK("sliced_ell_matrix_vec_mul_kernel");
2289 }
2290 
2291 
2292 //
2293 // Hybrid Matrix
2294 //
2295 
2296 
2297 template<typename AlphaBetaHandlerT, typename NumericT>
2298 __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
2299  const NumericT * ell_elements,
2300  const unsigned int * csr_rows,
2301  const unsigned int * csr_cols,
2302  const NumericT * csr_elements,
2303  const NumericT * x,
2304  unsigned int start_x,
2305  unsigned int inc_x,
2306  NumericT alpha,
2307  NumericT * result,
2308  unsigned int start_result,
2309  unsigned int inc_result,
2310  NumericT beta,
2311  unsigned int row_num,
2312  unsigned int internal_row_num,
2313  unsigned int items_per_row,
2314  unsigned int aligned_items_per_row
2315  )
2316 {
2317  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2318  unsigned int glb_sz = gridDim.x * blockDim.x;
2319 
2320  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2321  {
2322  NumericT sum = 0;
2323 
2324  unsigned int offset = row_id;
2325  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2326  {
2327  NumericT val = ell_elements[offset];
2328 
2329 
2330  if (val != NumericT(0))
2331  {
2332  int col = ell_coords[offset];
2333  sum += (x[col * inc_x + start_x] * val);
2334  }
2335  }
2336 
2337  unsigned int col_begin = csr_rows[row_id];
2338  unsigned int col_end = csr_rows[row_id + 1];
2339 
2340  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2341  sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2342 
2343  AlphaBetaHandlerT::apply(result[row_id * inc_result + start_result], alpha, sum, beta);
2344  }
2345 }
2346 
2347 
2348 
2357 template<typename NumericT, unsigned int AlignmentV>
2359  const viennacl::vector_base<NumericT> & vec,
2360  NumericT alpha,
2362  NumericT beta)
2363 {
2364  if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
2365  hyb_matrix_vec_mul_kernel<detail::spmv_alpha_beta><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle2()),
2366  viennacl::cuda_arg<NumericT>(mat.handle()),
2367  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2368  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2369  viennacl::cuda_arg<NumericT>(mat.handle5()),
2370  viennacl::cuda_arg(vec),
2371  static_cast<unsigned int>(vec.start()),
2372  static_cast<unsigned int>(vec.stride()),
2373  alpha,
2374  viennacl::cuda_arg(result),
2375  static_cast<unsigned int>(result.start()),
2376  static_cast<unsigned int>(result.stride()),
2377  beta,
2378  static_cast<unsigned int>(mat.size1()),
2379  static_cast<unsigned int>(mat.internal_size1()),
2380  static_cast<unsigned int>(mat.ell_nnz()),
2381  static_cast<unsigned int>(mat.internal_ellnnz())
2382  );
2383  else
2384  hyb_matrix_vec_mul_kernel<detail::spmv_pure><<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle2()),
2385  viennacl::cuda_arg<NumericT>(mat.handle()),
2386  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2387  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2388  viennacl::cuda_arg<NumericT>(mat.handle5()),
2389  viennacl::cuda_arg(vec),
2390  static_cast<unsigned int>(vec.start()),
2391  static_cast<unsigned int>(vec.stride()),
2392  alpha,
2393  viennacl::cuda_arg(result),
2394  static_cast<unsigned int>(result.start()),
2395  static_cast<unsigned int>(result.stride()),
2396  beta,
2397  static_cast<unsigned int>(mat.size1()),
2398  static_cast<unsigned int>(mat.internal_size1()),
2399  static_cast<unsigned int>(mat.ell_nnz()),
2400  static_cast<unsigned int>(mat.internal_ellnnz())
2401  );
2402  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2403 }
2404 
2405 
2406 
2407 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2408 __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
2409  const NumericT * ell_elements,
2410  const unsigned int * csr_rows,
2411  const unsigned int * csr_cols,
2412  const NumericT * csr_elements,
2413  unsigned int row_num,
2414  unsigned int internal_row_num,
2415  unsigned int items_per_row,
2416  unsigned int aligned_items_per_row,
2417  const NumericT * d_mat,
2418  unsigned int d_mat_row_start,
2419  unsigned int d_mat_col_start,
2420  unsigned int d_mat_row_inc,
2421  unsigned int d_mat_col_inc,
2422  unsigned int d_mat_row_size,
2423  unsigned int d_mat_col_size,
2424  unsigned int d_mat_internal_rows,
2425  unsigned int d_mat_internal_cols,
2426  NumericT * result,
2427  unsigned int result_row_start,
2428  unsigned int result_col_start,
2429  unsigned int result_row_inc,
2430  unsigned int result_col_inc,
2431  unsigned int result_row_size,
2432  unsigned int result_col_size,
2433  unsigned int result_internal_rows,
2434  unsigned int result_internal_cols)
2435 {
2436  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2437  unsigned int glb_sz = gridDim.x * blockDim.x;
2438 
2439  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2440  {
2441  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2442  {
2443  NumericT sum = 0;
2444 
2445  unsigned int offset = row_id;
2446  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2447  {
2448  NumericT val = ell_elements[offset];
2449 
2450  if (val != 0.0f)
2451  {
2452  sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2453  d_mat_row_start, d_mat_row_inc,
2454  d_mat_col_start, d_mat_col_inc,
2455  d_mat_internal_rows, d_mat_internal_cols)] * val;
2456  }
2457  }
2458 
2459  unsigned int col_begin = csr_rows[row_id];
2460  unsigned int col_end = csr_rows[row_id + 1];
2461 
2462  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2463  {
2464  sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2465  d_mat_row_start, d_mat_row_inc,
2466  d_mat_col_start, d_mat_col_inc,
2467  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2468  }
2469 
2470  result[ResultIndexT::apply(row_id, result_col,
2471  result_row_start, result_row_inc,
2472  result_col_start, result_col_inc,
2473  result_internal_rows, result_internal_cols)] = sum;
2474  }
2475  }
2476 }
2477 
2478 
2479 
2488 template<typename NumericT, unsigned int AlignmentV>
2490  const viennacl::matrix_base<NumericT> & d_mat,
2492 {
2493  if (d_mat.row_major() && result.row_major())
2494  {
2495  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2496  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2497  viennacl::cuda_arg<NumericT>(mat.handle()),
2498  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2499  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2500  viennacl::cuda_arg<NumericT>(mat.handle5()),
2501  static_cast<unsigned int>(mat.size1()),
2502  static_cast<unsigned int>(mat.internal_size1()),
2503  static_cast<unsigned int>(mat.ell_nnz()),
2504  static_cast<unsigned int>(mat.internal_ellnnz()),
2505 
2506  viennacl::cuda_arg(d_mat),
2507  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2508  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2509  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2510  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2511 
2512  viennacl::cuda_arg(result),
2513  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2514  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2515  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2516  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2517  );
2518  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2519  }
2520  else if (d_mat.row_major() && !result.row_major())
2521  {
2522  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2523  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2524  viennacl::cuda_arg<NumericT>(mat.handle()),
2525  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2526  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2527  viennacl::cuda_arg<NumericT>(mat.handle5()),
2528  static_cast<unsigned int>(mat.size1()),
2529  static_cast<unsigned int>(mat.internal_size1()),
2530  static_cast<unsigned int>(mat.ell_nnz()),
2531  static_cast<unsigned int>(mat.internal_ellnnz()),
2532 
2533  viennacl::cuda_arg(d_mat),
2534  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2535  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2536  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2537  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2538 
2539  viennacl::cuda_arg(result),
2540  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2541  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2542  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2543  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2544  );
2545  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2546  }
2547  else if (!d_mat.row_major() && result.row_major())
2548  {
2549  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2550  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2551  viennacl::cuda_arg<NumericT>(mat.handle()),
2552  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2553  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2554  viennacl::cuda_arg<NumericT>(mat.handle5()),
2555  static_cast<unsigned int>(mat.size1()),
2556  static_cast<unsigned int>(mat.internal_size1()),
2557  static_cast<unsigned int>(mat.ell_nnz()),
2558  static_cast<unsigned int>(mat.internal_ellnnz()),
2559 
2560  viennacl::cuda_arg(d_mat),
2561  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2562  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2563  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2564  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2565 
2566  viennacl::cuda_arg(result),
2567  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2568  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2569  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2570  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2571  );
2572  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2573  }
2574  else
2575  {
2576  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2577  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2578  viennacl::cuda_arg<NumericT>(mat.handle()),
2579  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2580  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2581  viennacl::cuda_arg<NumericT>(mat.handle5()),
2582  static_cast<unsigned int>(mat.size1()),
2583  static_cast<unsigned int>(mat.internal_size1()),
2584  static_cast<unsigned int>(mat.ell_nnz()),
2585  static_cast<unsigned int>(mat.internal_ellnnz()),
2586 
2587  viennacl::cuda_arg(d_mat),
2588  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2589  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2590  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2591  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2592 
2593  viennacl::cuda_arg(result),
2594  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2595  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2596  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2597  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2598  );
2599  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2600  }
2601 }
2602 
2603 
2604 
2605 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2606 __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
2607  const NumericT * ell_elements,
2608  const unsigned int * csr_rows,
2609  const unsigned int * csr_cols,
2610  const NumericT * csr_elements,
2611  unsigned int row_num,
2612  unsigned int internal_row_num,
2613  unsigned int items_per_row,
2614  unsigned int aligned_items_per_row,
2615  const NumericT * d_mat,
2616  unsigned int d_mat_row_start,
2617  unsigned int d_mat_col_start,
2618  unsigned int d_mat_row_inc,
2619  unsigned int d_mat_col_inc,
2620  unsigned int d_mat_row_size,
2621  unsigned int d_mat_col_size,
2622  unsigned int d_mat_internal_rows,
2623  unsigned int d_mat_internal_cols,
2624  NumericT * result,
2625  unsigned int result_row_start,
2626  unsigned int result_col_start,
2627  unsigned int result_row_inc,
2628  unsigned int result_col_inc,
2629  unsigned int result_row_size,
2630  unsigned int result_col_size,
2631  unsigned int result_internal_rows,
2632  unsigned int result_internal_cols)
2633 {
2634  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2635  unsigned int glb_sz = gridDim.x * blockDim.x;
2636 
2637  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2638  {
2639  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2640  {
2641  NumericT sum = 0;
2642 
2643  unsigned int offset = row_id;
2644  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2645  {
2646  NumericT val = ell_elements[offset];
2647 
2648  if (val != 0.0f)
2649  {
2650  sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2651  d_mat_row_start, d_mat_row_inc,
2652  d_mat_col_start, d_mat_col_inc,
2653  d_mat_internal_rows, d_mat_internal_cols)] * val;
2654  }
2655  }
2656 
2657  unsigned int col_begin = csr_rows[row_id];
2658  unsigned int col_end = csr_rows[row_id + 1];
2659 
2660  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2661  {
2662  sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2663  d_mat_row_start, d_mat_row_inc,
2664  d_mat_col_start, d_mat_col_inc,
2665  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2666  }
2667 
2668  result[ResultIndexT::apply(row_id, result_col,
2669  result_row_start, result_row_inc,
2670  result_col_start, result_col_inc,
2671  result_internal_rows, result_internal_cols)] = sum;
2672  }
2673  }
2674 }
2675 
2676 
2677 
2686 template<typename NumericT, unsigned int AlignmentV>
2690  viennacl::op_trans > & d_mat,
2692 {
2693  if (d_mat.lhs().row_major() && result.row_major())
2694  {
2695  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2696  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2697  viennacl::cuda_arg<NumericT>(mat.handle()),
2698  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2699  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2700  viennacl::cuda_arg<NumericT>(mat.handle5()),
2701  static_cast<unsigned int>(mat.size1()),
2702  static_cast<unsigned int>(mat.internal_size1()),
2703  static_cast<unsigned int>(mat.ell_nnz()),
2704  static_cast<unsigned int>(mat.internal_ellnnz()),
2705 
2706  viennacl::cuda_arg(d_mat.lhs()),
2707  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2708  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2709  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2710  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2711 
2712  viennacl::cuda_arg(result),
2713  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2714  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2715  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2716  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2717  );
2718  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2719  }
2720  else if (d_mat.lhs().row_major() && !result.row_major())
2721  {
2722  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2723  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2724  viennacl::cuda_arg<NumericT>(mat.handle()),
2725  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2726  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2727  viennacl::cuda_arg<NumericT>(mat.handle5()),
2728  static_cast<unsigned int>(mat.size1()),
2729  static_cast<unsigned int>(mat.internal_size1()),
2730  static_cast<unsigned int>(mat.ell_nnz()),
2731  static_cast<unsigned int>(mat.internal_ellnnz()),
2732 
2733  viennacl::cuda_arg(d_mat.lhs()),
2734  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2735  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2736  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2737  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2738 
2739  viennacl::cuda_arg(result),
2740  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2741  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2742  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2743  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2744  );
2745  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2746  }
2747  else if (!d_mat.lhs().row_major() && result.row_major())
2748  {
2749  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2750  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2751  viennacl::cuda_arg<NumericT>(mat.handle()),
2752  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2753  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2754  viennacl::cuda_arg<NumericT>(mat.handle5()),
2755  static_cast<unsigned int>(mat.size1()),
2756  static_cast<unsigned int>(mat.internal_size1()),
2757  static_cast<unsigned int>(mat.ell_nnz()),
2758  static_cast<unsigned int>(mat.internal_ellnnz()),
2759 
2760  viennacl::cuda_arg(d_mat.lhs()),
2761  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2762  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2763  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2764  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2765 
2766  viennacl::cuda_arg(result),
2767  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2768  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2769  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2770  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2771  );
2772  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2773  }
2774  else
2775  {
2776  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2777  viennacl::cuda_arg<unsigned int>(mat.handle2()),
2778  viennacl::cuda_arg<NumericT>(mat.handle()),
2779  viennacl::cuda_arg<unsigned int>(mat.handle3()),
2780  viennacl::cuda_arg<unsigned int>(mat.handle4()),
2781  viennacl::cuda_arg<NumericT>(mat.handle5()),
2782  static_cast<unsigned int>(mat.size1()),
2783  static_cast<unsigned int>(mat.internal_size1()),
2784  static_cast<unsigned int>(mat.ell_nnz()),
2785  static_cast<unsigned int>(mat.internal_ellnnz()),
2786 
2787  viennacl::cuda_arg(d_mat.lhs()),
2788  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2789  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2790  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2791  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2792 
2793  viennacl::cuda_arg(result),
2794  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2795  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2796  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2797  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2798  );
2799  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2800  }
2801 }
2802 
2803 
2804 } // namespace cuda
2805 } //namespace linalg
2806 } //namespace viennacl
2807 
2808 
2809 #endif
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
Simple enable-if variant that uses the SFINAE pattern.
Definition: enable_if.hpp:30
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
handle_type & handle2()
Definition: ell_matrix.hpp:103
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:386
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
A tag class representing a lower triangular matrix.
Definition: forwards.h:849
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
static __device__ void apply(NumericT &result, NumericT alpha, NumericT Ax, NumericT beta)
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:394
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size2() const
Definition: ell_matrix.hpp:92
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, NumericT beta)
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
static __device__ void apply(NumericT &result, NumericT alpha, NumericT Ax, NumericT beta)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
vcl_size_t size1() const
Returns the size of the result vector.
Definition: matrix.hpp:72
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
A tag class representing an upper triangular matrix.
Definition: forwards.h:854
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
std::size_t vcl_size_t
Definition: forwards.h:75
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
Definition: matrix.hpp:895
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta, unsigned int block_size)
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
handle_type & handle()
Definition: ell_matrix.hpp:100
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:875
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:248
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT alpha, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, NumericT beta)
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
Definition: forwards.h:220
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
A tag for column-major storage of a dense matrix.
Definition: forwards.h:321
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
LHS & lhs() const
Get left hand side operand.
Definition: matrix.hpp:66
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
Implementations of NMF operations using CUDA.
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)