ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
29 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/traits/size.hpp"
37 
39 
45 
46 namespace viennacl
47 {
48 namespace linalg
49 {
50 namespace cuda
51 {
52 //
53 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
54 //
55 
56 template<typename DestNumericT, typename SrcNumericT>
58 {
59  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
60 
61  if (mat1.row_major())
62  {
63  convert_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
64  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
65  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
66  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
67  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
68 
69  viennacl::cuda_arg(mat2),
70  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
71  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
72  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
73  );
74  VIENNACL_CUDA_LAST_ERROR_CHECK("convert_row_kernel");
75  }
76  else
77  {
78  convert_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
79  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
80  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
81  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
82  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
83 
84  viennacl::cuda_arg(mat2),
85  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
86  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
87  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
88  );
89  VIENNACL_CUDA_LAST_ERROR_CHECK("convert_col_kernel");
90  }
91 }
92 
93 template<typename NumericT, typename SizeT, typename DistanceT>
95  matrix_base<NumericT> & temp_trans)
96 {
97  trans_kernel<<<128,128>>>(viennacl::cuda_arg(proxy.lhs()),
98  static_cast<unsigned int>(proxy.lhs().start1()), static_cast<unsigned int>(proxy.lhs().start2()),
99  static_cast<unsigned int>(proxy.lhs().internal_size1()), static_cast<unsigned int>(proxy.lhs().internal_size2()),
100  static_cast<unsigned int>(proxy.lhs().size1()), static_cast<unsigned int>(proxy.lhs().size2()),
101  static_cast<unsigned int>(proxy.lhs().stride1()), static_cast<unsigned int>(proxy.lhs().stride2()),
102 
103  viennacl::cuda_arg(temp_trans),
104  static_cast<unsigned int>(temp_trans.start1()), static_cast<unsigned int>(temp_trans.start2()),
105  static_cast<unsigned int>(temp_trans.internal_size1()), static_cast<unsigned int>(temp_trans.internal_size2()),
106  static_cast<unsigned int>(temp_trans.stride1()), static_cast<unsigned int>(temp_trans.stride2()),
107  static_cast<bool>(proxy.lhs().row_major()));
108  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel");
109 }
110 
111 
112 template<typename NumericT, typename ScalarT>
114  matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
115 {
116  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
117 
118  typedef NumericT value_type;
119 
120  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
121 
122  value_type temporary_alpha = 0;
124  temporary_alpha = alpha;
125 
126  if (mat1.row_major())
127  {
128  am_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
129  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
130  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
131  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
132  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
133 
134  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
135  options_alpha,
136  viennacl::cuda_arg(mat2),
137  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
138  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
139  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
140  );
141  VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
142  }
143  else
144  {
145  am_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
146  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
147  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
148  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
149  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
150 
151  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
152  options_alpha,
153  viennacl::cuda_arg(mat2),
154  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
155  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
156  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
157  );
158  VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
159  }
160 }
161 
162 
163 template<typename NumericT, typename ScalarT1, typename ScalarT2>
165  matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
166  matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
167 {
168  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
169 
170  typedef NumericT value_type;
171 
172  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
173 
174  value_type temporary_alpha = 0;
176  temporary_alpha = alpha;
177 
178 
179  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
180 
181  value_type temporary_beta = 0;
183  temporary_beta = beta;
184 
185 
186  if (mat1.row_major())
187  {
188  ambm_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
189  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
190  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
191  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
192  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
193 
194  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
195  options_alpha,
196  viennacl::cuda_arg(mat2),
197  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
198  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
199  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
200 
201  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
202  options_beta,
203  viennacl::cuda_arg(mat3),
204  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
205  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
206  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
207  );
208  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
209  }
210  else
211  {
212  ambm_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
213  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
214  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
215  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
216  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
217 
218  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
219  options_alpha,
220  viennacl::cuda_arg(mat2),
221  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
222  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
223  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
224 
225  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
226  options_beta,
227  viennacl::cuda_arg(mat3),
228  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
229  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
230  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
231  );
232  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
233  }
234 
235 }
236 
237 
238 template<typename NumericT, typename ScalarT1, typename ScalarT2>
240  matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
241  matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
242 {
243  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
244 
245  typedef NumericT value_type;
246 
247  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
248 
249  value_type temporary_alpha = 0;
251  temporary_alpha = alpha;
252 
253 
254  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
255 
256  value_type temporary_beta = 0;
258  temporary_beta = beta;
259 
260 
261  if (mat1.row_major())
262  {
263  ambm_m_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
264  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
265  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
266  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
267  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
268 
269  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
270  options_alpha,
271  viennacl::cuda_arg(mat2),
272  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
273  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
274  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
275 
276  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
277  options_beta,
278  viennacl::cuda_arg(mat3),
279  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
280  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
281  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
282  );
283  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
284  }
285  else
286  {
287  ambm_m_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
288  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
289  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
290  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
291  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
292 
293  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
294  options_alpha,
295  viennacl::cuda_arg(mat2),
296  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
297  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
298  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
299 
300  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
301  options_beta,
302  viennacl::cuda_arg(mat3),
303  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
304  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
305  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
306  );
307  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
308  }
309 
310 }
311 
312 
313 
314 
315 template<typename NumericT>
316 void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
317 {
318  typedef NumericT value_type;
319  value_type alpha = s;
320 
323 
324  if (mat.row_major())
325  {
326 
327  matrix_row_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
328  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
329  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
330  s1, s2,
331  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
332  alpha);
333  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
334  }
335  else
336  {
337  matrix_col_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
338  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
339  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
340  s1, s2,
341  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
342  alpha);
343  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
344  }
345 }
346 
347 template<typename NumericT>
349 {
350  typedef NumericT value_type;
351  value_type alpha = s;
352 
353  if (mat.row_major())
354  {
355  matrix_row_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
356  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
357  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
358  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
359  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
360  alpha);
361  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
362  }
363  else
364  {
365  matrix_col_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
366  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
367  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
368  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
369  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
370  alpha);
371  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
372  }
373 }
374 
375 
376 template<typename NumericT>
378 {
379  typedef NumericT value_type;
380 
381  // Step 1: assign zero matrix:
382  matrix_assign(mat, NumericT(0));
383 
384  // Step 2: Assign diagonal:
385  unsigned int options_alpha = 0;
386 
387  vcl_size_t mat_start = 0;
388  vcl_size_t mat_stride = 0;
389  vcl_size_t mat_size = viennacl::traits::size(vec);
390  if (mat.row_major())
391  {
392  vcl_size_t first_row_index = 0;
393  vcl_size_t first_col_index = 0;
394  if (k < 0)
395  first_row_index = vcl_size_t(-k);
396  else
397  first_col_index = vcl_size_t(k);
398  mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
399  + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
401  }
402  else
403  {
404  vcl_size_t first_row_index = 0;
405  vcl_size_t first_col_index = 0;
406  if (k < 0)
407  first_row_index = vcl_size_t(-k);
408  else
409  first_col_index = vcl_size_t(k);
410  mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
413  }
414 
415  av_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
416  static_cast<unsigned int>(mat_start),
417  static_cast<unsigned int>(mat_stride),
418  static_cast<unsigned int>(mat_size),
419 
420  viennacl::cuda_arg<value_type>(NumericT(1)),
421  options_alpha,
422  viennacl::cuda_arg(vec),
423  static_cast<unsigned int>(viennacl::traits::start(vec)),
424  static_cast<unsigned int>(viennacl::traits::stride(vec)) );
425  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
426 }
427 
428 template<typename NumericT>
430 {
431  typedef NumericT value_type;
432 
433  unsigned int options_alpha = 0;
434 
435  vcl_size_t mat_start = 0;
436  vcl_size_t mat_stride = 0;
437  if (mat.row_major())
438  {
439  vcl_size_t first_row_index = 0;
440  vcl_size_t first_col_index = 0;
441  if (k < 0)
442  first_row_index = vcl_size_t(-k);
443  else
444  first_col_index = vcl_size_t(k);
445  mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
446  + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
448  }
449  else
450  {
451  vcl_size_t first_row_index = 0;
452  vcl_size_t first_col_index = 0;
453  if (k < 0)
454  first_row_index = vcl_size_t(-k);
455  else
456  first_col_index = vcl_size_t(k);
457  mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
460  }
461 
462  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
463  static_cast<unsigned int>(viennacl::traits::start(vec)),
464  static_cast<unsigned int>(viennacl::traits::stride(vec)),
465  static_cast<unsigned int>(viennacl::traits::size(vec)),
466 
467  viennacl::cuda_arg<value_type>(NumericT(1)),
468  options_alpha,
469  viennacl::cuda_arg(mat),
470  static_cast<unsigned int>(mat_start),
471  static_cast<unsigned int>(mat_stride));
472  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
473 }
474 
475 template<typename NumericT>
476 void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
477 {
478  typedef NumericT value_type;
479 
480  unsigned int options_alpha = 0;
481 
482  vcl_size_t mat_start = 0;
483  vcl_size_t mat_stride = 0;
484  if (mat.row_major())
485  {
487  mat_stride = viennacl::traits::stride2(mat);
488  }
489  else
490  {
493  }
494 
495  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
496  static_cast<unsigned int>(viennacl::traits::start(vec)),
497  static_cast<unsigned int>(viennacl::traits::stride(vec)),
498  static_cast<unsigned int>(viennacl::traits::size(vec)),
499 
500  viennacl::cuda_arg<value_type>(NumericT(1)),
501  options_alpha,
502  viennacl::cuda_arg(mat),
503  static_cast<unsigned int>(mat_start),
504  static_cast<unsigned int>(mat_stride));
505  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
506 }
507 
508 template<typename NumericT>
509 void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
510 {
511  typedef NumericT value_type;
512 
513  unsigned int options_alpha = 0;
514 
515  vcl_size_t mat_start = 0;
516  vcl_size_t mat_stride = 0;
517  if (mat.row_major())
518  {
521  }
522  else
523  {
525  mat_stride = viennacl::traits::stride2(mat);
526  }
527 
528  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
529  static_cast<unsigned int>(viennacl::traits::start(vec)),
530  static_cast<unsigned int>(viennacl::traits::stride(vec)),
531  static_cast<unsigned int>(viennacl::traits::size(vec)),
532 
533  viennacl::cuda_arg<value_type>(NumericT(1)),
534  options_alpha,
535  viennacl::cuda_arg(mat),
536  static_cast<unsigned int>(mat_start),
537  static_cast<unsigned int>(mat_stride));
538  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
539 }
540 
541 
542 //
544 //
545 
546 
547 template<typename NumericT, typename SizeT, typename OpT>
550 {
551  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
552 
553  typedef NumericT value_type;
554 
555  unsigned int op_type = 2; //0: product, 1: division, 2: power
557  op_type = 1;
559  op_type = 0;
560 
561  if (A.row_major())
562  {
563  element_op_int_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
564  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
565  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
566  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
567  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
568 
569  viennacl::cuda_arg(proxy.lhs()),
570  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
571  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
572  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
573 
574  viennacl::cuda_arg(proxy.rhs()),
575  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
576  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
577  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
578 
579  op_type
580  );
581  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
582  }
583  else
584  {
585  element_op_int_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
586  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
587  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
588  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
589  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
590 
591  viennacl::cuda_arg(proxy.lhs()),
592  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
593  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
594  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
595 
596  viennacl::cuda_arg(proxy.rhs()),
597  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
598  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
599  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
600 
601  op_type
602  );
603  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
604  }
605 }
606 
607 template<typename SizeT, typename OpT>
610 {
611  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
612 
613  typedef float value_type;
614 
615  unsigned int op_type = 2; //0: product, 1: division, 2: power
617  op_type = 1;
619  op_type = 0;
620 
621  if (A.row_major())
622  {
623  element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
624  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
625  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
626  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
627  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
628 
629  viennacl::cuda_arg(proxy.lhs()),
630  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
631  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
632  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
633 
634  viennacl::cuda_arg(proxy.rhs()),
635  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
636  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
637  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
638 
639  op_type
640  );
641  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
642  }
643  else
644  {
645  element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
646  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
647  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
648  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
649  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
650 
651  viennacl::cuda_arg(proxy.lhs()),
652  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
653  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
654  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
655 
656  viennacl::cuda_arg(proxy.rhs()),
657  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
658  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
659  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
660 
661  op_type
662  );
663  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
664  }
665 }
666 
667 template<typename SizeT, typename OpT>
670 {
671  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
672 
673  typedef double value_type;
674 
675  unsigned int op_type = 2; //0: product, 1: division, 2: power
677  op_type = 1;
679  op_type = 0;
680 
681  if (A.row_major())
682  {
683  element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
684  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
685  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
686  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
687  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
688 
689  viennacl::cuda_arg(proxy.lhs()),
690  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
691  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
692  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
693 
694  viennacl::cuda_arg(proxy.rhs()),
695  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
696  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
697  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
698 
699  op_type
700  );
701  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
702  }
703  else
704  {
705  element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
706  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
707  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
708  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
709  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
710 
711  viennacl::cuda_arg(proxy.lhs()),
712  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
713  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
714  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
715 
716  viennacl::cuda_arg(proxy.rhs()),
717  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
718  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
719  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
720 
721  op_type
722  );
723  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
724  }
725 }
726 
727 //
729 //
730 
731 // Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
732 // we could not find a more 'automatic' way of generating the overloads below...
733 
734 // abs
735 template<typename NumericT>
738 {
739  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
740 
741  typedef NumericT value_type;
742 
743  if (A.row_major())
744  {
745  matrix_row_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
746  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
747  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
748  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
749  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
750 
751  viennacl::cuda_arg(proxy.lhs()),
752  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
753  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
754  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
755  );
756  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
757  }
758  else
759  {
760  matrix_col_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
761  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
762  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
763  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
764  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
765 
766  viennacl::cuda_arg(proxy.lhs()),
767  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
768  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
769  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
770  );
771  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
772  }
773 }
774 
775 
776 // acos
777 template<typename NumericT>
780 {
781  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
782 
783  typedef NumericT value_type;
784 
785  if (A.row_major())
786  {
787  matrix_row_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
788  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
789  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
790  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
791  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
792 
793  viennacl::cuda_arg(proxy.lhs()),
794  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
795  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
796  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
797  );
798  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
799  }
800  else
801  {
802  matrix_col_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
803  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
804  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
805  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
806  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
807 
808  viennacl::cuda_arg(proxy.lhs()),
809  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
810  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
811  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
812  );
813  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
814  }
815 }
816 
817 
818 // asin
819 template<typename NumericT>
822 {
823  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
824 
825  typedef NumericT value_type;
826 
827  if (A.row_major())
828  {
829  matrix_row_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
830  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
831  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
832  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
833  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
834 
835  viennacl::cuda_arg(proxy.lhs()),
836  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
837  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
838  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
839  );
840  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
841  }
842  else
843  {
844  matrix_col_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
845  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
846  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
847  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
848  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
849 
850  viennacl::cuda_arg(proxy.lhs()),
851  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
852  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
853  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
854  );
855  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
856  }
857 }
858 
859 
860 // atan
861 template<typename NumericT>
864 {
865  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
866 
867  typedef NumericT value_type;
868 
869  if (A.row_major())
870  {
871  matrix_row_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
872  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
873  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
874  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
875  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
876 
877  viennacl::cuda_arg(proxy.lhs()),
878  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
879  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
880  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
881  );
882  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
883  }
884  else
885  {
886  matrix_col_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
887  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
888  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
889  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
890  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
891 
892  viennacl::cuda_arg(proxy.lhs()),
893  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
894  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
895  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
896  );
897  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
898  }
899 }
900 
901 
902 // ceil
903 template<typename NumericT>
906 {
907  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
908 
909  typedef NumericT value_type;
910 
911  if (A.row_major())
912  {
913  matrix_row_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
914  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
915  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
916  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
917  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
918 
919  viennacl::cuda_arg(proxy.lhs()),
920  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
921  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
922  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
923  );
924  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
925  }
926  else
927  {
928  matrix_col_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
929  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
930  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
931  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
932  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
933 
934  viennacl::cuda_arg(proxy.lhs()),
935  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
936  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
937  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
938  );
939  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
940  }
941 }
942 
943 
944 // cos
945 template<typename NumericT>
948 {
949  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
950 
951  typedef NumericT value_type;
952 
953  if (A.row_major())
954  {
955  matrix_row_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
956  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
957  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
958  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
959  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
960 
961  viennacl::cuda_arg(proxy.lhs()),
962  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
963  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
964  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
965  );
966  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
967  }
968  else
969  {
970  matrix_col_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
971  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
972  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
973  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
974  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
975 
976  viennacl::cuda_arg(proxy.lhs()),
977  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
978  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
979  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
980  );
981  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
982  }
983 }
984 
985 
986 // cosh
987 template<typename NumericT>
990 {
991  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
992 
993  typedef NumericT value_type;
994 
995  if (A.row_major())
996  {
997  matrix_row_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
998  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
999  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1000  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1001  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1002 
1003  viennacl::cuda_arg(proxy.lhs()),
1004  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1005  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1006  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1007  );
1008  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
1009  }
1010  else
1011  {
1012  matrix_col_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1013  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1014  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1015  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1016  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1017 
1018  viennacl::cuda_arg(proxy.lhs()),
1019  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1020  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1021  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1022  );
1023  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
1024  }
1025 }
1026 
1027 
1028 // exp
1029 template<typename NumericT>
1032 {
1033  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1034 
1035  typedef NumericT value_type;
1036 
1037  if (A.row_major())
1038  {
1039  matrix_row_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1040  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1041  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1042  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1043  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1044 
1045  viennacl::cuda_arg(proxy.lhs()),
1046  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1047  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1048  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1049  );
1050  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
1051  }
1052  else
1053  {
1054  matrix_col_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1055  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1056  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1057  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1058  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1059 
1060  viennacl::cuda_arg(proxy.lhs()),
1061  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1062  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1063  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1064  );
1065  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
1066  }
1067 }
1068 
1069 
1070 // fabs
1071 template<typename NumericT>
1074 {
1075  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1076 
1077  typedef NumericT value_type;
1078 
1079  if (A.row_major())
1080  {
1081  matrix_row_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1082  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1083  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1084  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1085  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1086 
1087  viennacl::cuda_arg(proxy.lhs()),
1088  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1089  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1090  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1091  );
1092  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
1093  }
1094  else
1095  {
1096  matrix_col_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1097  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1098  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1099  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1100  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1101 
1102  viennacl::cuda_arg(proxy.lhs()),
1103  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1104  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1105  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1106  );
1107  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
1108  }
1109 }
1110 
1111 
1112 // floor
1113 template<typename NumericT>
1116 {
1117  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1118 
1119  typedef NumericT value_type;
1120 
1121  if (A.row_major())
1122  {
1123  matrix_row_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1124  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1125  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1126  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1127  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1128 
1129  viennacl::cuda_arg(proxy.lhs()),
1130  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1131  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1132  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1133  );
1134  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
1135  }
1136  else
1137  {
1138  matrix_col_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1139  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1140  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1141  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1142  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1143 
1144  viennacl::cuda_arg(proxy.lhs()),
1145  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1146  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1147  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1148  );
1149  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
1150  }
1151 }
1152 
1153 
1154 // log
1155 template<typename NumericT>
1158 {
1159  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1160 
1161  typedef NumericT value_type;
1162 
1163  if (A.row_major())
1164  {
1165  matrix_row_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1166  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1167  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1168  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1169  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1170 
1171  viennacl::cuda_arg(proxy.lhs()),
1172  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1173  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1174  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1175  );
1176  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
1177  }
1178  else
1179  {
1180  matrix_col_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1181  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1182  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1183  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1184  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1185 
1186  viennacl::cuda_arg(proxy.lhs()),
1187  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1188  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1189  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1190  );
1191  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
1192  }
1193 }
1194 
1195 
1196 // log10
1197 template<typename NumericT>
1200 {
1201  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1202 
1203  typedef NumericT value_type;
1204 
1205  if (A.row_major())
1206  {
1207  matrix_row_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1208  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1209  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1210  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1211  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1212 
1213  viennacl::cuda_arg(proxy.lhs()),
1214  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1215  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1216  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1217  );
1218  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
1219  }
1220  else
1221  {
1222  matrix_col_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1223  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1224  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1225  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1226  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1227 
1228  viennacl::cuda_arg(proxy.lhs()),
1229  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1230  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1231  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1232  );
1233  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
1234  }
1235 }
1236 
1237 
1238 // sin
1239 template<typename NumericT>
1242 {
1243  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1244 
1245  typedef NumericT value_type;
1246 
1247  if (A.row_major())
1248  {
1249  matrix_row_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1250  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1251  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1252  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1253  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1254 
1255  viennacl::cuda_arg(proxy.lhs()),
1256  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1257  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1258  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1259  );
1260  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
1261  }
1262  else
1263  {
1264  matrix_col_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1265  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1266  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1267  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1268  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1269 
1270  viennacl::cuda_arg(proxy.lhs()),
1271  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1272  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1273  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1274  );
1275  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
1276  }
1277 }
1278 
1279 
1280 // sinh
1281 template<typename NumericT>
1284 {
1285  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1286 
1287  typedef NumericT value_type;
1288 
1289  if (A.row_major())
1290  {
1291  matrix_row_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1292  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1293  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1294  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1295  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1296 
1297  viennacl::cuda_arg(proxy.lhs()),
1298  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1299  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1300  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1301  );
1302  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
1303  }
1304  else
1305  {
1306  matrix_col_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1307  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1308  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1309  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1310  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1311 
1312  viennacl::cuda_arg(proxy.lhs()),
1313  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1314  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1315  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1316  );
1317  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
1318  }
1319 }
1320 
1321 
1322 // sqrt
1323 template<typename NumericT>
1326 {
1327  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1328 
1329  typedef NumericT value_type;
1330 
1331  if (A.row_major())
1332  {
1333  matrix_row_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1334  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1335  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1336  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1337  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1338 
1339  viennacl::cuda_arg(proxy.lhs()),
1340  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1341  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1342  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1343  );
1344  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sqrt_kernel");
1345  }
1346  else
1347  {
1348  matrix_col_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1349  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1350  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1351  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1352  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1353 
1354  viennacl::cuda_arg(proxy.lhs()),
1355  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1356  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1357  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1358  );
1359  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sqrt_kernel");
1360  }
1361 }
1362 
1363 
1364 // tan
1365 template<typename NumericT>
1368 {
1369  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1370 
1371  typedef NumericT value_type;
1372 
1373  if (A.row_major())
1374  {
1375  matrix_row_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1376  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1377  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1378  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1379  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1380 
1381  viennacl::cuda_arg(proxy.lhs()),
1382  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1383  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1384  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1385  );
1386  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tan_kernel");
1387  }
1388  else
1389  {
1390  matrix_col_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1391  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1392  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1393  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1394  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1395 
1396  viennacl::cuda_arg(proxy.lhs()),
1397  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1398  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1399  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1400  );
1401  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tan_kernel");
1402  }
1403 }
1404 
1405 
1406 // tanh
1407 template<typename NumericT>
1410 {
1411  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
1412 
1413  typedef NumericT value_type;
1414 
1415  if (A.row_major())
1416  {
1417  matrix_row_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1418  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1419  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1420  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1421  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1422 
1423  viennacl::cuda_arg(proxy.lhs()),
1424  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1425  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1426  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1427  );
1428  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tanh_kernel");
1429  }
1430  else
1431  {
1432  matrix_col_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
1433  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1434  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1435  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1436  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1437 
1438  viennacl::cuda_arg(proxy.lhs()),
1439  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1440  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1441  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1442  );
1443  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tanh_kernel");
1444  }
1445 }
1446 
1447 
1448 //
1450 //
1451 
1452 // A * x
1453 
1463 template<typename NumericT>
1464 void prod_impl(const matrix_base<NumericT> & mat, bool mat_transpose,
1465  const vector_base<NumericT> & vec,
1466  vector_base<NumericT> & result)
1467 {
1468  typedef NumericT value_type;
1469 
1470  assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!"));
1471 
1472  if (mat.row_major())
1473  {
1474  if (!mat_transpose)
1475  {
1476  vec_mul_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
1477  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1478  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1479  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1480  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1481 
1482  viennacl::cuda_arg(vec),
1483  static_cast<unsigned int>(viennacl::traits::start(vec)),
1484  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1485  static_cast<unsigned int>(viennacl::traits::size(vec)),
1486 
1487  viennacl::cuda_arg(result),
1488  static_cast<unsigned int>(viennacl::traits::start(result)),
1489  static_cast<unsigned int>(viennacl::traits::stride(result)),
1490  static_cast<unsigned int>(viennacl::traits::size(result))
1491  );
1492  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_row_kernel");
1493  }
1494  else
1495  {
1496  trans_vec_mul_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
1497  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1498  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1499  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1500  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1501 
1502  viennacl::cuda_arg(vec),
1503  static_cast<unsigned int>(viennacl::traits::start(vec)),
1504  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1505  static_cast<unsigned int>(viennacl::traits::size(vec)),
1506 
1507  viennacl::cuda_arg(result),
1508  static_cast<unsigned int>(viennacl::traits::start(result)),
1509  static_cast<unsigned int>(viennacl::traits::stride(result)),
1510  static_cast<unsigned int>(viennacl::traits::size(result))
1511  );
1512  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_row_kernel");
1513  }
1514  }
1515  else
1516  {
1517  if (!mat_transpose)
1518  {
1519  vec_mul_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
1520  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1521  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1522  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1523  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1524 
1525  viennacl::cuda_arg(vec),
1526  static_cast<unsigned int>(viennacl::traits::start(vec)),
1527  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1528  static_cast<unsigned int>(viennacl::traits::size(vec)),
1529 
1530  viennacl::cuda_arg(result),
1531  static_cast<unsigned int>(viennacl::traits::start(result)),
1532  static_cast<unsigned int>(viennacl::traits::stride(result)),
1533  static_cast<unsigned int>(viennacl::traits::size(result))
1534  );
1535  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_col_kernel");
1536  }
1537  else
1538  {
1539  trans_vec_mul_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
1540  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1541  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1542  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1543  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1544 
1545  viennacl::cuda_arg(vec),
1546  static_cast<unsigned int>(viennacl::traits::start(vec)),
1547  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1548  static_cast<unsigned int>(viennacl::traits::size(vec)),
1549 
1550  viennacl::cuda_arg(result),
1551  static_cast<unsigned int>(viennacl::traits::start(result)),
1552  static_cast<unsigned int>(viennacl::traits::stride(result)),
1553  static_cast<unsigned int>(viennacl::traits::size(result))
1554  );
1555  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_col_kernel");
1556  }
1557  }
1558 }
1559 
1560 
1561 //
1563 //
1564 
1565 namespace detail
1566 {
1567  // C = A * B and possibly transposed variants
1568  template<typename MatrixT1, typename MatrixT2, typename MatrixT3, typename ScalarT>
1569  void prod_slow_kernel(const MatrixT1 & A, bool transposed_A,
1570  const MatrixT2 & B, bool transposed_B,
1571  MatrixT3 & C,
1572  ScalarT alpha,
1573  ScalarT beta)
1574  {
1576 
1577  cpu_value_type converted_alpha = static_cast<cpu_value_type>(alpha);
1578  cpu_value_type converted_beta = static_cast<cpu_value_type>(beta);
1579 
1580  dim3 threads(16, 16);
1581  dim3 grid( (viennacl::traits::size1(C) - 1) / 16 + 1,
1582  (viennacl::traits::size2(C) - 1) / 16 + 1);
1583 
1584  bool row_major_A = A.row_major();
1585  bool row_major_B = B.row_major();
1586  bool row_major_C = C.row_major();
1587 
1588 
1589  if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1590  {
1591  matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>>
1592  (converted_alpha,
1593  viennacl::cuda_arg(A),
1594  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1595  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1596  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1597  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1598 
1599  viennacl::cuda_arg(B),
1600  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1601  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1602  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1603  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1604 
1605  converted_beta,
1606  viennacl::cuda_arg(C),
1607  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1608  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1609  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1610  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1611  }
1612  else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1613  {
1614  matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>>
1615  (converted_alpha,
1616  viennacl::cuda_arg(A),
1617  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1618  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1619  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1620  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1621 
1622  viennacl::cuda_arg(B),
1623  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1624  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1625  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1626  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1627 
1628  converted_beta,
1629  viennacl::cuda_arg(C),
1630  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1631  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1632  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1633  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1634  }
1635  else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1636  {
1637  matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>>
1638  (converted_alpha,
1639  viennacl::cuda_arg(A),
1640  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1641  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1642  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1643  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1644 
1645  viennacl::cuda_arg(B),
1646  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1647  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1648  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1649  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1650 
1651  converted_beta,
1652  viennacl::cuda_arg(C),
1653  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1654  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1655  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1656  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1657  }
1658  else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1659  {
1660  matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>>
1661  (converted_alpha,
1662  viennacl::cuda_arg(A),
1663  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1664  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1665  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1666  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1667 
1668  viennacl::cuda_arg(B),
1669  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1670  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1671  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1672  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1673 
1674  converted_beta,
1675  viennacl::cuda_arg(C),
1676  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1677  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1678  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1679  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1680  }
1682 
1683  else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
1684  {
1685  matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>>
1686  (converted_alpha,
1687  viennacl::cuda_arg(A),
1688  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1689  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1690  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1691  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1692 
1693  viennacl::cuda_arg(B),
1694  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1695  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1696  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1697  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1698 
1699  converted_beta,
1700  viennacl::cuda_arg(C),
1701  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1702  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1703  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1704  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1705  }
1706  else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
1707  {
1708  matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>>
1709  (converted_alpha,
1710  viennacl::cuda_arg(A),
1711  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1712  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1713  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1714  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1715 
1716  viennacl::cuda_arg(B),
1717  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1718  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1719  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1720  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1721 
1722  converted_beta,
1723  viennacl::cuda_arg(C),
1724  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1725  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1726  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1727  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1728  }
1729  else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
1730  {
1731  matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>>
1732  (converted_alpha,
1733  viennacl::cuda_arg(A),
1734  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1735  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1736  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1737  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1738 
1739  viennacl::cuda_arg(B),
1740  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1741  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1742  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1743  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1744 
1745  converted_beta,
1746  viennacl::cuda_arg(C),
1747  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1748  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1749  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1750  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1751  }
1752  else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
1753  {
1754  matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>>
1755  (converted_alpha,
1756  viennacl::cuda_arg(A),
1757  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1758  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1759  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1760  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1761 
1762  viennacl::cuda_arg(B),
1763  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1764  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1765  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1766  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1767 
1768  converted_beta,
1769  viennacl::cuda_arg(C),
1770  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1771  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1772  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1773  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1774  }
1776 
1777  else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
1778  {
1779  matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>>
1780  (converted_alpha,
1781  viennacl::cuda_arg(A),
1782  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1783  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1784  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1785  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1786 
1787  viennacl::cuda_arg(B),
1788  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1789  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1790  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1791  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1792 
1793  converted_beta,
1794  viennacl::cuda_arg(C),
1795  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1796  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1797  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1798  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1799  }
1800  else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
1801  {
1802  matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>>
1803  (converted_alpha,
1804  viennacl::cuda_arg(A),
1805  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1806  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1807  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1808  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1809 
1810  viennacl::cuda_arg(B),
1811  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1812  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1813  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1814  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1815 
1816  converted_beta,
1817  viennacl::cuda_arg(C),
1818  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1819  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1820  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1821  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1822  }
1823  else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
1824  {
1825  matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>>
1826  (converted_alpha,
1827  viennacl::cuda_arg(A),
1828  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1829  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1830  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1831  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1832 
1833  viennacl::cuda_arg(B),
1834  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1835  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1836  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1837  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1838 
1839  converted_beta,
1840  viennacl::cuda_arg(C),
1841  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1842  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1843  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1844  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1845  }
1846  else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
1847  {
1848  matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>>
1849  (converted_alpha,
1850  viennacl::cuda_arg(A),
1851  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1852  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1853  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1854  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1855 
1856  viennacl::cuda_arg(B),
1857  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1858  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1859  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1860  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1861 
1862  converted_beta,
1863  viennacl::cuda_arg(C),
1864  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1865  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1866  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1867  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1868  }
1870 
1871  else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
1872  {
1873  matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>>
1874  (converted_alpha,
1875  viennacl::cuda_arg(A),
1876  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1877  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1878  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1879  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1880 
1881  viennacl::cuda_arg(B),
1882  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1883  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1884  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1885  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1886 
1887  converted_beta,
1888  viennacl::cuda_arg(C),
1889  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1890  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1891  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1892  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1893  }
1894  else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
1895  {
1896  matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>>
1897  (converted_alpha,
1898  viennacl::cuda_arg(A),
1899  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1900  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1901  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1902  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1903 
1904  viennacl::cuda_arg(B),
1905  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1906  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1907  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1908  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1909 
1910  converted_beta,
1911  viennacl::cuda_arg(C),
1912  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1913  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1914  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1915  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1916  }
1917  else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
1918  {
1919  matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>>
1920  (converted_alpha,
1921  viennacl::cuda_arg(A),
1922  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1923  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1924  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1925  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1926 
1927  viennacl::cuda_arg(B),
1928  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1929  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1930  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1931  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1932 
1933  converted_beta,
1934  viennacl::cuda_arg(C),
1935  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1936  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1937  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1938  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1939  }
1940  else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
1941  {
1942  matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>>
1943  (converted_alpha,
1944  viennacl::cuda_arg(A),
1945  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1946  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1947  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1948  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1949 
1950  viennacl::cuda_arg(B),
1951  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1952  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1953  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1954  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1955 
1956  converted_beta,
1957  viennacl::cuda_arg(C),
1958  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1959  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1960  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1961  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1962  }
1964 
1965  else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1966  {
1967  matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>>
1968  (converted_alpha,
1969  viennacl::cuda_arg(A),
1970  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1971  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1972  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1973  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1974 
1975  viennacl::cuda_arg(B),
1976  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1977  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1978  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1979  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1980 
1981  converted_beta,
1982  viennacl::cuda_arg(C),
1983  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1984  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1985  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1986  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1987  }
1988  else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1989  {
1990  matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>>
1991  (converted_alpha,
1992  viennacl::cuda_arg(A),
1993  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1994  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1995  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1996  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1997 
1998  viennacl::cuda_arg(B),
1999  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2000  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2001  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2002  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2003 
2004  converted_beta,
2005  viennacl::cuda_arg(C),
2006  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2007  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2008  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2009  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2010  }
2011  else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
2012  {
2013  matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>>
2014  (converted_alpha,
2015  viennacl::cuda_arg(A),
2016  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2017  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2018  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2019  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2020 
2021  viennacl::cuda_arg(B),
2022  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2023  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2024  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2025  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2026 
2027  converted_beta,
2028  viennacl::cuda_arg(C),
2029  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2030  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2031  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2032  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2033  }
2034  else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
2035  {
2036  matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>>
2037  (converted_alpha,
2038  viennacl::cuda_arg(A),
2039  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2040  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2041  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2042  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2043 
2044  viennacl::cuda_arg(B),
2045  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2046  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2047  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2048  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2049 
2050  converted_beta,
2051  viennacl::cuda_arg(C),
2052  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2053  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2054  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2055  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2056  }
2058 
2059  else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
2060  {
2061  matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>>
2062  (converted_alpha,
2063  viennacl::cuda_arg(A),
2064  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2065  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2066  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2067  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2068 
2069  viennacl::cuda_arg(B),
2070  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2071  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2072  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2073  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2074 
2075  converted_beta,
2076  viennacl::cuda_arg(C),
2077  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2078  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2079  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2080  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2081  }
2082  else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
2083  {
2084  matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>>
2085  (converted_alpha,
2086  viennacl::cuda_arg(A),
2087  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2088  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2089  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2090  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2091 
2092  viennacl::cuda_arg(B),
2093  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2094  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2095  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2096  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2097 
2098  converted_beta,
2099  viennacl::cuda_arg(C),
2100  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2101  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2102  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2103  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2104  }
2105  else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
2106  {
2107  matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>>
2108  (converted_alpha,
2109  viennacl::cuda_arg(A),
2110  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2111  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2112  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2113  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2114 
2115  viennacl::cuda_arg(B),
2116  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2117  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2118  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2119  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2120 
2121  converted_beta,
2122  viennacl::cuda_arg(C),
2123  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2124  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2125  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2126  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2127  }
2128  else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
2129  {
2130  matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>>
2131  (converted_alpha,
2132  viennacl::cuda_arg(A),
2133  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2134  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2135  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2136  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2137 
2138  viennacl::cuda_arg(B),
2139  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2140  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2141  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2142  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2143 
2144  converted_beta,
2145  viennacl::cuda_arg(C),
2146  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2147  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2148  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2149  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2150  }
2152 
2153  else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
2154  {
2155  matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>>
2156  (converted_alpha,
2157  viennacl::cuda_arg(A),
2158  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2159  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2160  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2161  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2162 
2163  viennacl::cuda_arg(B),
2164  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2165  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2166  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2167  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2168 
2169  converted_beta,
2170  viennacl::cuda_arg(C),
2171  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2172  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2173  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2174  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2175  }
2176  else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
2177  {
2178  matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>>
2179  (converted_alpha,
2180  viennacl::cuda_arg(A),
2181  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2182  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2183  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2184  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2185 
2186  viennacl::cuda_arg(B),
2187  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2188  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2189  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2190  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2191 
2192  converted_beta,
2193  viennacl::cuda_arg(C),
2194  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2195  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2196  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2197  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2198  }
2199  else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
2200  {
2201  matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>>
2202  (converted_alpha,
2203  viennacl::cuda_arg(A),
2204  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2205  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2206  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2207  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2208 
2209  viennacl::cuda_arg(B),
2210  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2211  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2212  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2213  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2214 
2215  converted_beta,
2216  viennacl::cuda_arg(C),
2217  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2218  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2219  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2220  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2221  }
2222  else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
2223  {
2224  matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>>
2225  (converted_alpha,
2226  viennacl::cuda_arg(A),
2227  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2228  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2229  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2230  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2231 
2232  viennacl::cuda_arg(B),
2233  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2234  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2235  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2236  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2237 
2238  converted_beta,
2239  viennacl::cuda_arg(C),
2240  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2241  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2242  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2243  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2244  }
2245 
2246 
2248 
2249  else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
2250  {
2251  matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>>
2252  (converted_alpha,
2253  viennacl::cuda_arg(A),
2254  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2255  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2256  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2257  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2258 
2259  viennacl::cuda_arg(B),
2260  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2261  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2262  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2263  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2264 
2265  converted_beta,
2266  viennacl::cuda_arg(C),
2267  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2268  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2269  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2270  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2271  }
2272  else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
2273  {
2274  matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>>
2275  (converted_alpha,
2276  viennacl::cuda_arg(A),
2277  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2278  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2279  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2280  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2281 
2282  viennacl::cuda_arg(B),
2283  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2284  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2285  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2286  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2287 
2288  converted_beta,
2289  viennacl::cuda_arg(C),
2290  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2291  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2292  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2293  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2294  }
2295  else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
2296  {
2297  matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>>
2298  (converted_alpha,
2299  viennacl::cuda_arg(A),
2300  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2301  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2302  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2303  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2304 
2305  viennacl::cuda_arg(B),
2306  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2307  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2308  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2309  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2310 
2311  converted_beta,
2312  viennacl::cuda_arg(C),
2313  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2314  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2315  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2316  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2317  }
2318  else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
2319  {
2320  matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>>
2321  (converted_alpha,
2322  viennacl::cuda_arg(A),
2323  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2324  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2325  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2326  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2327 
2328  viennacl::cuda_arg(B),
2329  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2330  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2331  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2332  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2333 
2334  converted_beta,
2335  viennacl::cuda_arg(C),
2336  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2337  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2338  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2339  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2340  }
2341 
2342  }
2343 
2344 
2345  template<typename MatrixT1, typename MatrixT2, typename MatrixT3, typename ScalarT>
2346  void prod(const MatrixT1 & A, bool transposed_A,
2347  const MatrixT2 & B, bool transposed_B,
2348  MatrixT3 & C,
2349  ScalarT alpha,
2350  ScalarT beta)
2351  {
2352  if ( (viennacl::traits::size1(A) < 64)
2353  || (viennacl::traits::size2(A) < 64)
2354  || (viennacl::traits::size1(B) < 64) ) //there is most likely not enough to compute, rendering kernel launch overhead considerable
2355  {
2356  prod_slow_kernel(A, transposed_A,
2357  B, transposed_B,
2358  C, alpha, beta);
2359  }
2360  /*else if ( (viennacl::traits::size1(A) % 64 == 0)
2361  && (viennacl::traits::size2(A) % 64 == 0)
2362  && (viennacl::traits::size1(B) % 64 == 0) ) // allows the use of the fast kernel only
2363  {
2364  prod_fast_kernel(A, B, C, alpha, beta);
2365  //prod_slow_kernel(A, B, C, slow_kernel_name);
2366  }*/
2367  else //TODO: use four kernels
2368  {
2369  prod_slow_kernel(A, transposed_A,
2370  B, transposed_B,
2371  C, alpha, beta);
2372  }
2373 
2374  }
2375 } // namespace detail
2376 
2377 
2383 template<typename NumericT, typename ScalarT>
2384 void prod_impl(const matrix_base<NumericT> & A, bool trans_A,
2385  const matrix_base<NumericT> & B, bool trans_B,
2387  ScalarT alpha,
2388  ScalarT beta)
2389 {
2390  detail::prod(A, trans_A,
2391  B, trans_B,
2392  C, alpha, beta);
2393 }
2394 
2395 
2396 
2397 
2398 //
2400 //
2401 
2402 
2415 template<typename NumericT, typename ScalarT>
2417  ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
2418  const vector_base<NumericT> & vec1,
2419  const vector_base<NumericT> & vec2)
2420 {
2421  assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
2422  assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
2423 
2424  typedef NumericT value_type;
2425 
2426  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
2427 
2428  value_type temporary_alpha = 0;
2430  temporary_alpha = alpha;
2431 
2432  if (mat1.row_major())
2433  {
2434  scaled_rank1_update_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
2435  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
2436  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
2437  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
2438  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
2439 
2440  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
2441  options_alpha,
2442 
2443  viennacl::cuda_arg(vec1),
2444  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2445  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2446  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2447 
2448  viennacl::cuda_arg(vec2),
2449  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2450  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2451  static_cast<unsigned int>(viennacl::traits::size(vec2))
2452  );
2453  VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_row_kernel");
2454  }
2455  else
2456  {
2457  scaled_rank1_update_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
2458  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
2459  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
2460  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
2461  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
2462 
2463  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
2464  options_alpha,
2465 
2466  viennacl::cuda_arg(vec1),
2467  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2468  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2469  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2470 
2471  viennacl::cuda_arg(vec2),
2472  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2473  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2474  static_cast<unsigned int>(viennacl::traits::size(vec2))
2475  );
2476  VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_col_kernel");
2477  }
2478 }
2479 
2480 
2488 template <typename NumericT, typename VectorType>
2490  VectorType & dh,
2491  VectorType & sh
2492  )
2493 {
2494  if (A.row_major())
2495  {
2496  viennacl::linalg::cuda::bidiag_pack_row_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2497  viennacl::cuda_arg(dh),
2498  viennacl::cuda_arg(sh),
2499  static_cast<unsigned int>(viennacl::traits::size1(A)),
2500  static_cast<unsigned int>(viennacl::traits::size2(A)),
2501  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2502  }
2503  else
2504  {
2505  viennacl::linalg::cuda::bidiag_pack_column_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2506  viennacl::cuda_arg(dh),
2507  viennacl::cuda_arg(sh),
2508  static_cast<unsigned int>(viennacl::traits::size1(A)),
2509  static_cast<unsigned int>(viennacl::traits::size2(A)),
2510  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2511  }
2512 }
2513 
2514 
2515 
2525 template <typename NumericT>
2528  vcl_size_t row_start,
2529  vcl_size_t col_start,
2530  bool copy_col
2531 )
2532 {
2533  if(copy_col)
2534  {
2535  if (A.row_major())
2536  {
2537  copy_col_row_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2538  viennacl::cuda_arg(V),
2539  static_cast<unsigned int>(row_start),
2540  static_cast<unsigned int>(col_start),
2541  static_cast<unsigned int>(viennacl::traits::size1(A)),
2542  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2543  }
2544  else
2545  {
2546  copy_col_column_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2547  viennacl::cuda_arg(V),
2548  static_cast<unsigned int>(row_start),
2549  static_cast<unsigned int>(col_start),
2550  static_cast<unsigned int>(viennacl::traits::size1(A)),
2551  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2552  }
2553 
2554 
2555  }
2556  else
2557  {
2558  if (A.row_major())
2559  {
2560  copy_row_row_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2561  viennacl::cuda_arg(V),
2562  static_cast<unsigned int>(row_start),
2563  static_cast<unsigned int>(col_start),
2564  static_cast<unsigned int>(viennacl::traits::size2(A)),
2565  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2566  }
2567  else
2568  {
2569  copy_row_column_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2570  viennacl::cuda_arg(V),
2571  static_cast<unsigned int>(row_start),
2572  static_cast<unsigned int>(col_start),
2573  static_cast<unsigned int>(viennacl::traits::size2(A)),
2574  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2575  }
2576  }
2577 }
2578 
2579 
2586 template <typename NumericT>
2589  vcl_size_t start)
2590 {
2591  if (A.row_major())
2592  {
2593  house_update_A_left_row_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2594  viennacl::cuda_arg(D),
2595  static_cast<unsigned int>(start + 1),
2596  static_cast<unsigned int>(start),
2597  static_cast<unsigned int>(viennacl::traits::size1(A)),
2598  static_cast<unsigned int>(viennacl::traits::size2(A)),
2599  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2600 
2601 
2602  }
2603  else
2604  {
2605  house_update_A_left_column_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2606  viennacl::cuda_arg(D),
2607  static_cast<unsigned int>(start + 1),
2608  static_cast<unsigned int>(start),
2609  static_cast<unsigned int>(viennacl::traits::size1(A)),
2610  static_cast<unsigned int>(viennacl::traits::size2(A)),
2611  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2612 
2613 
2614  }
2615 
2616 }
2617 
2618 
2625 template <typename NumericT>
2628 {
2629  if (A.row_major())
2630  {
2631  house_update_A_right_row_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2632  viennacl::cuda_arg(D),
2633  static_cast<unsigned int>(0),
2634  static_cast<unsigned int>(0),
2635  static_cast<unsigned int>(viennacl::traits::size1(A)),
2636  static_cast<unsigned int>(viennacl::traits::size2(A)),
2637  static_cast<unsigned int>(viennacl::traits::internal_size2(A)));
2638 
2639 
2640  }
2641  else
2642  {
2643  house_update_A_right_column_major_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
2644  viennacl::cuda_arg(D),
2645  static_cast<unsigned int>(0),
2646  static_cast<unsigned int>(0),
2647  static_cast<unsigned int>(viennacl::traits::size1(A)),
2648  static_cast<unsigned int>(viennacl::traits::size2(A)),
2649  static_cast<unsigned int>(viennacl::traits::internal_size1(A)));
2650 
2651  }
2652 
2653 }
2654 
2655 
2662 template <typename NumericT>
2665  vcl_size_t A_size1)
2666 
2667 {
2668  if (Q.row_major())
2669  {
2670  house_update_QL_row_major_kernel<<<128, 128>>>(viennacl::cuda_arg(Q),
2671  viennacl::cuda_arg(D),
2672  static_cast<unsigned int>(A_size1),
2673  static_cast<unsigned int>(viennacl::traits::internal_size2(Q)));
2674  }
2675  else
2676  {
2677  house_update_QL_column_major_kernel<<<128, 128>>>(viennacl::cuda_arg(Q),
2678  viennacl::cuda_arg(D),
2679  static_cast<unsigned int>(A_size1),
2680  static_cast<unsigned int>(viennacl::traits::internal_size1(Q)));
2681  }
2682 }
2683 
2693 template<typename NumericT>
2695  vector_base<NumericT>& tmp1,
2696  vector_base<NumericT>& tmp2,
2697  int l,
2698  int m)
2699  {
2700  if (Q.row_major())
2702  viennacl::cuda_arg(tmp1),
2703  viennacl::cuda_arg(tmp2),
2704  static_cast<unsigned int>(viennacl::traits::size1(Q)),
2705  static_cast<unsigned int>(viennacl::traits::internal_size2(Q)),
2706  static_cast<unsigned int>(l),
2707  static_cast<unsigned int>(m - 1));
2708 
2709  else
2710  givens_next_column_major_kernel<<<128, 128>>>(viennacl::cuda_arg(Q),
2711  viennacl::cuda_arg(tmp1),
2712  viennacl::cuda_arg(tmp2),
2713  static_cast<unsigned int>(viennacl::traits::size1(Q)),
2714  static_cast<unsigned int>(viennacl::traits::internal_size1(Q)),
2715  static_cast<unsigned int>(l),
2716  static_cast<unsigned int>(m - 1));
2717  }
2718 
2719 
2720 } // namespace cuda
2721 } //namespace linalg
2722 } //namespace viennacl
2723 
2724 
2725 #endif
void house_update_QL(matrix_base< NumericT > &Q, vector_base< NumericT > &D, vcl_size_t A_size1)
This function updates the matrix Q, which is needed for the computation of the eigenvectors.
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:160
void convert(matrix_base< DestNumericT > &mat1, matrix_base< SrcNumericT > const &mat2)
void house_update_A_right(matrix_base< NumericT > &A, vector_base< NumericT > &D)
This function applies a householder transformation to a matrix: A <- A * P with a householder reflect...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
Generic size and resize functionality for different vector and matrix types.
void trans(matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > const &proxy, matrix_base< NumericT > &temp_trans)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:386
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:394
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341
Implementations of row-major dense matrix related operations, including matrix-vector products...
size_type stride2() const
Returns the number of columns.
Definition: matrix_def.hpp:234
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
void clear(VectorType &vec)
Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL obj...
Definition: clear.hpp:43
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void ambm(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Determines row and column increments for matrices and matrix proxies.
Implementations of column-major dense matrix related operations, including matrix-vector products...
viennacl::scalar< int > s2
viennacl::scalar< float > s1
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
float NumericT
Definition: bisect.cpp:40
void prod_slow_kernel(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:448
void am(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
void matrix_diag_to_vector(matrix_base< NumericT > const &mat, int k, vector_base< NumericT > &vec)
void house_update_A_left(matrix_base< NumericT > &A, vector_base< NumericT > &D, vcl_size_t start)
This function applies a householder transformation to a matrix. A <- P * A with a householder reflect...
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void scaled_rank_1_update(matrix_base< NumericT > &mat1, ScalarT const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, const vector_base< NumericT > &vec1, const vector_base< NumericT > &vec2)
The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update...
void ambm_m(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
size_type stride1() const
Returns the number of rows.
Definition: matrix_def.hpp:232
void matrix_diag_from_vector(const vector_base< NumericT > &vec, int k, matrix_base< NumericT > &mat)
void matrix_diagonal_assign(matrix_base< NumericT > &mat, NumericT s)
std::size_t vcl_size_t
Definition: forwards.h:75
Dense matrix-matrix product CUDA kernels reside here.
void prod(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:466
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Definition: result_of.hpp:271
Proxy classes for vectors.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void matrix_column(const matrix_base< NumericT > &mat, unsigned int j, vector_base< NumericT > &vec)
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
Common routines for CUDA execution.
void matrix_row(matrix_base< NumericT > const &mat, unsigned int i, vector_base< NumericT > &vec)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:248
void bidiag_pack(matrix_base< NumericT > &A, VectorType &dh, VectorType &sh)
This function stores the diagonal and the superdiagonal of a matrix in two vectors.
A tag class representing transposed matrices.
Definition: forwards.h:220
size_type start2() const
Returns the number of columns.
Definition: matrix_def.hpp:230
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:130
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:240
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:238
void givens_next(matrix_base< NumericT > &Q, vector_base< NumericT > &tmp1, vector_base< NumericT > &tmp2, int l, int m)
This function updates the matrix Q. It is part of the tql2 algorithm.
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
viennacl::backend::mem_handle & handle(T &obj)
Returns the generic memory handle of an object. Non-const version.
Definition: handle.hpp:41
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:436
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:134
Implementation of the ViennaCL scalar class.
Implementations of NMF operations using CUDA.
A collection of compile time type deductions.
void matrix_assign(matrix_base< NumericT > &mat, NumericT s, bool clear=false)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
Definition: common.hpp:188
void copy_vec(matrix_base< NumericT > &A, vector_base< NumericT > &V, vcl_size_t row_start, vcl_size_t col_start, bool copy_col)
This function copies a row or a column from a matrix to a vector.
Simple enable-if variant that uses the SFINAE pattern.
size_type start1() const
Returns the number of rows.
Definition: matrix_def.hpp:228