ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vector_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 #include "viennacl/forwards.h"
27 #include "viennacl/scalar.hpp"
28 #include "viennacl/tools/tools.hpp"
31 #include "viennacl/traits/size.hpp"
34 
36 
37 namespace viennacl
38 {
39 namespace linalg
40 {
41 namespace cuda
42 {
43 
44 //
45 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
46 //
47 template<typename DestNumericT, typename SrcNumericT>
48 __global__ void convert_kernel(DestNumericT * dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest,
49  SrcNumericT const * src, unsigned int start_src, unsigned int inc_src)
50 {
51  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
52  i < size_dest;
53  i += gridDim.x * blockDim.x)
54  dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
55 }
56 
57 
58 template<typename DestNumericT, typename SrcNumericT>
60 {
61  convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest),
62  static_cast<unsigned int>(viennacl::traits::start(dest)),
63  static_cast<unsigned int>(viennacl::traits::stride(dest)),
64  static_cast<unsigned int>(viennacl::traits::size(dest)),
65 
66  viennacl::cuda_arg(src),
67  static_cast<unsigned int>(viennacl::traits::start(src)),
68  static_cast<unsigned int>(viennacl::traits::stride(src)) );
69  VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel");
70 }
71 
72 
74 
75 // gpu scalar
76 template<typename NumericT>
77 __global__ void av_kernel(NumericT * vec1,
78  unsigned int start1,
79  unsigned int inc1,
80  unsigned int size1,
81 
82  const NumericT * fac2,
83  unsigned int options2,
84  const NumericT * vec2,
85  unsigned int start2,
86  unsigned int inc2)
87 {
88  NumericT alpha = *fac2;
89  if (options2 & (1 << 0))
90  alpha = -alpha;
91 
92  if (options2 & (1 << 1))
93  {
94  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
95  i < size1;
96  i += gridDim.x * blockDim.x)
97  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
98  }
99  else
100  {
101  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
102  i < size1;
103  i += gridDim.x * blockDim.x)
104  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
105  }
106 }
107 
108 // cpu scalar
109 template<typename NumericT>
110 __global__ void av_kernel(NumericT * vec1,
111  unsigned int start1,
112  unsigned int inc1,
113  unsigned int size1,
114 
115  NumericT fac2,
116  unsigned int options2,
117  const NumericT * vec2,
118  unsigned int start2,
119  unsigned int inc2)
120 {
121  NumericT alpha = fac2;
122  if (options2 & (1 << 0))
123  alpha = -alpha;
124 
125  if (options2 & (1 << 1))
126  {
127  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
128  i < size1;
129  i += gridDim.x * blockDim.x)
130  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
131  }
132  else
133  {
134  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
135  i < size1;
136  i += gridDim.x * blockDim.x)
137  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
138  }
139 }
140 
141 
142 
143 template<typename NumericT, typename ScalarType1>
145  vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
146 {
147  typedef NumericT value_type;
148 
149  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
150 
151  value_type data_alpha = alpha;
152  if (flip_sign_alpha)
153  data_alpha = -data_alpha;
154  if (reciprocal_alpha)
155  data_alpha = static_cast<value_type>(1) / data_alpha;
156 
157  value_type temporary_alpha = 0;
159  temporary_alpha = alpha;
160 
161  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
162  static_cast<unsigned int>(viennacl::traits::start(vec1)),
163  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
164  static_cast<unsigned int>(viennacl::traits::size(vec1)),
165 
166  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
167  options_alpha,
168  viennacl::cuda_arg(vec2),
169  static_cast<unsigned int>(viennacl::traits::start(vec2)),
170  static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
171  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
172 }
173 
174 
176 
177 // alpha and beta on GPU
178 template<typename NumericT>
179 __global__ void avbv_kernel(NumericT * vec1,
180  unsigned int start1,
181  unsigned int inc1,
182  unsigned int size1,
183 
184  const NumericT * fac2,
185  unsigned int options2,
186  const NumericT * vec2,
187  unsigned int start2,
188  unsigned int inc2,
189 
190  const NumericT * fac3,
191  unsigned int options3,
192  const NumericT * vec3,
193  unsigned int start3,
194  unsigned int inc3)
195 {
196  NumericT alpha = *fac2;
197  if (options2 & (1 << 0))
198  alpha = -alpha;
199 
200  NumericT beta = *fac3;
201  if (options3 & (1 << 0))
202  beta = -beta;
203 
204  if (options2 & (1 << 1))
205  {
206  if (options3 & (1 << 1))
207  {
208  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
209  i < size1;
210  i += gridDim.x * blockDim.x)
211  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
212  }
213  else
214  {
215  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
216  i < size1;
217  i += gridDim.x * blockDim.x)
218  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
219  }
220  }
221  else
222  {
223  if (options3 & (1 << 1))
224  {
225  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
226  i < size1;
227  i += gridDim.x * blockDim.x)
228  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
229  }
230  else
231  {
232  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
233  i < size1;
234  i += gridDim.x * blockDim.x)
235  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
236  }
237  }
238 }
239 
240 // alpha on CPU, beta on GPU
241 template<typename NumericT>
242 __global__ void avbv_kernel(NumericT * vec1,
243  unsigned int start1,
244  unsigned int inc1,
245  unsigned int size1,
246 
247  NumericT fac2,
248  unsigned int options2,
249  const NumericT * vec2,
250  unsigned int start2,
251  unsigned int inc2,
252 
253  const NumericT * fac3,
254  unsigned int options3,
255  const NumericT * vec3,
256  unsigned int start3,
257  unsigned int inc3)
258 {
259  NumericT alpha = fac2;
260  if (options2 & (1 << 0))
261  alpha = -alpha;
262 
263  NumericT beta = *fac3;
264  if (options3 & (1 << 0))
265  beta = -beta;
266 
267  if (options2 & (1 << 1))
268  {
269  if (options3 & (1 << 1))
270  {
271  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
272  i < size1;
273  i += gridDim.x * blockDim.x)
274  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
275  }
276  else
277  {
278  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
279  i < size1;
280  i += gridDim.x * blockDim.x)
281  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
282  }
283  }
284  else
285  {
286  if (options3 & (1 << 1))
287  {
288  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
289  i < size1;
290  i += gridDim.x * blockDim.x)
291  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
292  }
293  else
294  {
295  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
296  i < size1;
297  i += gridDim.x * blockDim.x)
298  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
299  }
300  }
301 }
302 
303 // alpha on GPU, beta on CPU
304 template<typename NumericT>
305 __global__ void avbv_kernel(NumericT * vec1,
306  unsigned int start1,
307  unsigned int inc1,
308  unsigned int size1,
309 
310  const NumericT * fac2,
311  unsigned int options2,
312  const NumericT * vec2,
313  unsigned int start2,
314  unsigned int inc2,
315 
316  NumericT fac3,
317  unsigned int options3,
318  const NumericT * vec3,
319  unsigned int start3,
320  unsigned int inc3)
321 {
322  NumericT alpha = *fac2;
323  if (options2 & (1 << 0))
324  alpha = -alpha;
325 
326  NumericT beta = fac3;
327  if (options3 & (1 << 0))
328  beta = -beta;
329 
330  if (options2 & (1 << 1))
331  {
332  if (options3 & (1 << 1))
333  {
334  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
335  i < size1;
336  i += gridDim.x * blockDim.x)
337  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
338  }
339  else
340  {
341  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
342  i < size1;
343  i += gridDim.x * blockDim.x)
344  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
345  }
346  }
347  else
348  {
349  if (options3 & (1 << 1))
350  {
351  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
352  i < size1;
353  i += gridDim.x * blockDim.x)
354  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
355  }
356  else
357  {
358  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
359  i < size1;
360  i += gridDim.x * blockDim.x)
361  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
362  }
363  }
364 }
365 
366 // alpha and beta on CPU
367 template<typename NumericT>
368 __global__ void avbv_kernel(NumericT * vec1,
369  unsigned int start1,
370  unsigned int inc1,
371  unsigned int size1,
372 
373  NumericT fac2,
374  unsigned int options2,
375  const NumericT * vec2,
376  unsigned int start2,
377  unsigned int inc2,
378 
379  NumericT fac3,
380  unsigned int options3,
381  const NumericT * vec3,
382  unsigned int start3,
383  unsigned int inc3)
384 {
385  NumericT alpha = fac2;
386  if (options2 & (1 << 0))
387  alpha = -alpha;
388 
389  NumericT beta = fac3;
390  if (options3 & (1 << 0))
391  beta = -beta;
392 
393  if (options2 & (1 << 1))
394  {
395  if (options3 & (1 << 1))
396  {
397  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
398  i < size1;
399  i += gridDim.x * blockDim.x)
400  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
401  }
402  else
403  {
404  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
405  i < size1;
406  i += gridDim.x * blockDim.x)
407  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
408  }
409  }
410  else
411  {
412  if (options3 & (1 << 1))
413  {
414  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
415  i < size1;
416  i += gridDim.x * blockDim.x)
417  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
418  }
419  else
420  {
421  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
422  i < size1;
423  i += gridDim.x * blockDim.x)
424  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
425  }
426  }
427 }
428 
429 
430 
431 
432 template<typename NumericT, typename ScalarT1, typename ScalarT2>
434  vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
435  vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
436 {
437  typedef NumericT value_type;
438 
439  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
440 
441  value_type data_alpha = alpha;
442  if (flip_sign_alpha)
443  data_alpha = -data_alpha;
444  if (reciprocal_alpha)
445  data_alpha = static_cast<value_type>(1) / data_alpha;
446 
447  value_type temporary_alpha = 0;
449  temporary_alpha = alpha;
450 
451  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
452 
453  value_type temporary_beta = 0;
455  temporary_beta = beta;
456 
457 
458  avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
459  static_cast<unsigned int>(viennacl::traits::start(vec1)),
460  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
461  static_cast<unsigned int>(viennacl::traits::size(vec1)),
462 
463  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
464  options_alpha,
465  viennacl::cuda_arg(vec2),
466  static_cast<unsigned int>(viennacl::traits::start(vec2)),
467  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
468 
469  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
470  options_beta,
471  viennacl::cuda_arg(vec3),
472  static_cast<unsigned int>(viennacl::traits::start(vec3)),
473  static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
474  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
475 }
476 
477 
479 
480 
481 // alpha and beta on GPU
482 template<typename NumericT>
483 __global__ void avbv_v_kernel(NumericT * vec1,
484  unsigned int start1,
485  unsigned int inc1,
486  unsigned int size1,
487 
488  const NumericT * fac2,
489  unsigned int options2,
490  const NumericT * vec2,
491  unsigned int start2,
492  unsigned int inc2,
493 
494  const NumericT * fac3,
495  unsigned int options3,
496  const NumericT * vec3,
497  unsigned int start3,
498  unsigned int inc3)
499 {
500  NumericT alpha = *fac2;
501  if (options2 & (1 << 0))
502  alpha = -alpha;
503 
504  NumericT beta = *fac3;
505  if (options3 & (1 << 0))
506  beta = -beta;
507 
508  if (options2 & (1 << 1))
509  {
510  if (options3 & (1 << 1))
511  {
512  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
513  i < size1;
514  i += gridDim.x * blockDim.x)
515  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
516  }
517  else
518  {
519  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
520  i < size1;
521  i += gridDim.x * blockDim.x)
522  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
523  }
524  }
525  else
526  {
527  if (options3 & (1 << 1))
528  {
529  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
530  i < size1;
531  i += gridDim.x * blockDim.x)
532  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
533  }
534  else
535  {
536  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
537  i < size1;
538  i += gridDim.x * blockDim.x)
539  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
540  }
541  }
542 }
543 
544 // alpha on CPU, beta on GPU
545 template<typename NumericT>
546 __global__ void avbv_v_kernel(NumericT * vec1,
547  unsigned int start1,
548  unsigned int inc1,
549  unsigned int size1,
550 
551  NumericT fac2,
552  unsigned int options2,
553  const NumericT * vec2,
554  unsigned int start2,
555  unsigned int inc2,
556 
557  const NumericT * fac3,
558  unsigned int options3,
559  const NumericT * vec3,
560  unsigned int start3,
561  unsigned int inc3)
562 {
563  NumericT alpha = fac2;
564  if (options2 & (1 << 0))
565  alpha = -alpha;
566 
567  NumericT beta = *fac3;
568  if (options3 & (1 << 0))
569  beta = -beta;
570 
571  if (options2 & (1 << 1))
572  {
573  if (options3 & (1 << 1))
574  {
575  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
576  i < size1;
577  i += gridDim.x * blockDim.x)
578  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
579  }
580  else
581  {
582  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
583  i < size1;
584  i += gridDim.x * blockDim.x)
585  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
586  }
587  }
588  else
589  {
590  if (options3 & (1 << 1))
591  {
592  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
593  i < size1;
594  i += gridDim.x * blockDim.x)
595  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
596  }
597  else
598  {
599  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
600  i < size1;
601  i += gridDim.x * blockDim.x)
602  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
603  }
604  }
605 }
606 
607 // alpha on GPU, beta on CPU
608 template<typename NumericT>
609 __global__ void avbv_v_kernel(NumericT * vec1,
610  unsigned int start1,
611  unsigned int inc1,
612  unsigned int size1,
613 
614  const NumericT * fac2,
615  unsigned int options2,
616  const NumericT * vec2,
617  unsigned int start2,
618  unsigned int inc2,
619 
620  NumericT fac3,
621  unsigned int options3,
622  const NumericT * vec3,
623  unsigned int start3,
624  unsigned int inc3)
625 {
626  NumericT alpha = *fac2;
627  if (options2 & (1 << 0))
628  alpha = -alpha;
629 
630  NumericT beta = fac3;
631  if (options3 & (1 << 0))
632  beta = -beta;
633 
634  if (options2 & (1 << 1))
635  {
636  if (options3 & (1 << 1))
637  {
638  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
639  i < size1;
640  i += gridDim.x * blockDim.x)
641  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
642  }
643  else
644  {
645  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
646  i < size1;
647  i += gridDim.x * blockDim.x)
648  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
649  }
650  }
651  else
652  {
653  if (options3 & (1 << 1))
654  {
655  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
656  i < size1;
657  i += gridDim.x * blockDim.x)
658  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
659  }
660  else
661  {
662  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
663  i < size1;
664  i += gridDim.x * blockDim.x)
665  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
666  }
667  }
668 }
669 
670 // alpha and beta on CPU
671 template<typename NumericT>
672 __global__ void avbv_v_kernel(NumericT * vec1,
673  unsigned int start1,
674  unsigned int inc1,
675  unsigned int size1,
676 
677  NumericT fac2,
678  unsigned int options2,
679  const NumericT * vec2,
680  unsigned int start2,
681  unsigned int inc2,
682 
683  NumericT fac3,
684  unsigned int options3,
685  const NumericT * vec3,
686  unsigned int start3,
687  unsigned int inc3)
688 {
689  NumericT alpha = fac2;
690  if (options2 & (1 << 0))
691  alpha = -alpha;
692 
693  NumericT beta = fac3;
694  if (options3 & (1 << 0))
695  beta = -beta;
696 
697  if (options2 & (1 << 1))
698  {
699  if (options3 & (1 << 1))
700  {
701  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
702  i < size1;
703  i += gridDim.x * blockDim.x)
704  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
705  }
706  else
707  {
708  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
709  i < size1;
710  i += gridDim.x * blockDim.x)
711  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
712  }
713  }
714  else
715  {
716  if (options3 & (1 << 1))
717  {
718  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
719  i < size1;
720  i += gridDim.x * blockDim.x)
721  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
722  }
723  else
724  {
725  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
726  i < size1;
727  i += gridDim.x * blockDim.x)
728  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
729  }
730  }
731 }
732 
733 
734 template<typename NumericT, typename ScalarT1, typename ScalarT2>
736  vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
737  vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
738 {
739  typedef NumericT value_type;
740 
741  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
742 
743  value_type data_alpha = alpha;
744  if (flip_sign_alpha)
745  data_alpha = -data_alpha;
746  if (reciprocal_alpha)
747  data_alpha = static_cast<value_type>(1) / data_alpha;
748 
749  value_type temporary_alpha = 0;
751  temporary_alpha = alpha;
752 
753  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
754 
755  value_type temporary_beta = 0;
757  temporary_beta = beta;
758 
759 
760  avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
761  static_cast<unsigned int>(viennacl::traits::start(vec1)),
762  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
763  static_cast<unsigned int>(viennacl::traits::size(vec1)),
764 
765  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
766  options_alpha,
767  viennacl::cuda_arg(vec2),
768  static_cast<unsigned int>(viennacl::traits::start(vec2)),
769  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
770 
771  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
772  options_beta,
773  viennacl::cuda_arg(vec3),
774  static_cast<unsigned int>(viennacl::traits::start(vec3)),
775  static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
776 }
777 
778 
780 
781 template<typename NumericT>
782 __global__ void vector_assign_kernel(NumericT * vec1,
783  unsigned int start1,
784  unsigned int inc1,
785  unsigned int size1,
786  unsigned int internal_size1,
787 
788  NumericT alpha)
789 {
790  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
791  i < size1;
792  i += gridDim.x * blockDim.x)
793  vec1[i*inc1+start1] = (i < size1) ? alpha : 0;
794 }
795 
802 template<typename NumericT, typename ScalarT1>
803 void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false)
804 {
805  typedef NumericT value_type;
806 
807  value_type temporary_alpha = 0;
809  temporary_alpha = alpha;
810 
811  unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
812 
813  vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
814  static_cast<unsigned int>(viennacl::traits::start(vec1)),
815  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
816  size,
817  static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
818 
819  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
820  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel");
821 }
822 
824 
825 template<typename NumericT>
826 __global__ void vector_swap_kernel(NumericT * vec1,
827  unsigned int start1,
828  unsigned int inc1,
829  unsigned int size1,
830 
831  NumericT * vec2,
832  unsigned int start2,
833  unsigned int inc2)
834 {
835  NumericT tmp;
836  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
837  i < size1;
838  i += gridDim.x * blockDim.x)
839  {
840  tmp = vec2[i*inc2+start2];
841  vec2[i*inc2+start2] = vec1[i*inc1+start1];
842  vec1[i*inc1+start1] = tmp;
843  }
844 }
845 
846 
852 template<typename NumericT>
854 {
855  vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
856  static_cast<unsigned int>(viennacl::traits::start(vec1)),
857  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
858  static_cast<unsigned int>(viennacl::traits::size(vec1)),
859 
860  viennacl::cuda_arg(vec2),
861  static_cast<unsigned int>(viennacl::traits::start(vec2)),
862  static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
863  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
864 }
865 
867 
868 template<typename NumericT>
869 __global__ void element_op_kernel(NumericT * vec1,
870  unsigned int start1,
871  unsigned int inc1,
872  unsigned int size1,
873 
874  NumericT const * vec2,
875  unsigned int start2,
876  unsigned int inc2,
877 
878  NumericT const * vec3,
879  unsigned int start3,
880  unsigned int inc3,
881 
882  unsigned int op_type
883  )
884 {
885  if (op_type == 2)
886  {
887  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
888  i < size1;
889  i += gridDim.x * blockDim.x)
890  {
891  vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
892  }
893  }
894  else if (op_type == 1)
895  {
896  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
897  i < size1;
898  i += gridDim.x * blockDim.x)
899  {
900  vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
901  }
902  }
903  else if (op_type == 0)
904  {
905  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
906  i < size1;
907  i += gridDim.x * blockDim.x)
908  {
909  vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
910  }
911  }
912 }
913 
914 template<typename NumericT>
915 __global__ void element_op_int_kernel(NumericT * vec1,
916  unsigned int start1,
917  unsigned int inc1,
918  unsigned int size1,
919 
920  NumericT const * vec2,
921  unsigned int start2,
922  unsigned int inc2,
923 
924  NumericT const * vec3,
925  unsigned int start3,
926  unsigned int inc3,
927 
928  unsigned int op_type
929  )
930 {
931  if (op_type == 1)
932  {
933  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
934  i < size1;
935  i += gridDim.x * blockDim.x)
936  {
937  vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
938  }
939  }
940  else if (op_type == 0)
941  {
942  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
943  i < size1;
944  i += gridDim.x * blockDim.x)
945  {
946  vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
947  }
948  }
949 }
950 
956 template<typename NumericT, typename OpT>
959 {
960  unsigned int op_type = 2; //0: product, 1: division, 2: power
962  op_type = 1;
964  op_type = 0;
965 
966  element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
967  static_cast<unsigned int>(viennacl::traits::start(vec1)),
968  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
969  static_cast<unsigned int>(viennacl::traits::size(vec1)),
970 
971  viennacl::cuda_arg(proxy.lhs()),
972  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
973  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
974 
975  viennacl::cuda_arg(proxy.rhs()),
976  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
977  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
978 
979  op_type
980  );
981  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
982 }
983 
984 template<typename OpT>
987 {
988  unsigned int op_type = 2; //0: product, 1: division, 2: power
990  op_type = 1;
992  op_type = 0;
993 
994  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
995  static_cast<unsigned int>(viennacl::traits::start(vec1)),
996  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
997  static_cast<unsigned int>(viennacl::traits::size(vec1)),
998 
999  viennacl::cuda_arg(proxy.lhs()),
1000  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1001  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
1002 
1003  viennacl::cuda_arg(proxy.rhs()),
1004  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
1005  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
1006 
1007  op_type
1008  );
1009  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
1010 }
1011 
1012 template<typename OpT>
1015 {
1016  unsigned int op_type = 2; //0: product, 1: division, 2: power
1018  op_type = 1;
1020  op_type = 0;
1021 
1022  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1023  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1024  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1025  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1026 
1027  viennacl::cuda_arg(proxy.lhs()),
1028  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1029  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
1030 
1031  viennacl::cuda_arg(proxy.rhs()),
1032  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
1033  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
1034 
1035  op_type
1036  );
1037  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
1038 }
1039 
1041 
1042 // Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
1043 // Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
1044 
1045 // acos
1046 template<typename NumericT>
1047 __global__ void vec_element_acos_kernel(
1048  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1049  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1050 {
1051  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1052  vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
1053 }
1054 
1055 template<typename NumericT>
1058 {
1059  typedef NumericT value_type;
1060 
1061  vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1062  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1063  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1064  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1065  viennacl::cuda_arg(proxy.lhs()),
1066  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1067  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1068  );
1069  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
1070 }
1071 
1072 // asin
1073 template<typename NumericT>
1074 __global__ void vec_element_asin_kernel(
1075  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1076  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1077 {
1078  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1079  vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
1080 }
1081 
1082 template<typename NumericT>
1085 {
1086  vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1087  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1088  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1089  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1090  viennacl::cuda_arg(proxy.lhs()),
1091  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1092  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1093  );
1094  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
1095 }
1096 
1097 
1098 // atan
1099 template<typename NumericT>
1100 __global__ void vec_element_atan_kernel(
1101  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1102  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1103 {
1104  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1105  vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
1106 }
1107 
1108 template<typename NumericT>
1111 {
1112  vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1113  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1114  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1115  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1116  viennacl::cuda_arg(proxy.lhs()),
1117  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1118  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1119  );
1120  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
1121 }
1122 
1123 
1124 // ceil
1125 template<typename NumericT>
1126 __global__ void vec_element_ceil_kernel(
1127  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1128  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1129 {
1130  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1131  vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
1132 }
1133 
1134 template<typename NumericT>
1137 {
1138  vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1139  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1140  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1141  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1142  viennacl::cuda_arg(proxy.lhs()),
1143  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1144  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1145  );
1146  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
1147 }
1148 
1149 
1150 // cos
1151 template<typename NumericT>
1152 __global__ void vec_element_cos_kernel(
1153  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1154  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1155 {
1156  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1157  vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
1158 }
1159 
1160 template<typename NumericT>
1163 {
1164  vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1165  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1166  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1167  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1168  viennacl::cuda_arg(proxy.lhs()),
1169  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1170  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1171  );
1172  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
1173 }
1174 
1175 
1176 // cosh
1177 template<typename NumericT>
1178 __global__ void vec_element_cosh_kernel(
1179  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1180  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1181 {
1182  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1183  vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
1184 }
1185 
1186 template<typename NumericT>
1189 {
1190  vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1191  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1192  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1193  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1194  viennacl::cuda_arg(proxy.lhs()),
1195  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1196  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1197  );
1198  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
1199 }
1200 
1201 
1202 // exp
1203 template<typename NumericT>
1204 __global__ void vec_element_exp_kernel(
1205  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1206  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1207 {
1208  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1209  vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
1210 }
1211 
1212 template<typename NumericT>
1215 {
1216  vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1217  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1218  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1219  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1220  viennacl::cuda_arg(proxy.lhs()),
1221  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1222  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1223  );
1224  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
1225 }
1226 
1227 
1228 // fabs
1229 template<typename NumericT>
1230 __global__ void vec_element_fabs_kernel(
1231  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1232  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1233 {
1234  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1235  vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
1236 }
1237 
1238 template<typename NumericT>
1241 {
1242  vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1243  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1244  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1245  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1246  viennacl::cuda_arg(proxy.lhs()),
1247  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1248  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1249  );
1250  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
1251 }
1252 
1253 // abs
1254 template<typename NumericT>
1255 __global__ void vec_element_abs_kernel(
1256  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1257  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1258 {
1259  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1260  vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
1261 }
1262 
1263 template<typename NumericT>
1266 {
1267  vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1268  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1269  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1270  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1271  viennacl::cuda_arg(proxy.lhs()),
1272  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1273  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1274  );
1275  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
1276 }
1277 
1278 
1279 
1280 // floor
1281 template<typename NumericT>
1283  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1284  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1285 {
1286  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1287  vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
1288 }
1289 
1290 template<typename NumericT>
1293 {
1294  vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1295  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1296  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1297  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1298  viennacl::cuda_arg(proxy.lhs()),
1299  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1300  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1301  );
1302  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
1303 }
1304 
1305 
1306 // log
1307 template<typename NumericT>
1308 __global__ void vec_element_log_kernel(
1309  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1310  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1311 {
1312  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1313  vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
1314 }
1315 
1316 template<typename NumericT>
1319 {
1320  vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1321  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1322  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1323  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1324  viennacl::cuda_arg(proxy.lhs()),
1325  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1326  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1327  );
1328  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
1329 }
1330 
1331 
1332 // log10
1333 template<typename NumericT>
1335  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1336  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1337 {
1338  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1339  vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
1340 }
1341 
1342 template<typename NumericT>
1345 {
1346  vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1347  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1348  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1349  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1350  viennacl::cuda_arg(proxy.lhs()),
1351  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1352  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1353  );
1354  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
1355 }
1356 
1357 
1358 // sin
1359 template<typename NumericT>
1360 __global__ void vec_element_sin_kernel(
1361  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1362  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1363 {
1364  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1365  vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
1366 }
1367 
1368 template<typename NumericT>
1371 {
1372  vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1373  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1374  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1375  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1376  viennacl::cuda_arg(proxy.lhs()),
1377  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1378  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1379  );
1380  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
1381 }
1382 
1383 
1384 // sinh
1385 template<typename NumericT>
1386 __global__ void vec_element_sinh_kernel(
1387  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1388  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1389 {
1390  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1391  vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
1392 }
1393 
1394 template<typename NumericT>
1397 {
1398  vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1399  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1400  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1401  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1402  viennacl::cuda_arg(proxy.lhs()),
1403  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1404  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1405  );
1406  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
1407 }
1408 
1409 
1410 // sqrt
1411 template<typename NumericT>
1412 __global__ void vec_element_sqrt_kernel(
1413  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1414  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1415 {
1416  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1417  vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
1418 }
1419 
1420 template<typename NumericT>
1423 {
1424  vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1425  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1426  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1427  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1428  viennacl::cuda_arg(proxy.lhs()),
1429  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1430  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1431  );
1432  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
1433 }
1434 
1435 
1436 // tan
1437 template<typename NumericT>
1438 __global__ void vec_element_tan_kernel(
1439  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1440  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1441 {
1442  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1443  vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
1444 }
1445 
1446 template<typename NumericT>
1449 {
1450  vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1451  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1452  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1453  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1454  viennacl::cuda_arg(proxy.lhs()),
1455  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1456  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1457  );
1458  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
1459 }
1460 
1461 
1462 // tanh
1463 template<typename NumericT>
1464 __global__ void vec_element_tanh_kernel(
1465  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1466  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1467 {
1468  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1469  vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
1470 }
1471 
1472 template<typename NumericT>
1475 {
1476  vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1477  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1478  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1479  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1480  viennacl::cuda_arg(proxy.lhs()),
1481  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1482  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1483  );
1484  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
1485 }
1486 
1487 
1488 
1490 
1491 
1492 template<typename NumericT>
1493 __global__ void inner_prod_kernel(const NumericT * vec1,
1494  unsigned int start1,
1495  unsigned int inc1,
1496  unsigned int size1,
1497  const NumericT * vec2,
1498  unsigned int start2,
1499  unsigned int inc2,
1500  unsigned int size2,
1501  NumericT * group_buffer)
1502 {
1503  __shared__ NumericT tmp_buffer[128];
1504  unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
1505  unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
1506 
1507  unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
1508  - ( blockIdx.x * size1) / (gridDim.x);
1509 
1510 
1511  NumericT tmp = 0;
1512  for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
1513  tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
1514  tmp_buffer[threadIdx.x] = tmp;
1515 
1516  // parallel reduction
1517  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1518  {
1519  __syncthreads();
1520  if (threadIdx.x < stride)
1521  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
1522  }
1523 
1524  if (threadIdx.x == 0)
1525  group_buffer[blockIdx.x] = tmp_buffer[0];
1526 
1527 }
1528 
1529 
1530 
1531 // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
1532 template<typename NumericT>
1534  const NumericT * vec1,
1535  unsigned int start1,
1536  unsigned int inc1,
1537  unsigned int size1,
1538  unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
1539  NumericT * result)
1540 {
1541  __shared__ NumericT tmp_buffer[128];
1542  NumericT thread_sum = 0;
1543  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1544  {
1545  if (option > 0)
1546  thread_sum += vec1[i*inc1+start1];
1547  else
1548  thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
1549  }
1550 
1551  tmp_buffer[threadIdx.x] = thread_sum;
1552 
1553  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1554  {
1555  __syncthreads();
1556  if (threadIdx.x < stride)
1557  {
1558  if (option > 0)
1559  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1560  else
1561  tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
1562  }
1563  }
1564 
1565  if (threadIdx.x == 0)
1566  {
1567  if (option == 2)
1568  *result = sqrt(tmp_buffer[0]);
1569  else
1570  *result = tmp_buffer[0];
1571  }
1572 }
1573 
1574 template<typename NumericT>
1576  const NumericT * vec1,
1577  unsigned int start1,
1578  unsigned int inc1,
1579  unsigned int size1,
1580  unsigned int option, //0: use max, 1: just sum
1581  NumericT * result)
1582 {
1583  __shared__ NumericT tmp_buffer[128];
1584  NumericT thread_sum = 0;
1585  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1586  {
1587  if (option > 0)
1588  thread_sum += vec1[i*inc1+start1];
1589  else
1590  thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
1591  }
1592 
1593  tmp_buffer[threadIdx.x] = thread_sum;
1594 
1595  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1596  {
1597  __syncthreads();
1598  if (threadIdx.x < stride)
1599  {
1600  if (option > 0)
1601  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1602  else
1603  tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
1604  }
1605  }
1606 
1607  if (threadIdx.x == 0)
1608  *result = tmp_buffer[0];
1609 }
1610 
1611 template<typename NumericT>
1613  const NumericT * vec1,
1614  unsigned int start1,
1615  unsigned int inc1,
1616  unsigned int size1,
1617  unsigned int option, //0: use max, 1: just sum
1618  NumericT * result)
1619 {
1620  __shared__ NumericT tmp_buffer[128];
1621  NumericT thread_sum = 0;
1622  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1623  {
1624  if (option > 0)
1625  thread_sum += vec1[i*inc1+start1];
1626  else
1627  thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
1628  }
1629 
1630  tmp_buffer[threadIdx.x] = thread_sum;
1631 
1632  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1633  {
1634  __syncthreads();
1635  if (threadIdx.x < stride)
1636  {
1637  if (option > 0)
1638  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1639  else
1640  tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
1641  }
1642  }
1643 
1644  if (threadIdx.x == 0)
1645  *result = tmp_buffer[0];
1646 }
1647 
1648 namespace detail
1649 {
1651  struct vector_sum_kernel_launcher_integers
1652  {
1653  template<typename NumericT, typename ScalarT>
1654  static void apply(vector_base<NumericT> const & temp,
1655  unsigned int option,
1656  ScalarT & result)
1657  {
1658  typedef NumericT value_type;
1659  vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
1660  static_cast<unsigned int>(viennacl::traits::start(temp)),
1661  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1662  static_cast<unsigned int>(viennacl::traits::size(temp)),
1663  static_cast<unsigned int>(option),
1664  viennacl::cuda_arg(result) );
1665  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1666  }
1667  };
1668 
1669  struct vector_sum_kernel_launcher_unsigned_integers
1670  {
1671  template<typename NumericT, typename ScalarT>
1672  static void apply(vector_base<NumericT> const & temp,
1673  unsigned int option,
1674  ScalarT & result)
1675  {
1676  typedef NumericT value_type;
1677  vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
1678  static_cast<unsigned int>(viennacl::traits::start(temp)),
1679  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1680  static_cast<unsigned int>(viennacl::traits::size(temp)),
1681  static_cast<unsigned int>(option),
1682  viennacl::cuda_arg(result) );
1683  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1684  }
1685  };
1686 
1687  struct vector_sum_kernel_launcher_floats
1688  {
1689  template<typename NumericT, typename ScalarT>
1690  static void apply(vector_base<NumericT> const & temp,
1691  unsigned int option,
1692  ScalarT & result)
1693  {
1694  typedef NumericT value_type;
1695  vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp),
1696  static_cast<unsigned int>(viennacl::traits::start(temp)),
1697  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1698  static_cast<unsigned int>(viennacl::traits::size(temp)),
1699  static_cast<unsigned int>(option),
1700  viennacl::cuda_arg(result) );
1701  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1702  }
1703  };
1704 
1705  template<typename NumericT>
1706  struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
1707 
1708  template<>
1709  struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {};
1710 
1711  template<>
1712  struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {};
1713 
1714  template<>
1715  struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {};
1716 
1717  template<>
1718  struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {};
1719 
1720  template<>
1721  struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {};
1722 
1723  template<>
1724  struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
1725 
1727 }
1728 
1729 
1730 //implementation of inner product:
1731 //namespace {
1738 template<typename NumericT, typename ScalarT>
1740  vector_base<NumericT> const & vec2,
1741  ScalarT & result)
1742 {
1743  typedef NumericT value_type;
1744 
1745  static const unsigned int work_groups = 128;
1746  static viennacl::vector<value_type> temp(work_groups);
1747 
1748  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1749  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1750  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1751  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1752  viennacl::cuda_arg(vec2),
1753  static_cast<unsigned int>(viennacl::traits::start(vec2)),
1754  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
1755  static_cast<unsigned int>(viennacl::traits::size(vec2)),
1756  viennacl::cuda_arg(temp)
1757  );
1758  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
1759 
1760  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
1761 }
1762 
1763 
1770 template<typename NumericT>
1772  vector_base<NumericT> const & vec2,
1773  NumericT & result)
1774 {
1775  typedef NumericT value_type;
1776 
1777  const unsigned int work_groups = 128;
1778  viennacl::vector<value_type> temp(work_groups);
1779 
1780  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
1781  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1782  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1783  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1784  viennacl::cuda_arg(vec2),
1785  static_cast<unsigned int>(viennacl::traits::start(vec2)),
1786  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
1787  static_cast<unsigned int>(viennacl::traits::size(vec2)),
1788  viennacl::cuda_arg(temp)
1789  );
1790  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
1791 
1792  // Now copy partial results from GPU back to CPU and run reduction there:
1793  std::vector<value_type> temp_cpu(work_groups);
1794  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
1795 
1796  result = 0;
1797  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1798  result += *it;
1799 }
1800 
1802 
1803 #define VIENNACL_MDOT_WORKGROUP_SIZE 128
1804 #define VIENNACL_MDOT_WORKGROUP_NUM 128
1805 // M = 2:
1806 template<typename NumericT>
1807 __global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1808  const NumericT *y0, unsigned int start0, unsigned int stride0,
1809  const NumericT *y1, unsigned int start1, unsigned int stride1,
1810  NumericT *group_results)
1811 {
1812  __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
1813  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1814  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1815  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
1816 
1817  NumericT entry_x = 0;
1818  NumericT group_sum0 = 0;
1819  NumericT group_sum1 = 0;
1820  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1821  entry_x = x[i * stridex + startx]; // load only once from global memory!
1822  group_sum0 += entry_x * y0[i * stride0 + start0];
1823  group_sum1 += entry_x * y1[i * stride1 + start1];
1824  }
1825  tmp_buffer[threadIdx.x] = group_sum0;
1826  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1827 
1828  // parallel reduction
1829  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1830  __syncthreads();
1831  if (threadIdx.x < stride) {
1832  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1833  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1834  }
1835  }
1836 
1837  // write result of group to group_results
1838  if (threadIdx.x == 0) {
1839  group_results[blockIdx.x] = tmp_buffer[0];
1840  group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
1841  }
1842 }
1843 
1844 // M = 3:
1845 template<typename NumericT>
1846 __global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1847  const NumericT *y0, unsigned int start0, unsigned int stride0,
1848  const NumericT *y1, unsigned int start1, unsigned int stride1,
1849  const NumericT *y2, unsigned int start2, unsigned int stride2,
1850  NumericT *group_results)
1851 {
1852  __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
1853  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1854  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1855  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1856 
1857  NumericT entry_x = 0;
1858  NumericT group_sum0 = 0;
1859  NumericT group_sum1 = 0;
1860  NumericT group_sum2 = 0;
1861  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1862  entry_x = x[i * stridex + startx]; // load only once from global memory!
1863  group_sum0 += entry_x * y0[i * stride0 + start0];
1864  group_sum1 += entry_x * y1[i * stride1 + start1];
1865  group_sum2 += entry_x * y2[i * stride2 + start2];
1866  }
1867  tmp_buffer[threadIdx.x] = group_sum0;
1868  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1869  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1870 
1871  // parallel reduction
1872  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1873  __syncthreads();
1874  if (threadIdx.x < stride) {
1875  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1876  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1877  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1878  }
1879  }
1880 
1881  // write result of group to group_results
1882  if (threadIdx.x == 0) {
1883  group_results[blockIdx.x ] = tmp_buffer[0];
1884  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1885  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1886  }
1887 }
1888 
1889 // M = 4:
1890 template<typename NumericT>
1891 __global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1892  const NumericT *y0, unsigned int start0, unsigned int stride0,
1893  const NumericT *y1, unsigned int start1, unsigned int stride1,
1894  const NumericT *y2, unsigned int start2, unsigned int stride2,
1895  const NumericT *y3, unsigned int start3, unsigned int stride3,
1896  NumericT *group_results)
1897 {
1898  __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
1899  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1900  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1901  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1902 
1903  NumericT entry_x = 0;
1904  NumericT group_sum0 = 0;
1905  NumericT group_sum1 = 0;
1906  NumericT group_sum2 = 0;
1907  NumericT group_sum3 = 0;
1908  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1909  entry_x = x[i * stridex + startx]; // load only once from global memory!
1910  group_sum0 += entry_x * y0[i * stride0 + start0];
1911  group_sum1 += entry_x * y1[i * stride1 + start1];
1912  group_sum2 += entry_x * y2[i * stride2 + start2];
1913  group_sum3 += entry_x * y3[i * stride3 + start3];
1914  }
1915  tmp_buffer[threadIdx.x] = group_sum0;
1916  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1917  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1918  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1919 
1920  // parallel reduction
1921  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1922  __syncthreads();
1923  if (threadIdx.x < stride) {
1924  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1925  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1926  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1927  tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
1928  }
1929  }
1930 
1931  // write result of group to group_results
1932  if (threadIdx.x == 0) {
1933  group_results[blockIdx.x ] = tmp_buffer[0];
1934  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1935  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1936  group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
1937  }
1938 }
1939 
1940 // M = 8:
1941 template<typename NumericT>
1942 __global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1943  const NumericT *y0, unsigned int start0, unsigned int stride0,
1944  const NumericT *y1, unsigned int start1, unsigned int stride1,
1945  const NumericT *y2, unsigned int start2, unsigned int stride2,
1946  const NumericT *y3, unsigned int start3, unsigned int stride3,
1947  const NumericT *y4, unsigned int start4, unsigned int stride4,
1948  const NumericT *y5, unsigned int start5, unsigned int stride5,
1949  const NumericT *y6, unsigned int start6, unsigned int stride6,
1950  const NumericT *y7, unsigned int start7, unsigned int stride7,
1951  NumericT *group_results)
1952 {
1953  __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
1954  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1955  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1956  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1957 
1958  NumericT entry_x = 0;
1959  NumericT group_sum0 = 0;
1960  NumericT group_sum1 = 0;
1961  NumericT group_sum2 = 0;
1962  NumericT group_sum3 = 0;
1963  NumericT group_sum4 = 0;
1964  NumericT group_sum5 = 0;
1965  NumericT group_sum6 = 0;
1966  NumericT group_sum7 = 0;
1967  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1968  entry_x = x[i * stridex + startx]; // load only once from global memory!
1969  group_sum0 += entry_x * y0[i * stride0 + start0];
1970  group_sum1 += entry_x * y1[i * stride1 + start1];
1971  group_sum2 += entry_x * y2[i * stride2 + start2];
1972  group_sum3 += entry_x * y3[i * stride3 + start3];
1973  group_sum4 += entry_x * y4[i * stride4 + start4];
1974  group_sum5 += entry_x * y5[i * stride5 + start5];
1975  group_sum6 += entry_x * y6[i * stride6 + start6];
1976  group_sum7 += entry_x * y7[i * stride7 + start7];
1977  }
1978  tmp_buffer[threadIdx.x] = group_sum0;
1979  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1980  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1981  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1982  tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
1983  tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
1984  tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
1985  tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
1986 
1987  // parallel reduction
1988  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1989  __syncthreads();
1990  if (threadIdx.x < stride) {
1991  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1992  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1993  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1994  tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
1995  tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
1996  tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
1997  tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
1998  tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
1999  }
2000  }
2001 
2002  // write result of group to group_results
2003  if (threadIdx.x == 0) {
2004  group_results[blockIdx.x ] = tmp_buffer[0];
2005  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
2006  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
2007  group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
2008  group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
2009  group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
2010  group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
2011  group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
2012  }
2013 }
2014 
2015 // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
2016 template<typename NumericT>
2017 __global__ void vector_multi_sum_kernel(
2018  NumericT const * vec1,
2019  NumericT * result,
2020  unsigned int start_result,
2021  unsigned int inc_result)
2022 {
2023  __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
2024 
2025  tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
2026 
2027  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2028  {
2029  __syncthreads();
2030  if (threadIdx.x < stride)
2031  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
2032  }
2033 
2034  if (threadIdx.x == 0)
2035  result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
2036 }
2037 
2038 template<typename NumericT>
2040  vector_tuple<NumericT> const & vec_tuple,
2041  vector_base<NumericT> & result)
2042 {
2043  typedef NumericT value_type;
2044 
2046 
2047  vcl_size_t current_index = 0;
2048  while (vec_tuple.const_size() > current_index)
2049  {
2050  switch (vec_tuple.const_size() - current_index)
2051  {
2052  case 7:
2053  case 6:
2054  case 5:
2055  case 4:
2056  {
2057  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2058  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2059  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
2060  vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
2061 
2064  static_cast<unsigned int>(viennacl::traits::start(x)),
2065  static_cast<unsigned int>(viennacl::traits::stride(x)),
2066  static_cast<unsigned int>(viennacl::traits::size(x)),
2067  viennacl::cuda_arg(y0),
2068  static_cast<unsigned int>(viennacl::traits::start(y0)),
2069  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2070  viennacl::cuda_arg(y1),
2071  static_cast<unsigned int>(viennacl::traits::start(y1)),
2072  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2073  viennacl::cuda_arg(y2),
2074  static_cast<unsigned int>(viennacl::traits::start(y2)),
2075  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2076  viennacl::cuda_arg(y3),
2077  static_cast<unsigned int>(viennacl::traits::start(y3)),
2078  static_cast<unsigned int>(viennacl::traits::stride(y3)),
2079  viennacl::cuda_arg(temp)
2080  );
2081  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_4_kernel");
2082  vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(viennacl::cuda_arg(temp),
2083  viennacl::cuda_arg(result),
2084  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2085  static_cast<unsigned int>(viennacl::traits::stride(result))
2086  );
2087  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2088  }
2089  current_index += 4;
2090  break;
2091  case 3:
2092  {
2093  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2094  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2095  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
2096 
2099  static_cast<unsigned int>(viennacl::traits::start(x)),
2100  static_cast<unsigned int>(viennacl::traits::stride(x)),
2101  static_cast<unsigned int>(viennacl::traits::size(x)),
2102  viennacl::cuda_arg(y0),
2103  static_cast<unsigned int>(viennacl::traits::start(y0)),
2104  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2105  viennacl::cuda_arg(y1),
2106  static_cast<unsigned int>(viennacl::traits::start(y1)),
2107  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2108  viennacl::cuda_arg(y2),
2109  static_cast<unsigned int>(viennacl::traits::start(y2)),
2110  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2111  viennacl::cuda_arg(temp)
2112  );
2113  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_3_kernel");
2114  vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(viennacl::cuda_arg(temp),
2115  viennacl::cuda_arg(result),
2116  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2117  static_cast<unsigned int>(viennacl::traits::stride(result))
2118  );
2119  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2120  }
2121  current_index += 3;
2122  break;
2123  case 2:
2124  {
2125  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2126  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2127 
2130  static_cast<unsigned int>(viennacl::traits::start(x)),
2131  static_cast<unsigned int>(viennacl::traits::stride(x)),
2132  static_cast<unsigned int>(viennacl::traits::size(x)),
2133  viennacl::cuda_arg(y0),
2134  static_cast<unsigned int>(viennacl::traits::start(y0)),
2135  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2136  viennacl::cuda_arg(y1),
2137  static_cast<unsigned int>(viennacl::traits::start(y1)),
2138  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2139  viennacl::cuda_arg(temp)
2140  );
2141  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_2_kernel");
2142  vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(viennacl::cuda_arg(temp),
2143  viennacl::cuda_arg(result),
2144  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2145  static_cast<unsigned int>(viennacl::traits::stride(result))
2146  );
2147  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2148  }
2149  current_index += 2;
2150  break;
2151  case 1:
2152  {
2153  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2154  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(x),
2155  static_cast<unsigned int>(viennacl::traits::start(x)),
2156  static_cast<unsigned int>(viennacl::traits::stride(x)),
2157  static_cast<unsigned int>(viennacl::traits::size(x)),
2158  viennacl::cuda_arg(y0),
2159  static_cast<unsigned int>(viennacl::traits::start(y0)),
2160  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2161  static_cast<unsigned int>(viennacl::traits::size(y0)),
2162  viennacl::cuda_arg(temp)
2163  );
2164  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
2165 
2166  vector_multi_sum_kernel<<<1, 128>>>(viennacl::cuda_arg(temp),
2167  viennacl::cuda_arg(result),
2168  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2169  static_cast<unsigned int>(viennacl::traits::stride(result))
2170  );
2171  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2172  }
2173  current_index += 1;
2174  break;
2175 
2176  default:
2177  {
2178  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2179  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2180  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
2181  vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
2182  vector_base<NumericT> const & y4 = vec_tuple.const_at(current_index + 4);
2183  vector_base<NumericT> const & y5 = vec_tuple.const_at(current_index + 5);
2184  vector_base<NumericT> const & y6 = vec_tuple.const_at(current_index + 6);
2185  vector_base<NumericT> const & y7 = vec_tuple.const_at(current_index + 7);
2186 
2189  static_cast<unsigned int>(viennacl::traits::start(x)),
2190  static_cast<unsigned int>(viennacl::traits::stride(x)),
2191  static_cast<unsigned int>(viennacl::traits::size(x)),
2192  viennacl::cuda_arg(y0),
2193  static_cast<unsigned int>(viennacl::traits::start(y0)),
2194  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2195  viennacl::cuda_arg(y1),
2196  static_cast<unsigned int>(viennacl::traits::start(y1)),
2197  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2198  viennacl::cuda_arg(y2),
2199  static_cast<unsigned int>(viennacl::traits::start(y2)),
2200  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2201  viennacl::cuda_arg(y3),
2202  static_cast<unsigned int>(viennacl::traits::start(y3)),
2203  static_cast<unsigned int>(viennacl::traits::stride(y3)),
2204  viennacl::cuda_arg(y4),
2205  static_cast<unsigned int>(viennacl::traits::start(y4)),
2206  static_cast<unsigned int>(viennacl::traits::stride(y4)),
2207  viennacl::cuda_arg(y5),
2208  static_cast<unsigned int>(viennacl::traits::start(y5)),
2209  static_cast<unsigned int>(viennacl::traits::stride(y5)),
2210  viennacl::cuda_arg(y6),
2211  static_cast<unsigned int>(viennacl::traits::start(y6)),
2212  static_cast<unsigned int>(viennacl::traits::stride(y6)),
2213  viennacl::cuda_arg(y7),
2214  static_cast<unsigned int>(viennacl::traits::start(y7)),
2215  static_cast<unsigned int>(viennacl::traits::stride(y7)),
2216  viennacl::cuda_arg(temp)
2217  );
2218  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_8_kernel");
2219  vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(viennacl::cuda_arg(temp),
2220  viennacl::cuda_arg(result),
2221  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2222  static_cast<unsigned int>(viennacl::traits::stride(result))
2223  );
2224  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2225  }
2226  current_index += 8;
2227  break;
2228  }
2229  }
2230 }
2231 
2232 #undef VIENNACL_MDOT_WORKGROUP_NUM
2233 #undef VIENNACL_MDOT_WORKGROUP_SIZE
2234 
2236 
2237 template<typename NumericT>
2238 __global__ void norm_kernel_floats(
2239  const NumericT * vec,
2240  unsigned int start1,
2241  unsigned int inc1,
2242  unsigned int size1,
2243  unsigned int norm_selector,
2244  NumericT * group_buffer)
2245 {
2246  __shared__ NumericT tmp_buffer[128];
2247 
2248  NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2249  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2250  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2251  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2252  group_stop = (group_stop > size1) ? size1 : group_stop;
2253 
2254  if (norm_selector == 1) //norm_1
2255  {
2256  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2257  tmp += fabs(vec[i*inc1 + start1]);
2258  }
2259  else if (norm_selector == 2) //norm_2
2260  {
2261  NumericT vec_entry = 0;
2262  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2263  {
2264  vec_entry = vec[i*inc1 + start1];
2265  tmp += vec_entry * vec_entry;
2266  }
2267  }
2268  else if (norm_selector == 0) //norm_inf
2269  {
2270  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2271  tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
2272  }
2273  else if (norm_selector == 3) //min
2274  {
2275  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2276  tmp = (vec[i*inc1 + start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2277  }
2278  else if (norm_selector == 4) //max
2279  {
2280  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2281  tmp = (vec[i*inc1 + start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2282  }
2283 
2284  tmp_buffer[threadIdx.x] = tmp;
2285 
2286  if (norm_selector == 1 || norm_selector == 2) //parallel reduction for norm_1 or norm_2:
2287  {
2288  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2289  {
2290  __syncthreads();
2291  if (threadIdx.x < stride)
2292  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2293  }
2294  }
2295  else if (norm_selector == 3)
2296  {
2297  //min:
2298  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2299  {
2300  __syncthreads();
2301  if (threadIdx.x < stride)
2302  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2303  }
2304  }
2305  else if (norm_selector == 4)
2306  {
2307  //max:
2308  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2309  {
2310  __syncthreads();
2311  if (threadIdx.x < stride)
2312  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2313  }
2314  }
2315  else
2316  {
2317  //norm_inf:
2318  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2319  {
2320  __syncthreads();
2321  if (threadIdx.x < stride)
2322  tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+stride]);
2323  }
2324  }
2325 
2326  if (threadIdx.x == 0)
2327  group_buffer[blockIdx.x] = tmp_buffer[0];
2328 }
2329 
2330 template<typename NumericT>
2331 __global__ void norm_kernel_integers(
2332  const NumericT * vec,
2333  unsigned int start1,
2334  unsigned int inc1,
2335  unsigned int size1,
2336  unsigned int norm_selector,
2337  NumericT * group_buffer)
2338 {
2339  __shared__ NumericT tmp_buffer[128];
2340 
2341  NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2342  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2343  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2344  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2345  group_stop = (group_stop > size1) ? size1 : group_stop;
2346 
2347  if (norm_selector == 1) //norm_1
2348  {
2349  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2350  tmp += abs(vec[i*inc1 + start1]);
2351  }
2352  else if (norm_selector == 0) //norm_inf
2353  {
2354  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2355  tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
2356  }
2357  else if (norm_selector == 3) //min
2358  {
2359  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2360  tmp = (vec[i*inc1 + start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2361  }
2362  else if (norm_selector == 4) //max
2363  {
2364  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2365  tmp = (vec[i*inc1 + start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2366  }
2367 
2368  tmp_buffer[threadIdx.x] = tmp;
2369 
2370  if (norm_selector == 1 || norm_selector == 2) //parallel reduction for norm_1 or norm_2:
2371  {
2372  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2373  {
2374  __syncthreads();
2375  if (threadIdx.x < stride)
2376  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2377  }
2378  }
2379  else if (norm_selector == 3)
2380  {
2381  //min:
2382  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2383  {
2384  __syncthreads();
2385  if (threadIdx.x < stride)
2386  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2387  }
2388  }
2389  else if (norm_selector == 4)
2390  {
2391  //max:
2392  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2393  {
2394  __syncthreads();
2395  if (threadIdx.x < stride)
2396  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2397  }
2398  }
2399  else
2400  {
2401  //norm_inf:
2402  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2403  {
2404  __syncthreads();
2405  if (threadIdx.x < stride)
2406  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
2407  }
2408  }
2409 
2410  if (threadIdx.x == 0)
2411  group_buffer[blockIdx.x] = tmp_buffer[0];
2412 }
2413 
2414 template<typename NumericT>
2416  const NumericT * vec,
2417  unsigned int start1,
2418  unsigned int inc1,
2419  unsigned int size1,
2420  unsigned int norm_selector,
2421  NumericT * group_buffer)
2422 {
2423  __shared__ NumericT tmp_buffer[128];
2424 
2425  NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2426  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2427  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2428  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2429  group_stop = (group_stop > size1) ? size1 : group_stop;
2430 
2431  if (norm_selector == 1) //norm_1
2432  {
2433  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2434  tmp += vec[i*inc1 + start1];
2435  }
2436  else if (norm_selector == 0) //norm_inf
2437  {
2438  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2439  tmp = (tmp > vec[i*inc1 + start1]) ? tmp : vec[i*inc1 + start1];
2440  }
2441  else if (norm_selector == 3) //min
2442  {
2443  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2444  tmp = (vec[i*inc1 + start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2445  }
2446  else if (norm_selector == 4) //max
2447  {
2448  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2449  tmp = (vec[i*inc1 + start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2450  }
2451 
2452  tmp_buffer[threadIdx.x] = tmp;
2453 
2454  if (norm_selector == 1 || norm_selector == 2) //parallel reduction for norm_1 or norm_2:
2455  {
2456  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2457  {
2458  __syncthreads();
2459  if (threadIdx.x < stride)
2460  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2461  }
2462  }
2463  else if (norm_selector == 3)
2464  {
2465  //min:
2466  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2467  {
2468  __syncthreads();
2469  if (threadIdx.x < stride)
2470  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2471  }
2472  }
2473  else if (norm_selector == 4)
2474  {
2475  //max:
2476  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2477  {
2478  __syncthreads();
2479  if (threadIdx.x < stride)
2480  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2481  }
2482  }
2483  else
2484  {
2485  //norm_inf:
2486  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2487  {
2488  __syncthreads();
2489  if (threadIdx.x < stride)
2490  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
2491  }
2492  }
2493 
2494  if (threadIdx.x == 0)
2495  group_buffer[blockIdx.x] = tmp_buffer[0];
2496 }
2497 
2499 namespace detail
2500 {
2501  struct norm_kernel_launcher_integers
2502  {
2503  template<typename NumericT>
2504  static void apply(vector_base<NumericT> const & vec1,
2505  vector_base<NumericT> & temp,
2506  unsigned int option)
2507  {
2508  norm_kernel_integers<<<128, 128>>>(viennacl::cuda_arg(vec1),
2509  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2510  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2511  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2512  static_cast<unsigned int>(option),
2513  viennacl::cuda_arg(temp)
2514  );
2515  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2516  }
2517  };
2518 
2519  struct norm_kernel_launcher_unsigned_integers
2520  {
2521  template<typename NumericT>
2522  static void apply(vector_base<NumericT> const & vec1,
2523  vector_base<NumericT> & temp,
2524  unsigned int option)
2525  {
2526  norm_kernel_unsigned_integers<<<128, 128>>>(viennacl::cuda_arg(vec1),
2527  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2528  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2529  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2530  static_cast<unsigned int>(option),
2531  viennacl::cuda_arg(temp)
2532  );
2533  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2534  }
2535  };
2536 
2537 
2538  struct norm_kernel_launcher_floats
2539  {
2540  template<typename NumericT>
2541  static void apply(vector_base<NumericT> const & vec1,
2542  vector_base<NumericT> & temp,
2543  unsigned int option)
2544  {
2545  norm_kernel_floats<<<128, 128>>>(viennacl::cuda_arg(vec1),
2546  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2547  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2548  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2549  static_cast<unsigned int>(option),
2550  viennacl::cuda_arg(temp)
2551  );
2552  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2553  }
2554  };
2555 
2556  template<typename NumericT>
2557  struct norm_kernel_launcher : public norm_kernel_launcher_integers {};
2558 
2559  template<>
2560  struct norm_kernel_launcher<unsigned char> : public norm_kernel_launcher_unsigned_integers {};
2561 
2562  template<>
2563  struct norm_kernel_launcher<unsigned short> : public norm_kernel_launcher_unsigned_integers {};
2564 
2565  template<>
2566  struct norm_kernel_launcher<unsigned int> : public norm_kernel_launcher_unsigned_integers {};
2567 
2568  template<>
2569  struct norm_kernel_launcher<unsigned long> : public norm_kernel_launcher_unsigned_integers {};
2570 
2571  template<>
2572  struct norm_kernel_launcher<float> : public norm_kernel_launcher_floats {};
2573 
2574  template<>
2575  struct norm_kernel_launcher<double> : public norm_kernel_launcher_floats {};
2576 
2577 }
2586 template<typename NumericT>
2588  scalar<NumericT> & result)
2589 {
2590  typedef NumericT value_type;
2591 
2592  vcl_size_t work_groups = 128;
2593  viennacl::vector<value_type> temp(work_groups);
2594 
2595  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2596  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
2597 }
2598 
2604 template<typename NumericT>
2606  NumericT & result)
2607 {
2608  typedef NumericT value_type;
2609 
2610  vcl_size_t work_groups = 128;
2611  viennacl::vector<value_type> temp(work_groups);
2612 
2613  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2614 
2615  // Now copy partial results from GPU back to CPU and run reduction there:
2616  std::vector<value_type> temp_cpu(work_groups);
2617  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2618 
2619  result = 0;
2620  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2621  result += *it;
2622 }
2623 
2625 
2631 template<typename NumericT>
2633  scalar<NumericT> & result)
2634 {
2635  typedef NumericT value_type;
2636 
2637  vcl_size_t work_groups = 128;
2638  viennacl::vector<value_type> temp(work_groups);
2639 
2640  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2641 
2642  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 2, result);
2643 }
2644 
2650 template<typename NumericT>
2652  NumericT & result)
2653 {
2654  typedef NumericT value_type;
2655 
2656  vcl_size_t work_groups = 128;
2657  viennacl::vector<value_type> temp(work_groups);
2658 
2659  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2660 
2661  std::vector<value_type> temp_cpu(work_groups);
2662  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2663 
2664  result = 0;
2665  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2666  result += *it;
2667  result = std::sqrt(result);
2668 }
2669 
2670 
2672 
2678 template<typename NumericT>
2680  scalar<NumericT> & result)
2681 {
2682  typedef NumericT value_type;
2683 
2684  vcl_size_t work_groups = 128;
2685  viennacl::vector<value_type> temp(work_groups);
2686 
2687  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2688  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 0, result);
2689 }
2690 
2691 
2692 
2698 template<typename NumericT>
2700  NumericT & result)
2701 {
2702  typedef NumericT value_type;
2703 
2704  vcl_size_t work_groups = 128;
2705  viennacl::vector<value_type> temp(work_groups);
2706 
2707  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2708 
2709  std::vector<value_type> temp_cpu(work_groups);
2710  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2711 
2712  result = 0;
2713  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2714  result = std::max(result, *it);
2715 }
2716 
2717 
2719 
2720 // second reduction stage for min() and max()
2721 template<typename NumericT>
2722 __global__ void vector_maxmin_kernel(
2723  const NumericT * vec1,
2724  unsigned int start1,
2725  unsigned int inc1,
2726  unsigned int size1,
2727  unsigned int option, //0: use max, 1: use min
2728  NumericT * result)
2729 {
2730  __shared__ NumericT tmp_buffer[128];
2731  NumericT thread_minmax = vec1[start1];
2732  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
2733  {
2734  if (option > 0) //min
2735  thread_minmax = (vec1[i*inc1+start1] < thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2736  else
2737  thread_minmax = (vec1[i*inc1+start1] > thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2738  }
2739 
2740  tmp_buffer[threadIdx.x] = thread_minmax;
2741 
2742  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2743  {
2744  __syncthreads();
2745  if (threadIdx.x < stride)
2746  {
2747  if (option > 0) //min
2748  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x + stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x + stride] : tmp_buffer[threadIdx.x];
2749  else
2750  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x + stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x + stride] : tmp_buffer[threadIdx.x];
2751  }
2752  }
2753 
2754  if (threadIdx.x == 0)
2755  *result = tmp_buffer[0];
2756 }
2757 
2758 
2764 template<typename NumericT>
2766  scalar<NumericT> & result)
2767 {
2768  typedef NumericT value_type;
2769 
2770  vcl_size_t work_groups = 128;
2772 
2773  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2774 
2775  vector_maxmin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
2776  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2777  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2778  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2779  static_cast<unsigned int>(0),
2780  viennacl::cuda_arg(result)
2781  );
2782  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_maxmin_kernel");
2783 }
2784 
2785 
2786 
2792 template<typename NumericT>
2793 void max_cpu(vector_base<NumericT> const & vec1,
2794  NumericT & result)
2795 {
2796  typedef NumericT value_type;
2797 
2798  vcl_size_t work_groups = 128;
2800 
2801  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2802 
2803  std::vector<value_type> temp_cpu(work_groups);
2804  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2805 
2806  result = temp[0];
2807  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2808  result = std::max(result, *it);
2809 }
2810 
2812 
2818 template<typename NumericT>
2820  scalar<NumericT> & result)
2821 {
2822  typedef NumericT value_type;
2823 
2824  vcl_size_t work_groups = 128;
2826 
2827  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2828 
2829  vector_maxmin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
2830  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2831  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2832  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2833  static_cast<unsigned int>(1),
2834  viennacl::cuda_arg(result)
2835  );
2836  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_maxmin_kernel");
2837 }
2838 
2839 
2840 
2846 template<typename NumericT>
2847 void min_cpu(vector_base<NumericT> const & vec1,
2848  NumericT & result)
2849 {
2850  typedef NumericT value_type;
2851 
2852  vcl_size_t work_groups = 128;
2854 
2855  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2856 
2857  std::vector<value_type> temp_cpu(work_groups);
2858  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2859 
2860  result = temp[0];
2861  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2862  result = std::min(result, *it);
2863 }
2864 
2865 
2867 
2873 template<typename NumericT>
2875  scalar<NumericT> & result)
2876 {
2877  typedef NumericT value_type;
2878 
2880  viennacl::linalg::cuda::inner_prod_impl(vec1, all_ones, result);
2881 }
2882 
2883 
2884 
2890 template<typename NumericT>
2891 void sum_cpu(vector_base<NumericT> const & vec1,
2892  NumericT & result)
2893 {
2894  typedef NumericT value_type;
2895 
2897  viennacl::linalg::cuda::inner_prod_cpu(vec1, all_ones, result);
2898 }
2899 
2900 
2901 
2903 
2904 
2905 
2906 //index_norm_inf:
2907 
2908 // fixes the problem of not having (f)abs available in a consistent manner
2909 template<typename NumericT>
2910 __device__ NumericT cuda_abs(NumericT val) { return (val < 0) ? -val : val; }
2911 __device__ inline unsigned long cuda_abs(unsigned long val) { return val; }
2912 __device__ inline unsigned int cuda_abs(unsigned int val) { return val; }
2913 __device__ inline unsigned short cuda_abs(unsigned short val) { return val; }
2914 __device__ inline unsigned char cuda_abs(unsigned char val) { return val; }
2915 
2916 template<typename NumericT>
2917 __global__ void index_norm_inf_kernel(const NumericT * vec,
2918  unsigned int start1,
2919  unsigned int inc1,
2920  unsigned int size1,
2921  unsigned int * result)
2922 {
2923  __shared__ NumericT float_buffer[128];
2924  __shared__ unsigned int index_buffer[128];
2925 
2926  float_buffer[threadIdx.x] = 0;
2927  index_buffer[threadIdx.x] = 0;
2928 
2929  //step 1: fill buffer:
2930  NumericT cur_max = NumericT(0);
2931  NumericT tmp;
2932  for (unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
2933  {
2934  tmp = vec[i*inc1+start1];
2935  tmp = cuda_abs(tmp);
2936  if (cur_max < tmp)
2937  {
2938  float_buffer[threadIdx.x] = tmp;
2939  index_buffer[threadIdx.x] = i;
2940  cur_max = tmp;
2941  }
2942  }
2943 
2944  //step 2: parallel reduction:
2945  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2946  {
2947  __syncthreads();
2948  if (threadIdx.x < stride)
2949  {
2950  //find the first occurring index
2951  if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+stride])
2952  {
2953  index_buffer[threadIdx.x] = index_buffer[threadIdx.x+stride];
2954  float_buffer[threadIdx.x] = float_buffer[threadIdx.x+stride];
2955  }
2956  }
2957  }
2958 
2959  if (threadIdx.x == 0)
2960  *result = index_buffer[0];
2961 }
2962 
2963 //This function should return a CPU scalar, otherwise statements like
2964 // vcl_rhs[index_norm_inf(vcl_rhs)]
2965 // are ambiguous
2971 template<typename NumericT>
2973 {
2974  typedef NumericT value_type;
2975 
2977  viennacl::backend::memory_create(h, sizeof(unsigned int), viennacl::traits::context(vec1));
2978 
2979  index_norm_inf_kernel<<<1, 128>>>(viennacl::cuda_arg(vec1),
2980  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2981  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2982  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2983  viennacl::cuda_arg<unsigned int>(h)
2984  //reinterpret_cast<unsigned int *>(h.cuda_handle().get())
2985  );
2986  VIENNACL_CUDA_LAST_ERROR_CHECK("index_norm_inf_kernel");
2987 
2988  unsigned int ret = 0;
2989  viennacl::backend::memory_read(h, 0, sizeof(unsigned int), &ret);
2990  return static_cast<vcl_size_t>(ret);
2991 }
2992 
2994 
2995 template<typename NumericT>
2996 __global__ void plane_rotation_kernel(
2997  NumericT * vec1,
2998  unsigned int start1,
2999  unsigned int inc1,
3000  unsigned int size1,
3001  NumericT * vec2,
3002  unsigned int start2,
3003  unsigned int inc2,
3004  unsigned int size2,
3005  NumericT alpha,
3006  NumericT beta)
3007 {
3008  NumericT tmp1 = 0;
3009  NumericT tmp2 = 0;
3010 
3011  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
3012  {
3013  tmp1 = vec1[i*inc1+start1];
3014  tmp2 = vec2[i*inc2+start2];
3015 
3016  vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2;
3017  vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1;
3018  }
3019 
3020 }
3021 
3031 template<typename NumericT>
3033  vector_base<NumericT> & vec2,
3034  NumericT alpha, NumericT beta)
3035 {
3036  typedef NumericT value_type;
3037 
3038  value_type temporary_alpha = 0;
3040  temporary_alpha = alpha;
3041 
3042  value_type temporary_beta = 0;
3044  temporary_beta = beta;
3045 
3046  plane_rotation_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
3047  static_cast<unsigned int>(viennacl::traits::start(vec1)),
3048  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
3049  static_cast<unsigned int>(viennacl::traits::size(vec1)),
3050  viennacl::cuda_arg(vec2),
3051  static_cast<unsigned int>(viennacl::traits::start(vec2)),
3052  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
3053  static_cast<unsigned int>(viennacl::traits::size(vec2)),
3054  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
3055  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)) );
3056  VIENNACL_CUDA_LAST_ERROR_CHECK("plane_rotation_kernel");
3057 }
3058 
3060 
3061 
3062 template<typename NumericT>
3063 __global__ void scan_kernel_1(NumericT const *X,
3064  unsigned int startX,
3065  unsigned int incX,
3066  unsigned int sizeX,
3067 
3068  NumericT *Y,
3069  unsigned int startY,
3070  unsigned int incY,
3071 
3072  unsigned int scan_offset,
3073  NumericT *carries) // 0 for inclusive scan, 1 for exclusive
3074 {
3075  __shared__ NumericT shared_buffer[256];
3076  NumericT my_value;
3077 
3078  unsigned int work_per_thread = (sizeX - 1) / (gridDim.x * blockDim.x) + 1;
3079  unsigned int block_start = work_per_thread * blockDim.x * blockIdx.x;
3080  unsigned int block_stop = work_per_thread * blockDim.x * (blockIdx.x + 1);
3081  unsigned int block_offset = 0;
3082 
3083  // run scan on each section
3084  for (unsigned int i = block_start + threadIdx.x; i < block_stop; i += blockDim.x)
3085  {
3086  // load data:
3087  my_value = (i < sizeX) ? X[i * incX + startX] : 0;
3088 
3089  // inclusive scan in shared buffer:
3090  for(unsigned int stride = 1; stride < blockDim.x; stride *= 2)
3091  {
3092  __syncthreads();
3093  shared_buffer[threadIdx.x] = my_value;
3094  __syncthreads();
3095  if (threadIdx.x >= stride)
3096  my_value += shared_buffer[threadIdx.x - stride];
3097  }
3098  __syncthreads();
3099  shared_buffer[threadIdx.x] = my_value;
3100  __syncthreads();
3101 
3102  // exclusive scan requires us to write a zero value at the beginning of each block
3103  if (scan_offset > 0)
3104  my_value = (threadIdx.x > 0) ? shared_buffer[threadIdx.x - 1] : 0;
3105 
3106  // write to output array
3107  if (i < sizeX)
3108  Y[i * incY + startY] = block_offset + my_value;
3109 
3110  block_offset += shared_buffer[blockDim.x-1];
3111  }
3112 
3113  // write carry:
3114  if (threadIdx.x == 0)
3115  carries[blockIdx.x] = block_offset;
3116 
3117 }
3118 
3119 // exclusive-scan of carries
3120 template<typename NumericT>
3121 __global__ void scan_kernel_2(NumericT *carries)
3122 {
3123  __shared__ NumericT shared_buffer[256];
3124 
3125  // load data:
3126  NumericT my_carry = carries[threadIdx.x];
3127 
3128  // exclusive scan in shared buffer:
3129 
3130  for(unsigned int stride = 1; stride < blockDim.x; stride *= 2)
3131  {
3132  __syncthreads();
3133  shared_buffer[threadIdx.x] = my_carry;
3134  __syncthreads();
3135  if (threadIdx.x >= stride)
3136  my_carry += shared_buffer[threadIdx.x - stride];
3137  }
3138  __syncthreads();
3139  shared_buffer[threadIdx.x] = my_carry;
3140  __syncthreads();
3141 
3142  // write to output array
3143  carries[threadIdx.x] = (threadIdx.x > 0) ? shared_buffer[threadIdx.x - 1] : 0;
3144 }
3145 
3146 template<typename NumericT>
3147 __global__ void scan_kernel_3(NumericT *Y,
3148  unsigned int startY,
3149  unsigned int incY,
3150  unsigned int sizeY,
3151 
3152  NumericT const *carries)
3153 {
3154  unsigned int work_per_thread = (sizeY - 1) / (gridDim.x * blockDim.x) + 1;
3155  unsigned int block_start = work_per_thread * blockDim.x * blockIdx.x;
3156  unsigned int block_stop = work_per_thread * blockDim.x * (blockIdx.x + 1);
3157 
3158  __shared__ NumericT shared_offset;
3159 
3160  if (threadIdx.x == 0)
3161  shared_offset = carries[blockIdx.x];
3162 
3163  __syncthreads();
3164 
3165  // add offset to each element in the block:
3166  for (unsigned int i = block_start + threadIdx.x; i < block_stop; i += blockDim.x)
3167  if (i < sizeY)
3168  Y[i * incY + startY] += shared_offset;
3169 }
3170 
3171 
3172 
3173 namespace detail
3174 {
3180  template<typename NumericT>
3181  void scan_impl(vector_base<NumericT> const & input,
3182  vector_base<NumericT> & output,
3183  bool is_inclusive)
3184  {
3185  vcl_size_t block_num = 128;
3186  vcl_size_t threads_per_block = 128;
3187 
3188  viennacl::backend::mem_handle cuda_carries;
3189  viennacl::backend::memory_create(cuda_carries, sizeof(NumericT)*block_num, viennacl::traits::context(input));
3190 
3191  // First step: Scan within each thread group and write carries
3192  scan_kernel_1<<<block_num, threads_per_block>>>(viennacl::cuda_arg(input),
3193  static_cast<unsigned int>(viennacl::traits::start(input)),
3194  static_cast<unsigned int>(viennacl::traits::stride(input)),
3195  static_cast<unsigned int>(viennacl::traits::size(input)),
3196 
3197  viennacl::cuda_arg(output),
3198  static_cast<unsigned int>(viennacl::traits::start(output)),
3199  static_cast<unsigned int>(viennacl::traits::stride(output)),
3200 
3201  static_cast<unsigned int>(is_inclusive ? 0 : 1),
3202  viennacl::cuda_arg<NumericT>(cuda_carries)
3203  );
3204 
3205  // Second step: Compute offset for each thread group (exclusive scan for each thread group)
3206  scan_kernel_2<<<1, block_num>>>(viennacl::cuda_arg<NumericT>(cuda_carries));
3207 
3208  // Third step: Offset each thread group accordingly
3209  scan_kernel_3<<<block_num, threads_per_block>>>(viennacl::cuda_arg(output),
3210  static_cast<unsigned int>(viennacl::traits::start(output)),
3211  static_cast<unsigned int>(viennacl::traits::stride(output)),
3212  static_cast<unsigned int>(viennacl::traits::size(output)),
3213 
3214  viennacl::cuda_arg<NumericT>(cuda_carries)
3215  );
3216  }
3217 }
3218 
3219 
3225 template<typename NumericT>
3227  vector_base<NumericT> & output)
3228 {
3229  detail::scan_impl(input, output, true);
3230 }
3231 
3232 
3238 template<typename NumericT>
3240  vector_base<NumericT> & output)
3241 {
3242  detail::scan_impl(input, output, false);
3243 }
3244 
3245 
3246 
3247 } //namespace cuda
3248 } //namespace linalg
3249 } //namespace viennacl
3250 
3251 
3252 #endif
vcl_size_t const_size() const
Definition: vector.hpp:1143
__global__ void convert_kernel(DestNumericT *dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, SrcNumericT const *src, unsigned int start_src, unsigned int inc_src)
__global__ void vec_element_abs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:160
void vector_assign(vector_base< NumericT > &vec1, ScalarT1 const &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:227
__global__ void vector_sum_kernel_unsigned_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
void norm_2_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^2-norm of a vector - implementation.
void convert(matrix_base< DestNumericT > &mat1, matrix_base< SrcNumericT > const &mat2)
__global__ void norm_kernel_floats(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, NumericT *group_results)
__global__ void vec_element_asin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Generic size and resize functionality for different vector and matrix types.
__global__ void plane_rotation_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT alpha, NumericT beta)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, NumericT *group_results)
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:386
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
__global__ void vector_multi_sum_kernel(NumericT const *vec1, NumericT *result, unsigned int start_result, unsigned int inc_result)
__global__ void vector_maxmin_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_fabs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
void max_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
__global__ void norm_kernel_unsigned_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
Determines row and column increments for matrices and matrix proxies.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
void exclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an exclusive scan using CUDA.
void norm_1_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^1-norm of a vector.
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
#define VIENNACL_MDOT_WORKGROUP_SIZE
An expression template class that represents a binary operation that yields a vector.
Definition: forwards.h:239
__global__ void scan_kernel_2(NumericT *carries)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
__global__ void vec_element_sqrt_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void avbv_v_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
void max_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void vec_element_tanh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
float NumericT
Definition: bisect.cpp:40
#define VIENNACL_MDOT_WORKGROUP_NUM
void vector_swap(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
Swaps the contents of two vectors, data is copied.
__global__ void index_norm_inf_kernel(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int *result)
void scan_impl(vector_base< NumericT > const &input, vector_base< NumericT > &output, bool is_inclusive)
Worker routine for scan routines.
__global__ void inner_prod_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT *group_buffer)
__global__ void vec_element_tan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
void inclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an inclusive scan using CUDA.
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:448
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
Definition: forwards.h:269
void norm_inf_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the supremum-norm of a vector.
void inner_prod_cpu(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, NumericT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
__global__ void vec_element_atan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void sum_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void scan_kernel_3(NumericT *Y, unsigned int startY, unsigned int incY, unsigned int sizeY, NumericT const *carries)
Common base class for dense vectors, vector ranges, and vector slices.
Definition: vector_def.hpp:104
__global__ void vector_sum_kernel_floats(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void scan_kernel_1(NumericT const *X, unsigned int startX, unsigned int incX, unsigned int sizeX, NumericT *Y, unsigned int startY, unsigned int incY, unsigned int scan_offset, NumericT *carries)
void sum_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
void avbv_v(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
std::size_t vcl_size_t
Definition: forwards.h:75
__global__ void vec_element_log_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_sum_kernel_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_cosh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:466
__global__ void av_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_acos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_ceil_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void min_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void norm_kernel_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
void norm_1_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^1-norm of a vector.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vcl_size_t index_norm_inf(vector_base< NumericT > const &vec1)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
__global__ void element_op_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
__global__ void vec_element_exp_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void plane_rotation(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2, NumericT alpha, NumericT beta)
Computes a plane rotation of two vectors.
void min_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
Common routines for CUDA execution.
void inner_prod_impl(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, ScalarT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: vector_def.hpp:87
__global__ void vector_swap_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2)
void norm_inf_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the supremum-norm of a vector.
__global__ void avbv_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, NumericT *group_results)
__global__ void vec_element_sin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
VectorType const & const_at(vcl_size_t i) const
Definition: vector.hpp:1146
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:130
__global__ void vec_element_cos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_assign_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int internal_size1, NumericT alpha)
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
Definition: memory.hpp:87
__global__ void vec_element_log10_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector_def.hpp:120
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:436
__global__ void vec_element_floor_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:134
Implementation of the ViennaCL scalar class.
void norm_2_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^2-norm of a vector - implementation.
void avbv(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, const NumericT *y4, unsigned int start4, unsigned int stride4, const NumericT *y5, unsigned int start5, unsigned int stride5, const NumericT *y6, unsigned int start6, unsigned int stride6, const NumericT *y7, unsigned int start7, unsigned int stride7, NumericT *group_results)
__global__ void vec_element_sinh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
Definition: common.hpp:188
__device__ NumericT cuda_abs(NumericT val)
Simple enable-if variant that uses the SFINAE pattern.
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91
__global__ void element_op_int_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)