ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vector_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 
27 #include "viennacl/forwards.h"
29 #include "viennacl/ocl/device.hpp"
30 #include "viennacl/ocl/handle.hpp"
31 #include "viennacl/ocl/kernel.hpp"
32 #include "viennacl/scalar.hpp"
33 #include "viennacl/tools/tools.hpp"
40 #include "viennacl/traits/size.hpp"
44 
45 namespace viennacl
46 {
47 namespace linalg
48 {
49 namespace opencl
50 {
51 
52 //
53 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
54 //
55 template<typename DestNumericT, typename SrcNumericT>
57 {
58  assert(viennacl::traits::opencl_handle(dest).context() == viennacl::traits::opencl_handle(src).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
59 
60  std::string kernel_name("convert_");
62  kernel_name += "_";
64 
65  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(dest).context());
68 
69  viennacl::ocl::enqueue(k( dest, cl_uint(dest.start()), cl_uint(dest.stride()), cl_uint(dest.size()),
70  src, cl_uint( src.start()), cl_uint( src.stride())
71  ) );
72 
73 }
74 
75 template <typename T, typename ScalarType1>
76 void av(vector_base<T> & vec1,
77  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
78 {
79  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
80 
81  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
83 
84  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
85 
87  (viennacl::is_cpu_scalar<ScalarType1>::value ? "av_cpu" : "av_gpu"));
88  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
89  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
90 
92  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
93  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
94  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
95  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
96 
98  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
99  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
100  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
101  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
102 
103 
104  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
105  size_vec1,
106 
107  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
108  options_alpha,
109  viennacl::traits::opencl_handle(vec2),
110  size_vec2 )
111  );
112 }
113 
114 
115 template <typename T, typename ScalarType1, typename ScalarType2>
116 void avbv(vector_base<T> & vec1,
117  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
118  vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
119 {
120  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
121  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
122 
123  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
125 
126  std::string kernel_name;
128  kernel_name = "avbv_cpu_cpu";
130  kernel_name = "avbv_cpu_gpu";
132  kernel_name = "avbv_gpu_cpu";
133  else
134  kernel_name = "avbv_gpu_gpu";
135 
136  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
137  cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
138 
140  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
141  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
142 
144  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
145  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
146  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
147  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
148 
150  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
151  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
152  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
153  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
154 
156  size_vec3.start = cl_uint(viennacl::traits::start(vec3));
157  size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
158  size_vec3.size = cl_uint(viennacl::traits::size(vec3));
159  size_vec3.internal_size = cl_uint(viennacl::traits::internal_size(vec3));
160 
161  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
162  size_vec1,
163 
164  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
165  options_alpha,
166  viennacl::traits::opencl_handle(vec2),
167  size_vec2,
168 
169  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
170  options_beta,
171  viennacl::traits::opencl_handle(vec3),
172  size_vec3 )
173  );
174 }
175 
176 
177 template <typename T, typename ScalarType1, typename ScalarType2>
178 void avbv_v(vector_base<T> & vec1,
179  vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
180  vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
181 {
182  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
183  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
184 
185  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
187 
188  std::string kernel_name;
190  kernel_name = "avbv_v_cpu_cpu";
192  kernel_name = "avbv_v_cpu_gpu";
194  kernel_name = "avbv_v_gpu_cpu";
195  else
196  kernel_name = "avbv_v_gpu_gpu";
197 
198  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
199  cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
200 
202  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
203  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
204 
206  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
207  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
208  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
209  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
210 
212  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
213  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
214  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
215  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
216 
218  size_vec3.start = cl_uint(viennacl::traits::start(vec3));
219  size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
220  size_vec3.size = cl_uint(viennacl::traits::size(vec3));
221  size_vec3.internal_size = cl_uint(viennacl::traits::internal_size(vec3));
222 
223  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
224  size_vec1,
225 
226  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
227  options_alpha,
228  viennacl::traits::opencl_handle(vec2),
229  size_vec2,
230 
231  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
232  options_beta,
233  viennacl::traits::opencl_handle(vec3),
234  size_vec3 )
235  );
236 }
237 
238 
245 template <typename T>
246 void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
247 {
248  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
250 
252  k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
253  viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
254 
255  cl_uint size = up_to_internal_size ? cl_uint(vec1.internal_size()) : cl_uint(viennacl::traits::size(vec1));
256  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
257  cl_uint(viennacl::traits::start(vec1)),
258  cl_uint(viennacl::traits::stride(vec1)),
259  size,
260  cl_uint(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
261  viennacl::traits::opencl_handle(T(alpha)) )
262  );
263 }
264 
265 
271 template <typename T>
273 {
274  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
275 
276  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
278 
280 
281  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
282  cl_uint(viennacl::traits::start(vec1)),
283  cl_uint(viennacl::traits::stride(vec1)),
284  cl_uint(viennacl::traits::size(vec1)),
285  viennacl::traits::opencl_handle(vec2),
286  cl_uint(viennacl::traits::start(vec2)),
287  cl_uint(viennacl::traits::stride(vec2)),
288  cl_uint(viennacl::traits::size(vec2)))
289  );
290 }
291 
293 
299 template <typename T, typename OP>
302 {
303  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
304  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
305 
306  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
308 
309  std::string kernel_name = "element_pow";
310  cl_uint op_type = 2; //0: product, 1: division, 2: power
312  {
313  op_type = 1;
314  kernel_name = "element_div";
315  }
317  {
318  op_type = 0;
319  kernel_name = "element_prod";
320  }
321 
323 
324  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
325  cl_uint(viennacl::traits::start(vec1)),
326  cl_uint(viennacl::traits::stride(vec1)),
327  cl_uint(viennacl::traits::size(vec1)),
328 
329  viennacl::traits::opencl_handle(proxy.lhs()),
330  cl_uint(viennacl::traits::start(proxy.lhs())),
331  cl_uint(viennacl::traits::stride(proxy.lhs())),
332 
333  viennacl::traits::opencl_handle(proxy.rhs()),
334  cl_uint(viennacl::traits::start(proxy.rhs())),
335  cl_uint(viennacl::traits::stride(proxy.rhs())),
336 
337  op_type)
338  );
339 }
340 
342 
348 template <typename T, typename OP>
351 {
352  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
353  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
354 
355  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
357 
359 
361  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
362  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
363  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
364  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
365 
367  size_vec2.start = cl_uint(viennacl::traits::start(proxy.lhs()));
368  size_vec2.stride = cl_uint(viennacl::traits::stride(proxy.lhs()));
369  size_vec2.size = cl_uint(viennacl::traits::size(proxy.lhs()));
370  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(proxy.lhs()));
371 
372  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
373  size_vec1,
374  viennacl::traits::opencl_handle(proxy.lhs()),
375  size_vec2)
376  );
377 }
378 
380 
387 template <typename T>
388 void inner_prod_impl(vector_base<T> const & vec1,
389  vector_base<T> const & vec2,
390  vector_base<T> & partial_result)
391 {
392  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
393  assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
394 
395  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
397 
398  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
399  && bool("Incompatible vector sizes in inner_prod_impl()!"));
400 
402 
403  assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in inner_prod_impl()") );
404 
406  size_vec1.start = cl_uint(viennacl::traits::start(vec1));
407  size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
408  size_vec1.size = cl_uint(viennacl::traits::size(vec1));
409  size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
410 
412  size_vec2.start = cl_uint(viennacl::traits::start(vec2));
413  size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
414  size_vec2.size = cl_uint(viennacl::traits::size(vec2));
415  size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
416 
417  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
418  size_vec1,
419  viennacl::traits::opencl_handle(vec2),
420  size_vec2,
422  viennacl::traits::opencl_handle(partial_result)
423  )
424  );
425 }
426 
427 
428 //implementation of inner product:
429 //namespace {
436 template <typename T>
437 void inner_prod_impl(vector_base<T> const & vec1,
438  vector_base<T> const & vec2,
439  scalar<T> & result)
440 {
441  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
442  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
443 
444  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
445 
446  vcl_size_t work_groups = 128;
447  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
448  temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
449 
450  // Step 1: Compute partial inner products for each work group:
451  inner_prod_impl(vec1, vec2, temp);
452 
453  // Step 2: Sum partial results:
455 
456  ksum.global_work_size(0, ksum.local_work_size(0));
457  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
458  cl_uint(viennacl::traits::start(temp)),
459  cl_uint(viennacl::traits::stride(temp)),
460  cl_uint(viennacl::traits::size(temp)),
461  cl_uint(1),
463  viennacl::traits::opencl_handle(result) )
464  );
465 }
466 
467 namespace detail
468 {
469  template<typename NumericT>
471  {
473  ret.start = cl_uint(viennacl::traits::start(vec));
474  ret.stride = cl_uint(viennacl::traits::stride(vec));
475  ret.size = cl_uint(viennacl::traits::size(vec));
477  return ret;
478  }
479 }
480 
487 template <typename NumericT>
489  vector_tuple<NumericT> const & vec_tuple,
490  vector_base<NumericT> & result)
491 {
492  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
493 
494  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
497 
499 
506 
507  vcl_size_t work_groups = inner_prod_kernel_8.global_work_size(0) / inner_prod_kernel_8.local_work_size(0);
508  viennacl::vector<NumericT> temp(8 * work_groups, viennacl::traits::context(x));
509 
510  vcl_size_t current_index = 0;
511  while (current_index < vec_tuple.const_size())
512  {
513  switch (vec_tuple.const_size() - current_index)
514  {
515  case 7:
516  case 6:
517  case 5:
518  case 4:
519  {
520  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
521  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
522  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
523  vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
524  viennacl::ocl::enqueue(inner_prod_kernel_4( viennacl::traits::opencl_handle(x), layout_x,
525  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
526  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
527  viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
528  viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
529  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 4 * inner_prod_kernel_4.local_work_size()),
530  viennacl::traits::opencl_handle(temp)
531  ) );
532 
533  ksum.global_work_size(0, 4 * ksum.local_work_size(0));
534  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
535  cl_uint(work_groups),
537  viennacl::traits::opencl_handle(result),
538  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
539  cl_uint(viennacl::traits::stride(result))
540  )
541  );
542  }
543  current_index += 4;
544  break;
545 
546  case 3:
547  {
548  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
549  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
550  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
551  viennacl::ocl::enqueue(inner_prod_kernel_3( viennacl::traits::opencl_handle(x), layout_x,
552  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
553  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
554  viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
555  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 3 * inner_prod_kernel_3.local_work_size()),
556  viennacl::traits::opencl_handle(temp)
557  ) );
558 
559  ksum.global_work_size(0, 3 * ksum.local_work_size(0));
560  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
561  cl_uint(work_groups),
563  viennacl::traits::opencl_handle(result),
564  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
565  cl_uint(viennacl::traits::stride(result))
566  )
567  );
568  }
569  current_index += 3;
570  break;
571 
572  case 2:
573  {
574  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
575  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
576  viennacl::ocl::enqueue(inner_prod_kernel_2( viennacl::traits::opencl_handle(x), layout_x,
577  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
578  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
579  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 2 * inner_prod_kernel_2.local_work_size()),
580  viennacl::traits::opencl_handle(temp)
581  ) );
582 
583  ksum.global_work_size(0, 2 * ksum.local_work_size(0));
584  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
585  cl_uint(work_groups),
587  viennacl::traits::opencl_handle(result),
588  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
589  cl_uint(viennacl::traits::stride(result))
590  )
591  );
592  }
593  current_index += 2;
594  break;
595 
596  case 1:
597  {
598  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
599  viennacl::ocl::enqueue(inner_prod_kernel_1( viennacl::traits::opencl_handle(x), layout_x,
600  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
601  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 1 * inner_prod_kernel_1.local_work_size()),
602  viennacl::traits::opencl_handle(temp)
603  ) );
604 
605  ksum.global_work_size(0, 1 * ksum.local_work_size(0));
606  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
607  cl_uint(work_groups),
609  viennacl::traits::opencl_handle(result),
610  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
611  cl_uint(viennacl::traits::stride(result))
612  )
613  );
614  }
615  current_index += 1;
616  break;
617 
618  default: //8 or more vectors
619  {
620  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
621  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
622  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
623  vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
624  vector_base<NumericT> const & y4 = vec_tuple.const_at(current_index + 4);
625  vector_base<NumericT> const & y5 = vec_tuple.const_at(current_index + 5);
626  vector_base<NumericT> const & y6 = vec_tuple.const_at(current_index + 6);
627  vector_base<NumericT> const & y7 = vec_tuple.const_at(current_index + 7);
628  viennacl::ocl::enqueue(inner_prod_kernel_8( viennacl::traits::opencl_handle(x), layout_x,
629  viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
630  viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
631  viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
632  viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
633  viennacl::traits::opencl_handle(y4), detail::make_layout(y4),
634  viennacl::traits::opencl_handle(y5), detail::make_layout(y5),
635  viennacl::traits::opencl_handle(y6), detail::make_layout(y6),
636  viennacl::traits::opencl_handle(y7), detail::make_layout(y7),
637  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 8 * inner_prod_kernel_8.local_work_size()),
638  viennacl::traits::opencl_handle(temp)
639  ) );
640 
641  ksum.global_work_size(0, 8 * ksum.local_work_size(0));
642  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
643  cl_uint(work_groups),
645  viennacl::traits::opencl_handle(result),
646  cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
647  cl_uint(viennacl::traits::stride(result))
648  )
649  );
650  }
651  current_index += 8;
652  break;
653  }
654  }
655 
656 }
657 
658 
659 
660 //implementation of inner product:
661 //namespace {
668 template <typename T>
669 void inner_prod_cpu(vector_base<T> const & vec1,
670  vector_base<T> const & vec2,
671  T & result)
672 {
673  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
674 
675  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
676 
677  vcl_size_t work_groups = 128;
678  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
679  temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
680 
681  // Step 1: Compute partial inner products for each work group:
682  inner_prod_impl(vec1, vec2, temp);
683 
684  // Step 2: Sum partial results:
685 
686  // Now copy partial results from GPU back to CPU and run reduction there:
687  std::vector<T> temp_cpu(work_groups);
688  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
689 
690  result = 0;
691  for (typename std::vector<T>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
692  result += *it;
693 }
694 
695 
697 
704 template <typename T>
706  vector_base<T> & partial_result,
707  cl_uint norm_id)
708 {
709  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
710 
711  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
713 
715 
716  assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in norm_reduction_impl()") );
717 
718  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
719  cl_uint(viennacl::traits::start(vec)),
720  cl_uint(viennacl::traits::stride(vec)),
721  cl_uint(viennacl::traits::size(vec)),
722  cl_uint(norm_id),
724  viennacl::traits::opencl_handle(partial_result) )
725  );
726 }
727 
728 
730 
736 template <typename T>
737 void norm_1_impl(vector_base<T> const & vec,
738  scalar<T> & result)
739 {
740  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
741 
742  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
743 
744  vcl_size_t work_groups = 128;
745  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
746 
747  // Step 1: Compute the partial work group results
748  norm_reduction_impl(vec, temp, 1);
749 
750  // Step 2: Compute the partial reduction using OpenCL
752 
753  ksum.global_work_size(0, ksum.local_work_size(0));
754  viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
755  cl_uint(viennacl::traits::start(temp)),
756  cl_uint(viennacl::traits::stride(temp)),
757  cl_uint(viennacl::traits::size(temp)),
758  cl_uint(1),
760  result)
761  );
762 }
763 
769 template <typename T>
770 void norm_1_cpu(vector_base<T> const & vec,
771  T & result)
772 {
773  vcl_size_t work_groups = 128;
774  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
775 
776  // Step 1: Compute the partial work group results
777  norm_reduction_impl(vec, temp, 1);
778 
779  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
780  typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
781 
782  CPUVectorType temp_cpu(work_groups);
783  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
784 
785  result = 0;
786  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
787  result += static_cast<T>(*it);
788 }
789 
790 
791 
793 
794 
800 template <typename T>
801 void norm_2_impl(vector_base<T> const & vec,
802  scalar<T> & result)
803 {
804  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
805 
806  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
807 
808  vcl_size_t work_groups = 128;
809  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
810 
811  // Step 1: Compute the partial work group results
812  norm_reduction_impl(vec, temp, 2);
813 
814  // Step 2: Reduction via OpenCL
816 
817  ksum.global_work_size(0, ksum.local_work_size(0));
818  viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
819  cl_uint(viennacl::traits::start(temp)),
820  cl_uint(viennacl::traits::stride(temp)),
821  cl_uint(viennacl::traits::size(temp)),
822  cl_uint(2),
824  result)
825  );
826 }
827 
833 template <typename T>
834 void norm_2_cpu(vector_base<T> const & vec,
835  T & result)
836 {
837  vcl_size_t work_groups = 128;
838  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
839 
840  // Step 1: Compute the partial work group results
841  norm_reduction_impl(vec, temp, 2);
842 
843  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
844  typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
845 
846  CPUVectorType temp_cpu(work_groups);
847  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
848 
849  result = 0;
850  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
851  result += static_cast<T>(*it);
852  result = std::sqrt(result);
853 }
854 
855 
856 
858 
864 template <typename T>
865 void norm_inf_impl(vector_base<T> const & vec,
866  scalar<T> & result)
867 {
868  assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
869 
870  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
871 
872  vcl_size_t work_groups = 128;
873  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
874 
875  // Step 1: Compute the partial work group results
876  norm_reduction_impl(vec, temp, 0);
877 
878  //part 2: parallel reduction of reduced kernel:
880 
881  ksum.global_work_size(0, ksum.local_work_size(0));
882  viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
883  cl_uint(viennacl::traits::start(temp)),
884  cl_uint(viennacl::traits::stride(temp)),
885  cl_uint(viennacl::traits::size(temp)),
886  cl_uint(0),
888  result)
889  );
890 }
891 
897 template <typename T>
898 void norm_inf_cpu(vector_base<T> const & vec,
899  T & result)
900 {
901  vcl_size_t work_groups = 128;
902  viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
903 
904  // Step 1: Compute the partial work group results
905  norm_reduction_impl(vec, temp, 0);
906 
907  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
908  typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
909 
910  CPUVectorType temp_cpu(work_groups);
911  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
912 
913  result = 0;
914  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
915  result = std::max(result, static_cast<T>(*it));
916 }
917 
918 
920 
921 //This function should return a CPU scalar, otherwise statements like
922 // vcl_rhs[index_norm_inf(vcl_rhs)]
923 // are ambiguous
929 template <typename T>
930 cl_uint index_norm_inf(vector_base<T> const & vec)
931 {
932  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
934 
935  viennacl::ocl::handle<cl_mem> h = ctx.create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
936 
938  //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
939 
940  //TODO: Use multi-group kernel for large vector sizes
941 
943  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
944  cl_uint(viennacl::traits::start(vec)),
945  cl_uint(viennacl::traits::stride(vec)),
946  cl_uint(viennacl::traits::size(vec)),
948  viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
949 
950  //read value:
951  cl_uint result;
952  cl_int err = clEnqueueReadBuffer(ctx.get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
953  VIENNACL_ERR_CHECK(err);
954  return result;
955 }
956 
957 
959 
965 template<typename NumericT>
967  scalar<NumericT> & result)
968 {
969  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
970 
971  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
973 
974  vcl_size_t work_groups = 128;
976 
978 
979  k.global_work_size(0, work_groups * k.local_work_size(0));
980  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
981  cl_uint(viennacl::traits::start(x)),
982  cl_uint(viennacl::traits::stride(x)),
983  cl_uint(viennacl::traits::size(x)),
985  viennacl::traits::opencl_handle(temp)
986  ));
987 
989  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(temp),
990  cl_uint(viennacl::traits::start(temp)),
991  cl_uint(viennacl::traits::stride(temp)),
992  cl_uint(viennacl::traits::size(temp)),
994  viennacl::traits::opencl_handle(result)
995  ));
996 }
997 
1003 template<typename NumericT>
1005  NumericT & result)
1006 {
1007  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
1009 
1010  vcl_size_t work_groups = 128;
1012 
1014 
1015  k.global_work_size(0, work_groups * k.local_work_size(0));
1016  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
1017  cl_uint(viennacl::traits::start(x)),
1018  cl_uint(viennacl::traits::stride(x)),
1019  cl_uint(viennacl::traits::size(x)),
1021  viennacl::traits::opencl_handle(temp)
1022  ));
1023 
1024  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
1025  typedef std::vector<typename viennacl::result_of::cl_type<NumericT>::type> CPUVectorType;
1026 
1027  CPUVectorType temp_cpu(work_groups);
1028  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
1029 
1030  result = static_cast<NumericT>(temp_cpu[0]);
1031  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1032  result = std::max(result, static_cast<NumericT>(*it));
1033 
1034 }
1035 
1036 
1038 
1044 template<typename NumericT>
1046  scalar<NumericT> & result)
1047 {
1048  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
1049 
1050  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
1052 
1053  vcl_size_t work_groups = 128;
1055 
1057 
1058  k.global_work_size(0, work_groups * k.local_work_size(0));
1059  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
1060  cl_uint(viennacl::traits::start(x)),
1061  cl_uint(viennacl::traits::stride(x)),
1062  cl_uint(viennacl::traits::size(x)),
1064  viennacl::traits::opencl_handle(temp)
1065  ));
1066 
1068  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(temp),
1069  cl_uint(viennacl::traits::start(temp)),
1070  cl_uint(viennacl::traits::stride(temp)),
1071  cl_uint(viennacl::traits::size(temp)),
1073  viennacl::traits::opencl_handle(result)
1074  ));
1075 }
1076 
1082 template<typename NumericT>
1084  NumericT & result)
1085 {
1086  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
1088 
1089  vcl_size_t work_groups = 128;
1091 
1093 
1094  k.global_work_size(0, work_groups * k.local_work_size(0));
1095  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
1096  cl_uint(viennacl::traits::start(x)),
1097  cl_uint(viennacl::traits::stride(x)),
1098  cl_uint(viennacl::traits::size(x)),
1100  viennacl::traits::opencl_handle(temp)
1101  ));
1102 
1103  // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
1104  typedef std::vector<typename viennacl::result_of::cl_type<NumericT>::type> CPUVectorType;
1105 
1106  CPUVectorType temp_cpu(work_groups);
1107  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
1108 
1109  result = static_cast<NumericT>(temp_cpu[0]);
1110  for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1111  result = std::min(result, static_cast<NumericT>(*it));
1112 }
1113 
1115 
1121 template<typename NumericT>
1123  scalar<NumericT> & result)
1124 {
1125  assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
1126 
1128  viennacl::linalg::opencl::inner_prod_impl(x, all_ones, result);
1129 }
1130 
1136 template<typename NumericT>
1137 void sum_cpu(vector_base<NumericT> const & x, NumericT & result)
1138 {
1140  sum_impl(x, tmp);
1141  result = tmp;
1142 }
1143 
1144 
1145 //TODO: Special case vec1 == vec2 allows improvement!!
1155 template <typename T>
1157  vector_base<T> & vec2,
1158  T alpha, T beta)
1159 {
1160  assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
1161 
1162  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
1164 
1165  assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
1167 
1168  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
1169  cl_uint(viennacl::traits::start(vec1)),
1170  cl_uint(viennacl::traits::stride(vec1)),
1171  cl_uint(viennacl::traits::size(vec1)),
1172  viennacl::traits::opencl_handle(vec2),
1173  cl_uint(viennacl::traits::start(vec2)),
1174  cl_uint(viennacl::traits::stride(vec2)),
1175  cl_uint(viennacl::traits::size(vec2)),
1176  viennacl::traits::opencl_handle(alpha),
1177  viennacl::traits::opencl_handle(beta))
1178  );
1179 }
1180 
1181 
1183 
1184 
1185 namespace detail
1186 {
1192  template<typename NumericT>
1193  void scan_impl(vector_base<NumericT> const & input,
1194  vector_base<NumericT> & output,
1195  bool is_inclusive)
1196  {
1197  vcl_size_t local_worksize = 128;
1198  vcl_size_t workgroups = 128;
1199 
1200  viennacl::backend::mem_handle opencl_carries;
1201  viennacl::backend::memory_create(opencl_carries, sizeof(NumericT)*workgroups, viennacl::traits::context(input));
1202 
1203  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
1208 
1209  // First step: Scan within each thread group and write carries
1210  k1.local_work_size(0, local_worksize);
1211  k1.global_work_size(0, workgroups * local_worksize);
1212  viennacl::ocl::enqueue(k1( input, cl_uint( input.start()), cl_uint( input.stride()), cl_uint(input.size()),
1213  output, cl_uint(output.start()), cl_uint(output.stride()),
1214  cl_uint(is_inclusive ? 0 : 1), opencl_carries.opencl_handle())
1215  );
1216 
1217  // Second step: Compute offset for each thread group (exclusive scan for each thread group)
1218  k2.local_work_size(0, workgroups);
1219  k2.global_work_size(0, workgroups);
1220  viennacl::ocl::enqueue(k2(opencl_carries.opencl_handle()));
1221 
1222  // Third step: Offset each thread group accordingly
1223  k3.local_work_size(0, local_worksize);
1224  k3.global_work_size(0, workgroups * local_worksize);
1225  viennacl::ocl::enqueue(k3(output, cl_uint(output.start()), cl_uint(output.stride()), cl_uint(output.size()),
1226  opencl_carries.opencl_handle())
1227  );
1228  }
1229 }
1230 
1231 
1237 template<typename NumericT>
1239  vector_base<NumericT> & output)
1240 {
1241  detail::scan_impl(input, output, true);
1242 }
1243 
1244 
1250 template<typename NumericT>
1252  vector_base<NumericT> & output)
1253 {
1254  detail::scan_impl(input, output, false);
1255 }
1256 
1257 
1258 } //namespace opencl
1259 } //namespace linalg
1260 } //namespace viennacl
1261 
1262 
1263 #endif
void min_cpu(vector_base< NumericT > const &x, NumericT &result)
Computes the minimum of a vector, where the result is stored on a CPU scalar.
cl_uint stride
Increment between integers.
Definition: kernel.hpp:50
vcl_size_t const_size() const
Definition: vector.hpp:1143
Helper class for packing four cl_uint numbers into a uint4 type for access inside an OpenCL kernel...
Definition: kernel.hpp:45
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:227
void avbv(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
void norm_2_impl(vector_base< T > const &vec, scalar< T > &result)
Computes the l^2-norm of a vector - implementation using OpenCL summation at second step...
Represents an OpenCL device within ViennaCL.
void norm_1_cpu(vector_base< T > const &vec, T &result)
Computes the l^1-norm of a vector with final reduction on CPU.
Generic size and resize functionality for different vector and matrix types.
void plane_rotation(vector_base< T > &vec1, vector_base< T > &vec2, T alpha, T beta)
Computes a plane rotation of two vectors.
viennacl::ocl::command_queue & get_queue()
Definition: context.hpp:266
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
cl_uint start
Starting value of the integer stride.
Definition: kernel.hpp:48
Various little tools used here and there in ViennaCL.
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
void norm_reduction_impl(vector_base< T > const &vec, vector_base< T > &partial_result, cl_uint norm_id)
Computes the partial work group results for vector norms.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<...
Definition: vector.hpp:727
void norm_inf_impl(vector_base< T > const &vec, scalar< T > &result)
Computes the supremum-norm of a vector.
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
Determines row and column increments for matrices and matrix proxies.
void min_impl(vector_base< NumericT > const &x, scalar< NumericT > &result)
Computes the minimum of a vector, where the result is stored in an OpenCL buffer. ...
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
Definition: size.hpp:375
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
An expression template class that represents a binary operation that yields a vector.
Definition: forwards.h:239
static void init(viennacl::ocl::context &ctx)
Definition: scan.hpp:162
void avbv_v(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< T > const &vec3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices...
Definition: scan.hpp:155
cl_uint internal_size
Internal length of the buffer. Might be larger than 'size' due to padding.
Definition: kernel.hpp:54
Common implementations shared by OpenCL-based operations.
viennacl::ocl::handle< cl_command_queue > const & handle() const
float NumericT
Definition: bisect.cpp:40
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
#define VIENNACL_ERR_CHECK(err)
Definition: error.hpp:681
const OCL_TYPE & get() const
Definition: handle.hpp:191
void inner_prod_impl(vector_base< T > const &vec1, vector_base< T > const &vec2, vector_base< T > &partial_result)
Computes the partial inner product of two vectors - implementation. Library users should call inner_p...
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
void sum_impl(vector_base< NumericT > const &x, scalar< NumericT > &result)
Computes the sum over all entries of a vector.
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
void max_impl(vector_base< NumericT > const &x, scalar< NumericT > &result)
Computes the maximum value of a vector, where the result is stored in an OpenCL buffer.
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:448
void max_cpu(vector_base< NumericT > const &x, NumericT &result)
Computes the maximum value of a vector, where the value is stored in a host value.
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
void norm_2_cpu(vector_base< T > const &vec, T &result)
Computes the l^1-norm of a vector with final reduction on CPU.
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
Definition: forwards.h:269
OpenCL kernel file for vector operations.
Implementation of a smart-pointer-like class for handling OpenCL handles.
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void av(vector_base< T > &vec1, vector_base< T > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:42
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:686
void norm_1_impl(vector_base< T > const &vec, scalar< T > &result)
Computes the l^1-norm of a vector.
void resize(size_type new_size, bool preserve=true)
Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'AlignmentV'.
Definition: vector.hpp:1046
Common base class for dense vectors, vector ranges, and vector slices.
Definition: vector_def.hpp:104
std::size_t vcl_size_t
Definition: forwards.h:75
void inclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an inclusive scan using CUDA.
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:466
OpenCL kernel file for scan operations. To be merged back to vector operations.
void element_op(matrix_base< T > &A, matrix_expression< const matrix_base< T >, const matrix_base< T >, op_element_binary< OP > > const &proxy)
Implementation of binary element-wise operations A = OP(B,C)
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:734
Main kernel class for generating OpenCL kernels for elementwise operations other than addition and su...
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void convert(matrix_base< DestNumericT > &dest, matrix_base< SrcNumericT > const &src)
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Representation of an OpenCL kernel in ViennaCL.
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
Definition: vector_def.hpp:87
void exclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an exclusive scan using CUDA.
cl_uint index_norm_inf(vector_base< T > const &vec)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
void norm_inf_cpu(vector_base< T > const &vec, T &result)
Computes the supremum-norm of a vector.
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
VectorType const & const_at(vcl_size_t i) const
Definition: vector.hpp:1146
viennacl::ocl::packed_cl_uint make_layout(vector_base< NumericT > const &vec)
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:130
OpenCL kernel file for element-wise vector operations.
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
Definition: memory.hpp:87
Forward declarations of the implicit_vector_base, vector_base class.
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector_def.hpp:120
void vector_assign(vector_base< T > &vec1, const T &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:436
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
std::string op_to_string(op_abs)
Definition: common.hpp:78
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void scan_impl(vector_base< NumericT > const &input, vector_base< NumericT > &output, bool is_inclusive)
Worker routine for scan routines using OpenCL.
void vector_swap(vector_base< T > &vec1, vector_base< T > &vec2)
Swaps the contents of two vectors, data is copied.
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:134
void inner_prod_cpu(vector_base< T > const &vec1, vector_base< T > const &vec2, T &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Implementation of the ViennaCL scalar class.
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:789
void sum_cpu(vector_base< NumericT > const &x, NumericT &result)
Computes the sum over all entries of a vector.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: vector.hpp:679
Simple enable-if variant that uses the SFINAE pattern.
cl_uint size
Number of values in the stride.
Definition: kernel.hpp:52
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)
static void init(viennacl::ocl::context &ctx)
viennacl::ocl::handle< cl_mem > create_memory(cl_mem_flags flags, unsigned int size, void *ptr=NULL) const
Creates a memory buffer within the context.
Definition: context.hpp:216