ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vector.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 
26 
27 #include "viennacl/ocl/kernel.hpp"
29 #include "viennacl/ocl/utils.hpp"
30 
31 
32 
35 namespace viennacl
36 {
37 namespace linalg
38 {
39 namespace opencl
40 {
41 namespace kernels
42 {
43 
45 
48 {
49  VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute
52 };
53 
56 {
58 
60  std::string assign_op;
63 };
64 
65 // just returns the for-loop
66 template <typename StringType>
67 void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta)
68 {
69  source.append(" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
70  if (cfg.with_stride_and_range)
71  {
72  source.append(" vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] ");
73  if (mult_alpha)
74  source.append("* alpha ");
75  else
76  source.append("/ alpha ");
77  if (cfg.b != VIENNACL_AVBV_NONE)
78  {
79  source.append("+ vec3[i*size3.y+size3.x] ");
80  if (mult_beta)
81  source.append("* beta");
82  else
83  source.append("/ beta");
84  }
85  }
86  else
87  {
88  source.append(" vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] ");
89  if (mult_alpha)
90  source.append("* alpha ");
91  else
92  source.append("/ alpha ");
93  if (cfg.b != VIENNACL_AVBV_NONE)
94  {
95  source.append("+ vec3[i] ");
96  if (mult_beta)
97  source.append("* beta");
98  else
99  source.append("/ beta");
100  }
101  }
102  source.append("; \n");
103 }
104 
105 template <typename StringType>
106 void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg)
107 {
108  source.append("__kernel void av");
109  if (cfg.b != VIENNACL_AVBV_NONE)
110  source.append("bv");
111  if (cfg.assign_op != "=")
112  source.append("_v");
113 
114  if (cfg.a == VIENNACL_AVBV_CPU)
115  source.append("_cpu");
116  else if (cfg.a == VIENNACL_AVBV_GPU)
117  source.append("_gpu");
118 
119  if (cfg.b == VIENNACL_AVBV_CPU)
120  source.append("_cpu");
121  else if (cfg.b == VIENNACL_AVBV_GPU)
122  source.append("_gpu");
123  source.append("( \n");
124  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
125  source.append(" uint4 size1, \n");
126  source.append(" \n");
127  if (cfg.a == VIENNACL_AVBV_CPU)
128  {
129  source.append(" "); source.append(numeric_string); source.append(" fac2, \n");
130  }
131  else if (cfg.a == VIENNACL_AVBV_GPU)
132  {
133  source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n");
134  }
135  source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
136  source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
137  source.append(" uint4 size2");
138 
139  if (cfg.b != VIENNACL_AVBV_NONE)
140  {
141  source.append(", \n\n");
142  if (cfg.b == VIENNACL_AVBV_CPU)
143  {
144  source.append(" "); source.append(numeric_string); source.append(" fac3, \n");
145  }
146  else if (cfg.b == VIENNACL_AVBV_GPU)
147  {
148  source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n");
149  }
150  source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
151  source.append(" __global const "); source.append(numeric_string); source.append(" * vec3, \n");
152  source.append(" uint4 size3 \n");
153  }
154  source.append(") { \n");
155 
156  if (cfg.a == VIENNACL_AVBV_CPU)
157  {
158  source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n");
159  }
160  else if (cfg.a == VIENNACL_AVBV_GPU)
161  {
162  source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
163  }
164  source.append(" if (options2 & (1 << 0)) \n");
165  source.append(" alpha = -alpha; \n");
166  source.append(" \n");
167 
168  if (cfg.b == VIENNACL_AVBV_CPU)
169  {
170  source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n");
171  }
172  else if (cfg.b == VIENNACL_AVBV_GPU)
173  {
174  source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
175  }
176  if (cfg.b != VIENNACL_AVBV_NONE)
177  {
178  source.append(" if (options3 & (1 << 0)) \n");
179  source.append(" beta = -beta; \n");
180  source.append(" \n");
181  }
182  source.append(" if (options2 & (1 << 1)) { \n");
183  if (cfg.b != VIENNACL_AVBV_NONE)
184  {
185  source.append(" if (options3 & (1 << 1)) {\n");
186  generate_avbv_impl2(source, numeric_string, cfg, false, false);
187  source.append(" } else {\n");
188  generate_avbv_impl2(source, numeric_string, cfg, false, true);
189  source.append(" } \n");
190  }
191  else
192  generate_avbv_impl2(source, numeric_string, cfg, false, true);
193  source.append(" } else { \n");
194  if (cfg.b != VIENNACL_AVBV_NONE)
195  {
196  source.append(" if (options3 & (1 << 1)) {\n");
197  generate_avbv_impl2(source, numeric_string, cfg, true, false);
198  source.append(" } else {\n");
199  generate_avbv_impl2(source, numeric_string, cfg, true, true);
200  source.append(" } \n");
201  }
202  else
203  generate_avbv_impl2(source, numeric_string, cfg, true, true);
204  source.append(" } \n");
205  source.append("} \n");
206 }
207 
208 template <typename StringType>
209 void generate_avbv(StringType & source, std::string const & numeric_string)
210 {
211  avbv_config cfg;
212  cfg.assign_op = "=";
213  cfg.with_stride_and_range = true;
214 
215  // av
216  cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
217  cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
218 
219  // avbv
220  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
221  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
222  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
223  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
224 
225  // avbv
226  cfg.assign_op = "+=";
227 
228  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
229  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
230  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
231  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
232 }
233 
234 template <typename StringType>
235 void generate_plane_rotation(StringType & source, std::string const & numeric_string)
236 {
237  source.append("__kernel void plane_rotation( \n");
238  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
239  source.append(" unsigned int start1, \n");
240  source.append(" unsigned int inc1, \n");
241  source.append(" unsigned int size1, \n");
242  source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n");
243  source.append(" unsigned int start2, \n");
244  source.append(" unsigned int inc2, \n");
245  source.append(" unsigned int size2, \n");
246  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
247  source.append(" "); source.append(numeric_string); source.append(" beta) \n");
248  source.append("{ \n");
249  source.append(" "); source.append(numeric_string); source.append(" tmp1 = 0; \n");
250  source.append(" "); source.append(numeric_string); source.append(" tmp2 = 0; \n");
251  source.append(" \n");
252  source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
253  source.append(" { \n");
254  source.append(" tmp1 = vec1[i*inc1+start1]; \n");
255  source.append(" tmp2 = vec2[i*inc2+start2]; \n");
256  source.append(" \n");
257  source.append(" vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
258  source.append(" vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
259  source.append(" } \n");
260  source.append(" \n");
261  source.append("} \n");
262 }
263 
264 template <typename StringType>
265 void generate_vector_swap(StringType & source, std::string const & numeric_string)
266 {
267  source.append("__kernel void swap( \n");
268  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
269  source.append(" unsigned int start1, \n");
270  source.append(" unsigned int inc1, \n");
271  source.append(" unsigned int size1, \n");
272  source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n");
273  source.append(" unsigned int start2, \n");
274  source.append(" unsigned int inc2, \n");
275  source.append(" unsigned int size2 \n");
276  source.append(" ) \n");
277  source.append("{ \n");
278  source.append(" "); source.append(numeric_string); source.append(" tmp; \n");
279  source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
280  source.append(" { \n");
281  source.append(" tmp = vec2[i*inc2+start2]; \n");
282  source.append(" vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
283  source.append(" vec1[i*inc1+start1] = tmp; \n");
284  source.append(" } \n");
285  source.append("} \n");
286 }
287 
288 template <typename StringType>
289 void generate_assign_cpu(StringType & source, std::string const & numeric_string)
290 {
291  source.append("__kernel void assign_cpu( \n");
292  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
293  source.append(" unsigned int start1, \n");
294  source.append(" unsigned int inc1, \n");
295  source.append(" unsigned int size1, \n");
296  source.append(" unsigned int internal_size1, \n");
297  source.append(" "); source.append(numeric_string); source.append(" alpha) \n");
298  source.append("{ \n");
299  source.append(" for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
300  source.append(" vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
301  source.append("} \n");
302 
303 }
304 
305 template <typename StringType>
306 void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num)
307 {
308  std::stringstream ss;
309  ss << vector_num;
310  std::string vector_num_string = ss.str();
311 
312  source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n");
313  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
314  source.append(" uint4 params_x, \n");
315  for (vcl_size_t i=0; i<vector_num; ++i)
316  {
317  ss.str("");
318  ss << i;
319  source.append(" __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n");
320  source.append(" uint4 params_y"); source.append(ss.str()); source.append(", \n");
321  }
322  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
323  source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
324  source.append("{ \n");
325  source.append(" unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
326  source.append(" unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
327  source.append(" unsigned int vec_stop_index = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
328 
329  // compute partial results within group:
330  for (vcl_size_t i=0; i<vector_num; ++i)
331  {
332  ss.str("");
333  ss << i;
334  source.append(" "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n");
335  }
336  source.append(" for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
337  source.append(" "); source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n");
338  for (vcl_size_t i=0; i<vector_num; ++i)
339  {
340  ss.str("");
341  ss << i;
342  source.append(" tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n");
343  }
344  source.append(" } \n");
345  for (vcl_size_t i=0; i<vector_num; ++i)
346  {
347  ss.str("");
348  ss << i;
349  source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n");
350  }
351 
352  // now run reduction:
353  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
354  source.append(" { \n");
355  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
356  source.append(" if (get_local_id(0) < stride) { \n");
357  for (vcl_size_t i=0; i<vector_num; ++i)
358  {
359  ss.str("");
360  ss << i;
361  source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n");
362  }
363  source.append(" } \n");
364  source.append(" } \n");
365  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
366 
367  source.append(" if (get_local_id(0) == 0) { \n");
368  for (vcl_size_t i=0; i<vector_num; ++i)
369  {
370  ss.str("");
371  ss << i;
372  source.append(" group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n");
373  }
374  source.append(" } \n");
375  source.append("} \n");
376 
377 }
378 
379 template <typename StringType>
380 void generate_norm(StringType & source, std::string const & numeric_string)
381 {
382  bool is_float_or_double = (numeric_string == "float" || numeric_string == "double");
383 
384  source.append(numeric_string); source.append(" impl_norm( \n");
385  source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
386  source.append(" unsigned int start1, \n");
387  source.append(" unsigned int inc1, \n");
388  source.append(" unsigned int size1, \n");
389  source.append(" unsigned int norm_selector, \n");
390  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n");
391  source.append("{ \n");
392  source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n");
393  source.append(" if (norm_selector == 1) \n"); //norm_1
394  source.append(" { \n");
395  source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
396  if (is_float_or_double)
397  source.append(" tmp += fabs(vec[i*inc1 + start1]); \n");
398  else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
399  source.append(" tmp += vec[i*inc1 + start1]; \n");
400  else
401  source.append(" tmp += abs(vec[i*inc1 + start1]); \n");
402  source.append(" } \n");
403  source.append(" else if (norm_selector == 2) \n"); //norm_2
404  source.append(" { \n");
405  source.append(" "); source.append(numeric_string); source.append(" vec_entry = 0; \n");
406  source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
407  source.append(" { \n");
408  source.append(" vec_entry = vec[i*inc1 + start1]; \n");
409  source.append(" tmp += vec_entry * vec_entry; \n");
410  source.append(" } \n");
411  source.append(" } \n");
412  source.append(" else if (norm_selector == 0) \n"); //norm_inf
413  source.append(" { \n");
414  source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
415  if (is_float_or_double)
416  source.append(" tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
417  else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
418  source.append(" tmp = max(vec[i*inc1 + start1], tmp); \n");
419  else
420  {
421  source.append(" tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n");
422  }
423  source.append(" } \n");
424 
425  source.append(" tmp_buffer[get_local_id(0)] = tmp; \n");
426 
427  source.append(" if (norm_selector > 0) \n"); //norm_1 or norm_2:
428  source.append(" { \n");
429  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
430  source.append(" { \n");
431  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
432  source.append(" if (get_local_id(0) < stride) \n");
433  source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
434  source.append(" } \n");
435  source.append(" return tmp_buffer[0]; \n");
436  source.append(" } \n");
437 
438  //norm_inf:
439  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
440  source.append(" { \n");
441  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
442  source.append(" if (get_local_id(0) < stride) \n");
443  if (is_float_or_double)
444  source.append(" tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
445  else
446  source.append(" tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
447  source.append(" } \n");
448 
449  source.append(" return tmp_buffer[0]; \n");
450  source.append("}; \n");
451 
452  source.append("__kernel void norm( \n");
453  source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
454  source.append(" unsigned int start1, \n");
455  source.append(" unsigned int inc1, \n");
456  source.append(" unsigned int size1, \n");
457  source.append(" unsigned int norm_selector, \n");
458  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
459  source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
460  source.append("{ \n");
461  source.append(" "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n");
462  source.append(" ( get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1, \n");
463  source.append(" inc1, \n");
464  source.append(" ( (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
465  source.append(" - ( get_group_id(0) * size1) / get_num_groups(0), \n");
466  source.append(" norm_selector, \n");
467  source.append(" tmp_buffer); \n");
468 
469  source.append(" if (get_local_id(0) == 0) \n");
470  source.append(" group_buffer[get_group_id(0)] = tmp; \n");
471  source.append("} \n");
472 
473 }
474 
475 template <typename StringType>
476 void generate_inner_prod_sum(StringType & source, std::string const & numeric_string)
477 {
478  // sums the array 'vec1' and writes to result. Each work group computes the inner product for a subvector of size 'size_per_workgroup'.
479  source.append("__kernel void sum_inner_prod( \n");
480  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
481  source.append(" unsigned int size_per_workgroup, \n");
482  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
483  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
484  source.append(" unsigned int start_result, \n");
485  source.append(" unsigned int inc_result) \n");
486  source.append("{ \n");
487  source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
488  source.append(" for (unsigned int i = get_local_id(0); i<size_per_workgroup; i += get_local_size(0)) \n");
489  source.append(" thread_sum += vec1[size_per_workgroup * get_group_id(0) + i]; \n");
490 
491  source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n");
492 
493  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
494  source.append(" { \n");
495  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
496  source.append(" if (get_local_id(0) < stride) \n");
497  source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
498  source.append(" } \n");
499  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
500 
501  source.append(" if (get_local_id(0) == 0) \n");
502  source.append(" result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
503  source.append("} \n");
504 
505 }
506 
507 template <typename StringType>
508 void generate_sum(StringType & source, std::string const & numeric_string)
509 {
510  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
511  source.append("__kernel void sum( \n");
512  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
513  source.append(" unsigned int start1, \n");
514  source.append(" unsigned int inc1, \n");
515  source.append(" unsigned int size1, \n");
516  source.append(" unsigned int option, \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
517  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
518  source.append(" __global "); source.append(numeric_string); source.append(" * result) \n");
519  source.append("{ \n");
520  source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
521  source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n");
522  source.append(" for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
523  source.append(" { \n");
524  source.append(" if (option > 0) \n");
525  source.append(" thread_sum += vec1[i*inc1+start1]; \n");
526  source.append(" else \n");
527  source.append(" { \n");
528  source.append(" tmp = vec1[i*inc1+start1]; \n");
529  source.append(" tmp = (tmp < 0) ? -tmp : tmp; \n");
530  source.append(" thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
531  source.append(" } \n");
532  source.append(" } \n");
533 
534  source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n");
535 
536  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
537  source.append(" { \n");
538  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
539  source.append(" if (get_local_id(0) < stride) \n");
540  source.append(" { \n");
541  source.append(" if (option > 0) \n");
542  source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
543  source.append(" else \n");
544  source.append(" tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
545  source.append(" } \n");
546  source.append(" } \n");
547  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
548 
549  source.append(" if (get_global_id(0) == 0) \n");
550  source.append(" { \n");
551  if (numeric_string == "float" || numeric_string == "double")
552  {
553  source.append(" if (option == 2) \n");
554  source.append(" *result = sqrt(tmp_buffer[0]); \n");
555  source.append(" else \n");
556  }
557  source.append(" *result = tmp_buffer[0]; \n");
558  source.append(" } \n");
559  source.append("} \n");
560 
561 }
562 
563 template <typename StringType>
564 void generate_index_norm_inf(StringType & source, std::string const & numeric_string)
565 {
566  //index_norm_inf:
567  source.append("unsigned int index_norm_inf_impl( \n");
568  source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
569  source.append(" unsigned int start1, \n");
570  source.append(" unsigned int inc1, \n");
571  source.append(" unsigned int size1, \n");
572  source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
573  source.append(" __local unsigned int * index_buffer) \n");
574  source.append("{ \n");
575  //step 1: fill buffer:
576  source.append(" "); source.append(numeric_string); source.append(" cur_max = 0; \n");
577  source.append(" "); source.append(numeric_string); source.append(" tmp; \n");
578  source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
579  source.append(" { \n");
580  if (numeric_string == "float" || numeric_string == "double")
581  source.append(" tmp = fabs(vec[i*inc1+start1]); \n");
582  else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
583  source.append(" tmp = vec[i*inc1+start1]; \n");
584  else
585  source.append(" tmp = abs(vec[i*inc1+start1]); \n");
586  source.append(" if (cur_max < tmp) \n");
587  source.append(" { \n");
588  source.append(" entry_buffer[get_global_id(0)] = tmp; \n");
589  source.append(" index_buffer[get_global_id(0)] = i; \n");
590  source.append(" cur_max = tmp; \n");
591  source.append(" } \n");
592  source.append(" } \n");
593 
594  //step 2: parallel reduction:
595  source.append(" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
596  source.append(" { \n");
597  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
598  source.append(" if (get_global_id(0) < stride) \n");
599  source.append(" { \n");
600  //find the first occurring index
601  source.append(" if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
602  source.append(" { \n");
603  source.append(" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
604  source.append(" entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
605  source.append(" } \n");
606  source.append(" } \n");
607  source.append(" } \n");
608  source.append(" \n");
609  source.append(" return index_buffer[0]; \n");
610  source.append("} \n");
611 
612  source.append("__kernel void index_norm_inf( \n");
613  source.append(" __global "); source.append(numeric_string); source.append(" * vec, \n");
614  source.append(" unsigned int start1, \n");
615  source.append(" unsigned int inc1, \n");
616  source.append(" unsigned int size1, \n");
617  source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
618  source.append(" __local unsigned int * index_buffer, \n");
619  source.append(" __global unsigned int * result) \n");
620  source.append("{ \n");
621  source.append(" entry_buffer[get_global_id(0)] = 0; \n");
622  source.append(" index_buffer[get_global_id(0)] = 0; \n");
623  source.append(" unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
624  source.append(" if (get_global_id(0) == 0) *result = tmp; \n");
625  source.append("} \n");
626 
627 }
628 
629 template <typename StringType>
630 void generate_maxmin(StringType & source, std::string const & numeric_string, bool is_max)
631 {
632  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
633  if (is_max)
634  source.append("__kernel void max_kernel( \n");
635  else
636  source.append("__kernel void min_kernel( \n");
637  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
638  source.append(" unsigned int start1, \n");
639  source.append(" unsigned int inc1, \n");
640  source.append(" unsigned int size1, \n");
641  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
642  source.append(" __global "); source.append(numeric_string); source.append(" * result) \n");
643  source.append("{ \n");
644  source.append(" "); source.append(numeric_string); source.append(" thread_result = vec1[start1]; \n");
645  source.append(" for (unsigned int i = get_global_id(0); i<size1; i += get_global_size(0)) \n");
646  source.append(" { \n");
647  source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[i*inc1+start1]; \n");
648  if (is_max)
649  source.append(" thread_result = thread_result > tmp ? thread_result : tmp; \n");
650  else
651  source.append(" thread_result = thread_result < tmp ? thread_result : tmp; \n");
652  source.append(" } \n");
653 
654  source.append(" tmp_buffer[get_local_id(0)] = thread_result; \n");
655 
656  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
657  source.append(" { \n");
658  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
659  source.append(" if (get_local_id(0) < stride) \n");
660  source.append(" { \n");
661  if (is_max)
662  source.append(" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
663  else
664  source.append(" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] < tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
665  source.append(" } \n");
666  source.append(" } \n");
667  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
668 
669  source.append(" if (get_local_id(0) == 0) \n");
670  source.append(" result[get_group_id(0)] = tmp_buffer[0]; \n");
671  source.append("} \n");
672 }
673 
675 
676 // main kernel class
678 template<typename NumericT>
679 struct vector
680 {
681  static std::string program_name()
682  {
684  }
685 
686  static void init(viennacl::ocl::context & ctx)
687  {
689  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
690 
691  static std::map<cl_context, bool> init_done;
692  if (!init_done[ctx.handle().get()])
693  {
694  std::string source;
695  source.reserve(8192);
696 
697  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
698 
699  // fully parametrized kernels:
700  generate_avbv(source, numeric_string);
701 
702  // kernels with mostly predetermined skeleton:
703  generate_plane_rotation(source, numeric_string);
704  generate_vector_swap(source, numeric_string);
705  generate_assign_cpu(source, numeric_string);
706 
707  generate_inner_prod(source, numeric_string, 1);
708  generate_norm(source, numeric_string);
709  generate_sum(source, numeric_string);
710  generate_index_norm_inf(source, numeric_string);
711  generate_maxmin(source, numeric_string, true);
712  generate_maxmin(source, numeric_string, false);
713 
714  std::string prog_name = program_name();
715  #ifdef VIENNACL_BUILD_INFO
716  std::cout << "Creating program " << prog_name << std::endl;
717  #endif
718  ctx.add_program(source, prog_name);
719  init_done[ctx.handle().get()] = true;
720  } //if
721  } //init
722 };
723 
724 // class with kernels for multiple inner products.
726 template<typename NumericT>
728 {
729  static std::string program_name()
730  {
731  return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector_multi";
732  }
733 
734  static void init(viennacl::ocl::context & ctx)
735  {
737  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
738 
739  static std::map<cl_context, bool> init_done;
740  if (!init_done[ctx.handle().get()])
741  {
742  std::string source;
743  source.reserve(8192);
744 
745  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
746 
747  generate_inner_prod(source, numeric_string, 2);
748  generate_inner_prod(source, numeric_string, 3);
749  generate_inner_prod(source, numeric_string, 4);
750  generate_inner_prod(source, numeric_string, 8);
751 
752  generate_inner_prod_sum(source, numeric_string);
753 
754  std::string prog_name = program_name();
755  #ifdef VIENNACL_BUILD_INFO
756  std::cout << "Creating program " << prog_name << std::endl;
757  #endif
758  ctx.add_program(source, prog_name);
759  init_done[ctx.handle().get()] = true;
760  } //if
761  } //init
762 };
763 
764 
765 template<typename StringT>
766 void generate_vector_convert(StringT & source, std::string const & dest_type, std::string const & src_type)
767 {
768  source.append(" __kernel void convert_" + dest_type + "_" + src_type + "( \n");
769  source.append(" __global " + dest_type + " * dest, \n");
770  source.append(" unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, \n");
771  source.append(" __global const " + src_type + " * src, \n");
772  source.append(" unsigned int start_src, unsigned int inc_src) \n");
773  source.append(" { \n");
774  source.append(" for (unsigned int i = get_global_id(0); i < size_dest; i += get_global_size(0)) \n");
775  source.append(" dest[start_dest + i * inc_dest] = src[start_src + i * inc_src]; \n");
776  source.append(" } \n");
777 }
778 
781 {
782 
783 public:
784  static std::string program_name()
785  {
786  return "vector_convert";
787  }
788 
789  static void init(viennacl::ocl::context & ctx)
790  {
791  static std::map<cl_context, bool> init_done;
792  if (!init_done[ctx.handle().get()])
793  {
794  std::string source;
795  source.reserve(4096);
796 
797  // int
803 
804  // unsigned int
810 
811  // long
817 
818  // unsigned long
824 
825  // float
831 
832  if (ctx.current_device().double_support())
833  {
835 
841 
848  }
849 
850  std::string prog_name = program_name();
851  #ifdef VIENNACL_BUILD_INFO
852  std::cout << "Creating program " << prog_name << std::endl;
853  #endif
854  ctx.add_program(source, prog_name);
855  init_done[ctx.handle().get()] = true;
856  } //if
857  } //init
858 
859 };
860 
861 
862 } // namespace kernels
863 } // namespace opencl
864 } // namespace linalg
865 } // namespace viennacl
866 #endif
867 
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:112
Implements a OpenCL platform within ViennaCL.
void generate_inner_prod(StringType &source, std::string const &numeric_string, vcl_size_t vector_num)
Definition: vector.hpp:306
void generate_index_norm_inf(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:564
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
Definition: utils.hpp:78
Various little tools used here and there in ViennaCL.
void generate_vector_swap(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:265
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<...
Definition: vector.hpp:727
void generate_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:279
Provides OpenCL-related utilities.
void generate_maxmin(StringType &source, std::string const &numeric_string, bool is_max)
Definition: vector.hpp:630
Main kernel class for vector conversion routines (e.g. convert vector to vector).
Definition: vector.hpp:780
void generate_plane_rotation(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:235
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_avbv(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:209
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context.
Definition: context.hpp:368
const OCL_TYPE & get() const
Definition: handle.hpp:191
void generate_sum(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:508
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:686
Configuration struct for generating OpenCL kernels for linear combinations of vectors.
Definition: vector.hpp:55
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Definition: device.hpp:956
std::size_t vcl_size_t
Definition: forwards.h:75
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:734
void generate_avbv_impl(StringType &source, std::string const &numeric_string, avbv_config const &cfg)
Definition: vector.hpp:106
void generate_avbv_impl2(StringType &source, std::string const &, avbv_config const &cfg, bool mult_alpha, bool mult_beta)
Definition: vector.hpp:67
void generate_inner_prod_sum(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:476
Representation of an OpenCL kernel in ViennaCL.
void generate_norm(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:380
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_vector_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
Definition: vector.hpp:766
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:789
avbv_scalar_type
Enumeration for the scalar type in avbv-like operations.
Definition: vector.hpp:47
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: vector.hpp:679