1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
66 template <
typename StringType>
69 source.append(
" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
72 source.append(
" vec1[i*size1.y+size1.x] "); source.append(cfg.
assign_op); source.append(
" vec2[i*size2.y+size2.x] ");
74 source.append(
"* alpha ");
76 source.append(
"/ alpha ");
79 source.append(
"+ vec3[i*size3.y+size3.x] ");
81 source.append(
"* beta");
83 source.append(
"/ beta");
88 source.append(
" vec1[i] "); source.append(cfg.
assign_op); source.append(
" vec2[i] ");
90 source.append(
"* alpha ");
92 source.append(
"/ alpha ");
95 source.append(
"+ vec3[i] ");
97 source.append(
"* beta");
99 source.append(
"/ beta");
102 source.append(
"; \n");
105 template <
typename StringType>
108 source.append(
"__kernel void av");
115 source.append(
"_cpu");
117 source.append(
"_gpu");
120 source.append(
"_cpu");
122 source.append(
"_gpu");
123 source.append(
"( \n");
124 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec1, \n");
125 source.append(
" uint4 size1, \n");
126 source.append(
" \n");
129 source.append(
" "); source.append(numeric_string); source.append(
" fac2, \n");
133 source.append(
" __global "); source.append(numeric_string); source.append(
" * fac2, \n");
135 source.append(
" unsigned int options2, \n");
136 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec2, \n");
137 source.append(
" uint4 size2");
141 source.append(
", \n\n");
144 source.append(
" "); source.append(numeric_string); source.append(
" fac3, \n");
148 source.append(
" __global "); source.append(numeric_string); source.append(
" * fac3, \n");
150 source.append(
" unsigned int options3, \n");
151 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec3, \n");
152 source.append(
" uint4 size3 \n");
154 source.append(
") { \n");
158 source.append(
" "); source.append(numeric_string); source.append(
" alpha = fac2; \n");
162 source.append(
" "); source.append(numeric_string); source.append(
" alpha = fac2[0]; \n");
164 source.append(
" if (options2 & (1 << 0)) \n");
165 source.append(
" alpha = -alpha; \n");
166 source.append(
" \n");
170 source.append(
" "); source.append(numeric_string); source.append(
" beta = fac3; \n");
174 source.append(
" "); source.append(numeric_string); source.append(
" beta = fac3[0]; \n");
178 source.append(
" if (options3 & (1 << 0)) \n");
179 source.append(
" beta = -beta; \n");
180 source.append(
" \n");
182 source.append(
" if (options2 & (1 << 1)) { \n");
185 source.append(
" if (options3 & (1 << 1)) {\n");
187 source.append(
" } else {\n");
189 source.append(
" } \n");
193 source.append(
" } else { \n");
196 source.append(
" if (options3 & (1 << 1)) {\n");
198 source.append(
" } else {\n");
200 source.append(
" } \n");
204 source.append(
" } \n");
205 source.append(
"} \n");
208 template <
typename StringType>
234 template <
typename StringType>
237 source.append(
"__kernel void plane_rotation( \n");
238 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec1, \n");
239 source.append(
" unsigned int start1, \n");
240 source.append(
" unsigned int inc1, \n");
241 source.append(
" unsigned int size1, \n");
242 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec2, \n");
243 source.append(
" unsigned int start2, \n");
244 source.append(
" unsigned int inc2, \n");
245 source.append(
" unsigned int size2, \n");
246 source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n");
247 source.append(
" "); source.append(numeric_string); source.append(
" beta) \n");
248 source.append(
"{ \n");
249 source.append(
" "); source.append(numeric_string); source.append(
" tmp1 = 0; \n");
250 source.append(
" "); source.append(numeric_string); source.append(
" tmp2 = 0; \n");
251 source.append(
" \n");
252 source.append(
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
253 source.append(
" { \n");
254 source.append(
" tmp1 = vec1[i*inc1+start1]; \n");
255 source.append(
" tmp2 = vec2[i*inc2+start2]; \n");
256 source.append(
" \n");
257 source.append(
" vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
258 source.append(
" vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
259 source.append(
" } \n");
260 source.append(
" \n");
261 source.append(
"} \n");
264 template <
typename StringType>
267 source.append(
"__kernel void swap( \n");
268 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec1, \n");
269 source.append(
" unsigned int start1, \n");
270 source.append(
" unsigned int inc1, \n");
271 source.append(
" unsigned int size1, \n");
272 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec2, \n");
273 source.append(
" unsigned int start2, \n");
274 source.append(
" unsigned int inc2, \n");
275 source.append(
" unsigned int size2 \n");
276 source.append(
" ) \n");
277 source.append(
"{ \n");
278 source.append(
" "); source.append(numeric_string); source.append(
" tmp; \n");
279 source.append(
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
280 source.append(
" { \n");
281 source.append(
" tmp = vec2[i*inc2+start2]; \n");
282 source.append(
" vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
283 source.append(
" vec1[i*inc1+start1] = tmp; \n");
284 source.append(
" } \n");
285 source.append(
"} \n");
288 template <
typename StringType>
291 source.append(
"__kernel void assign_cpu( \n");
292 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec1, \n");
293 source.append(
" unsigned int start1, \n");
294 source.append(
" unsigned int inc1, \n");
295 source.append(
" unsigned int size1, \n");
296 source.append(
" unsigned int internal_size1, \n");
297 source.append(
" "); source.append(numeric_string); source.append(
" alpha) \n");
298 source.append(
"{ \n");
299 source.append(
" for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
300 source.append(
" vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
301 source.append(
"} \n");
305 template <
typename StringType>
308 std::stringstream ss;
310 std::string vector_num_string = ss.str();
312 source.append(
"__kernel void inner_prod"); source.append(vector_num_string); source.append(
"( \n");
313 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
314 source.append(
" uint4 params_x, \n");
319 source.append(
" __global const "); source.append(numeric_string); source.append(
" * y"); source.append(ss.str()); source.append(
", \n");
320 source.append(
" uint4 params_y"); source.append(ss.str()); source.append(
", \n");
322 source.append(
" __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
323 source.append(
" __global "); source.append(numeric_string); source.append(
" * group_buffer) \n");
324 source.append(
"{ \n");
325 source.append(
" unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
326 source.append(
" unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
327 source.append(
" unsigned int vec_stop_index = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
334 source.append(
" "); source.append(numeric_string); source.append(
" tmp"); source.append(ss.str()); source.append(
" = 0; \n");
336 source.append(
" for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
337 source.append(
" "); source.append(numeric_string); source.append(
" val_x = x[i*params_x.y + params_x.x]; \n");
342 source.append(
" tmp"); source.append(ss.str()); source.append(
" += val_x * y"); source.append(ss.str()); source.append(
"[i * params_y"); source.append(ss.str()); source.append(
".y + params_y"); source.append(ss.str()); source.append(
".x]; \n");
344 source.append(
" } \n");
349 source.append(
" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(
" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append(
"; \n");
353 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
354 source.append(
" { \n");
355 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
356 source.append(
" if (get_local_id(0) < stride) { \n");
361 source.append(
" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(
" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(
" * get_local_size(0) + stride]; \n");
363 source.append(
" } \n");
364 source.append(
" } \n");
365 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
367 source.append(
" if (get_local_id(0) == 0) { \n");
372 source.append(
" group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(
" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(
" * get_local_size(0)]; \n");
374 source.append(
" } \n");
375 source.append(
"} \n");
379 template <
typename StringType>
382 bool is_float_or_double = (numeric_string ==
"float" || numeric_string ==
"double");
384 source.append(numeric_string); source.append(
" impl_norm( \n");
385 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec, \n");
386 source.append(
" unsigned int start1, \n");
387 source.append(
" unsigned int inc1, \n");
388 source.append(
" unsigned int size1, \n");
389 source.append(
" unsigned int norm_selector, \n");
390 source.append(
" __local "); source.append(numeric_string); source.append(
" * tmp_buffer) \n");
391 source.append(
"{ \n");
392 source.append(
" "); source.append(numeric_string); source.append(
" tmp = 0; \n");
393 source.append(
" if (norm_selector == 1) \n");
394 source.append(
" { \n");
395 source.append(
" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
396 if (is_float_or_double)
397 source.append(
" tmp += fabs(vec[i*inc1 + start1]); \n");
398 else if (numeric_string[0] ==
'u')
399 source.append(
" tmp += vec[i*inc1 + start1]; \n");
401 source.append(
" tmp += abs(vec[i*inc1 + start1]); \n");
402 source.append(
" } \n");
403 source.append(
" else if (norm_selector == 2) \n");
404 source.append(
" { \n");
405 source.append(
" "); source.append(numeric_string); source.append(
" vec_entry = 0; \n");
406 source.append(
" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
407 source.append(
" { \n");
408 source.append(
" vec_entry = vec[i*inc1 + start1]; \n");
409 source.append(
" tmp += vec_entry * vec_entry; \n");
410 source.append(
" } \n");
411 source.append(
" } \n");
412 source.append(
" else if (norm_selector == 0) \n");
413 source.append(
" { \n");
414 source.append(
" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
415 if (is_float_or_double)
416 source.append(
" tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
417 else if (numeric_string[0] ==
'u')
418 source.append(
" tmp = max(vec[i*inc1 + start1], tmp); \n");
421 source.append(
" tmp = max(("); source.append(numeric_string); source.append(
")abs(vec[i*inc1 + start1]), tmp); \n");
423 source.append(
" } \n");
425 source.append(
" tmp_buffer[get_local_id(0)] = tmp; \n");
427 source.append(
" if (norm_selector > 0) \n");
428 source.append(
" { \n");
429 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
430 source.append(
" { \n");
431 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
432 source.append(
" if (get_local_id(0) < stride) \n");
433 source.append(
" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
434 source.append(
" } \n");
435 source.append(
" return tmp_buffer[0]; \n");
436 source.append(
" } \n");
439 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
440 source.append(
" { \n");
441 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
442 source.append(
" if (get_local_id(0) < stride) \n");
443 if (is_float_or_double)
444 source.append(
" tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
446 source.append(
" tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
447 source.append(
" } \n");
449 source.append(
" return tmp_buffer[0]; \n");
450 source.append(
"}; \n");
452 source.append(
"__kernel void norm( \n");
453 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec, \n");
454 source.append(
" unsigned int start1, \n");
455 source.append(
" unsigned int inc1, \n");
456 source.append(
" unsigned int size1, \n");
457 source.append(
" unsigned int norm_selector, \n");
458 source.append(
" __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
459 source.append(
" __global "); source.append(numeric_string); source.append(
" * group_buffer) \n");
460 source.append(
"{ \n");
461 source.append(
" "); source.append(numeric_string); source.append(
" tmp = impl_norm(vec, \n");
462 source.append(
" ( get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1, \n");
463 source.append(
" inc1, \n");
464 source.append(
" ( (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
465 source.append(
" - ( get_group_id(0) * size1) / get_num_groups(0), \n");
466 source.append(
" norm_selector, \n");
467 source.append(
" tmp_buffer); \n");
469 source.append(
" if (get_local_id(0) == 0) \n");
470 source.append(
" group_buffer[get_group_id(0)] = tmp; \n");
471 source.append(
"} \n");
475 template <
typename StringType>
479 source.append(
"__kernel void sum_inner_prod( \n");
480 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec1, \n");
481 source.append(
" unsigned int size_per_workgroup, \n");
482 source.append(
" __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
483 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
484 source.append(
" unsigned int start_result, \n");
485 source.append(
" unsigned int inc_result) \n");
486 source.append(
"{ \n");
487 source.append(
" "); source.append(numeric_string); source.append(
" thread_sum = 0; \n");
488 source.append(
" for (unsigned int i = get_local_id(0); i<size_per_workgroup; i += get_local_size(0)) \n");
489 source.append(
" thread_sum += vec1[size_per_workgroup * get_group_id(0) + i]; \n");
491 source.append(
" tmp_buffer[get_local_id(0)] = thread_sum; \n");
493 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
494 source.append(
" { \n");
495 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
496 source.append(
" if (get_local_id(0) < stride) \n");
497 source.append(
" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
498 source.append(
" } \n");
499 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
501 source.append(
" if (get_local_id(0) == 0) \n");
502 source.append(
" result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
503 source.append(
"} \n");
507 template <
typename StringType>
508 void generate_sum(StringType & source, std::string
const & numeric_string)
511 source.append(
"__kernel void sum( \n");
512 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec1, \n");
513 source.append(
" unsigned int start1, \n");
514 source.append(
" unsigned int inc1, \n");
515 source.append(
" unsigned int size1, \n");
516 source.append(
" unsigned int option, \n");
517 source.append(
" __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
518 source.append(
" __global "); source.append(numeric_string); source.append(
" * result) \n");
519 source.append(
"{ \n");
520 source.append(
" "); source.append(numeric_string); source.append(
" thread_sum = 0; \n");
521 source.append(
" "); source.append(numeric_string); source.append(
" tmp = 0; \n");
522 source.append(
" for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
523 source.append(
" { \n");
524 source.append(
" if (option > 0) \n");
525 source.append(
" thread_sum += vec1[i*inc1+start1]; \n");
526 source.append(
" else \n");
527 source.append(
" { \n");
528 source.append(
" tmp = vec1[i*inc1+start1]; \n");
529 source.append(
" tmp = (tmp < 0) ? -tmp : tmp; \n");
530 source.append(
" thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
531 source.append(
" } \n");
532 source.append(
" } \n");
534 source.append(
" tmp_buffer[get_local_id(0)] = thread_sum; \n");
536 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
537 source.append(
" { \n");
538 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
539 source.append(
" if (get_local_id(0) < stride) \n");
540 source.append(
" { \n");
541 source.append(
" if (option > 0) \n");
542 source.append(
" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
543 source.append(
" else \n");
544 source.append(
" tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
545 source.append(
" } \n");
546 source.append(
" } \n");
547 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
549 source.append(
" if (get_global_id(0) == 0) \n");
550 source.append(
" { \n");
551 if (numeric_string ==
"float" || numeric_string ==
"double")
553 source.append(
" if (option == 2) \n");
554 source.append(
" *result = sqrt(tmp_buffer[0]); \n");
555 source.append(
" else \n");
557 source.append(
" *result = tmp_buffer[0]; \n");
558 source.append(
" } \n");
559 source.append(
"} \n");
563 template <
typename StringType>
567 source.append(
"unsigned int index_norm_inf_impl( \n");
568 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec, \n");
569 source.append(
" unsigned int start1, \n");
570 source.append(
" unsigned int inc1, \n");
571 source.append(
" unsigned int size1, \n");
572 source.append(
" __local "); source.append(numeric_string); source.append(
" * entry_buffer, \n");
573 source.append(
" __local unsigned int * index_buffer) \n");
574 source.append(
"{ \n");
576 source.append(
" "); source.append(numeric_string); source.append(
" cur_max = 0; \n");
577 source.append(
" "); source.append(numeric_string); source.append(
" tmp; \n");
578 source.append(
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
579 source.append(
" { \n");
580 if (numeric_string ==
"float" || numeric_string ==
"double")
581 source.append(
" tmp = fabs(vec[i*inc1+start1]); \n");
582 else if (numeric_string[0] ==
'u')
583 source.append(
" tmp = vec[i*inc1+start1]; \n");
585 source.append(
" tmp = abs(vec[i*inc1+start1]); \n");
586 source.append(
" if (cur_max < tmp) \n");
587 source.append(
" { \n");
588 source.append(
" entry_buffer[get_global_id(0)] = tmp; \n");
589 source.append(
" index_buffer[get_global_id(0)] = i; \n");
590 source.append(
" cur_max = tmp; \n");
591 source.append(
" } \n");
592 source.append(
" } \n");
595 source.append(
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
596 source.append(
" { \n");
597 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
598 source.append(
" if (get_global_id(0) < stride) \n");
599 source.append(
" { \n");
601 source.append(
" if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
602 source.append(
" { \n");
603 source.append(
" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
604 source.append(
" entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
605 source.append(
" } \n");
606 source.append(
" } \n");
607 source.append(
" } \n");
608 source.append(
" \n");
609 source.append(
" return index_buffer[0]; \n");
610 source.append(
"} \n");
612 source.append(
"__kernel void index_norm_inf( \n");
613 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec, \n");
614 source.append(
" unsigned int start1, \n");
615 source.append(
" unsigned int inc1, \n");
616 source.append(
" unsigned int size1, \n");
617 source.append(
" __local "); source.append(numeric_string); source.append(
" * entry_buffer, \n");
618 source.append(
" __local unsigned int * index_buffer, \n");
619 source.append(
" __global unsigned int * result) \n");
620 source.append(
"{ \n");
621 source.append(
" entry_buffer[get_global_id(0)] = 0; \n");
622 source.append(
" index_buffer[get_global_id(0)] = 0; \n");
623 source.append(
" unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
624 source.append(
" if (get_global_id(0) == 0) *result = tmp; \n");
625 source.append(
"} \n");
629 template <
typename StringType>
630 void generate_maxmin(StringType & source, std::string
const & numeric_string,
bool is_max)
634 source.append(
"__kernel void max_kernel( \n");
636 source.append(
"__kernel void min_kernel( \n");
637 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec1, \n");
638 source.append(
" unsigned int start1, \n");
639 source.append(
" unsigned int inc1, \n");
640 source.append(
" unsigned int size1, \n");
641 source.append(
" __local "); source.append(numeric_string); source.append(
" * tmp_buffer, \n");
642 source.append(
" __global "); source.append(numeric_string); source.append(
" * result) \n");
643 source.append(
"{ \n");
644 source.append(
" "); source.append(numeric_string); source.append(
" thread_result = vec1[start1]; \n");
645 source.append(
" for (unsigned int i = get_global_id(0); i<size1; i += get_global_size(0)) \n");
646 source.append(
" { \n");
647 source.append(
" "); source.append(numeric_string); source.append(
" tmp = vec1[i*inc1+start1]; \n");
649 source.append(
" thread_result = thread_result > tmp ? thread_result : tmp; \n");
651 source.append(
" thread_result = thread_result < tmp ? thread_result : tmp; \n");
652 source.append(
" } \n");
654 source.append(
" tmp_buffer[get_local_id(0)] = thread_result; \n");
656 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
657 source.append(
" { \n");
658 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
659 source.append(
" if (get_local_id(0) < stride) \n");
660 source.append(
" { \n");
662 source.append(
" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
664 source.append(
" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] < tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
665 source.append(
" } \n");
666 source.append(
" } \n");
667 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
669 source.append(
" if (get_local_id(0) == 0) \n");
670 source.append(
" result[get_group_id(0)] = tmp_buffer[0]; \n");
671 source.append(
"} \n");
678 template<
typename NumericT>
691 static std::map<cl_context, bool> init_done;
695 source.reserve(8192);
697 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
715 #ifdef VIENNACL_BUILD_INFO
716 std::cout <<
"Creating program " << prog_name << std::endl;
718 ctx.add_program(source, prog_name);
719 init_done[ctx.handle().get()] =
true;
726 template<
typename NumericT>
739 static std::map<cl_context, bool> init_done;
743 source.reserve(8192);
745 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
755 #ifdef VIENNACL_BUILD_INFO
756 std::cout <<
"Creating program " << prog_name << std::endl;
758 ctx.add_program(source, prog_name);
759 init_done[ctx.handle().get()] =
true;
765 template<
typename StringT>
768 source.append(
" __kernel void convert_" + dest_type +
"_" + src_type +
"( \n");
769 source.append(
" __global " + dest_type +
" * dest, \n");
770 source.append(
" unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, \n");
771 source.append(
" __global const " + src_type +
" * src, \n");
772 source.append(
" unsigned int start_src, unsigned int inc_src) \n");
773 source.append(
" { \n");
774 source.append(
" for (unsigned int i = get_global_id(0); i < size_dest; i += get_global_size(0)) \n");
775 source.append(
" dest[start_dest + i * inc_dest] = src[start_src + i * inc_src]; \n");
776 source.append(
" } \n");
786 return "vector_convert";
791 static std::map<cl_context, bool> init_done;
795 source.reserve(4096);
851 #ifdef VIENNACL_BUILD_INFO
852 std::cout <<
"Creating program " << prog_name << std::endl;
viennacl::ocl::device const & current_device() const
Returns the current device.
void generate_inner_prod(StringType &source, std::string const &numeric_string, vcl_size_t vector_num)
void generate_index_norm_inf(StringType &source, std::string const &numeric_string)
static std::string program_name()
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
void generate_vector_swap(StringType &source, std::string const &numeric_string)
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<...
void generate_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Provides OpenCL-related utilities.
void generate_maxmin(StringType &source, std::string const &numeric_string, bool is_max)
Main kernel class for vector conversion routines (e.g. convert vector to vector).
void generate_plane_rotation(StringType &source, std::string const &numeric_string)
static std::string program_name()
bool with_stride_and_range
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
static void apply(viennacl::ocl::context const &)
void generate_avbv(StringType &source, std::string const &numeric_string)
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context.
const OCL_TYPE & get() const
static std::string program_name()
void generate_sum(StringType &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
Configuration struct for generating OpenCL kernels for linear combinations of vectors.
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
static void init(viennacl::ocl::context &ctx)
void generate_avbv_impl(StringType &source, std::string const &numeric_string, avbv_config const &cfg)
void generate_avbv_impl2(StringType &source, std::string const &, avbv_config const &cfg, bool mult_alpha, bool mult_beta)
void generate_inner_prod_sum(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_norm(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
void generate_vector_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
static void init(viennacl::ocl::context &ctx)
avbv_scalar_type
Enumeration for the scalar type in avbv-like operations.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...