1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
2 #define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
49 template <
typename StringType>
53 source.append(
" #define VIENNACL_BISECT_MAX_THREADS_BLOCK 256\n");
54 source.append(
" #define VIENNACL_BISECT_MAX_SMALL_MATRIX 256\n");
55 source.append(
" #define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 256\n");
56 source.append(
" #define VIENNACL_BISECT_MIN_ABS_INTERVAL 5.0e-37\n");
65 template <
typename StringType>
69 source.append(
" inline int \n");
70 source.append(
" floorPow2(int n) \n");
71 source.append(
" { \n");
72 source.append(
" uint glb_id = get_global_id(0); \n");
73 source.append(
" uint grp_id = get_group_id(0); \n");
74 source.append(
" uint grp_nm = get_num_groups(0); \n");
75 source.append(
" uint lcl_id = get_local_id(0); \n");
76 source.append(
" uint lcl_sz = get_local_size(0); \n");
80 source.append(
" if (0 == (n & (n-1))) \n");
81 source.append(
" { \n");
82 source.append(
" return n; \n");
83 source.append(
" } \n");
85 source.append(
" int exp; \n");
86 source.append(
" frexp(( "); source.append(numeric_string); source.append(
" )n, &exp); \n");
87 source.append(
" return (1 << (exp - 1)); \n");
88 source.append(
" } \n");
98 template <
typename StringType>
101 source.append(
" \n");
102 source.append(
" inline int \n");
103 source.append(
" ceilPow2(int n) \n");
104 source.append(
" { \n");
105 source.append(
" uint glb_id = get_global_id(0); \n");
106 source.append(
" uint grp_id = get_group_id(0); \n");
107 source.append(
" uint grp_nm = get_num_groups(0); \n");
108 source.append(
" uint lcl_id = get_local_id(0); \n");
109 source.append(
" uint lcl_sz = get_local_size(0); \n");
113 source.append(
" if (0 == (n & (n-1))) \n");
114 source.append(
" { \n");
115 source.append(
" return n; \n");
116 source.append(
" } \n");
118 source.append(
" int exp; \n");
119 source.append(
" frexp(( "); source.append(numeric_string); source.append(
" )n, &exp); \n");
120 source.append(
" return (1 << exp); \n");
121 source.append(
" } \n");
132 template <
typename StringType>
135 source.append(
" \n");
136 source.append(
" inline "); source.append(numeric_string); source.append(
" \n");
137 source.append(
" computeMidpoint(const "); source.append(numeric_string); source.append(
" left,\n");
138 source.append(
" const "); source.append(numeric_string); source.append(
" right) \n");
139 source.append(
" { \n");
140 source.append(
" uint glb_id = get_global_id(0); \n");
141 source.append(
" uint grp_id = get_group_id(0); \n");
142 source.append(
" uint grp_nm = get_num_groups(0); \n");
143 source.append(
" uint lcl_id = get_local_id(0); \n");
144 source.append(
" uint lcl_sz = get_local_size(0); \n");
145 source.append(
" "); source.append(numeric_string); source.append(
" mid; \n");
147 source.append(
" if (sign(left) == sign(right)) \n");
148 source.append(
" { \n");
149 source.append(
" mid = left + (right - left) * 0.5f; \n");
150 source.append(
" } \n");
151 source.append(
" else \n");
152 source.append(
" { \n");
153 source.append(
" mid = (left + right) * 0.5f; \n");
154 source.append(
" } \n");
156 source.append(
" return mid; \n");
157 source.append(
" } \n");
177 template<
typename StringType>
180 source.append(
" \n");
181 source.append(
" void \n");
182 source.append(
" storeInterval(unsigned int addr, \n");
183 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_left, \n");
184 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_right, \n");
185 source.append(
" __local unsigned int * s_left_count, \n");
186 source.append(
" __local unsigned int * s_right_count, \n");
187 source.append(
" "); source.append(numeric_string); source.append(
" left, \n");
188 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
189 source.append(
" unsigned int left_count, \n");
190 source.append(
" unsigned int right_count, \n");
191 source.append(
" "); source.append(numeric_string); source.append(
" precision) \n");
192 source.append(
" { \n");
193 source.append(
" uint glb_id = get_global_id(0); \n");
194 source.append(
" uint grp_id = get_group_id(0); \n");
195 source.append(
" uint grp_nm = get_num_groups(0); \n");
196 source.append(
" uint lcl_id = get_local_id(0); \n");
197 source.append(
" uint lcl_sz = get_local_size(0); \n");
199 source.append(
" s_left_count[addr] = left_count; \n");
200 source.append(
" s_right_count[addr] = right_count; \n");
203 source.append(
" "); source.append(numeric_string); source.append(
" t0 = fabs(right - left); \n");
204 source.append(
" "); source.append(numeric_string); source.append(
" t1 = max(fabs(left), fabs(right)) * precision; \n");
206 source.append(
" if (t0 <= max(( "); source.append(numeric_string); source.append(
" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
207 source.append(
" { \n");
209 source.append(
" "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right); \n");
212 source.append(
" s_left[addr] = lambda; \n");
213 source.append(
" s_right[addr] = lambda; \n");
214 source.append(
" } \n");
215 source.append(
" else \n");
216 source.append(
" { \n");
219 source.append(
" s_left[addr] = left; \n");
220 source.append(
" s_right[addr] = right; \n");
221 source.append(
" } \n");
223 source.append(
" } \n");
227 template<
typename StringType>
230 source.append(
" \n");
231 source.append(
" void \n");
232 source.append(
" storeIntervalShort(unsigned int addr, \n");
233 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_left, \n");
234 source.append(
" __local "); source.append(numeric_string); source.append(
" * s_right, \n");
235 source.append(
" __local unsigned short * s_left_count, \n");
236 source.append(
" __local unsigned short * s_right_count, \n");
237 source.append(
" "); source.append(numeric_string); source.append(
" left, \n");
238 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
239 source.append(
" unsigned int left_count, \n");
240 source.append(
" unsigned int right_count, \n");
241 source.append(
" "); source.append(numeric_string); source.append(
" precision) \n");
242 source.append(
" { \n");
243 source.append(
" uint glb_id = get_global_id(0); \n");
244 source.append(
" uint grp_id = get_group_id(0); \n");
245 source.append(
" uint grp_nm = get_num_groups(0); \n");
246 source.append(
" uint lcl_id = get_local_id(0); \n");
247 source.append(
" uint lcl_sz = get_local_size(0); \n");
249 source.append(
" s_left_count[addr] = left_count; \n");
250 source.append(
" s_right_count[addr] = right_count; \n");
253 source.append(
" "); source.append(numeric_string); source.append(
" t0 = fabs(right - left); \n");
254 source.append(
" "); source.append(numeric_string); source.append(
" t1 = max(fabs(left), fabs(right)) * precision; \n");
256 source.append(
" if (t0 <= max(( "); source.append(numeric_string); source.append(
" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
257 source.append(
" { \n");
259 source.append(
" "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right); \n");
262 source.append(
" s_left[addr] = lambda; \n");
263 source.append(
" s_right[addr] = lambda; \n");
264 source.append(
" } \n");
265 source.append(
" else \n");
266 source.append(
" { \n");
269 source.append(
" s_left[addr] = left; \n");
270 source.append(
" s_right[addr] = right; \n");
271 source.append(
" } \n");
273 source.append(
" } \n");
294 template <
typename StringType>
297 source.append(
" \n");
298 source.append(
" inline unsigned int \n");
299 source.append(
" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
300 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
301 source.append(
" const unsigned int n, \n");
302 source.append(
" const "); source.append(numeric_string); source.append(
" x, \n");
303 source.append(
" const unsigned int tid, \n");
304 source.append(
" const unsigned int num_intervals_active, \n");
305 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_d, \n");
306 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_s, \n");
307 source.append(
" unsigned int converged \n");
308 source.append(
" ) \n");
309 source.append(
" { \n");
310 source.append(
" uint glb_id = get_global_id(0); \n");
311 source.append(
" uint grp_id = get_group_id(0); \n");
312 source.append(
" uint grp_nm = get_num_groups(0); \n");
313 source.append(
" uint lcl_id = get_local_id(0); \n");
314 source.append(
" uint lcl_sz = get_local_size(0); \n");
317 source.append(
" "); source.append(numeric_string); source.append(
" delta = 1.0f; \n");
318 source.append(
" unsigned int count = 0; \n");
320 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
323 source.append(
" if (lcl_id < n) \n");
324 source.append(
" { \n");
325 source.append(
" s_d[lcl_id] = *(g_d + lcl_id); \n");
326 source.append(
" s_s[lcl_id] = *(g_s + lcl_id - 1); \n");
327 source.append(
" } \n");
329 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
332 source.append(
" if ((tid < num_intervals_active) && (0 == converged)) \n");
333 source.append(
" { \n");
337 source.append(
" for (unsigned int k = 0; k < n; ++k) \n");
338 source.append(
" { \n");
339 source.append(
" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
340 source.append(
" count += (delta < 0) ? 1 : 0; \n");
341 source.append(
" } \n");
343 source.append(
" } \n");
345 source.append(
" return count; \n");
346 source.append(
" } \n");
366 template <
typename StringType>
369 source.append(
" \n");
370 source.append(
" inline unsigned int \n");
371 source.append(
" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
372 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
373 source.append(
" const unsigned int n, \n");
374 source.append(
" const "); source.append(numeric_string); source.append(
" x, \n");
375 source.append(
" const unsigned int tid, \n");
376 source.append(
" const unsigned int num_intervals_active, \n");
377 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_d, \n");
378 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_s, \n");
379 source.append(
" unsigned int converged \n");
380 source.append(
" ) \n");
381 source.append(
" { \n");
382 source.append(
" uint glb_id = get_global_id(0); \n");
383 source.append(
" uint grp_id = get_group_id(0); \n");
384 source.append(
" uint grp_nm = get_num_groups(0); \n");
385 source.append(
" uint lcl_id = get_local_id(0); \n");
386 source.append(
" uint lcl_sz = get_local_size(0); \n");
388 source.append(
" "); source.append(numeric_string); source.append(
" delta = 1.0f; \n");
389 source.append(
" unsigned int count = 0; \n");
391 source.append(
" unsigned int rem = n; \n");
394 source.append(
" for (unsigned int i = 0; i < n; i += lcl_sz) \n");
395 source.append(
" { \n");
397 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
400 source.append(
" if ((i + lcl_id) < n) \n");
401 source.append(
" { \n");
403 source.append(
" s_d[lcl_id] = *(g_d + i + lcl_id); \n");
404 source.append(
" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n");
405 source.append(
" } \n");
407 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
410 source.append(
" if (tid < num_intervals_active) \n");
411 source.append(
" { \n");
415 source.append(
" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n");
416 source.append(
" { \n");
417 source.append(
" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
419 source.append(
" count += (delta < 0) ? 1 : 0; \n");
420 source.append(
" } \n");
422 source.append(
" } \n");
424 source.append(
" rem -= lcl_sz; \n");
425 source.append(
" } \n");
427 source.append(
" return count; \n");
428 source.append(
" } \n");
455 template<
typename StringType>
458 source.append(
" \n");
459 source.append(
" void \n");
460 source.append(
" storeNonEmptyIntervals(unsigned int addr, \n");
461 source.append(
" const unsigned int num_threads_active, \n");
462 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
463 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
464 source.append(
" __local unsigned int *s_left_count, \n");
465 source.append(
" __local unsigned int *s_right_count, \n");
466 source.append(
" "); source.append(numeric_string); source.append(
" left, \n ");
467 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
468 source.append(
" "); source.append(numeric_string); source.append(
" right,\n");
469 source.append(
" const unsigned int left_count, \n");
470 source.append(
" const unsigned int mid_count, \n");
471 source.append(
" const unsigned int right_count, \n");
472 source.append(
" "); source.append(numeric_string); source.append(
" precision, \n");
473 source.append(
" __local unsigned int *compact_second_chunk, \n");
474 source.append(
" __local unsigned int *s_compaction_list_exc, \n");
475 source.append(
" unsigned int *is_active_second) \n");
476 source.append(
" { \n");
477 source.append(
" uint glb_id = get_global_id(0); \n");
478 source.append(
" uint grp_id = get_group_id(0); \n");
479 source.append(
" uint grp_nm = get_num_groups(0); \n");
480 source.append(
" uint lcl_id = get_local_id(0); \n");
481 source.append(
" uint lcl_sz = get_local_size(0); \n");
484 source.append(
" \n");
485 source.append(
" if ((left_count != mid_count) && (mid_count != right_count)) \n");
486 source.append(
" { \n");
489 source.append(
" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
490 source.append(
" left, mid, left_count, mid_count, precision); \n");
494 source.append(
" *is_active_second = 1; \n");
495 source.append(
" s_compaction_list_exc[lcl_id] = 1; \n");
496 source.append(
" *compact_second_chunk = 1; \n");
497 source.append(
" } \n");
498 source.append(
" else \n");
499 source.append(
" { \n");
504 source.append(
" *is_active_second = 0; \n");
505 source.append(
" s_compaction_list_exc[lcl_id] = 0; \n");
508 source.append(
" if (left_count != mid_count) \n");
509 source.append(
" { \n");
510 source.append(
" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
511 source.append(
" left, mid, left_count, mid_count, precision); \n");
512 source.append(
" } \n");
513 source.append(
" else \n");
514 source.append(
" { \n");
515 source.append(
" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
516 source.append(
" mid, right, mid_count, right_count, precision); \n");
517 source.append(
" } \n");
519 source.append(
" } \n");
520 source.append(
" } \n");
530 template <
typename StringType>
533 source.append(
" \n");
534 source.append(
" void \n");
535 source.append(
" storeNonEmptyIntervalsLarge(unsigned int addr, \n");
536 source.append(
" const unsigned int num_threads_active, \n");
537 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
538 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
539 source.append(
" __local unsigned short *s_left_count, \n");
540 source.append(
" __local unsigned short *s_right_count, \n");
541 source.append(
" "); source.append(numeric_string); source.append(
" left, \n ");
542 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
543 source.append(
" "); source.append(numeric_string); source.append(
" right,\n");
544 source.append(
" const unsigned int left_count, \n");
545 source.append(
" const unsigned int mid_count, \n");
546 source.append(
" const unsigned int right_count, \n");
547 source.append(
" "); source.append(numeric_string); source.append(
" epsilon, \n");
548 source.append(
" __local unsigned int *compact_second_chunk, \n");
549 source.append(
" __local unsigned short *s_compaction_list, \n");
550 source.append(
" unsigned int *is_active_second) \n");
551 source.append(
" { \n");
552 source.append(
" uint glb_id = get_global_id(0); \n");
553 source.append(
" uint grp_id = get_group_id(0); \n");
554 source.append(
" uint grp_nm = get_num_groups(0); \n");
555 source.append(
" uint lcl_id = get_local_id(0); \n");
556 source.append(
" uint lcl_sz = get_local_size(0); \n");
559 source.append(
" if ((left_count != mid_count) && (mid_count != right_count)) \n");
560 source.append(
" { \n");
562 source.append(
" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
563 source.append(
" left, mid, left_count, mid_count, epsilon); \n");
565 source.append(
" *is_active_second = 1; \n");
566 source.append(
" s_compaction_list[lcl_id] = 1; \n");
567 source.append(
" *compact_second_chunk = 1; \n");
568 source.append(
" } \n");
569 source.append(
" else \n");
570 source.append(
" { \n");
575 source.append(
" *is_active_second = 0; \n");
576 source.append(
" s_compaction_list[lcl_id] = 0; \n");
579 source.append(
" if (left_count != mid_count) \n");
580 source.append(
" { \n");
581 source.append(
" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
582 source.append(
" left, mid, left_count, mid_count, epsilon); \n");
583 source.append(
" } \n");
584 source.append(
" else \n");
585 source.append(
" { \n");
586 source.append(
" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
587 source.append(
" mid, right, mid_count, right_count, epsilon); \n");
588 source.append(
" } \n");
589 source.append(
" } \n");
590 source.append(
" } \n");
603 template<
typename StringType>
606 source.append(
" \n");
607 source.append(
" void \n");
608 source.append(
" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n");
609 source.append(
" unsigned int num_threads_compaction) \n");
610 source.append(
" { \n");
611 source.append(
" uint glb_id = get_global_id(0); \n");
612 source.append(
" uint grp_id = get_group_id(0); \n");
613 source.append(
" uint grp_nm = get_num_groups(0); \n");
614 source.append(
" uint lcl_id = get_local_id(0); \n");
615 source.append(
" uint lcl_sz = get_local_size(0); \n");
618 source.append(
" unsigned int offset = 1; \n");
619 source.append(
" const unsigned int tid = lcl_id; \n");
624 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
625 source.append(
" { \n");
627 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
629 source.append(
" if (tid < d) \n");
630 source.append(
" { \n");
632 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
633 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
634 source.append(
" \n");
635 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
636 source.append(
" + s_compaction_list_exc[ai]; \n");
637 source.append(
" } \n");
639 source.append(
" offset <<= 1; \n");
640 source.append(
" } \n");
643 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
644 source.append(
" { \n");
646 source.append(
" offset >>= 1; \n");
647 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
649 source.append(
" if (tid < (d-1)) \n");
650 source.append(
" { \n");
652 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
653 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
655 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
656 source.append(
" + s_compaction_list_exc[ai]; \n");
657 source.append(
" } \n");
658 source.append(
" } \n");
660 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
662 source.append(
" } \n");
666 template<
typename StringType>
669 source.append(
" \n");
670 source.append(
" void \n");
671 source.append(
" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n");
672 source.append(
" unsigned int num_threads_compaction) \n");
673 source.append(
" { \n");
674 source.append(
" uint glb_id = get_global_id(0); \n");
675 source.append(
" uint grp_id = get_group_id(0); \n");
676 source.append(
" uint grp_nm = get_num_groups(0); \n");
677 source.append(
" uint lcl_id = get_local_id(0); \n");
678 source.append(
" uint lcl_sz = get_local_size(0); \n");
681 source.append(
" unsigned int offset = 1; \n");
682 source.append(
" const unsigned int tid = lcl_id; \n");
685 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
686 source.append(
" { \n");
688 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
690 source.append(
" if (tid < d) \n");
691 source.append(
" { \n");
693 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
694 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
695 source.append(
" \n");
696 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
697 source.append(
" + s_compaction_list_exc[ai]; \n");
698 source.append(
" } \n");
700 source.append(
" offset <<= 1; \n");
701 source.append(
" } \n");
704 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
705 source.append(
" { \n");
707 source.append(
" offset >>= 1; \n");
708 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
710 source.append(
" if (tid < (d-1)) \n");
711 source.append(
" { \n");
713 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
714 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
716 source.append(
" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
717 source.append(
" + s_compaction_list_exc[ai]; \n");
718 source.append(
" } \n");
719 source.append(
" } \n");
721 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
723 source.append(
" } \n");
742 template<
typename StringType>
745 source.append(
" \n");
746 source.append(
" void \n");
747 source.append(
" compactIntervals(__local "); source.append(numeric_string); source.append(
" *s_left, \n");
748 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
749 source.append(
" __local unsigned int *s_left_count, \n");
750 source.append(
" __local unsigned int *s_right_count, \n");
751 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
752 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
753 source.append(
" unsigned int mid_count, unsigned int right_count, \n");
754 source.append(
" __local unsigned int *s_compaction_list, \n");
755 source.append(
" unsigned int num_threads_active, \n");
756 source.append(
" unsigned int is_active_second) \n");
757 source.append(
" { \n");
758 source.append(
" uint glb_id = get_global_id(0); \n");
759 source.append(
" uint grp_id = get_group_id(0); \n");
760 source.append(
" uint grp_nm = get_num_groups(0); \n");
761 source.append(
" uint lcl_id = get_local_id(0); \n");
762 source.append(
" uint lcl_sz = get_local_size(0); \n");
764 source.append(
" const unsigned int tid = lcl_id; \n");
768 source.append(
" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
769 source.append(
" { \n");
770 source.append(
" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
771 source.append(
" s_left[addr_w] = mid; \n");
772 source.append(
" s_right[addr_w] = right; \n");
773 source.append(
" s_left_count[addr_w] = mid_count; \n");
774 source.append(
" s_right_count[addr_w] = right_count; \n");
775 source.append(
" } \n");
776 source.append(
" } \n");
782 template<
typename StringType>
785 source.append(
" \n");
786 source.append(
" void \n");
787 source.append(
" compactIntervalsShort(__local "); source.append(numeric_string); source.append(
" *s_left, \n");
788 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
789 source.append(
" __local unsigned short *s_left_count, \n");
790 source.append(
" __local unsigned short *s_right_count, \n");
791 source.append(
" "); source.append(numeric_string); source.append(
" mid, \n");
792 source.append(
" "); source.append(numeric_string); source.append(
" right, \n");
793 source.append(
" unsigned int mid_count, unsigned int right_count, \n");
794 source.append(
" __local unsigned short *s_compaction_list, \n");
795 source.append(
" unsigned int num_threads_active, \n");
796 source.append(
" unsigned int is_active_second) \n");
797 source.append(
" { \n");
798 source.append(
" uint glb_id = get_global_id(0); \n");
799 source.append(
" uint grp_id = get_group_id(0); \n");
800 source.append(
" uint grp_nm = get_num_groups(0); \n");
801 source.append(
" uint lcl_id = get_local_id(0); \n");
802 source.append(
" uint lcl_sz = get_local_size(0); \n");
804 source.append(
" const unsigned int tid = lcl_id; \n");
808 source.append(
" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
809 source.append(
" { \n");
810 source.append(
" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
811 source.append(
" s_left[addr_w] = mid; \n");
812 source.append(
" s_right[addr_w] = right; \n");
813 source.append(
" s_left_count[addr_w] = mid_count; \n");
814 source.append(
" s_right_count[addr_w] = right_count; \n");
815 source.append(
" } \n");
816 source.append(
" } \n");
821 template<
typename StringType>
824 source.append(
" \n");
825 source.append(
" void \n");
826 source.append(
" storeIntervalConverged( __local "); source.append(numeric_string); source.append(
" *s_left, \n");
827 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
828 source.append(
" __local unsigned int *s_left_count, \n");
829 source.append(
" __local unsigned int *s_right_count, \n");
830 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
831 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
832 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
833 source.append(
" unsigned int *left_count, \n");
834 source.append(
" unsigned int *mid_count, \n");
835 source.append(
" unsigned int *right_count, \n");
836 source.append(
" __local unsigned int *s_compaction_list_exc, \n");
837 source.append(
" __local unsigned int *compact_second_chunk, \n");
838 source.append(
" const unsigned int num_threads_active, \n");
839 source.append(
" unsigned int *is_active_second) \n");
840 source.append(
" { \n");
841 source.append(
" uint glb_id = get_global_id(0); \n");
842 source.append(
" uint grp_id = get_group_id(0); \n");
843 source.append(
" uint grp_nm = get_num_groups(0); \n");
844 source.append(
" uint lcl_id = get_local_id(0); \n");
845 source.append(
" uint lcl_sz = get_local_size(0); \n");
847 source.append(
" const unsigned int tid = lcl_id; \n");
848 source.append(
" const unsigned int multiplicity = *right_count - *left_count; \n");
850 source.append(
" if (1 == multiplicity) \n");
851 source.append(
" { \n");
854 source.append(
" s_left[tid] = *left; \n");
855 source.append(
" s_right[tid] = *right; \n");
856 source.append(
" s_left_count[tid] = *left_count; \n");
857 source.append(
" s_right_count[tid] = *right_count; \n");
858 source.append(
" \n");
861 source.append(
" *is_active_second = 0; \n");
862 source.append(
" s_compaction_list_exc[tid] = 0; \n");
863 source.append(
" } \n");
864 source.append(
" else \n");
865 source.append(
" { \n");
868 source.append(
" *mid_count = *left_count + (multiplicity >> 1); \n");
871 source.append(
" s_left[tid] = *left; \n");
872 source.append(
" s_right[tid] = *right; \n");
873 source.append(
" s_left_count[tid] = *left_count; \n");
874 source.append(
" s_right_count[tid] = *mid_count; \n");
875 source.append(
" *mid = *left; \n");
878 source.append(
" *is_active_second = 1; \n");
879 source.append(
" s_compaction_list_exc[tid] = 1; \n");
880 source.append(
" *compact_second_chunk = 1; \n");
881 source.append(
" } \n");
882 source.append(
" } \n");
889 template<
typename StringType>
892 source.append(
" \n");
893 source.append(
" void \n");
894 source.append(
" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(
" *s_left, \n");
895 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
896 source.append(
" __local unsigned short *s_left_count, \n");
897 source.append(
" __local unsigned short *s_right_count, \n");
898 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
899 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
900 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
901 source.append(
" unsigned int *left_count, \n");
902 source.append(
" unsigned int *mid_count, \n");
903 source.append(
" unsigned int *right_count, \n");
904 source.append(
" __local unsigned short *s_compaction_list_exc, \n");
905 source.append(
" __local unsigned int *compact_second_chunk, \n");
906 source.append(
" const unsigned int num_threads_active, \n");
907 source.append(
" unsigned int *is_active_second) \n");
908 source.append(
" { \n");
909 source.append(
" uint glb_id = get_global_id(0); \n");
910 source.append(
" uint grp_id = get_group_id(0); \n");
911 source.append(
" uint grp_nm = get_num_groups(0); \n");
912 source.append(
" uint lcl_id = get_local_id(0); \n");
913 source.append(
" uint lcl_sz = get_local_size(0); \n");
915 source.append(
" const unsigned int tid = lcl_id; \n");
916 source.append(
" const unsigned int multiplicity = *right_count - *left_count; \n");
918 source.append(
" if (1 == multiplicity) \n");
919 source.append(
" { \n");
922 source.append(
" s_left[tid] = *left; \n");
923 source.append(
" s_right[tid] = *right; \n");
924 source.append(
" s_left_count[tid] = *left_count; \n");
925 source.append(
" s_right_count[tid] = *right_count; \n");
926 source.append(
" \n");
929 source.append(
" *is_active_second = 0; \n");
930 source.append(
" s_compaction_list_exc[tid] = 0; \n");
931 source.append(
" } \n");
932 source.append(
" else \n");
933 source.append(
" { \n");
936 source.append(
" *mid_count = *left_count + (multiplicity >> 1); \n");
939 source.append(
" s_left[tid] = *left; \n");
940 source.append(
" s_right[tid] = *right; \n");
941 source.append(
" s_left_count[tid] = *left_count; \n");
942 source.append(
" s_right_count[tid] = *mid_count; \n");
943 source.append(
" *mid = *left; \n");
946 source.append(
" *is_active_second = 1; \n");
947 source.append(
" s_compaction_list_exc[tid] = 1; \n");
948 source.append(
" *compact_second_chunk = 1; \n");
949 source.append(
" } \n");
950 source.append(
" } \n");
970 template<
typename StringType>
973 source.append(
" \n");
974 source.append(
" void \n");
975 source.append(
" subdivideActiveIntervalMulti(const unsigned int tid, \n");
976 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
977 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
978 source.append(
" __local unsigned int *s_left_count, \n");
979 source.append(
" __local unsigned int *s_right_count, \n");
980 source.append(
" const unsigned int num_threads_active, \n");
981 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
982 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
983 source.append(
" unsigned int *left_count, unsigned int *right_count, \n");
984 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
985 source.append(
" __local unsigned int *all_threads_converged) \n");
986 source.append(
" { \n");
987 source.append(
" uint glb_id = get_global_id(0); \n");
988 source.append(
" uint grp_id = get_group_id(0); \n");
989 source.append(
" uint grp_nm = get_num_groups(0); \n");
990 source.append(
" uint lcl_id = get_local_id(0); \n");
991 source.append(
" uint lcl_sz = get_local_size(0); \n");
994 source.append(
" if (tid < num_threads_active) \n");
995 source.append(
" { \n");
997 source.append(
" *left = s_left[tid]; \n");
998 source.append(
" *right = s_right[tid]; \n");
999 source.append(
" *left_count = s_left_count[tid]; \n");
1000 source.append(
" *right_count = s_right_count[tid]; \n");
1003 source.append(
" if (*left != *right) \n");
1004 source.append(
" { \n");
1006 source.append(
" *mid = computeMidpoint(*left, *right); \n");
1007 source.append(
" *all_threads_converged = 0; \n");
1008 source.append(
" } \n");
1009 source.append(
" else if ((*right_count - *left_count) > 1) \n");
1010 source.append(
" { \n");
1013 source.append(
" *all_threads_converged = 0; \n");
1014 source.append(
" } \n");
1016 source.append(
" } \n");
1018 source.append(
" } \n");
1022 template<
typename StringType>
1025 source.append(
" \n");
1026 source.append(
" void \n");
1027 source.append(
" subdivideActiveIntervalShort(const unsigned int tid, \n");
1028 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
1029 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
1030 source.append(
" __local unsigned short *s_left_count, \n");
1031 source.append(
" __local unsigned short *s_right_count, \n");
1032 source.append(
" const unsigned int num_threads_active, \n");
1033 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
1034 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
1035 source.append(
" unsigned int *left_count, unsigned int *right_count, \n");
1036 source.append(
" "); source.append(numeric_string); source.append(
" *mid, \n");
1037 source.append(
" __local unsigned int *all_threads_converged) \n");
1038 source.append(
" { \n");
1039 source.append(
" uint glb_id = get_global_id(0); \n");
1040 source.append(
" uint grp_id = get_group_id(0); \n");
1041 source.append(
" uint grp_nm = get_num_groups(0); \n");
1042 source.append(
" uint lcl_id = get_local_id(0); \n");
1043 source.append(
" uint lcl_sz = get_local_size(0); \n");
1046 source.append(
" if (tid < num_threads_active) \n");
1047 source.append(
" { \n");
1049 source.append(
" *left = s_left[tid]; \n");
1050 source.append(
" *right = s_right[tid]; \n");
1051 source.append(
" *left_count = s_left_count[tid]; \n");
1052 source.append(
" *right_count = s_right_count[tid]; \n");
1055 source.append(
" if (*left != *right) \n");
1056 source.append(
" { \n");
1058 source.append(
" *mid = computeMidpoint(*left, *right); \n");
1059 source.append(
" *all_threads_converged = 0; \n");
1060 source.append(
" } \n");
1062 source.append(
" } \n");
1064 source.append(
" } \n");
1084 template <
typename StringType>
1087 source.append(
" __kernel \n");
1088 source.append(
" void \n");
1089 source.append(
" bisectKernelSmall(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
1090 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
1091 source.append(
" const unsigned int n, \n");
1092 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left, \n");
1093 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right, \n");
1094 source.append(
" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n");
1095 source.append(
" const "); source.append(numeric_string); source.append(
" lg, \n");
1096 source.append(
" const "); source.append(numeric_string); source.append(
" ug, \n");
1097 source.append(
" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
1098 source.append(
" "); source.append(numeric_string); source.append(
" epsilon \n");
1099 source.append(
" ) \n");
1100 source.append(
" { \n");
1101 source.append(
" g_s = g_s + 1; \n");
1102 source.append(
" uint glb_id = get_global_id(0); \n");
1103 source.append(
" uint grp_id = get_group_id(0); \n");
1104 source.append(
" uint grp_nm = get_num_groups(0); \n");
1105 source.append(
" uint lcl_id = get_local_id(0); \n");
1106 source.append(
" uint lcl_sz = get_local_size(0); \n");
1110 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1111 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1115 source.append(
" __local unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1116 source.append(
" __local unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1119 source.append(
" __local unsigned int \n");
1120 source.append(
" s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n");
1125 source.append(
" __local unsigned int compact_second_chunk; \n");
1126 source.append(
" __local unsigned int all_threads_converged; \n");
1129 source.append(
" __local unsigned int num_threads_active; \n");
1132 source.append(
" __local unsigned int num_threads_compaction; \n");
1135 source.append(
" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1140 source.append(
" "); source.append(numeric_string); source.append(
" left = 0.0f; \n");
1141 source.append(
" "); source.append(numeric_string); source.append(
" right = 0.0f; \n");
1142 source.append(
" unsigned int left_count = 0; \n");
1143 source.append(
" unsigned int right_count = 0; \n");
1145 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
1147 source.append(
" unsigned int mid_count = 0; \n");
1149 source.append(
" unsigned int is_active_second = 0; \n");
1151 source.append(
" s_compaction_list[lcl_id] = 0; \n");
1152 source.append(
" s_left[lcl_id] = 0.0; \n");
1153 source.append(
" s_right[lcl_id] = 0.0; \n");
1154 source.append(
" s_left_count[lcl_id] = 0; \n");
1155 source.append(
" s_right_count[lcl_id] = 0; \n");
1157 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1160 source.append(
" if (0 == lcl_id) \n");
1161 source.append(
" { \n");
1162 source.append(
" s_left[0] = lg; \n");
1163 source.append(
" s_right[0] = ug; \n");
1164 source.append(
" s_left_count[0] = lg_eig_count; \n");
1165 source.append(
" s_right_count[0] = ug_eig_count; \n");
1167 source.append(
" compact_second_chunk = 0; \n");
1168 source.append(
" num_threads_active = 1; \n");
1170 source.append(
" num_threads_compaction = 1; \n");
1171 source.append(
" } \n");
1176 source.append(
" while (true) \n");
1177 source.append(
" { \n");
1179 source.append(
" all_threads_converged = 1; \n");
1180 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1182 source.append(
" is_active_second = 0; \n");
1183 source.append(
" subdivideActiveIntervalMulti(lcl_id, \n");
1184 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
1185 source.append(
" num_threads_active, \n");
1186 source.append(
" &left, &right, &left_count, &right_count, \n");
1187 source.append(
" &mid, &all_threads_converged); \n");
1189 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1192 source.append(
" if (1 == all_threads_converged) \n");
1193 source.append(
" { \n");
1194 source.append(
" break; \n");
1195 source.append(
" } \n");
1197 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1204 source.append(
" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n");
1205 source.append(
" lcl_id, num_threads_active, \n");
1206 source.append(
" s_left, s_right, \n");
1207 source.append(
" (left == right)); \n");
1209 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1221 source.append(
" if (lcl_id < num_threads_active) \n");
1222 source.append(
" { \n");
1224 source.append(
" if (left != right) \n");
1225 source.append(
" { \n");
1228 source.append(
" storeNonEmptyIntervals(lcl_id, num_threads_active, \n");
1229 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
1230 source.append(
" left, mid, right, \n");
1231 source.append(
" left_count, mid_count, right_count, \n");
1232 source.append(
" epsilon, &compact_second_chunk, \n");
1233 source.append(
" s_compaction_list_exc, \n");
1234 source.append(
" &is_active_second); \n");
1235 source.append(
" } \n");
1236 source.append(
" else \n");
1237 source.append(
" { \n");
1239 source.append(
" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1240 source.append(
" &left, &mid, &right, \n");
1241 source.append(
" &left_count, &mid_count, &right_count, \n");
1242 source.append(
" s_compaction_list_exc, &compact_second_chunk, \n");
1243 source.append(
" num_threads_active, \n");
1244 source.append(
" &is_active_second); \n");
1245 source.append(
" } \n");
1246 source.append(
" } \n");
1249 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1254 source.append(
" if (compact_second_chunk > 0) \n");
1255 source.append(
" { \n");
1257 source.append(
" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1259 source.append(
" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1260 source.append(
" mid, right, mid_count, right_count, \n");
1261 source.append(
" s_compaction_list, num_threads_active, \n");
1262 source.append(
" is_active_second); \n");
1263 source.append(
" } \n");
1265 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1267 source.append(
" if (0 == lcl_id) \n");
1268 source.append(
" { \n");
1271 source.append(
" num_threads_active += s_compaction_list[num_threads_active]; \n");
1273 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
1275 source.append(
" compact_second_chunk = 0; \n");
1276 source.append(
" } \n");
1278 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1280 source.append(
" } \n");
1282 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1289 source.append(
" if (lcl_id < n) \n");
1290 source.append(
" { \n");
1292 source.append(
" g_left[lcl_id] = s_left[lcl_id]; \n");
1294 source.append(
" g_left_count[lcl_id] = s_left_count[lcl_id]; \n");
1295 source.append(
" } \n");
1296 source.append(
" } \n");
1316 template <
typename StringType>
1319 source.append(
" __kernel \n");
1320 source.append(
" void \n");
1321 source.append(
" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
1322 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
1323 source.append(
" const unsigned int n, \n");
1324 source.append(
" __global unsigned int *blocks_mult, \n");
1325 source.append(
" __global unsigned int *blocks_mult_sum, \n");
1326 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left, \n");
1327 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right, \n");
1328 source.append(
" __global unsigned int *g_left_count, \n");
1329 source.append(
" __global unsigned int *g_right_count, \n");
1330 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_lambda, \n");
1331 source.append(
" __global unsigned int *g_pos, \n");
1332 source.append(
" "); source.append(numeric_string); source.append(
" precision \n");
1333 source.append(
" ) \n");
1334 source.append(
" { \n");
1335 source.append(
" g_s = g_s + 1; \n");
1336 source.append(
" uint glb_id = get_global_id(0); \n");
1337 source.append(
" uint grp_id = get_group_id(0); \n");
1338 source.append(
" uint grp_nm = get_num_groups(0); \n");
1339 source.append(
" uint lcl_id = get_local_id(0); \n");
1340 source.append(
" uint lcl_sz = get_local_size(0); \n");
1342 source.append(
" const unsigned int tid = lcl_id; \n");
1345 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1346 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1349 source.append(
" __local unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1350 source.append(
" __local unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1353 source.append(
" __local unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
1355 source.append(
" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1358 source.append(
" __local unsigned int all_threads_converged; \n");
1360 source.append(
" __local unsigned int num_threads_active; \n");
1362 source.append(
" __local unsigned int num_threads_compaction; \n");
1364 source.append(
" __local unsigned int compact_second_chunk; \n");
1367 source.append(
" __local unsigned int c_block_start; \n");
1368 source.append(
" __local unsigned int c_block_end; \n");
1369 source.append(
" __local unsigned int c_block_offset_output; \n");
1372 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
1374 source.append(
" unsigned int mid_count = 0; \n");
1376 source.append(
" "); source.append(numeric_string); source.append(
" left = 0.0f; \n");
1377 source.append(
" "); source.append(numeric_string); source.append(
" right = 0.0f; \n");
1378 source.append(
" unsigned int left_count = 0; \n");
1379 source.append(
" unsigned int right_count = 0; \n");
1381 source.append(
" unsigned int is_active_second = 0; \n");
1383 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1386 source.append(
" if (0 == tid) \n");
1387 source.append(
" { \n");
1389 source.append(
" c_block_start = blocks_mult[grp_id]; \n");
1390 source.append(
" c_block_end = blocks_mult[grp_id + 1]; \n");
1391 source.append(
" c_block_offset_output = blocks_mult_sum[grp_id]; \n");
1392 source.append(
" \n");
1394 source.append(
" num_threads_active = c_block_end - c_block_start; \n");
1395 source.append(
" s_compaction_list[0] = 0; \n");
1396 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
1398 source.append(
" all_threads_converged = 1; \n");
1399 source.append(
" compact_second_chunk = 0; \n");
1400 source.append(
" } \n");
1401 source.append(
" s_left_count [tid] = 42; \n");
1402 source.append(
" s_right_count[tid] = 42; \n");
1403 source.append(
" s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
1404 source.append(
" s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
1405 source.append(
" \n");
1406 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1407 source.append(
" \n");
1410 source.append(
" if (tid < num_threads_active) \n");
1411 source.append(
" { \n");
1413 source.append(
" s_left[tid] = g_left[c_block_start + tid]; \n");
1414 source.append(
" s_right[tid] = g_right[c_block_start + tid]; \n");
1415 source.append(
" s_left_count[tid] = g_left_count[c_block_start + tid]; \n");
1416 source.append(
" s_right_count[tid] = g_right_count[c_block_start + tid]; \n");
1417 source.append(
" \n");
1418 source.append(
" } \n");
1419 source.append(
" \n");
1420 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1421 source.append(
" unsigned int iter = 0; \n");
1423 source.append(
" while (true) \n");
1424 source.append(
" { \n");
1425 source.append(
" iter++; \n");
1427 source.append(
" s_compaction_list[lcl_id] = 0; \n");
1428 source.append(
" s_compaction_list[lcl_id + lcl_sz] = 0; \n");
1429 source.append(
" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
1432 source.append(
" subdivideActiveIntervalMulti(tid, s_left, s_right, \n");
1433 source.append(
" s_left_count, s_right_count, \n");
1434 source.append(
" num_threads_active, \n");
1435 source.append(
" &left, &right, &left_count, &right_count, \n");
1436 source.append(
" &mid, &all_threads_converged); \n");
1437 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1440 source.append(
" if (1 == all_threads_converged) \n");
1441 source.append(
" { \n");
1442 source.append(
" \n");
1443 source.append(
" break; \n");
1444 source.append(
" } \n");
1450 source.append(
" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1451 source.append(
" mid, tid, num_threads_active, \n");
1452 source.append(
" s_left, s_right, \n");
1453 source.append(
" (left == right)); \n");
1454 source.append(
" \n");
1455 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1457 source.append(
" if (tid < num_threads_active) \n");
1458 source.append(
" { \n");
1459 source.append(
" \n");
1461 source.append(
" if (left != right) \n");
1462 source.append(
" { \n");
1464 source.append(
" storeNonEmptyIntervals(tid, num_threads_active, \n");
1465 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
1466 source.append(
" left, mid, right, \n");
1467 source.append(
" left_count, mid_count, right_count, \n");
1468 source.append(
" precision, &compact_second_chunk, \n");
1469 source.append(
" s_compaction_list_exc, \n");
1470 source.append(
" &is_active_second); \n");
1471 source.append(
" \n");
1472 source.append(
" } \n");
1473 source.append(
" else \n");
1474 source.append(
" { \n");
1476 source.append(
" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1477 source.append(
" &left, &mid, &right, \n");
1478 source.append(
" &left_count, &mid_count, &right_count, \n");
1479 source.append(
" s_compaction_list_exc, &compact_second_chunk, \n");
1480 source.append(
" num_threads_active, \n");
1481 source.append(
" &is_active_second); \n");
1482 source.append(
" \n");
1483 source.append(
" } \n");
1484 source.append(
" } \n");
1486 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1490 source.append(
" if (1 == compact_second_chunk) \n");
1491 source.append(
" { \n");
1493 source.append(
" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1494 source.append(
" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1495 source.append(
" mid, right, mid_count, right_count, \n");
1496 source.append(
" s_compaction_list, num_threads_active, \n");
1497 source.append(
" is_active_second); \n");
1498 source.append(
" } \n");
1500 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1503 source.append(
" if (0 == tid) \n");
1504 source.append(
" { \n");
1505 source.append(
" num_threads_active += s_compaction_list[num_threads_active]; \n");
1506 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
1508 source.append(
" compact_second_chunk = 0; \n");
1509 source.append(
" all_threads_converged = 1; \n");
1510 source.append(
" } \n");
1512 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1515 source.append(
" s_compaction_list_exc[lcl_id] = 0; \n");
1516 source.append(
" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n");
1517 source.append(
" \n");
1518 source.append(
" if (num_threads_compaction > lcl_sz) \n");
1519 source.append(
" { \n");
1520 source.append(
" break; \n");
1521 source.append(
" } \n");
1524 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1526 source.append(
" } \n");
1529 source.append(
" if (tid < num_threads_active) \n");
1530 source.append(
" { \n");
1532 source.append(
" unsigned int addr = c_block_offset_output + tid; \n");
1533 source.append(
" \n");
1534 source.append(
" g_lambda[addr] = s_left[tid]; \n");
1535 source.append(
" g_pos[addr] = s_right_count[tid]; \n");
1536 source.append(
" } \n");
1537 source.append(
" } \n");
1554 template <
typename StringType>
1557 source.append(
" __kernel \n");
1558 source.append(
" void \n");
1559 source.append(
" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
1560 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
1561 source.append(
" const unsigned int n, \n");
1562 source.append(
" unsigned int num_intervals, \n");
1563 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left, \n");
1564 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right, \n");
1565 source.append(
" __global unsigned int *g_pos, \n");
1566 source.append(
" "); source.append(numeric_string); source.append(
" precision) \n");
1567 source.append(
" { \n");
1568 source.append(
" g_s = g_s + 1; \n");
1569 source.append(
" uint glb_id = get_global_id(0); \n");
1570 source.append(
" uint grp_id = get_group_id(0); \n");
1571 source.append(
" uint grp_nm = get_num_groups(0); \n");
1572 source.append(
" uint lcl_id = get_local_id(0); \n");
1573 source.append(
" uint lcl_sz = get_local_size(0); \n");
1574 source.append(
" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n");
1575 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1576 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1579 source.append(
" "); source.append(numeric_string); source.append(
" left, right; \n");
1582 source.append(
" unsigned int right_count; \n");
1584 source.append(
" unsigned int converged = 0; \n");
1586 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
1588 source.append(
" unsigned int mid_count = 0; \n");
1591 source.append(
" if (gtid < num_intervals) \n");
1592 source.append(
" { \n");
1593 source.append(
" left = g_left[gtid]; \n");
1594 source.append(
" right = g_right[gtid]; \n");
1595 source.append(
" right_count = g_pos[gtid]; \n");
1596 source.append(
" } \n");
1598 source.append(
" __local unsigned int converged_all_threads; \n");
1600 source.append(
" if (0 == lcl_id) \n");
1601 source.append(
" { \n");
1602 source.append(
" converged_all_threads = 0; \n");
1603 source.append(
" } \n");
1604 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1606 source.append(
" while (true) \n");
1607 source.append(
" { \n");
1608 source.append(
" converged_all_threads = 1; \n");
1610 source.append(
" if ((gtid < num_intervals) && (0 == converged)) \n");
1611 source.append(
" { \n");
1612 source.append(
" mid = computeMidpoint(left, right); \n");
1613 source.append(
" } \n");
1615 source.append(
" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1616 source.append(
" mid, gtid, num_intervals, \n");
1617 source.append(
" s_left_scratch, \n");
1618 source.append(
" s_right_scratch, \n");
1619 source.append(
" converged); \n");
1620 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1622 source.append(
" if ((gtid < num_intervals) && (0 == converged)) \n");
1623 source.append(
" { \n");
1625 source.append(
" if (right_count == mid_count) \n");
1626 source.append(
" { \n");
1627 source.append(
" right = mid; \n");
1628 source.append(
" } \n");
1629 source.append(
" else \n");
1630 source.append(
" { \n");
1631 source.append(
" left = mid; \n");
1632 source.append(
" } \n");
1634 source.append(
" "); source.append(numeric_string); source.append(
" t0 = right - left; \n");
1635 source.append(
" "); source.append(numeric_string); source.append(
" t1 = max(fabs(right), fabs(left)) * precision; \n");
1637 source.append(
" if (t0 < min(precision, t1)) \n");
1638 source.append(
" { \n");
1639 source.append(
" "); source.append(numeric_string); source.append(
" lambda = computeMidpoint(left, right); \n");
1640 source.append(
" left = lambda; \n");
1641 source.append(
" right = lambda; \n");
1643 source.append(
" converged = 1; \n");
1644 source.append(
" } \n");
1645 source.append(
" else \n");
1646 source.append(
" { \n");
1647 source.append(
" converged_all_threads = 0; \n");
1648 source.append(
" } \n");
1649 source.append(
" } \n");
1650 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1651 source.append(
" if (1 == converged_all_threads) \n");
1652 source.append(
" { \n");
1653 source.append(
" break; \n");
1654 source.append(
" } \n");
1655 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1656 source.append(
" } \n");
1658 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1659 source.append(
" if (gtid < num_intervals) \n");
1660 source.append(
" { \n");
1663 source.append(
" g_left[gtid] = left; \n");
1664 source.append(
" } \n");
1665 source.append(
" } \n");
1672 template <
typename StringType>
1675 source.append(
" \n");
1676 source.append(
" void writeToGmem(const unsigned int tid, const unsigned int tid_2, \n");
1677 source.append(
" const unsigned int num_threads_active, \n");
1678 source.append(
" const unsigned int num_blocks_mult, \n");
1679 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_one, \n");
1680 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_one, \n");
1681 source.append(
" __global unsigned int *g_pos_one, \n");
1682 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_mult, \n");
1683 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_mult, \n");
1684 source.append(
" __global unsigned int *g_left_count_mult, \n");
1685 source.append(
" __global unsigned int *g_right_count_mult, \n");
1686 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
1687 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
1688 source.append(
" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1689 source.append(
" __global unsigned int *g_blocks_mult, \n");
1690 source.append(
" __global unsigned int *g_blocks_mult_sum, \n");
1691 source.append(
" __local unsigned short *s_compaction_list, \n");
1692 source.append(
" __local unsigned short *s_cl_helper, \n");
1693 source.append(
" unsigned int offset_mult_lambda \n");
1694 source.append(
" ) \n");
1695 source.append(
" { \n");
1696 source.append(
" uint glb_id = get_global_id(0); \n");
1697 source.append(
" uint grp_id = get_group_id(0); \n");
1698 source.append(
" uint grp_nm = get_num_groups(0); \n");
1699 source.append(
" uint lcl_id = get_local_id(0); \n");
1700 source.append(
" uint lcl_sz = get_local_size(0); \n");
1703 source.append(
" if (tid < offset_mult_lambda) \n");
1704 source.append(
" { \n");
1706 source.append(
" g_left_one[tid] = s_left[tid]; \n");
1707 source.append(
" g_right_one[tid] = s_right[tid]; \n");
1709 source.append(
" g_pos_one[tid] = s_right_count[tid]; \n");
1710 source.append(
" } \n");
1711 source.append(
" else \n");
1712 source.append(
" { \n");
1714 source.append(
" \n");
1715 source.append(
" g_left_mult[tid - offset_mult_lambda] = s_left[tid]; \n");
1716 source.append(
" g_right_mult[tid - offset_mult_lambda] = s_right[tid]; \n");
1717 source.append(
" g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; \n");
1718 source.append(
" g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; \n");
1719 source.append(
" } \n");
1721 source.append(
" if (tid_2 < num_threads_active) \n");
1722 source.append(
" { \n");
1724 source.append(
" if (tid_2 < offset_mult_lambda) \n");
1725 source.append(
" { \n");
1727 source.append(
" g_left_one[tid_2] = s_left[tid_2]; \n");
1728 source.append(
" g_right_one[tid_2] = s_right[tid_2]; \n");
1730 source.append(
" g_pos_one[tid_2] = s_right_count[tid_2]; \n");
1731 source.append(
" } \n");
1732 source.append(
" else \n");
1733 source.append(
" { \n");
1735 source.append(
" g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; \n");
1736 source.append(
" g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; \n");
1737 source.append(
" g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; \n");
1738 source.append(
" g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; \n");
1739 source.append(
" } \n");
1741 source.append(
" } \n");
1743 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1747 source.append(
" if (tid <= num_blocks_mult) \n");
1748 source.append(
" { \n");
1749 source.append(
" g_blocks_mult[tid] = s_compaction_list[tid]; \n");
1750 source.append(
" g_blocks_mult_sum[tid] = s_cl_helper[tid]; \n");
1751 source.append(
" } \n");
1752 source.append(
" if (tid_2 <= num_blocks_mult) \n");
1753 source.append(
" { \n");
1754 source.append(
" g_blocks_mult[tid_2] = s_compaction_list[tid_2]; \n");
1755 source.append(
" g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; \n");
1756 source.append(
" } \n");
1757 source.append(
" } \n");
1764 template <
typename StringType>
1767 source.append(
" \n");
1768 source.append(
" void \n");
1769 source.append(
" compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, \n");
1770 source.append(
" const unsigned int num_threads_active, \n");
1771 source.append(
" __local unsigned int *offset_mult_lambda, \n");
1772 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_left, \n");
1773 source.append(
" __local "); source.append(numeric_string); source.append(
" *s_right, \n");
1774 source.append(
" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1775 source.append(
" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
1776 source.append(
" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper, \n");
1777 source.append(
" unsigned int is_one_lambda, unsigned int is_one_lambda_2, \n");
1778 source.append(
" "); source.append(numeric_string); source.append(
" *left, \n");
1779 source.append(
" "); source.append(numeric_string); source.append(
" *right, \n");
1780 source.append(
" "); source.append(numeric_string); source.append(
" *left_2, \n");
1781 source.append(
" "); source.append(numeric_string); source.append(
" *right_2, \n");
1782 source.append(
" unsigned int *left_count, unsigned int *right_count, \n");
1783 source.append(
" unsigned int *left_count_2, unsigned int *right_count_2, \n");
1784 source.append(
" unsigned int c_block_iend, unsigned int c_sum_block, \n");
1785 source.append(
" unsigned int c_block_iend_2, unsigned int c_sum_block_2 \n");
1786 source.append(
" ) \n");
1787 source.append(
" { \n");
1788 source.append(
" uint glb_id = get_global_id(0); \n");
1789 source.append(
" uint grp_id = get_group_id(0); \n");
1790 source.append(
" uint grp_nm = get_num_groups(0); \n");
1791 source.append(
" uint lcl_id = get_local_id(0); \n");
1792 source.append(
" uint lcl_sz = get_local_size(0); \n");
1795 source.append(
" *left = s_left[tid]; \n");
1796 source.append(
" *right = s_right[tid]; \n");
1798 source.append(
" if (tid_2 < num_threads_active) \n");
1799 source.append(
" { \n");
1800 source.append(
" *left_2 = s_left[tid_2]; \n");
1801 source.append(
" *right_2 = s_right[tid_2]; \n");
1802 source.append(
" } \n");
1804 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1808 source.append(
" unsigned int ptr_w = 0; \n");
1809 source.append(
" unsigned int ptr_w_2 = 0; \n");
1810 source.append(
" unsigned int ptr_blocking_w = 0; \n");
1811 source.append(
" unsigned int ptr_blocking_w_2 = 0; \n");
1812 source.append(
" \n");
1813 source.append(
" \n");
1815 source.append(
" ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] \n");
1816 source.append(
" : s_cl_mult[tid] + *offset_mult_lambda; \n");
1818 source.append(
" if (0 != c_block_iend) \n");
1819 source.append(
" { \n");
1820 source.append(
" ptr_blocking_w = s_cl_blocking[tid]; \n");
1821 source.append(
" } \n");
1823 source.append(
" if (tid_2 < num_threads_active) \n");
1824 source.append(
" { \n");
1825 source.append(
" ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] \n");
1826 source.append(
" : s_cl_mult[tid_2] + *offset_mult_lambda; \n");
1828 source.append(
" if (0 != c_block_iend_2) \n");
1829 source.append(
" { \n");
1830 source.append(
" ptr_blocking_w_2 = s_cl_blocking[tid_2]; \n");
1831 source.append(
" } \n");
1832 source.append(
" } \n");
1833 source.append(
" \n");
1834 source.append(
" \n");
1835 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1836 source.append(
" \n");
1838 source.append(
" if(tid < num_threads_active) \n");
1839 source.append(
" { \n ");
1840 source.append(
" s_left[ptr_w] = *left; \n");
1841 source.append(
" s_right[ptr_w] = *right; \n");
1842 source.append(
" s_left_count[ptr_w] = *left_count; \n");
1843 source.append(
" s_right_count[ptr_w] = *right_count; \n");
1844 source.append(
" } \n ");
1845 source.append(
" \n");
1846 source.append(
" \n");
1847 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1848 source.append(
" if(tid == 1) \n");
1849 source.append(
" { \n");
1850 source.append(
" s_left[ptr_w] = *left; \n");
1851 source.append(
" s_right[ptr_w] = *right; \n");
1852 source.append(
" s_left_count[ptr_w] = *left_count; \n");
1853 source.append(
" s_right_count[ptr_w] = *right_count; \n");
1854 source.append(
" \n");
1855 source.append(
" } \n");
1856 source.append(
" if (0 != c_block_iend) \n");
1857 source.append(
" { \n");
1858 source.append(
" s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; \n");
1859 source.append(
" s_cl_helper[ptr_blocking_w + 1] = c_sum_block; \n");
1860 source.append(
" } \n");
1861 source.append(
" \n");
1862 source.append(
" if (tid_2 < num_threads_active) \n");
1863 source.append(
" { \n");
1865 source.append(
" s_left[ptr_w_2] = *left_2; \n");
1866 source.append(
" s_right[ptr_w_2] = *right_2; \n");
1867 source.append(
" s_left_count[ptr_w_2] = *left_count_2; \n");
1868 source.append(
" s_right_count[ptr_w_2] = *right_count_2; \n");
1870 source.append(
" if (0 != c_block_iend_2) \n");
1871 source.append(
" { \n");
1872 source.append(
" s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; \n");
1873 source.append(
" s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; \n");
1874 source.append(
" } \n");
1875 source.append(
" } \n");
1877 source.append(
" } \n");
1886 template <
typename StringType>
1889 source.append(
" \n");
1890 source.append(
" void \n");
1891 source.append(
" scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, \n");
1892 source.append(
" const unsigned int num_threads_compaction, \n");
1893 source.append(
" __local unsigned short *s_cl_blocking, \n");
1894 source.append(
" __local unsigned short *s_cl_helper \n");
1895 source.append(
" ) \n");
1896 source.append(
" { \n");
1897 source.append(
" uint glb_id = get_global_id(0); \n");
1898 source.append(
" uint grp_id = get_group_id(0); \n");
1899 source.append(
" uint grp_nm = get_num_groups(0); \n");
1900 source.append(
" uint lcl_id = get_local_id(0); \n");
1901 source.append(
" uint lcl_sz = get_local_size(0); \n");
1905 source.append(
" s_cl_blocking[tid] = s_cl_helper[tid]; \n");
1907 source.append(
" if (tid_2 < num_threads_compaction) \n");
1908 source.append(
" { \n");
1909 source.append(
" s_cl_blocking[tid_2] = s_cl_helper[tid_2]; \n");
1910 source.append(
" } \n");
1912 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1919 source.append(
" unsigned int offset = 1; \n");
1922 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
1923 source.append(
" { \n");
1925 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1927 source.append(
" if (tid < d) \n");
1928 source.append(
" { \n");
1930 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
1931 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
1932 source.append(
" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1933 source.append(
" } \n");
1935 source.append(
" offset <<= 1; \n");
1936 source.append(
" } \n");
1939 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
1940 source.append(
" { \n");
1942 source.append(
" offset >>= 1; \n");
1943 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1946 source.append(
" if (tid < (d-1)) \n");
1947 source.append(
" { \n");
1949 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
1950 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
1951 source.append(
" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1952 source.append(
" } \n");
1953 source.append(
" } \n");
1955 source.append(
" } \n");
1965 template <
typename StringType>
1968 source.append(
" \n");
1969 source.append(
" void \n");
1970 source.append(
" scanSumBlocks(const unsigned int tid, const unsigned int tid_2, \n");
1971 source.append(
" const unsigned int num_threads_active, \n");
1972 source.append(
" const unsigned int num_threads_compaction, \n");
1973 source.append(
" __local unsigned short *s_cl_blocking, \n");
1974 source.append(
" __local unsigned short *s_cl_helper) \n");
1975 source.append(
" { \n");
1976 source.append(
" uint glb_id = get_global_id(0); \n");
1977 source.append(
" uint grp_id = get_group_id(0); \n");
1978 source.append(
" uint grp_nm = get_num_groups(0); \n");
1979 source.append(
" uint lcl_id = get_local_id(0); \n");
1980 source.append(
" uint lcl_sz = get_local_size(0); \n");
1982 source.append(
" unsigned int offset = 1; \n");
1986 source.append(
" for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) \n");
1987 source.append(
" { \n");
1989 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1991 source.append(
" if (tid < d) \n");
1992 source.append(
" { \n");
1994 source.append(
" unsigned int ai = offset*(2*tid+1)-1; \n");
1995 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
1997 source.append(
" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
1998 source.append(
" } \n");
2000 source.append(
" offset *= 2; \n");
2001 source.append(
" } \n");
2005 source.append(
" for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) \n");
2006 source.append(
" { \n");
2008 source.append(
" offset >>= 1; \n");
2009 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2011 source.append(
" if (tid < (d-1)) \n");
2012 source.append(
" { \n");
2013 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
2014 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
2015 source.append(
" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
2016 source.append(
" } \n");
2017 source.append(
" } \n");
2018 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2020 source.append(
" if (0 == tid) \n");
2021 source.append(
" { \n");
2026 source.append(
" s_cl_helper[num_threads_active - 1] = \n");
2027 source.append(
" s_cl_helper[num_threads_compaction - 1]; \n");
2028 source.append(
" s_cl_blocking[num_threads_active - 1] = \n");
2029 source.append(
" s_cl_blocking[num_threads_compaction - 1]; \n");
2030 source.append(
" } \n");
2031 source.append(
" } \n");
2041 template <
typename StringType>
2044 source.append(
" \n");
2045 source.append(
" void \n");
2046 source.append(
" scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int n, \n");
2047 source.append(
" const unsigned int num_threads_active, \n");
2048 source.append(
" const unsigned int num_threads_compaction, \n");
2049 source.append(
" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
2050 source.append(
" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper \n");
2051 source.append(
" ) \n");
2052 source.append(
" { \n");
2053 source.append(
" uint glb_id = get_global_id(0); \n");
2054 source.append(
" uint grp_id = get_group_id(0); \n");
2055 source.append(
" uint grp_nm = get_num_groups(0); \n");
2056 source.append(
" uint lcl_id = get_local_id(0); \n");
2057 source.append(
" uint lcl_sz = get_local_size(0); \n");
2065 source.append(
" unsigned int offset = 1; \n");
2068 source.append(
" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
2069 source.append(
" { \n");
2071 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2073 source.append(
" if (tid < d) \n");
2074 source.append(
" { \n");
2076 source.append(
" unsigned int ai = offset*(2*tid+1); \n");
2077 source.append(
" unsigned int bi = offset*(2*tid+2)-1; \n");
2079 source.append(
" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; \n");
2080 source.append(
" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; \n");
2088 source.append(
" if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) \n");
2089 source.append(
" { \n");
2092 source.append(
" if (s_cl_helper[ai - 1] == 1) \n");
2093 source.append(
" { \n");
2095 source.append(
" s_cl_helper[bi] = 1; \n");
2096 source.append(
" } \n");
2097 source.append(
" else if (s_cl_helper[bi] == 1) \n");
2098 source.append(
" { \n");
2100 source.append(
" s_cl_helper[ai - 1] = 1; \n");
2101 source.append(
" } \n");
2102 source.append(
" else \n");
2103 source.append(
" { \n");
2105 source.append(
" unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; \n");
2107 source.append(
" if (temp > (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2)) \n");
2108 source.append(
" { \n");
2111 source.append(
" s_cl_helper[ai - 1] = 1; \n");
2112 source.append(
" s_cl_helper[bi] = 1; \n");
2113 source.append(
" } \n");
2114 source.append(
" else \n");
2115 source.append(
" { \n");
2117 source.append(
" s_cl_blocking[bi] = temp; \n");
2118 source.append(
" s_cl_blocking[ai - 1] = 0; \n");
2119 source.append(
" } \n");
2120 source.append(
" } \n");
2121 source.append(
" } \n");
2122 source.append(
" } \n");
2123 source.append(
" offset <<= 1; \n");
2124 source.append(
" } \n");
2129 source.append(
" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
2130 source.append(
" { \n");
2131 source.append(
" offset >>= 1; \n");
2132 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2134 source.append(
" if (tid < (d-1)) \n");
2135 source.append(
" { \n");
2136 source.append(
" unsigned int ai = offset*(tid+1) - 1; \n");
2137 source.append(
" unsigned int bi = ai + (offset >> 1); \n");
2138 source.append(
" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; \n");
2139 source.append(
" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; \n");
2140 source.append(
" } \n");
2141 source.append(
" } \n");
2142 source.append(
" } \n");
2158 template <
typename StringType>
2161 source.append(
" __kernel \n");
2162 source.append(
" void \n");
2163 source.append(
" bisectKernelLarge(__global "); source.append(numeric_string); source.append(
" *g_d, \n");
2164 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_s, \n");
2165 source.append(
" const unsigned int n, \n");
2166 source.append(
" const "); source.append(numeric_string); source.append(
" lg, \n");
2167 source.append(
" const "); source.append(numeric_string); source.append(
" ug, \n");
2168 source.append(
" const unsigned int lg_eig_count, \n");
2169 source.append(
" const unsigned int ug_eig_count, \n");
2170 source.append(
" "); source.append(numeric_string); source.append(
" epsilon, \n");
2171 source.append(
" __global unsigned int *g_num_one, \n");
2172 source.append(
" __global unsigned int *g_num_blocks_mult, \n");
2173 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_one, \n");
2174 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_one, \n");
2175 source.append(
" __global unsigned int *g_pos_one, \n");
2176 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_left_mult, \n");
2177 source.append(
" __global "); source.append(numeric_string); source.append(
" *g_right_mult, \n");
2178 source.append(
" __global unsigned int *g_left_count_mult, \n");
2179 source.append(
" __global unsigned int *g_right_count_mult, \n");
2180 source.append(
" __global unsigned int *g_blocks_mult, \n");
2181 source.append(
" __global unsigned int *g_blocks_mult_sum \n");
2182 source.append(
" ) \n");
2183 source.append(
" { \n");
2184 source.append(
" g_s = g_s + 1; \n");
2185 source.append(
" uint glb_id = get_global_id(0); \n");
2186 source.append(
" uint grp_id = get_group_id(0); \n");
2187 source.append(
" uint grp_nm = get_num_groups(0); \n");
2188 source.append(
" uint lcl_id = get_local_id(0); \n");
2189 source.append(
" uint lcl_sz = get_local_size(0); \n");
2191 source.append(
" const unsigned int tid = lcl_id; \n");
2195 source.append(
" __local "); source.append(numeric_string); source.append(
" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2196 source.append(
" __local "); source.append(numeric_string); source.append(
" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2200 source.append(
" __local unsigned short s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2201 source.append(
" __local unsigned short s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2204 source.append(
" __local unsigned short s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2209 source.append(
" __local unsigned int compact_second_chunk; \n");
2211 source.append(
" __local unsigned int all_threads_converged; \n");
2214 source.append(
" __local unsigned int num_threads_active; \n");
2217 source.append(
" __local unsigned int num_threads_compaction; \n");
2220 source.append(
" __local unsigned short *s_compaction_list_exc = s_compaction_list + 1; \n");
2224 source.append(
" "); source.append(numeric_string); source.append(
" left = 0.0f; \n");
2225 source.append(
" "); source.append(numeric_string); source.append(
" right = 0.0f; \n");
2226 source.append(
" unsigned int left_count = 0; \n");
2227 source.append(
" unsigned int right_count = 0; \n");
2229 source.append(
" "); source.append(numeric_string); source.append(
" mid = 0.0f; \n");
2231 source.append(
" unsigned int mid_count = 0; \n");
2233 source.append(
" unsigned int is_active_second = 0; \n");
2236 source.append(
" s_compaction_list[tid] = 0; \n");
2237 source.append(
" s_left[tid] = 0; \n");
2238 source.append(
" s_right[tid] = 0; \n");
2239 source.append(
" s_left_count[tid] = 0; \n");
2240 source.append(
" s_right_count[tid] = 0; \n");
2242 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2245 source.append(
" if (0 == tid) \n");
2246 source.append(
" { \n");
2248 source.append(
" s_left[0] = lg; \n");
2249 source.append(
" s_right[0] = ug; \n");
2250 source.append(
" s_left_count[0] = lg_eig_count; \n");
2251 source.append(
" s_right_count[0] = ug_eig_count; \n");
2253 source.append(
" compact_second_chunk = 0; \n");
2254 source.append(
" num_threads_active = 1; \n");
2256 source.append(
" num_threads_compaction = 1; \n");
2258 source.append(
" all_threads_converged = 1; \n");
2259 source.append(
" } \n");
2261 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2267 source.append(
" while( true ) \n");
2268 source.append(
" { \n");
2269 source.append(
" s_compaction_list[tid] = 0; \n");
2270 source.append(
" s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
2271 source.append(
" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
2272 source.append(
" subdivideActiveIntervalShort(tid, s_left, s_right, s_left_count, s_right_count, \n");
2273 source.append(
" num_threads_active, \n");
2274 source.append(
" &left, &right, &left_count, &right_count, \n");
2275 source.append(
" &mid, &all_threads_converged); \n");
2277 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2280 source.append(
" if (1 == all_threads_converged) \n");
2281 source.append(
" { \n");
2282 source.append(
" break; \n");
2283 source.append(
" } \n");
2290 source.append(
" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
2291 source.append(
" mid, lcl_id, \n");
2292 source.append(
" num_threads_active, \n");
2293 source.append(
" s_left, s_right, \n");
2294 source.append(
" (left == right)); \n");
2296 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2308 source.append(
" if (tid < num_threads_active) \n");
2309 source.append(
" { \n");
2311 source.append(
" if (left != right) \n");
2312 source.append(
" { \n");
2315 source.append(
" storeNonEmptyIntervalsLarge(tid, num_threads_active, \n");
2316 source.append(
" s_left, s_right, \n");
2317 source.append(
" s_left_count, s_right_count, \n");
2318 source.append(
" left, mid, right, \n");
2319 source.append(
" left_count, mid_count, right_count, \n");
2320 source.append(
" epsilon, &compact_second_chunk, \n");
2321 source.append(
" s_compaction_list_exc, \n");
2322 source.append(
" &is_active_second); \n");
2323 source.append(
" } \n");
2324 source.append(
" else \n");
2325 source.append(
" { \n");
2330 source.append(
" s_left[tid] = left; \n");
2331 source.append(
" s_right[tid] = left; \n");
2332 source.append(
" s_left_count[tid] = left_count; \n");
2333 source.append(
" s_right_count[tid] = right_count; \n");
2335 source.append(
" is_active_second = 0; \n");
2336 source.append(
" } \n");
2337 source.append(
" } \n");
2340 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2345 source.append(
" if (compact_second_chunk > 0) \n");
2346 source.append(
" { \n");
2349 source.append(
" createIndicesCompactionShort(s_compaction_list_exc, num_threads_compaction); \n");
2350 source.append(
" } \n");
2351 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2352 source.append(
" \n");
2353 source.append(
" if (compact_second_chunk > 0) \n");
2354 source.append(
" { \n");
2355 source.append(
" compactIntervalsShort(s_left, s_right, s_left_count, s_right_count, \n");
2356 source.append(
" mid, right, mid_count, right_count, \n");
2357 source.append(
" s_compaction_list, num_threads_active, \n");
2358 source.append(
" is_active_second); \n");
2359 source.append(
" } \n");
2361 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2364 source.append(
" if (0 == tid) \n");
2365 source.append(
" { \n");
2368 source.append(
" num_threads_active += s_compaction_list[num_threads_active]; \n");
2369 source.append(
" num_threads_compaction = ceilPow2(num_threads_active); \n");
2371 source.append(
" compact_second_chunk = 0; \n");
2372 source.append(
" all_threads_converged = 1; \n");
2373 source.append(
" } \n");
2374 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2375 source.append(
" if (num_threads_compaction > lcl_sz) \n");
2376 source.append(
" { \n");
2377 source.append(
" break; \n");
2378 source.append(
" } \n");
2379 source.append(
" } \n");
2380 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2388 source.append(
" unsigned int left_count_2; \n");
2389 source.append(
" unsigned int right_count_2; \n");
2391 source.append(
" unsigned int tid_2 = tid + lcl_sz; \n");
2395 source.append(
" left_count = s_left_count[tid]; \n");
2396 source.append(
" right_count = s_right_count[tid]; \n");
2399 source.append(
" if (tid_2 < num_threads_active) \n");
2400 source.append(
" { \n");
2401 source.append(
" left_count_2 = s_left_count[tid_2]; \n");
2402 source.append(
" right_count_2 = s_right_count[tid_2]; \n");
2403 source.append(
" } \n");
2407 source.append(
" __local unsigned short *s_cl_one = s_left_count + 1; \n");
2408 source.append(
" __local unsigned short *s_cl_mult = s_right_count + 1; \n");
2412 source.append(
" __local unsigned short *s_cl_blocking = s_compaction_list_exc; \n");
2414 source.append(
" __local unsigned short s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2416 source.append(
" if (0 == tid) \n");
2417 source.append(
" { \n");
2419 source.append(
" s_left_count[0] = 0; \n");
2420 source.append(
" s_right_count[0] = 0; \n");
2421 source.append(
" \n");
2422 source.append(
" } \n");
2424 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2427 source.append(
" unsigned int is_one_lambda = 0; \n");
2428 source.append(
" unsigned int is_one_lambda_2 = 0; \n");
2431 source.append(
" unsigned int multiplicity = right_count - left_count; \n");
2432 source.append(
" is_one_lambda = (1 == multiplicity); \n");
2434 source.append(
" s_cl_one[tid] = is_one_lambda; \n");
2435 source.append(
" s_cl_mult[tid] = (! is_one_lambda); \n");
2438 source.append(
" s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; \n");
2439 source.append(
" s_cl_helper[tid] = 0; \n");
2441 source.append(
" if (tid_2 < num_threads_active) \n");
2442 source.append(
" { \n");
2444 source.append(
" unsigned int multiplicity = right_count_2 - left_count_2; \n");
2445 source.append(
" is_one_lambda_2 = (1 == multiplicity); \n");
2447 source.append(
" s_cl_one[tid_2] = is_one_lambda_2; \n");
2448 source.append(
" s_cl_mult[tid_2] = (! is_one_lambda_2); \n");
2451 source.append(
" s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; \n");
2452 source.append(
" s_cl_helper[tid_2] = 0; \n");
2453 source.append(
" } \n");
2454 source.append(
" else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1)) \n");
2455 source.append(
" { \n");
2458 source.append(
" s_cl_blocking[tid_2] = 0; \n");
2459 source.append(
" s_cl_helper[tid_2] = 0; \n");
2460 source.append(
" } \n");
2463 source.append(
" scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction, \n");
2464 source.append(
" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper); \n");
2465 source.append(
" \n");
2466 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2468 source.append(
" scanSumBlocks(tid, tid_2, num_threads_active, \n");
2469 source.append(
" num_threads_compaction, s_cl_blocking, s_cl_helper); \n");
2472 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2474 source.append(
" unsigned int c_block_iend = 0; \n");
2475 source.append(
" unsigned int c_block_iend_2 = 0; \n");
2476 source.append(
" unsigned int c_sum_block = 0; \n");
2477 source.append(
" unsigned int c_sum_block_2 = 0; \n");
2484 source.append(
" if (1 == s_cl_helper[tid]) \n");
2485 source.append(
" { \n");
2487 source.append(
" c_block_iend = s_cl_mult[tid] + 1; \n");
2488 source.append(
" c_sum_block = s_cl_blocking[tid]; \n");
2489 source.append(
" } \n");
2491 source.append(
" if (1 == s_cl_helper[tid_2]) \n");
2492 source.append(
" { \n");
2494 source.append(
" c_block_iend_2 = s_cl_mult[tid_2] + 1; \n");
2495 source.append(
" c_sum_block_2 = s_cl_blocking[tid_2]; \n");
2496 source.append(
" } \n");
2498 source.append(
" scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, \n");
2499 source.append(
" s_cl_blocking, s_cl_helper); \n");
2503 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2506 source.append(
" __local unsigned int num_blocks_mult; \n");
2507 source.append(
" __local unsigned int num_mult; \n");
2508 source.append(
" __local unsigned int offset_mult_lambda; \n");
2510 source.append(
" if (0 == tid) \n");
2511 source.append(
" { \n");
2513 source.append(
" num_blocks_mult = s_cl_blocking[num_threads_active - 1]; \n");
2514 source.append(
" offset_mult_lambda = s_cl_one[num_threads_active - 1]; \n");
2515 source.append(
" num_mult = s_cl_mult[num_threads_active - 1]; \n");
2517 source.append(
" *g_num_one = offset_mult_lambda; \n");
2518 source.append(
" *g_num_blocks_mult = num_blocks_mult; \n");
2519 source.append(
" } \n");
2521 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2523 source.append(
" "); source.append(numeric_string); source.append(
" left_2, right_2; \n");
2524 source.append(
" --s_cl_one; \n");
2525 source.append(
" --s_cl_mult; \n");
2526 source.append(
" --s_cl_blocking; \n");
2527 source.append(
" \n");
2528 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2529 source.append(
" compactStreamsFinal(tid, tid_2, num_threads_active, &offset_mult_lambda, \n");
2530 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
2531 source.append(
" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, \n");
2532 source.append(
" is_one_lambda, is_one_lambda_2, \n");
2533 source.append(
" &left, &right, &left_2, &right_2, \n");
2534 source.append(
" &left_count, &right_count, &left_count_2, &right_count_2, \n");
2535 source.append(
" c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2 \n");
2536 source.append(
" ); \n");
2538 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2541 source.append(
" if (0 == tid) \n");
2542 source.append(
" { \n");
2543 source.append(
" s_cl_blocking[num_blocks_mult] = num_mult; \n");
2544 source.append(
" s_cl_helper[0] = 0; \n");
2545 source.append(
" } \n");
2547 source.append(
" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2550 source.append(
" writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, \n");
2551 source.append(
" g_left_one, g_right_one, g_pos_one, \n");
2552 source.append(
" g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult, \n");
2553 source.append(
" s_left, s_right, s_left_count, s_right_count, \n");
2554 source.append(
" g_blocks_mult, g_blocks_mult_sum, \n");
2555 source.append(
" s_compaction_list, s_cl_helper, offset_mult_lambda); \n");
2556 source.append(
" \n");
2558 source.append(
" } \n");
2565 template <
class NumericT>
2578 static std::map<cl_context, bool> init_done;
2582 source.reserve(8192);
2584 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
2587 if (numeric_string ==
"float" || numeric_string ==
"double")
2632 #ifdef VIENNACL_BUILD_INFO
2633 std::cout <<
"Creating program " << prog_name << std::endl;
2635 ctx.add_program(source, prog_name);
2636 init_done[ctx.handle().get()] =
true;
2645 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_writeToGmem(StringType &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_bisect_kernel_bisectKernel(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_subdivideActiveInterval(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_floorPow2(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_bisectKernelLarge(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_createIndicesCompactionShort(StringType &source)
void generate_bisect_kernel_compactStreamsFinal(StringType &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_bisect_kernel_compactIntervals(StringType &source, std::string const &numeric_string)
Provides OpenCL-related utilities.
void generate_bisect_kernel_compactIntervalsShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalConvergedShort(StringType &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Common implementations shared by OpenCL-based operations.
static void apply(viennacl::ocl::context const &)
void generate_bisect_kernel_createIndicesCompaction(StringType &source)
static std::string program_name()
const OCL_TYPE & get() const
void generate_bisect_kernel_ceilPow2(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_scanCompactBlocksStartAddress(StringType &source)
void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_computeMidpoint(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeNonEmptyIntervals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_config(StringType &source)
void generate_bisect_kernel_subdivideActiveIntervalShort(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_computeNumSmallerEigenvals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalShort(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_bisect_kernel_scanSumBlocks(StringType &source)
void generate_bisect_kernel_scanInitial(StringType &source)
void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
void generate_bisect_kernel_storeInterval(StringType &source, std::string const &numeric_string)
Main kernel class for the generation of the bisection kernels and utilities.
void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType &source, std::string const &numeric_string)
void generate_bisect_kernel_storeIntervalConverged(StringType &source, std::string const &numeric_string)