ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
bisect.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
2 #define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
32 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/ocl/kernel.hpp"
35 #include "viennacl/ocl/utils.hpp"
36 
38 
39 // declaration, forward
40 
41 namespace viennacl
42 {
43 namespace linalg
44 {
45 namespace opencl
46 {
47 namespace kernels
48 {
49  template <typename StringType>
50  void generate_bisect_kernel_config(StringType & source)
51  {
52  /* Global configuration parameter */
53  source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK 256\n");
54  source.append(" #define VIENNACL_BISECT_MAX_SMALL_MATRIX 256\n");
55  source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 256\n");
56  source.append(" #define VIENNACL_BISECT_MIN_ABS_INTERVAL 5.0e-37\n");
57 
58  }
59 
61  // Compute the next lower power of two of n
62  // n number for which next higher power of two is seeked
64 
65  template <typename StringType>
66  void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string)
67  {
68  source.append(" \n");
69  source.append(" inline int \n");
70  source.append(" floorPow2(int n) \n");
71  source.append(" { \n");
72  source.append(" uint glb_id = get_global_id(0); \n");
73  source.append(" uint grp_id = get_group_id(0); \n");
74  source.append(" uint grp_nm = get_num_groups(0); \n");
75  source.append(" uint lcl_id = get_local_id(0); \n");
76  source.append(" uint lcl_sz = get_local_size(0); \n");
77 
78 
79  // early out if already power of two
80  source.append(" if (0 == (n & (n-1))) \n");
81  source.append(" { \n");
82  source.append(" return n; \n");
83  source.append(" } \n");
84 
85  source.append(" int exp; \n");
86  source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
87  source.append(" return (1 << (exp - 1)); \n");
88  source.append(" } \n");
89 
90  }
91 
92 
94  // Compute the next higher power of two of n
95  // n number for which next higher power of two is seeked
97 
98  template <typename StringType>
99  void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string)
100  {
101  source.append(" \n");
102  source.append(" inline int \n");
103  source.append(" ceilPow2(int n) \n");
104  source.append(" { \n");
105  source.append(" uint glb_id = get_global_id(0); \n");
106  source.append(" uint grp_id = get_group_id(0); \n");
107  source.append(" uint grp_nm = get_num_groups(0); \n");
108  source.append(" uint lcl_id = get_local_id(0); \n");
109  source.append(" uint lcl_sz = get_local_size(0); \n");
110 
111 
112  // early out if already power of two
113  source.append(" if (0 == (n & (n-1))) \n");
114  source.append(" { \n");
115  source.append(" return n; \n");
116  source.append(" } \n");
117 
118  source.append(" int exp; \n");
119  source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
120  source.append(" return (1 << exp); \n");
121  source.append(" } \n");
122  }
123 
124 
126  // Compute midpoint of interval [\a left, \a right] avoiding overflow if possible
127  //
128  // left left / lower limit of interval
129  // right right / upper limit of interval
131 
132  template <typename StringType>
133  void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string)
134  {
135  source.append(" \n");
136  source.append(" inline "); source.append(numeric_string); source.append(" \n");
137  source.append(" computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n");
138  source.append(" const "); source.append(numeric_string); source.append(" right) \n");
139  source.append(" { \n");
140  source.append(" uint glb_id = get_global_id(0); \n");
141  source.append(" uint grp_id = get_group_id(0); \n");
142  source.append(" uint grp_nm = get_num_groups(0); \n");
143  source.append(" uint lcl_id = get_local_id(0); \n");
144  source.append(" uint lcl_sz = get_local_size(0); \n");
145  source.append(" "); source.append(numeric_string); source.append(" mid; \n");
146 
147  source.append(" if (sign(left) == sign(right)) \n");
148  source.append(" { \n");
149  source.append(" mid = left + (right - left) * 0.5f; \n");
150  source.append(" } \n");
151  source.append(" else \n");
152  source.append(" { \n");
153  source.append(" mid = (left + right) * 0.5f; \n");
154  source.append(" } \n");
155 
156  source.append(" return mid; \n");
157  source.append(" } \n");
158 
159  }
160 
161 
163  // Check if interval converged and store appropriately
164  //
165  // addr address where to store the information of the interval
166  // s_left shared memory storage for left interval limits
167  // s_right shared memory storage for right interval limits
168  // s_left_count shared memory storage for number of eigenvalues less than left interval limits
169  // s_right_count shared memory storage for number of eigenvalues less than right interval limits
170  // left lower limit of interval
171  // right upper limit of interval
172  // left_count eigenvalues less than \a left
173  // right_count eigenvalues less than \a right
174  // precision desired precision for eigenvalues
176 
177  template<typename StringType>
178  void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string)
179  {
180  source.append(" \n");
181  source.append(" void \n");
182  source.append(" storeInterval(unsigned int addr, \n");
183  source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
184  source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
185  source.append(" __local unsigned int * s_left_count, \n");
186  source.append(" __local unsigned int * s_right_count, \n");
187  source.append(" "); source.append(numeric_string); source.append(" left, \n");
188  source.append(" "); source.append(numeric_string); source.append(" right, \n");
189  source.append(" unsigned int left_count, \n");
190  source.append(" unsigned int right_count, \n");
191  source.append(" "); source.append(numeric_string); source.append(" precision) \n");
192  source.append(" { \n");
193  source.append(" uint glb_id = get_global_id(0); \n");
194  source.append(" uint grp_id = get_group_id(0); \n");
195  source.append(" uint grp_nm = get_num_groups(0); \n");
196  source.append(" uint lcl_id = get_local_id(0); \n");
197  source.append(" uint lcl_sz = get_local_size(0); \n");
198 
199  source.append(" s_left_count[addr] = left_count; \n");
200  source.append(" s_right_count[addr] = right_count; \n");
201 
202  // check if interval converged
203  source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
204  source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
205 
206  source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
207  source.append(" { \n");
208  // compute mid point
209  source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
210 
211  // mark as converged
212  source.append(" s_left[addr] = lambda; \n");
213  source.append(" s_right[addr] = lambda; \n");
214  source.append(" } \n");
215  source.append(" else \n");
216  source.append(" { \n");
217 
218  // store current limits
219  source.append(" s_left[addr] = left; \n");
220  source.append(" s_right[addr] = right; \n");
221  source.append(" } \n");
222 
223  source.append(" } \n");
224 
225  }
226 
227  template<typename StringType>
228  void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string)
229  {
230  source.append(" \n");
231  source.append(" void \n");
232  source.append(" storeIntervalShort(unsigned int addr, \n");
233  source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
234  source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
235  source.append(" __local unsigned short * s_left_count, \n");
236  source.append(" __local unsigned short * s_right_count, \n");
237  source.append(" "); source.append(numeric_string); source.append(" left, \n");
238  source.append(" "); source.append(numeric_string); source.append(" right, \n");
239  source.append(" unsigned int left_count, \n");
240  source.append(" unsigned int right_count, \n");
241  source.append(" "); source.append(numeric_string); source.append(" precision) \n");
242  source.append(" { \n");
243  source.append(" uint glb_id = get_global_id(0); \n");
244  source.append(" uint grp_id = get_group_id(0); \n");
245  source.append(" uint grp_nm = get_num_groups(0); \n");
246  source.append(" uint lcl_id = get_local_id(0); \n");
247  source.append(" uint lcl_sz = get_local_size(0); \n");
248 
249  source.append(" s_left_count[addr] = left_count; \n");
250  source.append(" s_right_count[addr] = right_count; \n");
251 
252  // check if interval converged
253  source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
254  source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
255 
256  source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
257  source.append(" { \n");
258  // compute mid point
259  source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
260 
261  // mark as converged
262  source.append(" s_left[addr] = lambda; \n");
263  source.append(" s_right[addr] = lambda; \n");
264  source.append(" } \n");
265  source.append(" else \n");
266  source.append(" { \n");
267 
268  // store current limits
269  source.append(" s_left[addr] = left; \n");
270  source.append(" s_right[addr] = right; \n");
271  source.append(" } \n");
272 
273  source.append(" } \n");
274 
275 
276  }
277 
278 
280  // Compute number of eigenvalues that are smaller than x given a symmetric,
281  // real, and tridiagonal matrix
282  //
283  // g_d diagonal elements stored in global memory
284  // g_s superdiagonal elements stored in global memory
285  // n size of matrix
286  // x value for which the number of eigenvalues that are smaller is sought
287  // tid thread identified (e.g. threadIdx.x or gtid)
288  // num_intervals_active number of active intervals / threads that currently process an interval
289  // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory
290  // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
291  // converged flag if the current thread is already converged (that is count does not have to be computed)
293 
294  template <typename StringType>
295  void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string)
296  {
297  source.append(" \n");
298  source.append(" inline unsigned int \n");
299  source.append(" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
300  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
301  source.append(" const unsigned int n, \n");
302  source.append(" const "); source.append(numeric_string); source.append(" x, \n");
303  source.append(" const unsigned int tid, \n");
304  source.append(" const unsigned int num_intervals_active, \n");
305  source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
306  source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
307  source.append(" unsigned int converged \n");
308  source.append(" ) \n");
309  source.append(" { \n");
310  source.append(" uint glb_id = get_global_id(0); \n");
311  source.append(" uint grp_id = get_group_id(0); \n");
312  source.append(" uint grp_nm = get_num_groups(0); \n");
313  source.append(" uint lcl_id = get_local_id(0); \n");
314  source.append(" uint lcl_sz = get_local_size(0); \n");
315 
316 
317  source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
318  source.append(" unsigned int count = 0; \n");
319 
320  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
321 
322  // read data into shared memory
323  source.append(" if (lcl_id < n) \n");
324  source.append(" { \n");
325  source.append(" s_d[lcl_id] = *(g_d + lcl_id); \n");
326  source.append(" s_s[lcl_id] = *(g_s + lcl_id - 1); \n");
327  source.append(" } \n");
328 
329  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
330 
331  // perform loop only for active threads
332  source.append(" if ((tid < num_intervals_active) && (0 == converged)) \n");
333  source.append(" { \n");
334 
335  // perform (optimized) Gaussian elimination to determine the number
336  // of eigenvalues that are smaller than n
337  source.append(" for (unsigned int k = 0; k < n; ++k) \n");
338  source.append(" { \n");
339  source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
340  source.append(" count += (delta < 0) ? 1 : 0; \n");
341  source.append(" } \n");
342 
343  source.append(" } \n"); // end if thread currently processing an interval
344 
345  source.append(" return count; \n");
346  source.append(" } \n");
347 
348  }
349 
350 
352  // Compute number of eigenvalues that are smaller than x given a symmetric,
353  // real, and tridiagonal matrix
354  //
355  // g_d diagonal elements stored in global memory
356  // g_s superdiagonal elements stored in global memory
357  // n size of matrix
358  // x value for which the number of eigenvalues that are smaller is seeked
359  // tid thread identified (e.g. threadIdx.x or gtid)
360  // num_intervals_active number of active intervals / threads that currently process an interval
361  // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory
362  // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
363  // converged flag if the current thread is already converged (that is count does not have to be computed)
365 
366  template <typename StringType>
367  void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string)
368  {
369  source.append(" \n");
370  source.append(" inline unsigned int \n");
371  source.append(" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n");
372  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
373  source.append(" const unsigned int n, \n");
374  source.append(" const "); source.append(numeric_string); source.append(" x, \n");
375  source.append(" const unsigned int tid, \n");
376  source.append(" const unsigned int num_intervals_active, \n");
377  source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
378  source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
379  source.append(" unsigned int converged \n");
380  source.append(" ) \n");
381  source.append(" { \n");
382  source.append(" uint glb_id = get_global_id(0); \n");
383  source.append(" uint grp_id = get_group_id(0); \n");
384  source.append(" uint grp_nm = get_num_groups(0); \n");
385  source.append(" uint lcl_id = get_local_id(0); \n");
386  source.append(" uint lcl_sz = get_local_size(0); \n");
387 
388  source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
389  source.append(" unsigned int count = 0; \n");
390 
391  source.append(" unsigned int rem = n; \n");
392 
393  // do until whole diagonal and superdiagonal has been loaded and processed
394  source.append(" for (unsigned int i = 0; i < n; i += lcl_sz) \n");
395  source.append(" { \n");
396 
397  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
398 
399  // read new chunk of data into shared memory
400  source.append(" if ((i + lcl_id) < n) \n");
401  source.append(" { \n");
402 
403  source.append(" s_d[lcl_id] = *(g_d + i + lcl_id); \n");
404  source.append(" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n");
405  source.append(" } \n");
406 
407  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
408 
409 
410  source.append(" if (tid < num_intervals_active) \n");
411  source.append(" { \n");
412 
413  // perform (optimized) Gaussian elimination to determine the number
414  // of eigenvalues that are smaller than n
415  source.append(" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n");
416  source.append(" { \n");
417  source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
418  // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
419  source.append(" count += (delta < 0) ? 1 : 0; \n");
420  source.append(" } \n");
421 
422  source.append(" } \n"); // end if thread currently processing an interval
423 
424  source.append(" rem -= lcl_sz; \n");
425  source.append(" } \n");
426 
427  source.append(" return count; \n");
428  source.append(" } \n");
429 
430 
431  }
432 
434  // Store all non-empty intervals resulting from the subdivision of the interval
435  // currently processed by the thread
436  //
437  // addr base address for storing intervals
438  // num_threads_active number of threads / intervals in current sweep
439  // s_left shared memory storage for left interval limits
440  // s_right shared memory storage for right interval limits
441  // s_left_count shared memory storage for number of eigenvalues less than left interval limits
442  // s_right_count shared memory storage for number of eigenvalues less than right interval limits
443  // left lower limit of interval
444  // mid midpoint of interval
445  // right upper limit of interval
446  // left_count eigenvalues less than \a left
447  // mid_count eigenvalues less than \a mid
448  // right_count eigenvalues less than \a right
449  // precision desired precision for eigenvalues
450  // compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction
451  // s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
452  // is_active_interval mark is thread has a second non-empty child interval
454 
455  template<typename StringType>
456  void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string)
457  {
458  source.append(" \n");
459  source.append(" void \n");
460  source.append(" storeNonEmptyIntervals(unsigned int addr, \n");
461  source.append(" const unsigned int num_threads_active, \n");
462  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
463  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
464  source.append(" __local unsigned int *s_left_count, \n");
465  source.append(" __local unsigned int *s_right_count, \n");
466  source.append(" "); source.append(numeric_string); source.append(" left, \n ");
467  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
468  source.append(" "); source.append(numeric_string); source.append(" right,\n");
469  source.append(" const unsigned int left_count, \n");
470  source.append(" const unsigned int mid_count, \n");
471  source.append(" const unsigned int right_count, \n");
472  source.append(" "); source.append(numeric_string); source.append(" precision, \n");
473  source.append(" __local unsigned int *compact_second_chunk, \n");
474  source.append(" __local unsigned int *s_compaction_list_exc, \n");
475  source.append(" unsigned int *is_active_second) \n");
476  source.append(" { \n");
477  source.append(" uint glb_id = get_global_id(0); \n");
478  source.append(" uint grp_id = get_group_id(0); \n");
479  source.append(" uint grp_nm = get_num_groups(0); \n");
480  source.append(" uint lcl_id = get_local_id(0); \n");
481  source.append(" uint lcl_sz = get_local_size(0); \n");
482 
483  // check if both child intervals are valid
484  source.append(" \n");
485  source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
486  source.append(" { \n");
487 
488  // store the left interval
489  source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
490  source.append(" left, mid, left_count, mid_count, precision); \n");
491 
492  // mark that a second interval has been generated, only stored after
493  // stream compaction of second chunk
494  source.append(" *is_active_second = 1; \n");
495  source.append(" s_compaction_list_exc[lcl_id] = 1; \n");
496  source.append(" *compact_second_chunk = 1; \n");
497  source.append(" } \n");
498  source.append(" else \n");
499  source.append(" { \n");
500 
501  // only one non-empty child interval
502 
503  // mark that no second child
504  source.append(" *is_active_second = 0; \n");
505  source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
506 
507  // store the one valid child interval
508  source.append(" if (left_count != mid_count) \n");
509  source.append(" { \n");
510  source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
511  source.append(" left, mid, left_count, mid_count, precision); \n");
512  source.append(" } \n");
513  source.append(" else \n");
514  source.append(" { \n");
515  source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
516  source.append(" mid, right, mid_count, right_count, precision); \n");
517  source.append(" } \n");
518 
519  source.append(" } \n");
520  source.append(" } \n");
521 
522  }
523 
524 
529 
530  template <typename StringType>
531  void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string)
532  {
533  source.append(" \n");
534  source.append(" void \n");
535  source.append(" storeNonEmptyIntervalsLarge(unsigned int addr, \n");
536  source.append(" const unsigned int num_threads_active, \n");
537  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
538  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
539  source.append(" __local unsigned short *s_left_count, \n");
540  source.append(" __local unsigned short *s_right_count, \n");
541  source.append(" "); source.append(numeric_string); source.append(" left, \n ");
542  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
543  source.append(" "); source.append(numeric_string); source.append(" right,\n");
544  source.append(" const unsigned int left_count, \n");
545  source.append(" const unsigned int mid_count, \n");
546  source.append(" const unsigned int right_count, \n");
547  source.append(" "); source.append(numeric_string); source.append(" epsilon, \n");
548  source.append(" __local unsigned int *compact_second_chunk, \n");
549  source.append(" __local unsigned short *s_compaction_list, \n");
550  source.append(" unsigned int *is_active_second) \n");
551  source.append(" { \n");
552  source.append(" uint glb_id = get_global_id(0); \n");
553  source.append(" uint grp_id = get_group_id(0); \n");
554  source.append(" uint grp_nm = get_num_groups(0); \n");
555  source.append(" uint lcl_id = get_local_id(0); \n");
556  source.append(" uint lcl_sz = get_local_size(0); \n");
557 
558  // check if both child intervals are valid
559  source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
560  source.append(" { \n");
561 
562  source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
563  source.append(" left, mid, left_count, mid_count, epsilon); \n");
564 
565  source.append(" *is_active_second = 1; \n");
566  source.append(" s_compaction_list[lcl_id] = 1; \n");
567  source.append(" *compact_second_chunk = 1; \n");
568  source.append(" } \n");
569  source.append(" else \n");
570  source.append(" { \n");
571 
572  // only one non-empty child interval
573 
574  // mark that no second child
575  source.append(" *is_active_second = 0; \n");
576  source.append(" s_compaction_list[lcl_id] = 0; \n");
577 
578  // store the one valid child interval
579  source.append(" if (left_count != mid_count) \n");
580  source.append(" { \n");
581  source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
582  source.append(" left, mid, left_count, mid_count, epsilon); \n");
583  source.append(" } \n");
584  source.append(" else \n");
585  source.append(" { \n");
586  source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
587  source.append(" mid, right, mid_count, right_count, epsilon); \n");
588  source.append(" } \n");
589  source.append(" } \n");
590  source.append(" } \n");
591  }
592 
594  // Create indices for compaction, that is process \a s_compaction_list_exc
595  // which is 1 for intervals that generated a second child and 0 otherwise
596  // and create for each of the non-zero elements the index where the new
597  // interval belongs to in a compact representation of all generated second children
598  //
599  // s_compaction_list_exc list containing the flags which threads generated two children
600  // num_threads_compaction number of threads to employ for compaction
602 
603  template<typename StringType>
605  {
606  source.append(" \n");
607  source.append(" void \n");
608  source.append(" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n");
609  source.append(" unsigned int num_threads_compaction) \n");
610  source.append(" { \n");
611  source.append(" uint glb_id = get_global_id(0); \n");
612  source.append(" uint grp_id = get_group_id(0); \n");
613  source.append(" uint grp_nm = get_num_groups(0); \n");
614  source.append(" uint lcl_id = get_local_id(0); \n");
615  source.append(" uint lcl_sz = get_local_size(0); \n");
616 
617 
618  source.append(" unsigned int offset = 1; \n");
619  source.append(" const unsigned int tid = lcl_id; \n");
620  // if(tid == 0)
621  // printf("num_threads_compaction = %u\n", num_threads_compaction);
622 
623  // higher levels of scan tree
624  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
625  source.append(" { \n");
626 
627  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
628 
629  source.append(" if (tid < d) \n");
630  source.append(" { \n");
631 
632  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
633  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
634  source.append(" \n");
635  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
636  source.append(" + s_compaction_list_exc[ai]; \n");
637  source.append(" } \n");
638 
639  source.append(" offset <<= 1; \n");
640  source.append(" } \n");
641 
642  // traverse down tree: first down to level 2 across
643  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
644  source.append(" { \n");
645 
646  source.append(" offset >>= 1; \n");
647  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
648 
649  source.append(" if (tid < (d-1)) \n");
650  source.append(" { \n");
651 
652  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
653  source.append(" unsigned int bi = ai + (offset >> 1); \n");
654 
655  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
656  source.append(" + s_compaction_list_exc[ai]; \n");
657  source.append(" } \n");
658  source.append(" } \n");
659 
660  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
661 
662  source.append(" } \n");
663  }
664 
665 
666  template<typename StringType>
668  {
669  source.append(" \n");
670  source.append(" void \n");
671  source.append(" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n");
672  source.append(" unsigned int num_threads_compaction) \n");
673  source.append(" { \n");
674  source.append(" uint glb_id = get_global_id(0); \n");
675  source.append(" uint grp_id = get_group_id(0); \n");
676  source.append(" uint grp_nm = get_num_groups(0); \n");
677  source.append(" uint lcl_id = get_local_id(0); \n");
678  source.append(" uint lcl_sz = get_local_size(0); \n");
679 
680 
681  source.append(" unsigned int offset = 1; \n");
682  source.append(" const unsigned int tid = lcl_id; \n");
683 
684  // higher levels of scan tree
685  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
686  source.append(" { \n");
687 
688  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
689 
690  source.append(" if (tid < d) \n");
691  source.append(" { \n");
692 
693  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
694  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
695  source.append(" \n");
696  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
697  source.append(" + s_compaction_list_exc[ai]; \n");
698  source.append(" } \n");
699 
700  source.append(" offset <<= 1; \n");
701  source.append(" } \n");
702 
703  // traverse down tree: first down to level 2 across
704  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
705  source.append(" { \n");
706 
707  source.append(" offset >>= 1; \n");
708  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
709 
710  source.append(" if (tid < (d-1)) \n");
711  source.append(" { \n");
712 
713  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
714  source.append(" unsigned int bi = ai + (offset >> 1); \n");
715 
716  source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
717  source.append(" + s_compaction_list_exc[ai]; \n");
718  source.append(" } \n");
719  source.append(" } \n");
720 
721  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
722 
723  source.append(" } \n");
724  }
725 
727  // Perform stream compaction for second child intervals
728  //
729  // s_left shared memory storage for left interval limits
730  // s_right shared memory storage for right interval limits
731  // s_left_count shared memory storage for number of eigenvalues less than left interval limits
732  // s_right_count shared memory storage for number of eigenvalues less than right interval limits
733  // mid midpoint of current interval (left of new interval)
734  // right upper limit of interval
735  // mid_count eigenvalues less than \a mid
736  // s_compaction_list list containing the indices where the data has to be stored
737  // num_threads_active number of active threads / intervals
738  // is_active_interval mark is thread has a second non-empty child interval
740 
741 
742  template<typename StringType>
743  void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string)
744  {
745  source.append(" \n");
746  source.append(" void \n");
747  source.append(" compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left, \n");
748  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
749  source.append(" __local unsigned int *s_left_count, \n");
750  source.append(" __local unsigned int *s_right_count, \n");
751  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
752  source.append(" "); source.append(numeric_string); source.append(" right, \n");
753  source.append(" unsigned int mid_count, unsigned int right_count, \n");
754  source.append(" __local unsigned int *s_compaction_list, \n");
755  source.append(" unsigned int num_threads_active, \n");
756  source.append(" unsigned int is_active_second) \n");
757  source.append(" { \n");
758  source.append(" uint glb_id = get_global_id(0); \n");
759  source.append(" uint grp_id = get_group_id(0); \n");
760  source.append(" uint grp_nm = get_num_groups(0); \n");
761  source.append(" uint lcl_id = get_local_id(0); \n");
762  source.append(" uint lcl_sz = get_local_size(0); \n");
763 
764  source.append(" const unsigned int tid = lcl_id; \n");
765 
766  // perform compaction / copy data for all threads where the second
767  // child is not dead
768  source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
769  source.append(" { \n");
770  source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
771  source.append(" s_left[addr_w] = mid; \n");
772  source.append(" s_right[addr_w] = right; \n");
773  source.append(" s_left_count[addr_w] = mid_count; \n");
774  source.append(" s_right_count[addr_w] = right_count; \n");
775  source.append(" } \n");
776  source.append(" } \n");
777  }
778 
779 
780 
781 
782  template<typename StringType>
783  void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string)
784  {
785  source.append(" \n");
786  source.append(" void \n");
787  source.append(" compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
788  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
789  source.append(" __local unsigned short *s_left_count, \n");
790  source.append(" __local unsigned short *s_right_count, \n");
791  source.append(" "); source.append(numeric_string); source.append(" mid, \n");
792  source.append(" "); source.append(numeric_string); source.append(" right, \n");
793  source.append(" unsigned int mid_count, unsigned int right_count, \n");
794  source.append(" __local unsigned short *s_compaction_list, \n");
795  source.append(" unsigned int num_threads_active, \n");
796  source.append(" unsigned int is_active_second) \n");
797  source.append(" { \n");
798  source.append(" uint glb_id = get_global_id(0); \n");
799  source.append(" uint grp_id = get_group_id(0); \n");
800  source.append(" uint grp_nm = get_num_groups(0); \n");
801  source.append(" uint lcl_id = get_local_id(0); \n");
802  source.append(" uint lcl_sz = get_local_size(0); \n");
803 
804  source.append(" const unsigned int tid = lcl_id; \n");
805 
806  // perform compaction / copy data for all threads where the second
807  // child is not dead
808  source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
809  source.append(" { \n");
810  source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
811  source.append(" s_left[addr_w] = mid; \n");
812  source.append(" s_right[addr_w] = right; \n");
813  source.append(" s_left_count[addr_w] = mid_count; \n");
814  source.append(" s_right_count[addr_w] = right_count; \n");
815  source.append(" } \n");
816  source.append(" } \n");
817  }
818 
819 
820 
821  template<typename StringType>
822  void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string)
823  {
824  source.append(" \n");
825  source.append(" void \n");
826  source.append(" storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left, \n");
827  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
828  source.append(" __local unsigned int *s_left_count, \n");
829  source.append(" __local unsigned int *s_right_count, \n");
830  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
831  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
832  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
833  source.append(" unsigned int *left_count, \n");
834  source.append(" unsigned int *mid_count, \n");
835  source.append(" unsigned int *right_count, \n");
836  source.append(" __local unsigned int *s_compaction_list_exc, \n");
837  source.append(" __local unsigned int *compact_second_chunk, \n");
838  source.append(" const unsigned int num_threads_active, \n");
839  source.append(" unsigned int *is_active_second) \n");
840  source.append(" { \n");
841  source.append(" uint glb_id = get_global_id(0); \n");
842  source.append(" uint grp_id = get_group_id(0); \n");
843  source.append(" uint grp_nm = get_num_groups(0); \n");
844  source.append(" uint lcl_id = get_local_id(0); \n");
845  source.append(" uint lcl_sz = get_local_size(0); \n");
846 
847  source.append(" const unsigned int tid = lcl_id; \n");
848  source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
849  // check multiplicity of eigenvalue
850  source.append(" if (1 == multiplicity) \n");
851  source.append(" { \n");
852 
853  // just re-store intervals, simple eigenvalue
854  source.append(" s_left[tid] = *left; \n");
855  source.append(" s_right[tid] = *right; \n");
856  source.append(" s_left_count[tid] = *left_count; \n");
857  source.append(" s_right_count[tid] = *right_count; \n");
858  source.append(" \n");
859 
860  // mark that no second child / clear
861  source.append(" *is_active_second = 0; \n");
862  source.append(" s_compaction_list_exc[tid] = 0; \n");
863  source.append(" } \n");
864  source.append(" else \n");
865  source.append(" { \n");
866 
867  // number of eigenvalues after the split less than mid
868  source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
869 
870  // store left interval
871  source.append(" s_left[tid] = *left; \n");
872  source.append(" s_right[tid] = *right; \n");
873  source.append(" s_left_count[tid] = *left_count; \n");
874  source.append(" s_right_count[tid] = *mid_count; \n");
875  source.append(" *mid = *left; \n");
876 
877  // mark that second child interval exists
878  source.append(" *is_active_second = 1; \n");
879  source.append(" s_compaction_list_exc[tid] = 1; \n");
880  source.append(" *compact_second_chunk = 1; \n");
881  source.append(" } \n");
882  source.append(" } \n");
883  }
884 
885 
886 
887 
888 
889  template<typename StringType>
890  void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string)
891  {
892  source.append(" \n");
893  source.append(" void \n");
894  source.append(" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
895  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
896  source.append(" __local unsigned short *s_left_count, \n");
897  source.append(" __local unsigned short *s_right_count, \n");
898  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
899  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
900  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
901  source.append(" unsigned int *left_count, \n");
902  source.append(" unsigned int *mid_count, \n");
903  source.append(" unsigned int *right_count, \n");
904  source.append(" __local unsigned short *s_compaction_list_exc, \n");
905  source.append(" __local unsigned int *compact_second_chunk, \n");
906  source.append(" const unsigned int num_threads_active, \n");
907  source.append(" unsigned int *is_active_second) \n");
908  source.append(" { \n");
909  source.append(" uint glb_id = get_global_id(0); \n");
910  source.append(" uint grp_id = get_group_id(0); \n");
911  source.append(" uint grp_nm = get_num_groups(0); \n");
912  source.append(" uint lcl_id = get_local_id(0); \n");
913  source.append(" uint lcl_sz = get_local_size(0); \n");
914 
915  source.append(" const unsigned int tid = lcl_id; \n");
916  source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
917  // check multiplicity of eigenvalue
918  source.append(" if (1 == multiplicity) \n");
919  source.append(" { \n");
920 
921  // just re-store intervals, simple eigenvalue
922  source.append(" s_left[tid] = *left; \n");
923  source.append(" s_right[tid] = *right; \n");
924  source.append(" s_left_count[tid] = *left_count; \n");
925  source.append(" s_right_count[tid] = *right_count; \n");
926  source.append(" \n");
927 
928  // mark that no second child / clear
929  source.append(" *is_active_second = 0; \n");
930  source.append(" s_compaction_list_exc[tid] = 0; \n");
931  source.append(" } \n");
932  source.append(" else \n");
933  source.append(" { \n");
934 
935  // number of eigenvalues after the split less than mid
936  source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
937 
938  // store left interval
939  source.append(" s_left[tid] = *left; \n");
940  source.append(" s_right[tid] = *right; \n");
941  source.append(" s_left_count[tid] = *left_count; \n");
942  source.append(" s_right_count[tid] = *mid_count; \n");
943  source.append(" *mid = *left; \n");
944 
945  // mark that second child interval exists
946  source.append(" *is_active_second = 1; \n");
947  source.append(" s_compaction_list_exc[tid] = 1; \n");
948  source.append(" *compact_second_chunk = 1; \n");
949  source.append(" } \n");
950  source.append(" } \n");
951  }
952 
954  // Subdivide interval if active and not already converged
955  //
956  // tid id of thread
957  // s_left shared memory storage for left interval limits
958  // s_right shared memory storage for right interval limits
959  // s_left_count shared memory storage for number of eigenvalues less than left interval limits
960  // s_right_count shared memory storage for number of eigenvalues less than right interval limits
961  // num_threads_active number of active threads in warp
962  // left lower limit of interval
963  // right upper limit of interval
964  // left_count eigenvalues less than \a left
965  // right_count eigenvalues less than \a right
966  // all_threads_converged shared memory flag if all threads are converged
968 
969 
970  template<typename StringType>
971  void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string)
972  {
973  source.append(" \n");
974  source.append(" void \n");
975  source.append(" subdivideActiveIntervalMulti(const unsigned int tid, \n");
976  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
977  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
978  source.append(" __local unsigned int *s_left_count, \n");
979  source.append(" __local unsigned int *s_right_count, \n");
980  source.append(" const unsigned int num_threads_active, \n");
981  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
982  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
983  source.append(" unsigned int *left_count, unsigned int *right_count, \n");
984  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
985  source.append(" __local unsigned int *all_threads_converged) \n");
986  source.append(" { \n");
987  source.append(" uint glb_id = get_global_id(0); \n");
988  source.append(" uint grp_id = get_group_id(0); \n");
989  source.append(" uint grp_nm = get_num_groups(0); \n");
990  source.append(" uint lcl_id = get_local_id(0); \n");
991  source.append(" uint lcl_sz = get_local_size(0); \n");
992 
993  // for all active threads
994  source.append(" if (tid < num_threads_active) \n");
995  source.append(" { \n");
996 
997  source.append(" *left = s_left[tid]; \n");
998  source.append(" *right = s_right[tid]; \n");
999  source.append(" *left_count = s_left_count[tid]; \n");
1000  source.append(" *right_count = s_right_count[tid]; \n");
1001 
1002  // check if thread already converged
1003  source.append(" if (*left != *right) \n");
1004  source.append(" { \n");
1005 
1006  source.append(" *mid = computeMidpoint(*left, *right); \n");
1007  source.append(" *all_threads_converged = 0; \n");
1008  source.append(" } \n");
1009  source.append(" else if ((*right_count - *left_count) > 1) \n");
1010  source.append(" { \n");
1011  // mark as not converged if multiple eigenvalues enclosed
1012  // duplicate interval in storeIntervalsConverged()
1013  source.append(" *all_threads_converged = 0; \n");
1014  source.append(" } \n");
1015 
1016  source.append(" } \n");
1017  // end for all active threads
1018  source.append(" } \n");
1019  }
1020 
1021 
1022  template<typename StringType>
1023  void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string)
1024  {
1025  source.append(" \n");
1026  source.append(" void \n");
1027  source.append(" subdivideActiveIntervalShort(const unsigned int tid, \n");
1028  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
1029  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
1030  source.append(" __local unsigned short *s_left_count, \n");
1031  source.append(" __local unsigned short *s_right_count, \n");
1032  source.append(" const unsigned int num_threads_active, \n");
1033  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
1034  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
1035  source.append(" unsigned int *left_count, unsigned int *right_count, \n");
1036  source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
1037  source.append(" __local unsigned int *all_threads_converged) \n");
1038  source.append(" { \n");
1039  source.append(" uint glb_id = get_global_id(0); \n");
1040  source.append(" uint grp_id = get_group_id(0); \n");
1041  source.append(" uint grp_nm = get_num_groups(0); \n");
1042  source.append(" uint lcl_id = get_local_id(0); \n");
1043  source.append(" uint lcl_sz = get_local_size(0); \n");
1044 
1045  // for all active threads
1046  source.append(" if (tid < num_threads_active) \n");
1047  source.append(" { \n");
1048 
1049  source.append(" *left = s_left[tid]; \n");
1050  source.append(" *right = s_right[tid]; \n");
1051  source.append(" *left_count = s_left_count[tid]; \n");
1052  source.append(" *right_count = s_right_count[tid]; \n");
1053 
1054  // check if thread already converged
1055  source.append(" if (*left != *right) \n");
1056  source.append(" { \n");
1057 
1058  source.append(" *mid = computeMidpoint(*left, *right); \n");
1059  source.append(" *all_threads_converged = 0; \n");
1060  source.append(" } \n");
1061 
1062  source.append(" } \n");
1063  // end for all active threads
1064  source.append(" } \n");
1065  }
1066 
1067  // end of utilities
1068  // start of kernels
1069 
1070 
1072  // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
1073  //
1074  // g_d diagonal elements in global memory
1075  // g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
1076  // n size of matrix
1077  // lg lower bound of input interval (e.g. Gerschgorin interval)
1078  // ug upper bound of input interval (e.g. Gerschgorin interval)
1079  // lg_eig_count number of eigenvalues that are smaller than \a lg
1080  // lu_eig_count number of eigenvalues that are smaller than \a lu
1081  // epsilon desired accuracy of eigenvalues to compute
1084  template <typename StringType>
1085  void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string)
1086  {
1087  source.append(" __kernel \n");
1088  source.append(" void \n");
1089  source.append(" bisectKernelSmall(__global "); source.append(numeric_string); source.append(" *g_d, \n");
1090  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
1091  source.append(" const unsigned int n, \n");
1092  source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
1093  source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
1094  source.append(" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n");
1095  source.append(" const "); source.append(numeric_string); source.append(" lg, \n");
1096  source.append(" const "); source.append(numeric_string); source.append(" ug, \n");
1097  source.append(" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
1098  source.append(" "); source.append(numeric_string); source.append(" epsilon \n");
1099  source.append(" ) \n");
1100  source.append(" { \n");
1101  source.append(" g_s = g_s + 1; \n");
1102  source.append(" uint glb_id = get_global_id(0); \n");
1103  source.append(" uint grp_id = get_group_id(0); \n");
1104  source.append(" uint grp_nm = get_num_groups(0); \n");
1105  source.append(" uint lcl_id = get_local_id(0); \n");
1106  source.append(" uint lcl_sz = get_local_size(0); \n");
1107 
1108  // intervals (store left and right because the subdivision tree is in general
1109  // not dense
1110  source.append(" __local "); source.append(numeric_string); source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1111  source.append(" __local "); source.append(numeric_string); source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1112 
1113  // number of eigenvalues that are smaller than s_left / s_right
1114  // (correspondence is realized via indices)
1115  source.append(" __local unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1116  source.append(" __local unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
1117 
1118  // helper for stream compaction
1119  source.append(" __local unsigned int \n");
1120  source.append(" s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n");
1121 
1122  // state variables for whole block
1123  // if 0 then compaction of second chunk of child intervals is not necessary
1124  // (because all intervals had exactly one non-dead child)
1125  source.append(" __local unsigned int compact_second_chunk; \n");
1126  source.append(" __local unsigned int all_threads_converged; \n");
1127 
1128  // number of currently active threads
1129  source.append(" __local unsigned int num_threads_active; \n");
1130 
1131  // number of threads to use for stream compaction
1132  source.append(" __local unsigned int num_threads_compaction; \n");
1133 
1134  // helper for exclusive scan
1135  source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1136 
1137 
1138  // variables for currently processed interval
1139  // left and right limit of active interval
1140  source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
1141  source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
1142  source.append(" unsigned int left_count = 0; \n");
1143  source.append(" unsigned int right_count = 0; \n");
1144  // midpoint of active interval
1145  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
1146  // number of eigenvalues smaller then mid
1147  source.append(" unsigned int mid_count = 0; \n");
1148  // affected from compaction
1149  source.append(" unsigned int is_active_second = 0; \n");
1150 
1151  source.append(" s_compaction_list[lcl_id] = 0; \n");
1152  source.append(" s_left[lcl_id] = 0.0; \n");
1153  source.append(" s_right[lcl_id] = 0.0; \n");
1154  source.append(" s_left_count[lcl_id] = 0; \n");
1155  source.append(" s_right_count[lcl_id] = 0; \n");
1156 
1157  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1158 
1159  // set up initial configuration
1160  source.append(" if (0 == lcl_id) \n");
1161  source.append(" { \n");
1162  source.append(" s_left[0] = lg; \n");
1163  source.append(" s_right[0] = ug; \n");
1164  source.append(" s_left_count[0] = lg_eig_count; \n");
1165  source.append(" s_right_count[0] = ug_eig_count; \n");
1166 
1167  source.append(" compact_second_chunk = 0; \n");
1168  source.append(" num_threads_active = 1; \n");
1169 
1170  source.append(" num_threads_compaction = 1; \n");
1171  source.append(" } \n");
1172 
1173  // for all active threads read intervals from the last level
1174  // the number of (worst case) active threads per level l is 2^l
1175 
1176  source.append(" while (true) \n");
1177  source.append(" { \n");
1178 
1179  source.append(" all_threads_converged = 1; \n");
1180  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1181 
1182  source.append(" is_active_second = 0; \n");
1183  source.append(" subdivideActiveIntervalMulti(lcl_id, \n");
1184  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
1185  source.append(" num_threads_active, \n");
1186  source.append(" &left, &right, &left_count, &right_count, \n");
1187  source.append(" &mid, &all_threads_converged); \n");
1188  // source.append(" output[lcl_id] = s_left; \n");
1189  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1190 
1191  // check if done
1192  source.append(" if (1 == all_threads_converged) \n");
1193  source.append(" { \n");
1194  source.append(" break; \n");
1195  source.append(" } \n");
1196 
1197  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1198 
1199  // compute number of eigenvalues smaller than mid
1200  // use all threads for reading the necessary matrix data from global
1201  // memory
1202  // use s_left and s_right as scratch space for diagonal and
1203  // superdiagonal of matrix
1204  source.append(" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n");
1205  source.append(" lcl_id, num_threads_active, \n");
1206  source.append(" s_left, s_right, \n");
1207  source.append(" (left == right)); \n");
1208 
1209  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1210 
1211  // store intervals
1212  // for all threads store the first child interval in a continuous chunk of
1213  // memory, and the second child interval -- if it exists -- in a second
1214  // chunk; it is likely that all threads reach convergence up to
1215  // \a epsilon at the same level; furthermore, for higher level most / all
1216  // threads will have only one child, storing the first child compactly will
1217  // (first) avoid to perform a compaction step on the first chunk, (second)
1218  // make it for higher levels (when all threads / intervals have
1219  // exactly one child) unnecessary to perform a compaction of the second
1220  // chunk
1221  source.append(" if (lcl_id < num_threads_active) \n");
1222  source.append(" { \n");
1223 
1224  source.append(" if (left != right) \n");
1225  source.append(" { \n");
1226 
1227  // store intervals
1228  source.append(" storeNonEmptyIntervals(lcl_id, num_threads_active, \n");
1229  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
1230  source.append(" left, mid, right, \n");
1231  source.append(" left_count, mid_count, right_count, \n");
1232  source.append(" epsilon, &compact_second_chunk, \n");
1233  source.append(" s_compaction_list_exc, \n");
1234  source.append(" &is_active_second); \n");
1235  source.append(" } \n");
1236  source.append(" else \n");
1237  source.append(" { \n");
1238 
1239  source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1240  source.append(" &left, &mid, &right, \n");
1241  source.append(" &left_count, &mid_count, &right_count, \n");
1242  source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
1243  source.append(" num_threads_active, \n");
1244  source.append(" &is_active_second); \n");
1245  source.append(" } \n");
1246  source.append(" } \n");
1247 
1248  // necessary so that compact_second_chunk is up-to-date
1249  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1250 
1251  // perform compaction of chunk where second children are stored
1252  // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most
1253  // (num_threads_active / 4) threads are needed
1254  source.append(" if (compact_second_chunk > 0) \n");
1255  source.append(" { \n");
1256 
1257  source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1258 
1259  source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1260  source.append(" mid, right, mid_count, right_count, \n");
1261  source.append(" s_compaction_list, num_threads_active, \n");
1262  source.append(" is_active_second); \n");
1263  source.append(" } \n");
1264 
1265  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1266 
1267  source.append(" if (0 == lcl_id) \n");
1268  source.append(" { \n");
1269 
1270  // update number of active threads with result of reduction
1271  source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
1272 
1273  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
1274 
1275  source.append(" compact_second_chunk = 0; \n");
1276  source.append(" } \n");
1277 
1278  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1279 
1280  source.append(" } \n");
1281 
1282  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1283 
1284  // write resulting intervals to global mem
1285  // for all threads write if they have been converged to an eigenvalue to
1286  // a separate array
1287 
1288  // at most n valid intervals
1289  source.append(" if (lcl_id < n) \n");
1290  source.append(" { \n");
1291  // intervals converged so left and right limit are identical
1292  source.append(" g_left[lcl_id] = s_left[lcl_id]; \n");
1293  // left count is sufficient to have global order
1294  source.append(" g_left_count[lcl_id] = s_left_count[lcl_id]; \n");
1295  source.append(" } \n");
1296  source.append(" } \n");
1297  }
1298 
1300  // Perform second step of bisection algorithm for large matrices for intervals that after the first step contained more than one eigenvalue
1301  //
1302  // g_d diagonal elements of symmetric, tridiagonal matrix
1303  // g_s superdiagonal elements of symmetric, tridiagonal matrix
1304  // n matrix size
1305  // blocks_mult start addresses of blocks of intervals that are processed by one block of threads, each of the intervals contains more than one eigenvalue
1306  // blocks_mult_sum total number of eigenvalues / singleton intervals in one block of intervals
1307  // g_left left limits of intervals
1308  // g_right right limits of intervals
1309  // g_left_count number of eigenvalues less than left limits
1310  // g_right_count number of eigenvalues less than right limits
1311  // g_lambda final eigenvalue
1312  // g_pos index of eigenvalue (in ascending order)
1313  // precision desired precision of eigenvalues
1315 
1316  template <typename StringType>
1317  void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string)
1318  {
1319  source.append(" __kernel \n");
1320  source.append(" void \n");
1321  source.append(" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
1322  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
1323  source.append(" const unsigned int n, \n");
1324  source.append(" __global unsigned int *blocks_mult, \n");
1325  source.append(" __global unsigned int *blocks_mult_sum, \n");
1326  source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
1327  source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
1328  source.append(" __global unsigned int *g_left_count, \n");
1329  source.append(" __global unsigned int *g_right_count, \n");
1330  source.append(" __global "); source.append(numeric_string); source.append(" *g_lambda, \n");
1331  source.append(" __global unsigned int *g_pos, \n");
1332  source.append(" "); source.append(numeric_string); source.append(" precision \n");
1333  source.append(" ) \n");
1334  source.append(" { \n");
1335  source.append(" g_s = g_s + 1; \n");
1336  source.append(" uint glb_id = get_global_id(0); \n");
1337  source.append(" uint grp_id = get_group_id(0); \n");
1338  source.append(" uint grp_nm = get_num_groups(0); \n");
1339  source.append(" uint lcl_id = get_local_id(0); \n");
1340  source.append(" uint lcl_sz = get_local_size(0); \n");
1341 
1342  source.append(" const unsigned int tid = lcl_id; \n");
1343 
1344  // left and right limits of interval
1345  source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1346  source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1347 
1348  // number of eigenvalues smaller than interval limits
1349  source.append(" __local unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1350  source.append(" __local unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1351 
1352  // helper array for chunk compaction of second chunk
1353  source.append(" __local unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
1354  // compaction list helper for exclusive scan
1355  source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
1356 
1357  // flag if all threads are converged
1358  source.append(" __local unsigned int all_threads_converged; \n");
1359  // number of active threads
1360  source.append(" __local unsigned int num_threads_active; \n");
1361  // number of threads to employ for compaction
1362  source.append(" __local unsigned int num_threads_compaction; \n");
1363  // flag if second chunk has to be compacted
1364  source.append(" __local unsigned int compact_second_chunk; \n");
1365 
1366  // parameters of block of intervals processed by this block of threads
1367  source.append(" __local unsigned int c_block_start; \n");
1368  source.append(" __local unsigned int c_block_end; \n");
1369  source.append(" __local unsigned int c_block_offset_output; \n");
1370 
1371  // midpoint of currently active interval of the thread
1372  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
1373  // number of eigenvalues smaller than \a mid
1374  source.append(" unsigned int mid_count = 0; \n");
1375  // current interval parameter
1376  source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
1377  source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
1378  source.append(" unsigned int left_count = 0; \n");
1379  source.append(" unsigned int right_count = 0; \n");
1380  // helper for compaction, keep track which threads have a second child
1381  source.append(" unsigned int is_active_second = 0; \n");
1382 
1383  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1384 
1385  // initialize common start conditions
1386  source.append(" if (0 == tid) \n");
1387  source.append(" { \n");
1388 
1389  source.append(" c_block_start = blocks_mult[grp_id]; \n");
1390  source.append(" c_block_end = blocks_mult[grp_id + 1]; \n");
1391  source.append(" c_block_offset_output = blocks_mult_sum[grp_id]; \n");
1392  source.append(" \n");
1393 
1394  source.append(" num_threads_active = c_block_end - c_block_start; \n");
1395  source.append(" s_compaction_list[0] = 0; \n");
1396  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
1397 
1398  source.append(" all_threads_converged = 1; \n");
1399  source.append(" compact_second_chunk = 0; \n");
1400  source.append(" } \n");
1401  source.append(" s_left_count [tid] = 42; \n");
1402  source.append(" s_right_count[tid] = 42; \n");
1403  source.append(" s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
1404  source.append(" s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
1405  source.append(" \n");
1406  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1407  source.append(" \n");
1408 
1409  // read data into shared memory
1410  source.append(" if (tid < num_threads_active) \n");
1411  source.append(" { \n");
1412 
1413  source.append(" s_left[tid] = g_left[c_block_start + tid]; \n");
1414  source.append(" s_right[tid] = g_right[c_block_start + tid]; \n");
1415  source.append(" s_left_count[tid] = g_left_count[c_block_start + tid]; \n");
1416  source.append(" s_right_count[tid] = g_right_count[c_block_start + tid]; \n");
1417  source.append(" \n");
1418  source.append(" } \n");
1419  source.append(" \n");
1420  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1421  source.append(" unsigned int iter = 0; \n");
1422  // do until all threads converged
1423  source.append(" while (true) \n");
1424  source.append(" { \n");
1425  source.append(" iter++; \n");
1426  //for (int iter=0; iter < 0; iter++) {
1427  source.append(" s_compaction_list[lcl_id] = 0; \n");
1428  source.append(" s_compaction_list[lcl_id + lcl_sz] = 0; \n");
1429  source.append(" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
1430 
1431  // subdivide interval if currently active and not already converged
1432  source.append(" subdivideActiveIntervalMulti(tid, s_left, s_right, \n");
1433  source.append(" s_left_count, s_right_count, \n");
1434  source.append(" num_threads_active, \n");
1435  source.append(" &left, &right, &left_count, &right_count, \n");
1436  source.append(" &mid, &all_threads_converged); \n");
1437  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1438 
1439  // stop if all eigenvalues have been found
1440  source.append(" if (1 == all_threads_converged) \n");
1441  source.append(" { \n");
1442  source.append(" \n");
1443  source.append(" break; \n");
1444  source.append(" } \n");
1445 
1446  // compute number of eigenvalues smaller than mid for active and not
1447  // converged intervals, use all threads for loading data from gmem and
1448  // s_left and s_right as scratch space to store the data load from gmem
1449  // in shared memory
1450  source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1451  source.append(" mid, tid, num_threads_active, \n");
1452  source.append(" s_left, s_right, \n");
1453  source.append(" (left == right)); \n");
1454  source.append(" \n");
1455  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1456 
1457  source.append(" if (tid < num_threads_active) \n");
1458  source.append(" { \n");
1459  source.append(" \n");
1460  // store intervals
1461  source.append(" if (left != right) \n");
1462  source.append(" { \n");
1463 
1464  source.append(" storeNonEmptyIntervals(tid, num_threads_active, \n");
1465  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
1466  source.append(" left, mid, right, \n");
1467  source.append(" left_count, mid_count, right_count, \n");
1468  source.append(" precision, &compact_second_chunk, \n");
1469  source.append(" s_compaction_list_exc, \n");
1470  source.append(" &is_active_second); \n");
1471  source.append(" \n");
1472  source.append(" } \n");
1473  source.append(" else \n");
1474  source.append(" { \n");
1475 
1476  source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
1477  source.append(" &left, &mid, &right, \n");
1478  source.append(" &left_count, &mid_count, &right_count, \n");
1479  source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
1480  source.append(" num_threads_active, \n");
1481  source.append(" &is_active_second); \n");
1482  source.append(" \n");
1483  source.append(" } \n");
1484  source.append(" } \n");
1485 
1486  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1487 
1488  // compact second chunk of intervals if any of the threads generated
1489  // two child intervals
1490  source.append(" if (1 == compact_second_chunk) \n");
1491  source.append(" { \n");
1492 
1493  source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
1494  source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
1495  source.append(" mid, right, mid_count, right_count, \n");
1496  source.append(" s_compaction_list, num_threads_active, \n");
1497  source.append(" is_active_second); \n");
1498  source.append(" } \n");
1499 
1500  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1501 
1502  // update state variables
1503  source.append(" if (0 == tid) \n");
1504  source.append(" { \n");
1505  source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
1506  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
1507 
1508  source.append(" compact_second_chunk = 0; \n");
1509  source.append(" all_threads_converged = 1; \n");
1510  source.append(" } \n");
1511 
1512  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1513 
1514  // clear
1515  source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
1516  source.append(" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n");
1517  source.append(" \n");
1518  source.append(" if (num_threads_compaction > lcl_sz) \n");
1519  source.append(" { \n");
1520  source.append(" break; \n");
1521  source.append(" } \n");
1522 
1523 
1524  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1525 
1526  source.append(" } \n"); // end until all threads converged
1527 
1528  // write data back to global memory
1529  source.append(" if (tid < num_threads_active) \n");
1530  source.append(" { \n");
1531 
1532  source.append(" unsigned int addr = c_block_offset_output + tid; \n");
1533  source.append(" \n");
1534  source.append(" g_lambda[addr] = s_left[tid]; \n");
1535  source.append(" g_pos[addr] = s_right_count[tid]; \n");
1536  source.append(" } \n");
1537  source.append(" } \n");
1538  }
1539 
1540 
1542  // Determine eigenvalues for large matrices for intervals that after the first step contained one eigenvalue
1543  //
1544  // g_d diagonal elements of symmetric, tridiagonal matrix
1545  // g_s superdiagonal elements of symmetric, tridiagonal matrix
1546  // n matrix size
1547  // num_intervals total number of intervals containing one eigenvalue after the first step
1548  // g_left left interval limits
1549  // g_right right interval limits
1550  // g_pos index of interval / number of intervals that are smaller than right interval limit
1551  // precision desired precision of eigenvalues
1553 
1554  template <typename StringType>
1555  void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string)
1556  {
1557  source.append(" __kernel \n");
1558  source.append(" void \n");
1559  source.append(" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
1560  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
1561  source.append(" const unsigned int n, \n");
1562  source.append(" unsigned int num_intervals, \n");
1563  source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
1564  source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
1565  source.append(" __global unsigned int *g_pos, \n");
1566  source.append(" "); source.append(numeric_string); source.append(" precision) \n");
1567  source.append(" { \n");
1568  source.append(" g_s = g_s + 1; \n");
1569  source.append(" uint glb_id = get_global_id(0); \n");
1570  source.append(" uint grp_id = get_group_id(0); \n");
1571  source.append(" uint grp_nm = get_num_groups(0); \n");
1572  source.append(" uint lcl_id = get_local_id(0); \n");
1573  source.append(" uint lcl_sz = get_local_size(0); \n");
1574  source.append(" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n");
1575  source.append(" __local "); source.append(numeric_string); source.append(" s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1576  source.append(" __local "); source.append(numeric_string); source.append(" s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
1577  // active interval of thread
1578  // left and right limit of current interval
1579  source.append(" "); source.append(numeric_string); source.append(" left, right; \n");
1580  // number of threads smaller than the right limit (also corresponds to the
1581  // global index of the eigenvalues contained in the active interval)
1582  source.append(" unsigned int right_count; \n");
1583  // flag if current thread converged
1584  source.append(" unsigned int converged = 0; \n");
1585  // midpoint when current interval is subdivided
1586  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
1587  // number of eigenvalues less than mid
1588  source.append(" unsigned int mid_count = 0; \n");
1589 
1590  // read data from global memory
1591  source.append(" if (gtid < num_intervals) \n");
1592  source.append(" { \n");
1593  source.append(" left = g_left[gtid]; \n");
1594  source.append(" right = g_right[gtid]; \n");
1595  source.append(" right_count = g_pos[gtid]; \n");
1596  source.append(" } \n");
1597  // flag to determine if all threads converged to eigenvalue
1598  source.append(" __local unsigned int converged_all_threads; \n");
1599  // initialized shared flag
1600  source.append(" if (0 == lcl_id) \n");
1601  source.append(" { \n");
1602  source.append(" converged_all_threads = 0; \n");
1603  source.append(" } \n");
1604  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1605  // process until all threads converged to an eigenvalue
1606  source.append(" while (true) \n");
1607  source.append(" { \n");
1608  source.append(" converged_all_threads = 1; \n");
1609  // update midpoint for all active threads
1610  source.append(" if ((gtid < num_intervals) && (0 == converged)) \n");
1611  source.append(" { \n");
1612  source.append(" mid = computeMidpoint(left, right); \n");
1613  source.append(" } \n");
1614  // find number of eigenvalues that are smaller than midpoint
1615  source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
1616  source.append(" mid, gtid, num_intervals, \n");
1617  source.append(" s_left_scratch, \n");
1618  source.append(" s_right_scratch, \n");
1619  source.append(" converged); \n");
1620  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1621  // for all active threads
1622  source.append(" if ((gtid < num_intervals) && (0 == converged)) \n");
1623  source.append(" { \n");
1624  // update intervals -- always one child interval survives
1625  source.append(" if (right_count == mid_count) \n");
1626  source.append(" { \n");
1627  source.append(" right = mid; \n");
1628  source.append(" } \n");
1629  source.append(" else \n");
1630  source.append(" { \n");
1631  source.append(" left = mid; \n");
1632  source.append(" } \n");
1633  // check for convergence
1634  source.append(" "); source.append(numeric_string); source.append(" t0 = right - left; \n");
1635  source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(right), fabs(left)) * precision; \n");
1636 
1637  source.append(" if (t0 < min(precision, t1)) \n");
1638  source.append(" { \n");
1639  source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
1640  source.append(" left = lambda; \n");
1641  source.append(" right = lambda; \n");
1642 
1643  source.append(" converged = 1; \n");
1644  source.append(" } \n");
1645  source.append(" else \n");
1646  source.append(" { \n");
1647  source.append(" converged_all_threads = 0; \n");
1648  source.append(" } \n");
1649  source.append(" } \n");
1650  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1651  source.append(" if (1 == converged_all_threads) \n");
1652  source.append(" { \n");
1653  source.append(" break; \n");
1654  source.append(" } \n");
1655  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1656  source.append(" } \n");
1657  // write data back to global memory
1658  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1659  source.append(" if (gtid < num_intervals) \n");
1660  source.append(" { \n");
1661  // intervals converged so left and right interval limit are both identical
1662  // and identical to the eigenvalue
1663  source.append(" g_left[gtid] = left; \n");
1664  source.append(" } \n");
1665  source.append(" } \n");
1666  }
1667 
1669  // Write data to global memory
1671 
1672  template <typename StringType>
1673  void generate_bisect_kernel_writeToGmem(StringType & source, std::string const & numeric_string)
1674  {
1675  source.append(" \n");
1676  source.append(" void writeToGmem(const unsigned int tid, const unsigned int tid_2, \n");
1677  source.append(" const unsigned int num_threads_active, \n");
1678  source.append(" const unsigned int num_blocks_mult, \n");
1679  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_one, \n");
1680  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_one, \n");
1681  source.append(" __global unsigned int *g_pos_one, \n");
1682  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_mult, \n");
1683  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_mult, \n");
1684  source.append(" __global unsigned int *g_left_count_mult, \n");
1685  source.append(" __global unsigned int *g_right_count_mult, \n");
1686  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
1687  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
1688  source.append(" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1689  source.append(" __global unsigned int *g_blocks_mult, \n");
1690  source.append(" __global unsigned int *g_blocks_mult_sum, \n");
1691  source.append(" __local unsigned short *s_compaction_list, \n");
1692  source.append(" __local unsigned short *s_cl_helper, \n");
1693  source.append(" unsigned int offset_mult_lambda \n");
1694  source.append(" ) \n");
1695  source.append(" { \n");
1696  source.append(" uint glb_id = get_global_id(0); \n");
1697  source.append(" uint grp_id = get_group_id(0); \n");
1698  source.append(" uint grp_nm = get_num_groups(0); \n");
1699  source.append(" uint lcl_id = get_local_id(0); \n");
1700  source.append(" uint lcl_sz = get_local_size(0); \n");
1701 
1702 
1703  source.append(" if (tid < offset_mult_lambda) \n");
1704  source.append(" { \n");
1705 
1706  source.append(" g_left_one[tid] = s_left[tid]; \n");
1707  source.append(" g_right_one[tid] = s_right[tid]; \n");
1708  // right count can be used to order eigenvalues without sorting
1709  source.append(" g_pos_one[tid] = s_right_count[tid]; \n");
1710  source.append(" } \n");
1711  source.append(" else \n");
1712  source.append(" { \n");
1713 
1714  source.append(" \n");
1715  source.append(" g_left_mult[tid - offset_mult_lambda] = s_left[tid]; \n");
1716  source.append(" g_right_mult[tid - offset_mult_lambda] = s_right[tid]; \n");
1717  source.append(" g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid]; \n");
1718  source.append(" g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid]; \n");
1719  source.append(" } \n");
1720 
1721  source.append(" if (tid_2 < num_threads_active) \n");
1722  source.append(" { \n");
1723 
1724  source.append(" if (tid_2 < offset_mult_lambda) \n");
1725  source.append(" { \n");
1726 
1727  source.append(" g_left_one[tid_2] = s_left[tid_2]; \n");
1728  source.append(" g_right_one[tid_2] = s_right[tid_2]; \n");
1729  // right count can be used to order eigenvalues without sorting
1730  source.append(" g_pos_one[tid_2] = s_right_count[tid_2]; \n");
1731  source.append(" } \n");
1732  source.append(" else \n");
1733  source.append(" { \n");
1734 
1735  source.append(" g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2]; \n");
1736  source.append(" g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2]; \n");
1737  source.append(" g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2]; \n");
1738  source.append(" g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2]; \n");
1739  source.append(" } \n");
1740 
1741  source.append(" } \n"); // end writing out data
1742 
1743  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1744 
1745  // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
1746  // s_compaction_list we write the exclusive scan result
1747  source.append(" if (tid <= num_blocks_mult) \n");
1748  source.append(" { \n");
1749  source.append(" g_blocks_mult[tid] = s_compaction_list[tid]; \n");
1750  source.append(" g_blocks_mult_sum[tid] = s_cl_helper[tid]; \n");
1751  source.append(" } \n");
1752  source.append(" if (tid_2 <= num_blocks_mult) \n");
1753  source.append(" { \n");
1754  source.append(" g_blocks_mult[tid_2] = s_compaction_list[tid_2]; \n");
1755  source.append(" g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2]; \n");
1756  source.append(" } \n");
1757  source.append(" } \n");
1758  }
1759 
1761  // Perform final stream compaction before writing data to global memory
1763 
1764  template <typename StringType>
1765  void generate_bisect_kernel_compactStreamsFinal(StringType & source, std::string const & numeric_string)
1766  {
1767  source.append(" \n");
1768  source.append(" void \n");
1769  source.append(" compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, \n");
1770  source.append(" const unsigned int num_threads_active, \n");
1771  source.append(" __local unsigned int *offset_mult_lambda, \n");
1772  source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
1773  source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
1774  source.append(" __local unsigned short *s_left_count, __local unsigned short *s_right_count, \n");
1775  source.append(" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
1776  source.append(" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper, \n");
1777  source.append(" unsigned int is_one_lambda, unsigned int is_one_lambda_2, \n");
1778  source.append(" "); source.append(numeric_string); source.append(" *left, \n");
1779  source.append(" "); source.append(numeric_string); source.append(" *right, \n");
1780  source.append(" "); source.append(numeric_string); source.append(" *left_2, \n");
1781  source.append(" "); source.append(numeric_string); source.append(" *right_2, \n");
1782  source.append(" unsigned int *left_count, unsigned int *right_count, \n");
1783  source.append(" unsigned int *left_count_2, unsigned int *right_count_2, \n");
1784  source.append(" unsigned int c_block_iend, unsigned int c_sum_block, \n");
1785  source.append(" unsigned int c_block_iend_2, unsigned int c_sum_block_2 \n");
1786  source.append(" ) \n");
1787  source.append(" { \n");
1788  source.append(" uint glb_id = get_global_id(0); \n");
1789  source.append(" uint grp_id = get_group_id(0); \n");
1790  source.append(" uint grp_nm = get_num_groups(0); \n");
1791  source.append(" uint lcl_id = get_local_id(0); \n");
1792  source.append(" uint lcl_sz = get_local_size(0); \n");
1793 
1794  // cache data before performing compaction
1795  source.append(" *left = s_left[tid]; \n");
1796  source.append(" *right = s_right[tid]; \n");
1797 
1798  source.append(" if (tid_2 < num_threads_active) \n");
1799  source.append(" { \n");
1800  source.append(" *left_2 = s_left[tid_2]; \n");
1801  source.append(" *right_2 = s_right[tid_2]; \n");
1802  source.append(" } \n");
1803 
1804  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1805 
1806  // determine addresses for intervals containing multiple eigenvalues and
1807  // addresses for blocks of intervals
1808  source.append(" unsigned int ptr_w = 0; \n");
1809  source.append(" unsigned int ptr_w_2 = 0; \n");
1810  source.append(" unsigned int ptr_blocking_w = 0; \n");
1811  source.append(" unsigned int ptr_blocking_w_2 = 0; \n");
1812  source.append(" \n");
1813  source.append(" \n");
1814 
1815  source.append(" ptr_w = (1 == is_one_lambda) ? s_cl_one[tid] \n");
1816  source.append(" : s_cl_mult[tid] + *offset_mult_lambda; \n");
1817 
1818  source.append(" if (0 != c_block_iend) \n");
1819  source.append(" { \n");
1820  source.append(" ptr_blocking_w = s_cl_blocking[tid]; \n");
1821  source.append(" } \n");
1822 
1823  source.append(" if (tid_2 < num_threads_active) \n");
1824  source.append(" { \n");
1825  source.append(" ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2] \n");
1826  source.append(" : s_cl_mult[tid_2] + *offset_mult_lambda; \n");
1827 
1828  source.append(" if (0 != c_block_iend_2) \n");
1829  source.append(" { \n");
1830  source.append(" ptr_blocking_w_2 = s_cl_blocking[tid_2]; \n");
1831  source.append(" } \n");
1832  source.append(" } \n");
1833  source.append(" \n");
1834  source.append(" \n");
1835  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1836  source.append(" \n");
1837  // store compactly in shared mem
1838  source.append(" if(tid < num_threads_active) \n");
1839  source.append(" { \n ");
1840  source.append(" s_left[ptr_w] = *left; \n");
1841  source.append(" s_right[ptr_w] = *right; \n");
1842  source.append(" s_left_count[ptr_w] = *left_count; \n");
1843  source.append(" s_right_count[ptr_w] = *right_count; \n");
1844  source.append(" } \n ");
1845  source.append(" \n");
1846  source.append(" \n");
1847  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1848  source.append(" if(tid == 1) \n");
1849  source.append(" { \n");
1850  source.append(" s_left[ptr_w] = *left; \n");
1851  source.append(" s_right[ptr_w] = *right; \n");
1852  source.append(" s_left_count[ptr_w] = *left_count; \n");
1853  source.append(" s_right_count[ptr_w] = *right_count; \n");
1854  source.append(" \n");
1855  source.append(" } \n");
1856  source.append(" if (0 != c_block_iend) \n");
1857  source.append(" { \n");
1858  source.append(" s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1; \n");
1859  source.append(" s_cl_helper[ptr_blocking_w + 1] = c_sum_block; \n");
1860  source.append(" } \n");
1861  source.append(" \n");
1862  source.append(" if (tid_2 < num_threads_active) \n");
1863  source.append(" { \n");
1864  // store compactly in shared mem
1865  source.append(" s_left[ptr_w_2] = *left_2; \n");
1866  source.append(" s_right[ptr_w_2] = *right_2; \n");
1867  source.append(" s_left_count[ptr_w_2] = *left_count_2; \n");
1868  source.append(" s_right_count[ptr_w_2] = *right_count_2; \n");
1869 
1870  source.append(" if (0 != c_block_iend_2) \n");
1871  source.append(" { \n");
1872  source.append(" s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1; \n");
1873  source.append(" s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2; \n");
1874  source.append(" } \n");
1875  source.append(" } \n");
1876 
1877  source.append(" } \n");
1878  }
1879 
1880 
1881 
1883  // Compute addresses to obtain compact list of block start addresses
1885 
1886  template <typename StringType>
1888  {
1889  source.append(" \n");
1890  source.append(" void \n");
1891  source.append(" scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, \n");
1892  source.append(" const unsigned int num_threads_compaction, \n");
1893  source.append(" __local unsigned short *s_cl_blocking, \n");
1894  source.append(" __local unsigned short *s_cl_helper \n");
1895  source.append(" ) \n");
1896  source.append(" { \n");
1897  source.append(" uint glb_id = get_global_id(0); \n");
1898  source.append(" uint grp_id = get_group_id(0); \n");
1899  source.append(" uint grp_nm = get_num_groups(0); \n");
1900  source.append(" uint lcl_id = get_local_id(0); \n");
1901  source.append(" uint lcl_sz = get_local_size(0); \n");
1902 
1903  // prepare for second step of block generation: compaction of the block
1904  // list itself to efficiently write out these
1905  source.append(" s_cl_blocking[tid] = s_cl_helper[tid]; \n");
1906 
1907  source.append(" if (tid_2 < num_threads_compaction) \n");
1908  source.append(" { \n");
1909  source.append(" s_cl_blocking[tid_2] = s_cl_helper[tid_2]; \n");
1910  source.append(" } \n");
1911 
1912  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1913 
1914  // additional scan to compact s_cl_blocking that permits to generate a
1915  // compact list of eigenvalue blocks each one containing about
1916  // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
1917  // processed by one thread block in a subsequent processing step
1918 
1919  source.append(" unsigned int offset = 1; \n");
1920 
1921  // build scan tree
1922  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
1923  source.append(" { \n");
1924 
1925  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1926 
1927  source.append(" if (tid < d) \n");
1928  source.append(" { \n");
1929 
1930  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
1931  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
1932  source.append(" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1933  source.append(" } \n");
1934 
1935  source.append(" offset <<= 1; \n");
1936  source.append(" } \n");
1937 
1938  // traverse down tree: first down to level 2 across
1939  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
1940  source.append(" { \n");
1941 
1942  source.append(" offset >>= 1; \n");
1943  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1944 
1945  //
1946  source.append(" if (tid < (d-1)) \n");
1947  source.append(" { \n");
1948 
1949  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
1950  source.append(" unsigned int bi = ai + (offset >> 1); \n");
1951  source.append(" s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai]; \n");
1952  source.append(" } \n");
1953  source.append(" } \n");
1954 
1955  source.append(" } \n");
1956 
1957 
1958  }
1959 
1960 
1962  // Perform scan to obtain number of eigenvalues before a specific block
1964 
1965  template <typename StringType>
1966  void generate_bisect_kernel_scanSumBlocks(StringType & source)
1967  {
1968  source.append(" \n");
1969  source.append(" void \n");
1970  source.append(" scanSumBlocks(const unsigned int tid, const unsigned int tid_2, \n");
1971  source.append(" const unsigned int num_threads_active, \n");
1972  source.append(" const unsigned int num_threads_compaction, \n");
1973  source.append(" __local unsigned short *s_cl_blocking, \n");
1974  source.append(" __local unsigned short *s_cl_helper) \n");
1975  source.append(" { \n");
1976  source.append(" uint glb_id = get_global_id(0); \n");
1977  source.append(" uint grp_id = get_group_id(0); \n");
1978  source.append(" uint grp_nm = get_num_groups(0); \n");
1979  source.append(" uint lcl_id = get_local_id(0); \n");
1980  source.append(" uint lcl_sz = get_local_size(0); \n");
1981 
1982  source.append(" unsigned int offset = 1; \n");
1983 
1984  // first step of scan to build the sum of elements within each block
1985  // build up tree
1986  source.append(" for (int d = num_threads_compaction >> 1; d > 0; d >>= 1) \n");
1987  source.append(" { \n");
1988 
1989  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
1990 
1991  source.append(" if (tid < d) \n");
1992  source.append(" { \n");
1993 
1994  source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
1995  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
1996 
1997  source.append(" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
1998  source.append(" } \n");
1999 
2000  source.append(" offset *= 2; \n");
2001  source.append(" } \n");
2002 
2003  // first step of scan to build the sum of elements within each block
2004  // traverse down tree
2005  source.append(" for (int d = 2; d < (num_threads_compaction - 1); d <<= 1) \n");
2006  source.append(" { \n");
2007 
2008  source.append(" offset >>= 1; \n");
2009  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2010 
2011  source.append(" if (tid < (d-1)) \n");
2012  source.append(" { \n");
2013  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
2014  source.append(" unsigned int bi = ai + (offset >> 1); \n");
2015  source.append(" s_cl_blocking[bi] += s_cl_blocking[ai]; \n");
2016  source.append(" } \n");
2017  source.append(" } \n");
2018  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2019 
2020  source.append(" if (0 == tid) \n");
2021  source.append(" { \n");
2022 
2023  // move last element of scan to last element that is valid
2024  // necessary because the number of threads employed for scan is a power
2025  // of two and not necessarily the number of active threasd
2026  source.append(" s_cl_helper[num_threads_active - 1] = \n");
2027  source.append(" s_cl_helper[num_threads_compaction - 1]; \n");
2028  source.append(" s_cl_blocking[num_threads_active - 1] = \n");
2029  source.append(" s_cl_blocking[num_threads_compaction - 1]; \n");
2030  source.append(" } \n");
2031  source.append(" } \n");
2032 
2033 
2034  }
2035 
2037  // Perform initial scan for compaction of intervals containing one and
2038  // multiple eigenvalues; also do initial scan to build blocks
2040 
2041  template <typename StringType>
2042  void generate_bisect_kernel_scanInitial(StringType & source)
2043  {
2044  source.append(" \n");
2045  source.append(" void \n");
2046  source.append(" scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int n, \n");
2047  source.append(" const unsigned int num_threads_active, \n");
2048  source.append(" const unsigned int num_threads_compaction, \n");
2049  source.append(" __local unsigned short *s_cl_one, __local unsigned short *s_cl_mult, \n");
2050  source.append(" __local unsigned short *s_cl_blocking, __local unsigned short *s_cl_helper \n");
2051  source.append(" ) \n");
2052  source.append(" { \n");
2053  source.append(" uint glb_id = get_global_id(0); \n");
2054  source.append(" uint grp_id = get_group_id(0); \n");
2055  source.append(" uint grp_nm = get_num_groups(0); \n");
2056  source.append(" uint lcl_id = get_local_id(0); \n");
2057  source.append(" uint lcl_sz = get_local_size(0); \n");
2058 
2059 
2060  // perform scan to compactly write out the intervals containing one and
2061  // multiple eigenvalues
2062  // also generate tree for blocking of intervals containing multiple
2063  // eigenvalues
2064 
2065  source.append(" unsigned int offset = 1; \n");
2066 
2067  // build scan tree
2068  source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
2069  source.append(" { \n");
2070 
2071  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2072 
2073  source.append(" if (tid < d) \n");
2074  source.append(" { \n");
2075 
2076  source.append(" unsigned int ai = offset*(2*tid+1); \n");
2077  source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
2078 
2079  source.append(" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1]; \n");
2080  source.append(" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1]; \n");
2081 
2082  // s_cl_helper is binary and zero for an internal node and 1 for a
2083  // root node of a tree corresponding to a block
2084  // s_cl_blocking contains the number of nodes in each sub-tree at each
2085  // iteration, the data has to be kept to compute the total number of
2086  // eigenvalues per block that, in turn, is needed to efficiently
2087  // write out data in the second step
2088  source.append(" if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1)) \n");
2089  source.append(" { \n");
2090 
2091  // check how many childs are non terminated
2092  source.append(" if (s_cl_helper[ai - 1] == 1) \n");
2093  source.append(" { \n");
2094  // mark as terminated
2095  source.append(" s_cl_helper[bi] = 1; \n");
2096  source.append(" } \n");
2097  source.append(" else if (s_cl_helper[bi] == 1) \n");
2098  source.append(" { \n");
2099  // mark as terminated
2100  source.append(" s_cl_helper[ai - 1] = 1; \n");
2101  source.append(" } \n");
2102  source.append(" else \n"); // both childs are non-terminated
2103  source.append(" { \n");
2104 
2105  source.append(" unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1]; \n");
2106 
2107  source.append(" if (temp > (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2)) \n");
2108  source.append(" { \n");
2109 
2110  // the two child trees have to form separate blocks, terminate trees
2111  source.append(" s_cl_helper[ai - 1] = 1; \n");
2112  source.append(" s_cl_helper[bi] = 1; \n");
2113  source.append(" } \n");
2114  source.append(" else \n");
2115  source.append(" { \n");
2116  // build up tree by joining subtrees
2117  source.append(" s_cl_blocking[bi] = temp; \n");
2118  source.append(" s_cl_blocking[ai - 1] = 0; \n");
2119  source.append(" } \n");
2120  source.append(" } \n");
2121  source.append(" } \n"); // end s_cl_helper update
2122  source.append(" } \n");
2123  source.append(" offset <<= 1; \n");
2124  source.append(" } \n");
2125 
2126 
2127  // traverse down tree, this only for stream compaction, not for block
2128  // construction
2129  source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
2130  source.append(" { \n");
2131  source.append(" offset >>= 1; \n");
2132  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2133  //
2134  source.append(" if (tid < (d-1)) \n");
2135  source.append(" { \n");
2136  source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
2137  source.append(" unsigned int bi = ai + (offset >> 1); \n");
2138  source.append(" s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai]; \n");
2139  source.append(" s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai]; \n");
2140  source.append(" } \n");
2141  source.append(" } \n");
2142  source.append(" } \n");
2143  }
2144 
2146  // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
2147  //
2148  // g_d diagonal elements in global memory
2149  // g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
2150  // n size of matrix
2151  // lg lower bound of input interval (e.g. Gerschgorin interval)
2152  // ug upper bound of input interval (e.g. Gerschgorin interval)
2153  // lg_eig_count number of eigenvalues that are smaller than \a lg
2154  // lu_eig_count number of eigenvalues that are smaller than \a lu
2155  // epsilon desired accuracy of eigenvalues to compute
2157 
2158  template <typename StringType>
2159  void generate_bisect_kernel_bisectKernelLarge(StringType & source, std::string const & numeric_string)
2160  {
2161  source.append(" __kernel \n");
2162  source.append(" void \n");
2163  source.append(" bisectKernelLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n");
2164  source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
2165  source.append(" const unsigned int n, \n");
2166  source.append(" const "); source.append(numeric_string); source.append(" lg, \n");
2167  source.append(" const "); source.append(numeric_string); source.append(" ug, \n");
2168  source.append(" const unsigned int lg_eig_count, \n");
2169  source.append(" const unsigned int ug_eig_count, \n");
2170  source.append(" "); source.append(numeric_string); source.append(" epsilon, \n");
2171  source.append(" __global unsigned int *g_num_one, \n");
2172  source.append(" __global unsigned int *g_num_blocks_mult, \n");
2173  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_one, \n");
2174  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_one, \n");
2175  source.append(" __global unsigned int *g_pos_one, \n");
2176  source.append(" __global "); source.append(numeric_string); source.append(" *g_left_mult, \n");
2177  source.append(" __global "); source.append(numeric_string); source.append(" *g_right_mult, \n");
2178  source.append(" __global unsigned int *g_left_count_mult, \n");
2179  source.append(" __global unsigned int *g_right_count_mult, \n");
2180  source.append(" __global unsigned int *g_blocks_mult, \n");
2181  source.append(" __global unsigned int *g_blocks_mult_sum \n");
2182  source.append(" ) \n");
2183  source.append(" { \n");
2184  source.append(" g_s = g_s + 1; \n");
2185  source.append(" uint glb_id = get_global_id(0); \n");
2186  source.append(" uint grp_id = get_group_id(0); \n");
2187  source.append(" uint grp_nm = get_num_groups(0); \n");
2188  source.append(" uint lcl_id = get_local_id(0); \n");
2189  source.append(" uint lcl_sz = get_local_size(0); \n");
2190 
2191  source.append(" const unsigned int tid = lcl_id; \n");
2192 
2193  // intervals (store left and right because the subdivision tree is in general
2194  // not dense
2195  source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2196  source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2197 
2198  // number of eigenvalues that are smaller than s_left / s_right
2199  // (correspondence is realized via indices)
2200  source.append(" __local unsigned short s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2201  source.append(" __local unsigned short s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2202 
2203  // helper for stream compaction
2204  source.append(" __local unsigned short s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2205 
2206  // state variables for whole block
2207  // if 0 then compaction of second chunk of child intervals is not necessary
2208  // (because all intervals had exactly one non-dead child)
2209  source.append(" __local unsigned int compact_second_chunk; \n");
2210  // if 1 then all threads are converged
2211  source.append(" __local unsigned int all_threads_converged; \n");
2212 
2213  // number of currently active threads
2214  source.append(" __local unsigned int num_threads_active; \n");
2215 
2216  // number of threads to use for stream compaction
2217  source.append(" __local unsigned int num_threads_compaction; \n");
2218 
2219  // helper for exclusive scan
2220  source.append(" __local unsigned short *s_compaction_list_exc = s_compaction_list + 1; \n");
2221 
2222  // variables for currently processed interval
2223  // left and right limit of active interval
2224  source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
2225  source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
2226  source.append(" unsigned int left_count = 0; \n");
2227  source.append(" unsigned int right_count = 0; \n");
2228  // midpoint of active interval
2229  source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
2230  // number of eigenvalues smaller then mid
2231  source.append(" unsigned int mid_count = 0; \n");
2232  // helper for stream compaction (tracking of threads generating second child)
2233  source.append(" unsigned int is_active_second = 0; \n");
2234 
2235  // initialize lists
2236  source.append(" s_compaction_list[tid] = 0; \n");
2237  source.append(" s_left[tid] = 0; \n");
2238  source.append(" s_right[tid] = 0; \n");
2239  source.append(" s_left_count[tid] = 0; \n");
2240  source.append(" s_right_count[tid] = 0; \n");
2241 
2242  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2243 
2244  // set up initial configuration
2245  source.append(" if (0 == tid) \n");
2246  source.append(" { \n");
2247 
2248  source.append(" s_left[0] = lg; \n");
2249  source.append(" s_right[0] = ug; \n");
2250  source.append(" s_left_count[0] = lg_eig_count; \n");
2251  source.append(" s_right_count[0] = ug_eig_count; \n");
2252 
2253  source.append(" compact_second_chunk = 0; \n");
2254  source.append(" num_threads_active = 1; \n");
2255 
2256  source.append(" num_threads_compaction = 1; \n");
2257 
2258  source.append(" all_threads_converged = 1; \n");
2259  source.append(" } \n");
2260 
2261  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2262 
2263  // for all active threads read intervals from the last level
2264  // the number of (worst case) active threads per level l is 2^l
2265  // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues
2266  // will be executed in the second step
2267  source.append(" while( true ) \n");
2268  source.append(" { \n");
2269  source.append(" s_compaction_list[tid] = 0; \n");
2270  source.append(" s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
2271  source.append(" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
2272  source.append(" subdivideActiveIntervalShort(tid, s_left, s_right, s_left_count, s_right_count, \n");
2273  source.append(" num_threads_active, \n");
2274  source.append(" &left, &right, &left_count, &right_count, \n");
2275  source.append(" &mid, &all_threads_converged); \n");
2276 
2277  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2278 
2279  // check if done
2280  source.append(" if (1 == all_threads_converged) \n");
2281  source.append(" { \n");
2282  source.append(" break; \n");
2283  source.append(" } \n");
2284 
2285  // compute number of eigenvalues smaller than mid
2286  // use all threads for reading the necessary matrix data from global
2287  // memory
2288  // use s_left and s_right as scratch space for diagonal and
2289  // superdiagonal of matrix
2290  source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
2291  source.append(" mid, lcl_id, \n");
2292  source.append(" num_threads_active, \n");
2293  source.append(" s_left, s_right, \n");
2294  source.append(" (left == right)); \n");
2295 
2296  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2297 
2298  // store intervals
2299  // for all threads store the first child interval in a continuous chunk of
2300  // memory, and the second child interval -- if it exists -- in a second
2301  // chunk; it is likely that all threads reach convergence up to
2302  // \a epsilon at the same level; furthermore, for higher level most / all
2303  // threads will have only one child, storing the first child compactly will
2304  // (first) avoid to perform a compaction step on the first chunk, (second)
2305  // make it for higher levels (when all threads / intervals have
2306  // exactly one child) unnecessary to perform a compaction of the second
2307  // chunk
2308  source.append(" if (tid < num_threads_active) \n");
2309  source.append(" { \n");
2310 
2311  source.append(" if (left != right) \n");
2312  source.append(" { \n");
2313 
2314  // store intervals
2315  source.append(" storeNonEmptyIntervalsLarge(tid, num_threads_active, \n");
2316  source.append(" s_left, s_right, \n");
2317  source.append(" s_left_count, s_right_count, \n");
2318  source.append(" left, mid, right, \n");
2319  source.append(" left_count, mid_count, right_count, \n");
2320  source.append(" epsilon, &compact_second_chunk, \n");
2321  source.append(" s_compaction_list_exc, \n");
2322  source.append(" &is_active_second); \n");
2323  source.append(" } \n");
2324  source.append(" else \n");
2325  source.append(" { \n");
2326 
2327  // re-write converged interval (has to be stored again because s_left
2328  // and s_right are used as scratch space for
2329  // computeNumSmallerEigenvalsLarge()
2330  source.append(" s_left[tid] = left; \n");
2331  source.append(" s_right[tid] = left; \n");
2332  source.append(" s_left_count[tid] = left_count; \n");
2333  source.append(" s_right_count[tid] = right_count; \n");
2334 
2335  source.append(" is_active_second = 0; \n");
2336  source.append(" } \n");
2337  source.append(" } \n");
2338 
2339  // necessary so that compact_second_chunk is up-to-date
2340  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2341 
2342  // perform compaction of chunk where second children are stored
2343  // scan of (num_threads_active / 2) elements, thus at most
2344  // (num_threads_active / 4) threads are needed
2345  source.append(" if (compact_second_chunk > 0) \n");
2346  source.append(" { \n");
2347 
2348  // create indices for compaction
2349  source.append(" createIndicesCompactionShort(s_compaction_list_exc, num_threads_compaction); \n");
2350  source.append(" } \n");
2351  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2352  source.append(" \n");
2353  source.append(" if (compact_second_chunk > 0) \n");
2354  source.append(" { \n");
2355  source.append(" compactIntervalsShort(s_left, s_right, s_left_count, s_right_count, \n");
2356  source.append(" mid, right, mid_count, right_count, \n");
2357  source.append(" s_compaction_list, num_threads_active, \n");
2358  source.append(" is_active_second); \n");
2359  source.append(" } \n");
2360 
2361  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2362 
2363  // update state variables
2364  source.append(" if (0 == tid) \n");
2365  source.append(" { \n");
2366 
2367  // update number of active threads with result of reduction
2368  source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
2369  source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
2370 
2371  source.append(" compact_second_chunk = 0; \n");
2372  source.append(" all_threads_converged = 1; \n");
2373  source.append(" } \n");
2374  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2375  source.append(" if (num_threads_compaction > lcl_sz) \n");
2376  source.append(" { \n");
2377  source.append(" break; \n");
2378  source.append(" } \n");
2379  source.append(" } \n");
2380  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2381 
2382  // generate two lists of intervals; one with intervals that contain one
2383  // eigenvalue (or are converged), and one with intervals that need further
2384  // subdivision
2385 
2386  // perform two scans in parallel
2387 
2388  source.append(" unsigned int left_count_2; \n");
2389  source.append(" unsigned int right_count_2; \n");
2390 
2391  source.append(" unsigned int tid_2 = tid + lcl_sz; \n");
2392 
2393  // cache in per thread registers so that s_left_count and s_right_count
2394  // can be used for scans
2395  source.append(" left_count = s_left_count[tid]; \n");
2396  source.append(" right_count = s_right_count[tid]; \n");
2397 
2398  // some threads have to cache data for two intervals
2399  source.append(" if (tid_2 < num_threads_active) \n");
2400  source.append(" { \n");
2401  source.append(" left_count_2 = s_left_count[tid_2]; \n");
2402  source.append(" right_count_2 = s_right_count[tid_2]; \n");
2403  source.append(" } \n");
2404 
2405  // compaction list for intervals containing one and multiple eigenvalues
2406  // do not affect first element for exclusive scan
2407  source.append(" __local unsigned short *s_cl_one = s_left_count + 1; \n");
2408  source.append(" __local unsigned short *s_cl_mult = s_right_count + 1; \n");
2409 
2410  // compaction list for generating blocks of intervals containing multiple
2411  // eigenvalues
2412  source.append(" __local unsigned short *s_cl_blocking = s_compaction_list_exc; \n");
2413  // helper compaction list for generating blocks of intervals
2414  source.append(" __local unsigned short s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
2415 
2416  source.append(" if (0 == tid) \n");
2417  source.append(" { \n");
2418  // set to 0 for exclusive scan
2419  source.append(" s_left_count[0] = 0; \n");
2420  source.append(" s_right_count[0] = 0; \n");
2421  source.append(" \n");
2422  source.append(" } \n");
2423 
2424  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2425 
2426  // flag if interval contains one or multiple eigenvalues
2427  source.append(" unsigned int is_one_lambda = 0; \n");
2428  source.append(" unsigned int is_one_lambda_2 = 0; \n");
2429 
2430  // number of eigenvalues in the interval
2431  source.append(" unsigned int multiplicity = right_count - left_count; \n");
2432  source.append(" is_one_lambda = (1 == multiplicity); \n");
2433 
2434  source.append(" s_cl_one[tid] = is_one_lambda; \n");
2435  source.append(" s_cl_mult[tid] = (! is_one_lambda); \n");
2436 
2437  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
2438  source.append(" s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity; \n");
2439  source.append(" s_cl_helper[tid] = 0; \n");
2440 
2441  source.append(" if (tid_2 < num_threads_active) \n");
2442  source.append(" { \n");
2443 
2444  source.append(" unsigned int multiplicity = right_count_2 - left_count_2; \n");
2445  source.append(" is_one_lambda_2 = (1 == multiplicity); \n");
2446 
2447  source.append(" s_cl_one[tid_2] = is_one_lambda_2; \n");
2448  source.append(" s_cl_mult[tid_2] = (! is_one_lambda_2); \n");
2449 
2450  // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
2451  source.append(" s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity; \n");
2452  source.append(" s_cl_helper[tid_2] = 0; \n");
2453  source.append(" } \n");
2454  source.append(" else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1)) \n");
2455  source.append(" { \n");
2456 
2457  // clear
2458  source.append(" s_cl_blocking[tid_2] = 0; \n");
2459  source.append(" s_cl_helper[tid_2] = 0; \n");
2460  source.append(" } \n");
2461 
2462 
2463  source.append(" scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction, \n");
2464  source.append(" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper); \n");
2465  source.append(" \n");
2466  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2467 
2468  source.append(" scanSumBlocks(tid, tid_2, num_threads_active, \n");
2469  source.append(" num_threads_compaction, s_cl_blocking, s_cl_helper); \n");
2470 
2471  // end down sweep of scan
2472  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2473 
2474  source.append(" unsigned int c_block_iend = 0; \n");
2475  source.append(" unsigned int c_block_iend_2 = 0; \n");
2476  source.append(" unsigned int c_sum_block = 0; \n");
2477  source.append(" unsigned int c_sum_block_2 = 0; \n");
2478 
2479  // for each thread / interval that corresponds to root node of interval block
2480  // store start address of block and total number of eigenvalues in all blocks
2481  // before this block (particular thread is irrelevant, constraint is to
2482  // have a subset of threads so that one and only one of them is in each
2483  // interval)
2484  source.append(" if (1 == s_cl_helper[tid]) \n");
2485  source.append(" { \n");
2486 
2487  source.append(" c_block_iend = s_cl_mult[tid] + 1; \n");
2488  source.append(" c_sum_block = s_cl_blocking[tid]; \n");
2489  source.append(" } \n");
2490 
2491  source.append(" if (1 == s_cl_helper[tid_2]) \n");
2492  source.append(" { \n");
2493 
2494  source.append(" c_block_iend_2 = s_cl_mult[tid_2] + 1; \n");
2495  source.append(" c_sum_block_2 = s_cl_blocking[tid_2]; \n");
2496  source.append(" } \n");
2497 
2498  source.append(" scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction, \n");
2499  source.append(" s_cl_blocking, s_cl_helper); \n");
2500 
2501 
2502  // finished second scan for s_cl_blocking
2503  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2504 
2505  // determine the global results
2506  source.append(" __local unsigned int num_blocks_mult; \n");
2507  source.append(" __local unsigned int num_mult; \n");
2508  source.append(" __local unsigned int offset_mult_lambda; \n");
2509 
2510  source.append(" if (0 == tid) \n");
2511  source.append(" { \n");
2512 
2513  source.append(" num_blocks_mult = s_cl_blocking[num_threads_active - 1]; \n");
2514  source.append(" offset_mult_lambda = s_cl_one[num_threads_active - 1]; \n");
2515  source.append(" num_mult = s_cl_mult[num_threads_active - 1]; \n");
2516 
2517  source.append(" *g_num_one = offset_mult_lambda; \n");
2518  source.append(" *g_num_blocks_mult = num_blocks_mult; \n");
2519  source.append(" } \n");
2520 
2521  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2522 
2523  source.append(" "); source.append(numeric_string); source.append(" left_2, right_2; \n");
2524  source.append(" --s_cl_one; \n");
2525  source.append(" --s_cl_mult; \n");
2526  source.append(" --s_cl_blocking; \n");
2527  source.append(" \n");
2528  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2529  source.append(" compactStreamsFinal(tid, tid_2, num_threads_active, &offset_mult_lambda, \n");
2530  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
2531  source.append(" s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper, \n");
2532  source.append(" is_one_lambda, is_one_lambda_2, \n");
2533  source.append(" &left, &right, &left_2, &right_2, \n");
2534  source.append(" &left_count, &right_count, &left_count_2, &right_count_2, \n");
2535  source.append(" c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2 \n");
2536  source.append(" ); \n");
2537 
2538  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2539 
2540  // final adjustment before writing out data to global memory
2541  source.append(" if (0 == tid) \n");
2542  source.append(" { \n");
2543  source.append(" s_cl_blocking[num_blocks_mult] = num_mult; \n");
2544  source.append(" s_cl_helper[0] = 0; \n");
2545  source.append(" } \n");
2546 
2547  source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
2548 
2549  // write to global memory
2550  source.append(" writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult, \n");
2551  source.append(" g_left_one, g_right_one, g_pos_one, \n");
2552  source.append(" g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult, \n");
2553  source.append(" s_left, s_right, s_left_count, s_right_count, \n");
2554  source.append(" g_blocks_mult, g_blocks_mult_sum, \n");
2555  source.append(" s_compaction_list, s_cl_helper, offset_mult_lambda); \n");
2556  source.append(" \n");
2557 
2558  source.append(" } \n");
2559  }
2560 
2561  // main kernel class
2565  template <class NumericT>
2567  {
2568  static std::string program_name()
2569  {
2570  return viennacl::ocl::type_to_string<NumericT>::apply() + "_bisect_kernel";
2571  }
2572 
2573  static void init(viennacl::ocl::context & ctx)
2574  {
2576  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
2577 
2578  static std::map<cl_context, bool> init_done;
2579  if (!init_done[ctx.handle().get()])
2580  {
2581  std::string source;
2582  source.reserve(8192);
2583 
2584  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
2585 
2586  // only generate for floating points (forces error for integers)
2587  if (numeric_string == "float" || numeric_string == "double")
2588  {
2589  //functions used from bisect_util.cpp
2591  generate_bisect_kernel_floorPow2(source, numeric_string);
2592  generate_bisect_kernel_ceilPow2(source, numeric_string);
2593  generate_bisect_kernel_computeMidpoint(source, numeric_string);
2594 
2595  generate_bisect_kernel_storeInterval(source, numeric_string);
2596  generate_bisect_kernel_storeIntervalShort(source, numeric_string);
2597 
2598  generate_bisect_kernel_computeNumSmallerEigenvals(source, numeric_string);
2600 
2601  generate_bisect_kernel_storeNonEmptyIntervals(source, numeric_string);
2603 
2606 
2607  generate_bisect_kernel_compactIntervals(source, numeric_string);
2608  generate_bisect_kernel_compactIntervalsShort(source, numeric_string);
2609 
2610  generate_bisect_kernel_storeIntervalConverged(source, numeric_string);
2612 
2613  generate_bisect_kernel_subdivideActiveInterval(source, numeric_string);
2615 
2616  generate_bisect_kernel_bisectKernel(source, numeric_string);
2619 
2620 
2621  generate_bisect_kernel_writeToGmem(source, numeric_string);
2622  generate_bisect_kernel_compactStreamsFinal(source, numeric_string);
2626  generate_bisect_kernel_bisectKernelLarge(source, numeric_string);
2627 
2628 
2629  }
2630 
2631  std::string prog_name = program_name();
2632  #ifdef VIENNACL_BUILD_INFO
2633  std::cout << "Creating program " << prog_name << std::endl;
2634  #endif
2635  ctx.add_program(source, prog_name);
2636  init_done[ctx.handle().get()] = true;
2637  } //if
2638  } //init
2639  };
2640 }
2641 }
2642 }
2643 }
2644 
2645 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:1317
void generate_bisect_kernel_writeToGmem(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:1673
static void init(viennacl::ocl::context &ctx)
Definition: bisect.hpp:2573
void generate_bisect_kernel_bisectKernel(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:1085
void generate_bisect_kernel_subdivideActiveInterval(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:971
void generate_bisect_kernel_floorPow2(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:66
void generate_bisect_kernel_bisectKernelLarge(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:2159
Implements a OpenCL platform within ViennaCL.
void generate_bisect_kernel_createIndicesCompactionShort(StringType &source)
Definition: bisect.hpp:667
Various little tools used here and there in ViennaCL.
void generate_bisect_kernel_compactStreamsFinal(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:1765
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
void generate_bisect_kernel_compactIntervals(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:743
Provides OpenCL-related utilities.
void generate_bisect_kernel_compactIntervalsShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:783
void generate_bisect_kernel_storeIntervalConvergedShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:890
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
Common implementations shared by OpenCL-based operations.
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_bisect_kernel_createIndicesCompaction(StringType &source)
Definition: bisect.hpp:604
const OCL_TYPE & get() const
Definition: handle.hpp:191
void generate_bisect_kernel_ceilPow2(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:99
void generate_bisect_kernel_scanCompactBlocksStartAddress(StringType &source)
Definition: bisect.hpp:1887
void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:367
void generate_bisect_kernel_computeMidpoint(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:133
void generate_bisect_kernel_storeNonEmptyIntervals(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:456
void generate_bisect_kernel_config(StringType &source)
Definition: bisect.hpp:50
void generate_bisect_kernel_subdivideActiveIntervalShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:1023
void generate_bisect_kernel_computeNumSmallerEigenvals(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:295
void generate_bisect_kernel_storeIntervalShort(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:228
Representation of an OpenCL kernel in ViennaCL.
void generate_bisect_kernel_scanSumBlocks(StringType &source)
Definition: bisect.hpp:1966
void generate_bisect_kernel_scanInitial(StringType &source)
Definition: bisect.hpp:2042
void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:531
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_bisect_kernel_storeInterval(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:178
Main kernel class for the generation of the bisection kernels and utilities.
Definition: bisect.hpp:2566
void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:1555
void generate_bisect_kernel_storeIntervalConverged(StringType &source, std::string const &numeric_string)
Definition: bisect.hpp:822