ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
bisect_kernel_small.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
30 // includes, project
33 // additional kernel
35 
36 namespace viennacl
37 {
38 namespace linalg
39 {
40 namespace cuda
41 {
42 
58 template<typename NumericT>
59 __global__
60 void
61 bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
62  NumericT * g_left, NumericT *g_right,
63  unsigned int *g_left_count, unsigned int *g_right_count,
64  const NumericT lg, const NumericT ug,
65  const unsigned int lg_eig_count, const unsigned int ug_eig_count,
66  NumericT epsilon
67  )
68 {
69  // intervals (store left and right because the subdivision tree is in general
70  // not dense
73 
74  // number of eigenvalues that are smaller than s_left / s_right
75  // (correspondence is realized via indices)
76  __shared__ unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
77  __shared__ unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
78 
79  // helper for stream compaction
80  __shared__ unsigned int
82 
83  // state variables for whole block
84  // if 0 then compaction of second chunk of child intervals is not necessary
85  // (because all intervals had exactly one non-dead child)
86  __shared__ unsigned int compact_second_chunk;
87  __shared__ unsigned int all_threads_converged;
88 
89  // number of currently active threads
90  __shared__ unsigned int num_threads_active;
91 
92  // number of threads to use for stream compaction
93  __shared__ unsigned int num_threads_compaction;
94 
95  // helper for exclusive scan
96  unsigned int *s_compaction_list_exc = s_compaction_list + 1;
97 
98 
99  // variables for currently processed interval
100  // left and right limit of active interval
101  NumericT left = 0.0f;
102  NumericT right = 0.0f;
103  unsigned int left_count = 0;
104  unsigned int right_count = 0;
105  // midpoint of active interval
106  NumericT mid = 0.0f;
107  // number of eigenvalues smaller then mid
108  unsigned int mid_count = 0;
109  // affected from compaction
110  unsigned int is_active_second = 0;
111 
112  s_compaction_list[threadIdx.x] = 0;
113  s_left[threadIdx.x] = 0;
114  s_right[threadIdx.x] = 0;
115  s_left_count[threadIdx.x] = 0;
116  s_right_count[threadIdx.x] = 0;
117 
118  __syncthreads();
119 
120  // set up initial configuration
121  if (0 == threadIdx.x)
122  {
123  s_left[0] = lg;
124  s_right[0] = ug;
125  s_left_count[0] = lg_eig_count;
126  s_right_count[0] = ug_eig_count;
127 
128  compact_second_chunk = 0;
129  num_threads_active = 1;
130 
131  num_threads_compaction = 1;
132  }
133 
134  // for all active threads read intervals from the last level
135  // the number of (worst case) active threads per level l is 2^l
136  while (true)
137  {
138 
139  all_threads_converged = 1;
140  __syncthreads();
141 
142  is_active_second = 0;
143  subdivideActiveIntervalMulti(threadIdx.x,
144  s_left, s_right, s_left_count, s_right_count,
145  num_threads_active,
146  left, right, left_count, right_count,
147  mid, all_threads_converged);
148 
149  __syncthreads();
150 
151  // check if done
152  if (1 == all_threads_converged)
153  {
154  break;
155  }
156 
157  __syncthreads();
158 
159  // compute number of eigenvalues smaller than mid
160  // use all threads for reading the necessary matrix data from global
161  // memory
162  // use s_left and s_right as scratch space for diagonal and
163  // superdiagonal of matrix
164  mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,
165  threadIdx.x, num_threads_active,
166  s_left, s_right,
167  (left == right));
168 
169  __syncthreads();
170 
171  // store intervals
172  // for all threads store the first child interval in a continuous chunk of
173  // memory, and the second child interval -- if it exists -- in a second
174  // chunk; it is likely that all threads reach convergence up to
175  // \a epsilon at the same level; furthermore, for higher level most / all
176  // threads will have only one child, storing the first child compactly will
177  // (first) avoid to perform a compaction step on the first chunk, (second)
178  // make it for higher levels (when all threads / intervals have
179  // exactly one child) unnecessary to perform a compaction of the second
180  // chunk
181  if (threadIdx.x < num_threads_active)
182  {
183 
184  if (left != right)
185  {
186 
187  // store intervals
188  storeNonEmptyIntervals(threadIdx.x, num_threads_active,
189  s_left, s_right, s_left_count, s_right_count,
190  left, mid, right,
191  left_count, mid_count, right_count,
192  epsilon, compact_second_chunk,
193  s_compaction_list_exc,
194  is_active_second);
195  }
196  else
197  {
198 
199  storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
200  left, mid, right,
201  left_count, mid_count, right_count,
202  s_compaction_list_exc, compact_second_chunk,
203  num_threads_active,
204  is_active_second);
205  }
206  }
207 
208  // necessary so that compact_second_chunk is up-to-date
209  __syncthreads();
210 
211  // perform compaction of chunk where second children are stored
212  // scan of (num_threads_active / 2) elements, thus at most
213  // (num_threads_active / 4) threads are needed
214  if (compact_second_chunk > 0)
215  {
216 
217  createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
218 
219  compactIntervals(s_left, s_right, s_left_count, s_right_count,
220  mid, right, mid_count, right_count,
221  s_compaction_list, num_threads_active,
222  is_active_second);
223  }
224 
225  __syncthreads();
226 
227  if (0 == threadIdx.x)
228  {
229 
230  // update number of active threads with result of reduction
231  num_threads_active += s_compaction_list[num_threads_active];
232 
233  num_threads_compaction = ceilPow2(num_threads_active);
234 
235  compact_second_chunk = 0;
236  }
237 
238  __syncthreads();
239 
240  }
241 
242  __syncthreads();
243 
244  // write resulting intervals to global mem
245  // for all threads write if they have been converged to an eigenvalue to
246  // a separate array
247 
248  // at most n valid intervals
249  if (threadIdx.x < n)
250  {
251 
252  // intervals converged so left and right limit are identical
253  g_left[threadIdx.x] = s_left[threadIdx.x];
254  // left count is sufficient to have global order
255  g_left_count[threadIdx.x] = s_left_count[threadIdx.x];
256  }
257 }
258 } // namespace cuda
259 } // namespace linalg
260 } // namespace viennacl
261 #endif // #ifndef _BISECT_KERNEL_SMALL_H_
__global__ void bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int n, NumericT *g_left, NumericT *g_right, unsigned int *g_left_count, unsigned int *g_right_count, const NumericT lg, const NumericT ug, const unsigned int lg_eig_count, const unsigned int ug_eig_count, NumericT epsilon)
Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix.
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
__device__ unsigned int computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)
__device__ void storeNonEmptyIntervals(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT mid, NumericT right, const S left_count, const S mid_count, const S right_count, NumericT precision, unsigned int &compact_second_chunk, T *s_compaction_list_exc, unsigned int &is_active_second)
Store all non-empty intervals resulting from the subdivision of the interval currently processed by t...
Utility functions.
__device__ int ceilPow2(int n)
Definition: bisect_util.hpp:66
Global configuration parameters.
float NumericT
Definition: bisect.cpp:40
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
#define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX
Definition: config.hpp:38
__device__ void subdivideActiveIntervalMulti(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ void storeIntervalConverged(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT &left, NumericT &mid, NumericT &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active, unsigned int &is_active_second)