ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
bisect_kernel_large_multi.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
30 /* Perform second step of bisection algorithm for large matrices for
31  * intervals that contained after the first step more than one eigenvalue
32  */
33 
34 // includes, project
37 // additional kernel
39 
40 namespace viennacl
41 {
42 namespace linalg
43 {
44 namespace cuda
45 {
65 template<typename NumericT>
66 __global__
67 void
68 bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
69  unsigned int *blocks_mult,
70  unsigned int *blocks_mult_sum,
71  NumericT *g_left, NumericT *g_right,
72  unsigned int *g_left_count,
73  unsigned int *g_right_count,
74  NumericT *g_lambda, unsigned int *g_pos,
75  NumericT precision
76  )
77 {
78  const unsigned int tid = threadIdx.x;
79 
80  // left and right limits of interval
81  __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
82  __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
83 
84  // number of eigenvalues smaller than interval limits
85  __shared__ unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
86  __shared__ unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
87 
88  // helper array for chunk compaction of second chunk
89  __shared__ unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
90  // compaction list helper for exclusive scan
91  unsigned int *s_compaction_list_exc = s_compaction_list + 1;
92 
93  // flag if all threads are converged
94  __shared__ unsigned int all_threads_converged;
95  // number of active threads
96  __shared__ unsigned int num_threads_active;
97  // number of threads to employ for compaction
98  __shared__ unsigned int num_threads_compaction;
99  // flag if second chunk has to be compacted
100  __shared__ unsigned int compact_second_chunk;
101 
102  // parameters of block of intervals processed by this block of threads
103  __shared__ unsigned int c_block_start;
104  __shared__ unsigned int c_block_end;
105  __shared__ unsigned int c_block_offset_output;
106 
107  // midpoint of currently active interval of the thread
108  NumericT mid = 0.0f;
109  // number of eigenvalues smaller than \a mid
110  unsigned int mid_count = 0;
111  // current interval parameter
112  NumericT left = 0.0f;
113  NumericT right = 0.0f;
114  unsigned int left_count = 0;
115  unsigned int right_count = 0;
116  // helper for compaction, keep track which threads have a second child
117  unsigned int is_active_second = 0;
118 
119 
120  __syncthreads();
121  // initialize common start conditions
122  if (0 == tid)
123  {
124 
125  c_block_start = blocks_mult[blockIdx.x];
126  c_block_end = blocks_mult[blockIdx.x + 1];
127  c_block_offset_output = blocks_mult_sum[blockIdx.x];
128 
129 
130  num_threads_active = c_block_end - c_block_start;
131  s_compaction_list[0] = 0;
132  num_threads_compaction = ceilPow2(num_threads_active);
133 
134  all_threads_converged = 1;
135  compact_second_chunk = 0;
136  }
137 
138  s_left_count [tid] = 42;
139  s_right_count[tid] = 42;
140  s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
141  s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
142 
143  __syncthreads();
144 
145 
146  // read data into shared memory
147  if (tid < num_threads_active)
148  {
149  s_left[tid] = g_left[c_block_start + tid];
150  s_right[tid] = g_right[c_block_start + tid];
151  s_left_count[tid] = g_left_count[c_block_start + tid];
152  s_right_count[tid] = g_right_count[c_block_start + tid];
153  }
154 
155  __syncthreads();
156  unsigned int iter = 0;
157  // do until all threads converged
158  while (true)
159  {
160  iter++;
161  //for (int iter=0; iter < 0; iter++) {
162  s_compaction_list[threadIdx.x] = 0;
163  s_compaction_list[threadIdx.x + blockDim.x] = 0;
164  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
165 
166  // subdivide interval if currently active and not already converged
167  subdivideActiveIntervalMulti(tid, s_left, s_right,
168  s_left_count, s_right_count,
169  num_threads_active,
170  left, right, left_count, right_count,
171  mid, all_threads_converged);
172  __syncthreads();
173 
174  // stop if all eigenvalues have been found
175  if (1 == all_threads_converged)
176  {
177 
178  break;
179  }
180 
181  // compute number of eigenvalues smaller than mid for active and not
182  // converged intervals, use all threads for loading data from gmem and
183  // s_left and s_right as scratch space to store the data load from gmem
184  // in shared memory
185  mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
186  mid, tid, num_threads_active,
187  s_left, s_right,
188  (left == right));
189 
190  __syncthreads();
191 
192  if (tid < num_threads_active)
193  {
194 
195  // store intervals
196  if (left != right)
197  {
198 
199  storeNonEmptyIntervals(tid, num_threads_active,
200  s_left, s_right, s_left_count, s_right_count,
201  left, mid, right,
202  left_count, mid_count, right_count,
203  precision, compact_second_chunk,
204  s_compaction_list_exc,
205  is_active_second);
206 
207  }
208  else
209  {
210 
211  storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
212  left, mid, right,
213  left_count, mid_count, right_count,
214  s_compaction_list_exc, compact_second_chunk,
215  num_threads_active,
216  is_active_second);
217 
218  }
219  }
220 
221  __syncthreads();
222 
223  // compact second chunk of intervals if any of the threads generated
224  // two child intervals
225  if (1 == compact_second_chunk)
226  {
227 
228  createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
229  compactIntervals(s_left, s_right, s_left_count, s_right_count,
230  mid, right, mid_count, right_count,
231  s_compaction_list, num_threads_active,
232  is_active_second);
233  }
234 
235  __syncthreads();
236 
237  // update state variables
238  if (0 == tid)
239  {
240  num_threads_active += s_compaction_list[num_threads_active];
241  num_threads_compaction = ceilPow2(num_threads_active);
242 
243  compact_second_chunk = 0;
244  all_threads_converged = 1;
245  }
246 
247  __syncthreads();
248 
249  // clear
250  s_compaction_list_exc[threadIdx.x] = 0;
251  s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
252 
253  if (num_threads_compaction > blockDim.x)
254  {
255  break;
256  }
257 
258 
259  __syncthreads();
260 
261  } // end until all threads converged
262 
263  // write data back to global memory
264  if (tid < num_threads_active)
265  {
266 
267  unsigned int addr = c_block_offset_output + tid;
268 
269  g_lambda[addr] = s_left[tid];
270  g_pos[addr] = s_right_count[tid];
271  }
272 }
273 } // namespace cuda
274 } // namespace linalg
275 } // namespace viennacl
276 
277 #endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
Definition: config.hpp:32
__device__ void storeNonEmptyIntervals(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT mid, NumericT right, const S left_count, const S mid_count, const S right_count, NumericT precision, unsigned int &compact_second_chunk, T *s_compaction_list_exc, unsigned int &is_active_second)
Store all non-empty intervals resulting from the subdivision of the interval currently processed by t...
Utility functions.
__device__ int ceilPow2(int n)
Definition: bisect_util.hpp:66
Global configuration parameters.
float NumericT
Definition: bisect.cpp:40
__global__ void bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, unsigned int *blocks_mult, unsigned int *blocks_mult_sum, NumericT *g_left, NumericT *g_right, unsigned int *g_left_count, unsigned int *g_right_count, NumericT *g_lambda, unsigned int *g_pos, NumericT precision)
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void subdivideActiveIntervalMulti(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ void storeIntervalConverged(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT &left, NumericT &mid, NumericT &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active, unsigned int &is_active_second)
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)