1 #ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
56 frexp((
float)n, &exp);
57 return (1 << (exp - 1));
76 frexp((
float)n, &exp);
86 template<
typename NumericT>
96 mid = left + (right - left) * 0.5f;
100 mid = (left + right) * 0.5f;
121 template<
class S,
class T,
class NumericT>
126 T *s_left_count, T *s_right_count,
128 S left_count, S right_count,
131 s_left_count[addr] = left_count;
132 s_right_count[addr] = right_count;
136 NumericT t1 =
max(abs(left), abs(right)) * precision;
144 s_left[addr] = lambda;
145 s_right[addr] = lambda;
152 s_right[addr] = right;
174 template<
typename NumericT>
179 const unsigned int tid,
180 const unsigned int num_intervals_active,
182 unsigned int converged
187 unsigned int count = 0;
194 s_d[threadIdx.x] = *(g_d + threadIdx.x);
195 s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
201 if ((tid < num_intervals_active) && (0 == converged))
206 for (
unsigned int k = 0; k < n; ++k)
208 delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
209 count += (delta < 0) ? 1 : 0;
234 template<
typename NumericT>
239 const unsigned int tid,
240 const unsigned int num_intervals_active,
242 unsigned int converged
246 unsigned int count = 0;
248 unsigned int rem = n;
251 for (
unsigned int i = 0; i < n; i += blockDim.x)
257 if ((i + threadIdx.x) < n)
260 s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
261 s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
267 if (tid < num_intervals_active)
272 for (
unsigned int k = 0; k <
min(rem,blockDim.x); ++k)
274 delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
276 count += (delta < 0) ? 1 : 0;
306 template<
class S,
class T,
class NumericT>
310 const unsigned int num_threads_active,
312 T *s_left_count, T *s_right_count,
318 unsigned int &compact_second_chunk,
319 T *s_compaction_list_exc,
320 unsigned int &is_active_second)
324 if ((left_count != mid_count) && (mid_count != right_count))
328 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
329 left, mid, left_count, mid_count, precision);
333 is_active_second = 1;
334 s_compaction_list_exc[threadIdx.x] = 1;
335 compact_second_chunk = 1;
343 is_active_second = 0;
344 s_compaction_list_exc[threadIdx.x] = 0;
347 if (left_count != mid_count)
349 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
350 left, mid, left_count, mid_count, precision);
354 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
355 mid, right, mid_count, right_count, precision);
374 unsigned int num_threads_compaction)
377 unsigned int offset = 1;
378 const unsigned int tid = threadIdx.x;
383 for (
int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
391 unsigned int ai = offset*(2*tid+1)-1;
392 unsigned int bi = offset*(2*tid+2)-1;
394 s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
395 + s_compaction_list_exc[ai];
402 for (
int d = 2; d < num_threads_compaction; d <<= 1)
411 unsigned int ai = offset*(tid+1) - 1;
412 unsigned int bi = ai + (offset >> 1);
414 s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
415 + s_compaction_list_exc[ai];
437 template<
class T,
class NumericT>
441 T *s_left_count, T *s_right_count,
443 unsigned int mid_count,
unsigned int right_count,
444 T *s_compaction_list,
445 unsigned int num_threads_active,
446 unsigned int is_active_second)
448 const unsigned int tid = threadIdx.x;
452 if ((tid < num_threads_active) && (1 == is_active_second))
454 unsigned int addr_w = num_threads_active + s_compaction_list[tid];
455 s_left[addr_w] = mid;
456 s_right[addr_w] = right;
457 s_left_count[addr_w] = mid_count;
458 s_right_count[addr_w] = right_count;
462 template<
class T,
class S,
class NumericT>
466 T *s_left_count, T *s_right_count,
468 S &left_count, S &mid_count, S &right_count,
469 T *s_compaction_list_exc,
470 unsigned int &compact_second_chunk,
471 const unsigned int num_threads_active,
472 unsigned int &is_active_second)
474 const unsigned int tid = threadIdx.x;
475 const unsigned int multiplicity = right_count - left_count;
477 if (1 == multiplicity)
482 s_right[tid] = right;
483 s_left_count[tid] = left_count;
484 s_right_count[tid] = right_count;
488 is_active_second = 0;
489 s_compaction_list_exc[tid] = 0;
495 mid_count = left_count + (multiplicity >> 1);
499 s_right[tid] = right;
500 s_left_count[tid] = left_count;
501 s_right_count[tid] = mid_count;
505 is_active_second = 1;
506 s_compaction_list_exc[tid] = 1;
507 compact_second_chunk = 1;
526 template<
class T,
class NumericT>
531 T *s_left_count, T *s_right_count,
532 const unsigned int num_threads_active,
534 unsigned int &left_count,
unsigned int &right_count,
535 NumericT &mid,
unsigned int &all_threads_converged)
538 if (tid < num_threads_active)
542 right = s_right[tid];
543 left_count = s_left_count[tid];
544 right_count = s_right_count[tid];
551 all_threads_converged = 0;
553 else if ((right_count - left_count) > 1)
557 all_threads_converged = 0;
579 template<
class T,
class NumericT>
584 T *s_left_count, T *s_right_count,
585 const unsigned int num_threads_active,
587 unsigned int &left_count,
unsigned int &right_count,
588 NumericT &mid,
unsigned int &all_threads_converged)
591 if (tid < num_threads_active)
595 right = s_right[tid];
596 left_count = s_left_count[tid];
597 right_count = s_right_count[tid];
604 all_threads_converged = 0;
612 #endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
__device__ unsigned int computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)
__device__ void storeNonEmptyIntervals(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT mid, NumericT right, const S left_count, const S mid_count, const S right_count, NumericT precision, unsigned int &compact_second_chunk, T *s_compaction_list_exc, unsigned int &is_active_second)
Store all non-empty intervals resulting from the subdivision of the interval currently processed by t...
#define VIENNACL_BISECT_MIN_ABS_INTERVAL
__device__ int floorPow2(int n)
__device__ int ceilPow2(int n)
Global configuration parameters.
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void storeInterval(unsigned int addr, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT right, S left_count, S right_count, NumericT precision)
__device__ void subdivideActiveInterval(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ void subdivideActiveIntervalMulti(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ NumericT computeMidpoint(const NumericT left, const NumericT right)
__device__ void storeIntervalConverged(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT &left, NumericT &mid, NumericT &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active, unsigned int &is_active_second)
float sign_f(const float &val)
Sign of number (float)
NumericT max(std::vector< NumericT > const &v1)
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)
NumericT min(std::vector< NumericT > const &v1)