1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
65 template<
typename NumericT>
69 unsigned int *blocks_mult,
70 unsigned int *blocks_mult_sum,
72 unsigned int *g_left_count,
73 unsigned int *g_right_count,
74 NumericT *g_lambda,
unsigned int *g_pos,
78 const unsigned int tid = threadIdx.x;
91 unsigned int *s_compaction_list_exc = s_compaction_list + 1;
94 __shared__
unsigned int all_threads_converged;
96 __shared__
unsigned int num_threads_active;
98 __shared__
unsigned int num_threads_compaction;
100 __shared__
unsigned int compact_second_chunk;
103 __shared__
unsigned int c_block_start;
104 __shared__
unsigned int c_block_end;
105 __shared__
unsigned int c_block_offset_output;
110 unsigned int mid_count = 0;
114 unsigned int left_count = 0;
115 unsigned int right_count = 0;
117 unsigned int is_active_second = 0;
125 c_block_start = blocks_mult[blockIdx.x];
126 c_block_end = blocks_mult[blockIdx.x + 1];
127 c_block_offset_output = blocks_mult_sum[blockIdx.x];
130 num_threads_active = c_block_end - c_block_start;
131 s_compaction_list[0] = 0;
132 num_threads_compaction =
ceilPow2(num_threads_active);
134 all_threads_converged = 1;
135 compact_second_chunk = 0;
138 s_left_count [tid] = 42;
139 s_right_count[tid] = 42;
147 if (tid < num_threads_active)
149 s_left[tid] = g_left[c_block_start + tid];
150 s_right[tid] = g_right[c_block_start + tid];
151 s_left_count[tid] = g_left_count[c_block_start + tid];
152 s_right_count[tid] = g_right_count[c_block_start + tid];
156 unsigned int iter = 0;
162 s_compaction_list[threadIdx.x] = 0;
163 s_compaction_list[threadIdx.x + blockDim.x] = 0;
168 s_left_count, s_right_count,
170 left, right, left_count, right_count,
171 mid, all_threads_converged);
175 if (1 == all_threads_converged)
186 mid, tid, num_threads_active,
192 if (tid < num_threads_active)
200 s_left, s_right, s_left_count, s_right_count,
202 left_count, mid_count, right_count,
203 precision, compact_second_chunk,
204 s_compaction_list_exc,
213 left_count, mid_count, right_count,
214 s_compaction_list_exc, compact_second_chunk,
225 if (1 == compact_second_chunk)
230 mid, right, mid_count, right_count,
231 s_compaction_list, num_threads_active,
240 num_threads_active += s_compaction_list[num_threads_active];
241 num_threads_compaction =
ceilPow2(num_threads_active);
243 compact_second_chunk = 0;
244 all_threads_converged = 1;
250 s_compaction_list_exc[threadIdx.x] = 0;
251 s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
253 if (num_threads_compaction > blockDim.x)
264 if (tid < num_threads_active)
267 unsigned int addr = c_block_offset_output + tid;
269 g_lambda[addr] = s_left[tid];
270 g_pos[addr] = s_right_count[tid];
277 #endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
__device__ void storeNonEmptyIntervals(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT mid, NumericT right, const S left_count, const S mid_count, const S right_count, NumericT precision, unsigned int &compact_second_chunk, T *s_compaction_list_exc, unsigned int &is_active_second)
Store all non-empty intervals resulting from the subdivision of the interval currently processed by t...
__device__ int ceilPow2(int n)
Global configuration parameters.
__global__ void bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, unsigned int *blocks_mult, unsigned int *blocks_mult_sum, NumericT *g_left, NumericT *g_right, unsigned int *g_left_count, unsigned int *g_right_count, NumericT *g_lambda, unsigned int *g_pos, NumericT precision)
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void subdivideActiveIntervalMulti(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ void storeIntervalConverged(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT &left, NumericT &mid, NumericT &right, S &left_count, S &mid_count, S &right_count, T *s_compaction_list_exc, unsigned int &compact_second_chunk, const unsigned int num_threads_active, unsigned int &is_active_second)
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)