1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
41 template<
typename NumericT>
43 const unsigned int * row_indices,
44 const unsigned int * column_indices,
49 __shared__
unsigned int col_index_buffer[128];
50 __shared__
NumericT element_buffer[128];
51 __shared__
NumericT vector_buffer[128];
53 unsigned int nnz = row_indices[
size];
54 unsigned int current_row = 0;
55 unsigned int row_at_window_start = 0;
56 NumericT current_vector_entry = vector[0];
57 unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
58 unsigned int next_row = row_indices[1];
60 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
65 element_buffer[threadIdx.x] = elements[i];
66 unsigned int tmp = column_indices[i];
67 col_index_buffer[threadIdx.x] = tmp;
68 vector_buffer[threadIdx.x] = vector[tmp];
77 for (
unsigned int k=0; k<blockDim.x; ++k)
79 if (current_row < size && i+k == next_row)
81 vector[current_row] = current_vector_entry;
83 if (current_row < size)
85 next_row = row_indices[current_row+1];
86 current_vector_entry = vector[current_row];
90 if (current_row < size && col_index_buffer[k] < current_row)
92 if (col_index_buffer[k] < row_at_window_start)
93 current_vector_entry -= element_buffer[k] * vector_buffer[k];
94 else if (col_index_buffer[k] < current_row)
95 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
100 row_at_window_start = current_row;
109 template<
typename NumericT>
111 const unsigned int * row_indices,
112 const unsigned int * column_indices,
117 __shared__
unsigned int col_index_buffer[128];
118 __shared__
NumericT element_buffer[128];
119 __shared__
NumericT vector_buffer[128];
121 unsigned int nnz = row_indices[
size];
122 unsigned int current_row = 0;
123 unsigned int row_at_window_start = 0;
124 NumericT current_vector_entry = vector[0];
126 unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
127 unsigned int next_row = row_indices[1];
129 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
134 element_buffer[threadIdx.x] = elements[i];
135 unsigned int tmp = column_indices[i];
136 col_index_buffer[threadIdx.x] = tmp;
137 vector_buffer[threadIdx.x] = vector[tmp];
143 if (threadIdx.x == 0)
146 for (
unsigned int k=0; k<blockDim.x; ++k)
148 if (current_row < size && i+k == next_row)
150 vector[current_row] = current_vector_entry / diagonal_entry;
152 if (current_row < size)
154 next_row = row_indices[current_row+1];
155 current_vector_entry = vector[current_row];
159 if (current_row < size && col_index_buffer[k] < current_row)
161 if (col_index_buffer[k] < row_at_window_start)
162 current_vector_entry -= element_buffer[k] * vector_buffer[k];
163 else if (col_index_buffer[k] < current_row)
164 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
166 else if (col_index_buffer[k] == current_row)
167 diagonal_entry = element_buffer[k];
171 row_at_window_start = current_row;
179 template<
typename NumericT>
181 const unsigned int * row_indices,
182 const unsigned int * column_indices,
187 __shared__
unsigned int col_index_buffer[128];
188 __shared__
NumericT element_buffer[128];
189 __shared__
NumericT vector_buffer[128];
191 unsigned int nnz = row_indices[
size];
192 unsigned int current_row = size-1;
193 unsigned int row_at_window_start = size-1;
194 NumericT current_vector_entry = vector[size-1];
195 unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
196 unsigned int next_row = row_indices[size-1];
198 unsigned int i = loop_end + threadIdx.x;
204 element_buffer[threadIdx.x] = elements[i];
205 unsigned int tmp = column_indices[i];
206 col_index_buffer[threadIdx.x] = tmp;
207 vector_buffer[threadIdx.x] = vector[tmp];
213 if (threadIdx.x == 0)
216 for (
unsigned int k2=0; k2<blockDim.x; ++k2)
218 unsigned int k = (blockDim.x - k2) - 1;
223 if (col_index_buffer[k] > row_at_window_start)
224 current_vector_entry -= element_buffer[k] * vector_buffer[k];
225 else if (col_index_buffer[k] > current_row)
226 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
230 vector[current_row] = current_vector_entry;
234 next_row = row_indices[current_row];
235 current_vector_entry = vector[current_row];
242 row_at_window_start = current_row;
256 template<
typename NumericT>
258 const unsigned int * row_indices,
259 const unsigned int * column_indices,
264 __shared__
unsigned int col_index_buffer[128];
265 __shared__
NumericT element_buffer[128];
266 __shared__
NumericT vector_buffer[128];
268 unsigned int nnz = row_indices[
size];
269 unsigned int current_row = size-1;
270 unsigned int row_at_window_start = size-1;
271 NumericT current_vector_entry = vector[size-1];
273 unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
274 unsigned int next_row = row_indices[size-1];
276 unsigned int i = loop_end + threadIdx.x;
282 element_buffer[threadIdx.x] = elements[i];
283 unsigned int tmp = column_indices[i];
284 col_index_buffer[threadIdx.x] = tmp;
285 vector_buffer[threadIdx.x] = vector[tmp];
291 if (threadIdx.x == 0)
294 for (
unsigned int k2=0; k2<blockDim.x; ++k2)
296 unsigned int k = (blockDim.x - k2) - 1;
301 if (col_index_buffer[k] > row_at_window_start)
302 current_vector_entry -= element_buffer[k] * vector_buffer[k];
303 else if (col_index_buffer[k] > current_row)
304 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
305 else if (col_index_buffer[k] == current_row)
306 diagonal_entry = element_buffer[k];
310 vector[current_row] = current_vector_entry / diagonal_entry;
314 next_row = row_indices[current_row];
315 current_vector_entry = vector[current_row];
322 row_at_window_start = current_row;
341 template<
typename NumericT>
343 const unsigned int * row_indices,
344 const unsigned int * column_indices,
353 unsigned int row_start = row_indices[
row];
354 unsigned int row_stop = row_indices[
row + 1];
355 for (
unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
357 unsigned int col_index = column_indices[entry_index];
359 vector[col_index] -= result_entry * elements[entry_index];
366 template<
typename NumericT>
368 const unsigned int * row_indices,
369 const unsigned int * column_indices,
374 __shared__
unsigned int row_index_lookahead[256];
375 __shared__
unsigned int row_index_buffer[256];
377 unsigned int row_index;
378 unsigned int col_index;
380 unsigned int nnz = row_indices[
size];
381 unsigned int row_at_window_start = 0;
382 unsigned int row_at_window_end = 0;
383 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
385 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
387 col_index = (i < nnz) ? column_indices[i] : 0;
388 matrix_entry = (i < nnz) ? elements[i] : 0;
389 row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x <
size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
395 unsigned int row_index_inc = 0;
396 while (i >= row_index_lookahead[row_index_inc + 1])
398 row_index = row_at_window_start + row_index_inc;
399 row_index_buffer[threadIdx.x] = row_index;
404 row_index_buffer[threadIdx.x] = size - 1;
409 row_at_window_start = row_index_buffer[0];
410 row_at_window_end = row_index_buffer[blockDim.x - 1];
413 for (
unsigned int row = row_at_window_start;
row <= row_at_window_end; ++
row)
417 if ( (row_index ==
row) && (col_index >
row) )
418 vector[col_index] -= result_entry * matrix_entry;
423 row_at_window_start = row_at_window_end;
428 template<
typename NumericT>
430 const unsigned int * row_indices,
431 const unsigned int * column_indices,
437 __shared__
unsigned int row_index_lookahead[256];
438 __shared__
unsigned int row_index_buffer[256];
440 unsigned int row_index;
441 unsigned int col_index;
443 unsigned int nnz = row_indices[
size];
444 unsigned int row_at_window_start = 0;
445 unsigned int row_at_window_end = 0;
446 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
448 for (
unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
450 col_index = (i < nnz) ? column_indices[i] : 0;
451 matrix_entry = (i < nnz) ? elements[i] : 0;
452 row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x <
size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
458 unsigned int row_index_inc = 0;
459 while (i >= row_index_lookahead[row_index_inc + 1])
461 row_index = row_at_window_start + row_index_inc;
462 row_index_buffer[threadIdx.x] = row_index;
467 row_index_buffer[threadIdx.x] = size - 1;
472 row_at_window_start = row_index_buffer[0];
473 row_at_window_end = row_index_buffer[blockDim.x - 1];
476 for (
unsigned int row = row_at_window_start;
row <= row_at_window_end; ++
row)
480 if ( (row_index ==
row) && (col_index >
row) )
481 vector[col_index] -= result_entry * matrix_entry;
486 row_at_window_start = row_at_window_end;
490 for (
unsigned int i = threadIdx.x; i < size; i += blockDim.x)
491 vector[i] /= diagonal_entries[i];
496 template<
typename NumericT>
498 const unsigned int * row_indices,
499 const unsigned int * column_indices,
504 __shared__
unsigned int row_index_lookahead[256];
505 __shared__
unsigned int row_index_buffer[256];
507 unsigned int row_index;
508 unsigned int col_index;
510 unsigned int nnz = row_indices[
size];
511 unsigned int row_at_window_start =
size;
512 unsigned int row_at_window_end;
513 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
515 for (
unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
517 unsigned int i = (nnz - i2) - 1;
518 col_index = (i2 < nnz) ? column_indices[i] : 0;
519 matrix_entry = (i2 < nnz) ? elements[i] : 0;
520 row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
526 unsigned int row_index_dec = 0;
527 while (row_index_lookahead[row_index_dec] > i)
529 row_index = row_at_window_start - row_index_dec;
530 row_index_buffer[threadIdx.x] = row_index;
535 row_index_buffer[threadIdx.x] = 0;
540 row_at_window_start = row_index_buffer[0];
541 row_at_window_end = row_index_buffer[blockDim.x - 1];
544 for (
unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
546 unsigned int row = row_at_window_start - row2;
549 if ( (row_index == row) && (col_index <
row) )
550 vector[col_index] -= result_entry * matrix_entry;
555 row_at_window_start = row_at_window_end;
562 template<
typename NumericT>
564 const unsigned int * row_indices,
565 const unsigned int * column_indices,
574 for (
unsigned int row2 = 0; row2 <
size; ++row2)
576 unsigned int row = (size - row2) - 1;
577 result_entry = vector[
row] / diagonal_entries[
row];
579 unsigned int row_start = row_indices[
row];
580 unsigned int row_stop = row_indices[row + 1];
581 for (
unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
583 unsigned int col_index = column_indices[entry_index];
585 vector[col_index] -= result_entry * elements[entry_index];
590 if (threadIdx.x == 0)
591 vector[
row] = result_entry;
596 template<
typename NumericT>
598 const unsigned int * row_indices,
599 const unsigned int * column_indices,
605 __shared__
unsigned int row_index_lookahead[256];
606 __shared__
unsigned int row_index_buffer[256];
608 unsigned int row_index;
609 unsigned int col_index;
611 unsigned int nnz = row_indices[
size];
612 unsigned int row_at_window_start =
size;
613 unsigned int row_at_window_end;
614 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
616 for (
unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
618 unsigned int i = (nnz - i2) - 1;
619 col_index = (i2 < nnz) ? column_indices[i] : 0;
620 matrix_entry = (i2 < nnz) ? elements[i] : 0;
621 row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
627 unsigned int row_index_dec = 0;
628 while (row_index_lookahead[row_index_dec] > i)
630 row_index = row_at_window_start - row_index_dec;
631 row_index_buffer[threadIdx.x] = row_index;
636 row_index_buffer[threadIdx.x] = 0;
641 row_at_window_start = row_index_buffer[0];
642 row_at_window_end = row_index_buffer[blockDim.x - 1];
645 for (
unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
647 unsigned int row = row_at_window_start - row2;
650 if ( (row_index == row) && (col_index <
row) )
651 vector[col_index] -= result_entry * matrix_entry;
656 row_at_window_start = row_at_window_end;
661 for (
unsigned int i = threadIdx.x; i < size; i += blockDim.x)
662 vector[i] /= diagonal_entries[i];
667 template<
typename NumericT>
669 const unsigned int * row_jumper_L,
670 const unsigned int * column_indices_L,
672 const unsigned int * block_offsets,
676 unsigned int col_start = block_offsets[2*blockIdx.x];
677 unsigned int col_stop = block_offsets[2*blockIdx.x+1];
678 unsigned int row_start = row_jumper_L[col_start];
679 unsigned int row_stop;
682 if (col_start >= col_stop)
686 for (
unsigned int col = col_start; col < col_stop; ++col)
688 result_entry = result[col];
689 row_stop = row_jumper_L[col + 1];
690 for (
unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
691 result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
692 row_start = row_stop;
699 template<
typename NumericT>
701 const unsigned int * row_jumper_U,
702 const unsigned int * column_indices_U,
705 const unsigned int * block_offsets,
709 unsigned int col_start = block_offsets[2*blockIdx.x];
710 unsigned int col_stop = block_offsets[2*blockIdx.x+1];
711 unsigned int row_start;
712 unsigned int row_stop;
715 if (col_start >= col_stop)
719 for (
unsigned int iter = 0; iter < col_stop - col_start; ++iter)
721 unsigned int col = (col_stop - iter) - 1;
722 result_entry = result[col] / diagonal_U[col];
723 row_start = row_jumper_U[col];
724 row_stop = row_jumper_U[col + 1];
725 for (
unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
726 result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
731 for (
unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
732 result[col] /= diagonal_U[col];
__global__ void csr_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel2(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
__global__ void csr_trans_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_lu_backward(const unsigned int *row_jumper_U, const unsigned int *column_indices_U, const NumericT *elements_U, const NumericT *diagonal_U, const unsigned int *block_offsets, NumericT *result, unsigned int size)
__global__ void csr_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_unit_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void csr_trans_unit_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_lu_backward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *vector, unsigned int size)
__global__ void csr_trans_lu_forward_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *diagonal_entries, NumericT *vector, unsigned int size)
__global__ void csr_block_trans_unit_lu_forward(const unsigned int *row_jumper_L, const unsigned int *column_indices_L, const NumericT *elements_L, const unsigned int *block_offsets, NumericT *result, unsigned int size)