1 #ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
46 template<
typename IndexT>
48 const IndexT * A_row_indices,
49 const IndexT * A_col_indices,
51 unsigned int * L_row_indices)
53 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
55 row += gridDim.x * blockDim.x)
57 unsigned int row_begin = A_row_indices[
row];
58 unsigned int row_end = A_row_indices[
row+1];
60 unsigned int num_entries_L = 0;
61 for (
unsigned int j=row_begin; j<row_end; ++j)
63 unsigned int col = A_col_indices[j];
68 L_row_indices[
row] = num_entries_L;
72 template<
typename NumericT>
74 unsigned int const *A_row_indices,
75 unsigned int const *A_col_indices,
79 unsigned int const *L_row_indices,
80 unsigned int *L_col_indices,
83 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
85 row += gridDim.x * blockDim.x)
87 unsigned int row_begin = A_row_indices[
row];
88 unsigned int row_end = A_row_indices[
row+1];
90 unsigned int index_L = L_row_indices[
row];
91 for (
unsigned int j = row_begin; j < row_end; ++j)
93 unsigned int col = A_col_indices[j];
98 L_col_indices[index_L] = col;
99 L_elements[index_L] = value;
106 template<
typename NumericT>
113 extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
114 viennacl::cuda_arg<unsigned int>(A.
handle2()),
115 static_cast<unsigned int>(A.
size1()),
116 viennacl::cuda_arg<unsigned int>(L.
handle1())
130 extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
131 viennacl::cuda_arg<unsigned int>(A.
handle2()),
132 viennacl::cuda_arg<NumericT>(A.
handle()),
133 static_cast<unsigned int>(A.
size1()),
134 viennacl::cuda_arg<unsigned int>(L.
handle1()),
135 viennacl::cuda_arg<unsigned int>(L.
handle2()),
136 viennacl::cuda_arg<NumericT>(L.
handle())
147 template<
typename NumericT>
149 unsigned int const *A_row_indices,
150 unsigned int const *A_col_indices,
152 unsigned int A_size1,
156 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
158 row += gridDim.x * blockDim.x)
160 unsigned int row_begin = A_row_indices[
row];
161 unsigned int row_end = A_row_indices[
row+1];
163 for (
unsigned int j = row_begin; j < row_end; ++j)
165 unsigned int col = A_col_indices[j];
168 D_elements[
row] =
NumericT(1) / sqrt(fabs(A_elements[j]));
176 template<
typename NumericT>
178 unsigned int const *R_row_indices,
179 unsigned int const *R_col_indices,
181 unsigned int R_size1,
185 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
187 row += gridDim.x * blockDim.x)
189 unsigned int row_begin = R_row_indices[
row];
190 unsigned int row_end = R_row_indices[
row+1];
194 for (
unsigned int j = row_begin; j < row_end; ++j)
195 R_elements[j] *= D_row * D_elements[R_col_indices[j]];
202 template<
typename NumericT>
209 ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
210 viennacl::cuda_arg<unsigned int>(A.
handle2()),
211 viennacl::cuda_arg<NumericT>(A.
handle()),
212 static_cast<unsigned int>(A.
size1()),
218 ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
219 viennacl::cuda_arg<unsigned int>(L.
handle2()),
220 viennacl::cuda_arg<NumericT>(L.
handle()),
221 static_cast<unsigned int>(L.
size1()),
230 template<
typename NumericT>
232 unsigned int const *L_row_indices,
233 unsigned int const *L_col_indices,
236 unsigned int L_size1,
239 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
241 row += gridDim.x * blockDim.x)
246 unsigned int row_Li_start = L_row_indices[
row];
247 unsigned int row_Li_end = L_row_indices[
row + 1];
249 for (
unsigned int i = row_Li_start; i < row_Li_end; ++i)
251 unsigned int col = L_col_indices[i];
253 unsigned int row_Lj_start = L_row_indices[col];
254 unsigned int row_Lj_end = L_row_indices[col + 1];
257 unsigned int index_Lj = row_Lj_start;
258 unsigned int col_Lj = L_col_indices[index_Lj];
260 for (
unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
262 unsigned int col_Li = L_col_indices[index_Li];
265 while (col_Lj < col_Li)
268 col_Lj = L_col_indices[index_Lj];
271 if (col_Lj == col_Li)
272 s -= L_backup[index_Li] * L_backup[index_Lj];
276 L_elements[i] = (
row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]);
284 template<
typename NumericT>
292 icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
293 viennacl::cuda_arg<unsigned int>(L.
handle2()),
294 viennacl::cuda_arg<NumericT>(L.
handle()),
295 viennacl::cuda_arg<NumericT>(L_backup),
296 static_cast<unsigned int>(L.
size1()),
298 viennacl::cuda_arg<NumericT>(aij_L.
handle())
307 template<
typename IndexT>
309 const IndexT * A_row_indices,
310 const IndexT * A_col_indices,
311 unsigned int A_size1,
313 unsigned int * L_row_indices,
315 unsigned int * U_row_indices)
317 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
319 row += gridDim.x * blockDim.x)
321 unsigned int row_begin = A_row_indices[
row];
322 unsigned int row_end = A_row_indices[
row+1];
324 unsigned int num_entries_L = 0;
325 unsigned int num_entries_U = 0;
326 for (
unsigned int j=row_begin; j<row_end; ++j)
328 unsigned int col = A_col_indices[j];
335 L_row_indices[
row] = num_entries_L;
336 U_row_indices[
row] = num_entries_U;
340 template<
typename NumericT>
342 unsigned int const *A_row_indices,
343 unsigned int const *A_col_indices,
345 unsigned int A_size1,
347 unsigned int const *L_row_indices,
348 unsigned int *L_col_indices,
351 unsigned int const *U_row_indices,
352 unsigned int *U_col_indices,
355 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
357 row += gridDim.x * blockDim.x)
359 unsigned int row_begin = A_row_indices[
row];
360 unsigned int row_end = A_row_indices[
row+1];
362 unsigned int index_L = L_row_indices[
row];
363 unsigned int index_U = U_row_indices[
row];
364 for (
unsigned int j = row_begin; j < row_end; ++j)
366 unsigned int col = A_col_indices[j];
371 L_col_indices[index_L] = col;
372 L_elements[index_L] = value;
378 U_col_indices[index_U] = col;
379 U_elements[index_U] = value;
386 template<
typename NumericT>
394 extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
395 viennacl::cuda_arg<unsigned int>(A.
handle2()),
396 static_cast<unsigned int>(A.
size1()),
397 viennacl::cuda_arg<unsigned int>(L.
handle1()),
398 viennacl::cuda_arg<unsigned int>(U.
handle1())
416 extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
417 viennacl::cuda_arg<unsigned int>(A.
handle2()),
418 viennacl::cuda_arg<NumericT>(A.
handle()),
419 static_cast<unsigned int>(A.
size1()),
420 viennacl::cuda_arg<unsigned int>(L.
handle1()),
421 viennacl::cuda_arg<unsigned int>(L.
handle2()),
422 viennacl::cuda_arg<NumericT>(L.
handle()),
423 viennacl::cuda_arg<unsigned int>(U.
handle1()),
424 viennacl::cuda_arg<unsigned int>(U.
handle2()),
425 viennacl::cuda_arg<NumericT>(U.
handle())
437 template<
typename NumericT>
445 ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
446 viennacl::cuda_arg<unsigned int>(A.
handle2()),
447 viennacl::cuda_arg<NumericT>(A.
handle()),
448 static_cast<unsigned int>(A.
size1()),
449 viennacl::cuda_arg<NumericT>(D.handle())
454 ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
455 viennacl::cuda_arg<unsigned int>(L.
handle2()),
456 viennacl::cuda_arg<NumericT>(L.
handle()),
457 static_cast<unsigned int>(L.
size1()),
458 viennacl::cuda_arg<NumericT>(D.handle())
463 ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.
handle1()),
464 viennacl::cuda_arg<unsigned int>(U.
handle2()),
465 viennacl::cuda_arg<NumericT>(U.
handle()),
466 static_cast<unsigned int>(U.
size1()),
467 viennacl::cuda_arg<NumericT>(D.handle())
475 template<
typename NumericT>
477 unsigned int const *L_row_indices,
478 unsigned int const *L_col_indices,
481 unsigned int L_size1,
485 unsigned int const *U_trans_row_indices,
486 unsigned int const *U_trans_col_indices,
492 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
494 row += gridDim.x * blockDim.x)
499 unsigned int row_L_start = L_row_indices[
row];
500 unsigned int row_L_end = L_row_indices[
row + 1];
502 for (
unsigned int j = row_L_start; j < row_L_end; ++j)
504 unsigned int col = L_col_indices[j];
509 unsigned int row_U_start = U_trans_row_indices[col];
510 unsigned int row_U_end = U_trans_row_indices[col + 1];
513 unsigned int index_U = row_U_start;
514 unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1;
516 for (
unsigned int k = row_L_start; k < j; ++k)
518 unsigned int col_L = L_col_indices[k];
521 while (col_U < col_L)
524 col_U = U_trans_col_indices[index_U];
528 sum += L_backup[k] * U_trans_backup[index_U];
532 L_elements[j] = (aij_L[j] -
sum) / U_trans_backup[row_U_end - 1];
539 unsigned int row_U_start = U_trans_row_indices[
row];
540 unsigned int row_U_end = U_trans_row_indices[
row + 1];
541 for (
unsigned int j = row_U_start; j < row_U_end; ++j)
543 unsigned int col = U_trans_col_indices[j];
545 row_L_start = L_row_indices[col];
546 row_L_end = L_row_indices[col + 1];
549 unsigned int index_L = row_L_start;
550 unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1;
552 for (
unsigned int k = row_U_start; k < j; ++k)
554 unsigned int col_U = U_trans_col_indices[k];
557 while (col_L < col_U)
560 col_L = L_col_indices[index_L];
564 sum += L_backup[index_L] * U_trans_backup[k];
568 U_trans_elements[j] = aij_U_trans[j] -
sum;
575 template<
typename NumericT>
589 ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.
handle1()),
590 viennacl::cuda_arg<unsigned int>(L.
handle2()),
591 viennacl::cuda_arg<NumericT>(L.
handle()),
592 viennacl::cuda_arg<NumericT>(L_backup),
593 static_cast<unsigned int>(L.
size1()),
595 viennacl::cuda_arg<NumericT>(aij_L.
handle()),
597 viennacl::cuda_arg<unsigned int>(U_trans.
handle1()),
598 viennacl::cuda_arg<unsigned int>(U_trans.
handle2()),
599 viennacl::cuda_arg<NumericT>(U_trans.
handle()),
600 viennacl::cuda_arg<NumericT>(U_backup),
602 viennacl::cuda_arg<NumericT>(aij_U_trans.
handle())
610 template<
typename NumericT>
612 unsigned int const *R_row_indices,
613 unsigned int const *R_col_indices,
615 unsigned int R_size1,
619 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
621 row += gridDim.x * blockDim.x)
623 unsigned int row_begin = R_row_indices[
row];
624 unsigned int row_end = R_row_indices[
row+1];
628 for (
unsigned int j = row_begin; j < row_end; ++j)
630 unsigned int col = R_col_indices[j];
633 diag = R_elements[j];
641 for (
unsigned int j = row_begin; j < row_end; ++j)
642 R_elements[j] /= -diag;
648 template<
typename NumericT>
652 ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.
handle1()),
653 viennacl::cuda_arg<unsigned int>(R.
handle2()),
654 viennacl::cuda_arg<NumericT>(R.
handle()),
655 static_cast<unsigned int>(R.
size1()),
656 viennacl::cuda_arg<NumericT>(diag_R.
handle())
__global__ void ilu_chow_patel_sweep_kernel(unsigned int const *L_row_indices, unsigned int const *L_col_indices, NumericT *L_elements, NumericT const *L_backup, unsigned int L_size1, NumericT const *aij_L, unsigned int const *U_trans_row_indices, unsigned int const *U_trans_col_indices, NumericT *U_trans_elements, NumericT const *U_trans_backup, NumericT const *aij_U_trans)
CUDA kernel for one Chow-Patel-ILU sweep.
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
__global__ void extract_LU_kernel_2(unsigned int const *A_row_indices, unsigned int const *A_col_indices, NumericT const *A_elements, unsigned int A_size1, unsigned int const *L_row_indices, unsigned int *L_col_indices, NumericT *L_elements, unsigned int const *U_row_indices, unsigned int *U_col_indices, NumericT *U_elements)
Generic size and resize functionality for different vector and matrix types.
void extract_LU(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L, compressed_matrix< NumericT > &U)
Implementations of vector operations.
const vcl_size_t & size1() const
Returns the number of rows.
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
This file provides the forward declarations for the main types used within ViennaCL.
__global__ void ilu_form_neumann_matrix_kernel(unsigned int const *R_row_indices, unsigned int const *R_col_indices, NumericT *R_elements, unsigned int R_size1, NumericT *D_elements)
Determines row and column increments for matrices and matrix proxies.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
void extract_L(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L)
void ilu_form_neumann_matrix(compressed_matrix< NumericT > &R, vector< NumericT > &diag_R)
void generate_row_block_information()
Builds the row block information needed for fast sparse matrix-vector multiplications.
__global__ void extract_L_kernel_2(unsigned int const *A_row_indices, unsigned int const *A_col_indices, NumericT const *A_elements, unsigned int A_size1, unsigned int const *L_row_indices, unsigned int *L_col_indices, NumericT *L_elements)
__global__ void ilu_scale_kernel_1(unsigned int const *A_row_indices, unsigned int const *A_col_indices, NumericT const *A_elements, unsigned int A_size1, NumericT *D_elements)
__global__ void extract_L_kernel_1(const IndexT *A_row_indices, const IndexT *A_col_indices, unsigned int A_size1, unsigned int *L_row_indices)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
void ilu_scale(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L, compressed_matrix< NumericT > &U)
Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L a...
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
void memory_copy(mem_handle const &src_buffer, mem_handle &dst_buffer, vcl_size_t src_offset, vcl_size_t dst_offset, vcl_size_t bytes_to_copy)
Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'ds...
void icc_scale(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &L)
Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L a...
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
__global__ void icc_chow_patel_sweep_kernel(unsigned int const *L_row_indices, unsigned int const *L_col_indices, NumericT *L_elements, NumericT const *L_backup, unsigned int L_size1, NumericT const *aij_L)
CUDA kernel for one Chow-Patel-ICC sweep.
__global__ void ilu_scale_kernel_2(unsigned int const *R_row_indices, unsigned int const *R_col_indices, NumericT *R_elements, unsigned int R_size1, NumericT *D_elements)
Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the di...
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
void ilu_chow_patel_sweep(compressed_matrix< NumericT > &L, vector< NumericT > const &aij_L, compressed_matrix< NumericT > &U_trans, vector< NumericT > const &aij_U_trans)
Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) ...
Implementation of the ViennaCL scalar class.
__global__ void extract_LU_kernel_1(const IndexT *A_row_indices, const IndexT *A_col_indices, unsigned int A_size1, unsigned int *L_row_indices, unsigned int *U_row_indices)
const handle_type & handle() const
Returns the memory handle.
Simple enable-if variant that uses the SFINAE pattern.
void icc_chow_patel_sweep(compressed_matrix< NumericT > &L, vector< NumericT > const &aij_L)
Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) ...