1 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
2 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
40 template<
typename NumericT>
43 unsigned int A_start1,
unsigned int A_start2,
44 unsigned int A_inc1,
unsigned int A_inc2,
45 unsigned int A_size1,
unsigned int A_size2,
46 unsigned int A_internal_size1,
unsigned int A_internal_size2,
50 unsigned int B_start1,
unsigned int B_start2,
51 unsigned int B_inc1,
unsigned int B_inc2,
52 unsigned int B_size1,
unsigned int B_size2,
53 unsigned int B_internal_size1,
unsigned int B_internal_size2,
61 for (
unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
63 unsigned int row = A_size1 - 1 - row_cnt;
72 B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
73 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
75 B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
76 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
83 temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
85 temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
88 for (
unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
91 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
93 entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
96 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
98 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
106 template<
typename NumericT>
109 unsigned int A_start1,
unsigned int A_start2,
110 unsigned int A_inc1,
unsigned int A_inc2,
111 unsigned int A_size1,
unsigned int A_size2,
112 unsigned int A_internal_size1,
unsigned int A_internal_size2,
116 unsigned int B_start1,
unsigned int B_start2,
117 unsigned int B_inc1,
unsigned int B_inc2,
118 unsigned int B_size1,
unsigned int B_size2,
119 unsigned int B_internal_size1,
unsigned int B_internal_size2,
127 for (
unsigned int row = 0;
row < A_size1; ++
row)
134 if (threadIdx.x == 0)
137 B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
138 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
140 B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
141 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
148 temp = B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
150 temp = B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
153 for (
unsigned int elim =
row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
156 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)];
158 entry_A = A[(elim * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1];
161 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
163 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
176 template<
typename TagT>
182 template<
typename TagT>
188 template<
typename Matrix1T,
typename Matrix2T,
typename SolverTagT>
191 SolverTagT
const & tag)
196 dim3 grid(B.size2());
252 template<
typename NumericT,
typename SolverTagT>
265 template<
typename NumericT>
268 unsigned int A_start1,
unsigned int A_start2,
269 unsigned int A_inc1,
unsigned int A_inc2,
270 unsigned int A_size1,
unsigned int A_size2,
271 unsigned int A_internal_size1,
unsigned int A_internal_size2,
273 unsigned int v_start,
277 unsigned int options)
280 unsigned int unit_diagonal_flag = (options & (1 << 0));
282 unsigned int is_lower_solve = (options & (1 << 2));
284 for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)
286 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
287 if (!unit_diagonal_flag)
290 if (threadIdx.x == 0)
291 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
296 temp = v[row * v_inc + v_start];
298 for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
299 elim < (is_lower_solve ? A_size1 : row);
301 v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
306 template<
typename NumericT>
309 unsigned int A_start1,
unsigned int A_start2,
310 unsigned int A_inc1,
unsigned int A_inc2,
311 unsigned int A_size1,
unsigned int A_size2,
312 unsigned int A_internal_size1,
unsigned int A_internal_size2,
314 unsigned int v_start,
317 unsigned int options)
320 unsigned int unit_diagonal_flag = (options & (1 << 0));
322 unsigned int is_lower_solve = (options & (1 << 2));
324 for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)
326 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
327 if (!unit_diagonal_flag)
330 if (threadIdx.x == 0)
331 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
336 temp = v[row * v_inc + v_start];
338 for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
339 elim < (is_lower_solve ? A_size1 : row);
341 v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
353 template<
typename MatrixT,
typename VectorT>
356 unsigned int options)
397 template<
typename NumericT,
typename SolverTagT>
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
void inplace_solve_vector_impl(MatrixT const &mat, VectorT &vec, unsigned int options)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void triangular_substitute_inplace_row_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Implementation of the dense matrix class.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
__global__ void matrix_matrix_lower_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool unit_diagonal)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
A tag class representing a lower triangular matrix.
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void matrix_matrix_upper_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool unit_diagonal)
bool is_unit_solve(TagT const &tag)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
A tag class representing an upper triangular matrix.
result_of::size_type< T >::type start(T const &obj)
__global__ void triangular_substitute_inplace_col_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
void inplace_solve_impl(Matrix1T const &A, Matrix2T &B, SolverTagT const &tag)
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
A tag class representing a lower triangular matrix with unit diagonal.
unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
A tag class representing an upper triangular matrix with unit diagonal.
bool is_upper_solve(TagT const &tag)