ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
direct_solve.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
2 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/vector.hpp"
27 #include "viennacl/matrix.hpp"
28 
29 
31 
32 
33 namespace viennacl
34 {
35 namespace linalg
36 {
37 namespace cuda
38 {
39 
40 template<typename NumericT>
42  const NumericT * A,
43  unsigned int A_start1, unsigned int A_start2,
44  unsigned int A_inc1, unsigned int A_inc2,
45  unsigned int A_size1, unsigned int A_size2,
46  unsigned int A_internal_size1, unsigned int A_internal_size2,
47  bool row_major_A,
48 
49  NumericT * B,
50  unsigned int B_start1, unsigned int B_start2,
51  unsigned int B_inc1, unsigned int B_inc2,
52  unsigned int B_size1, unsigned int B_size2,
53  unsigned int B_internal_size1, unsigned int B_internal_size2,
54  bool row_major_B,
55 
56  bool unit_diagonal)
57 {
58  NumericT temp;
59  NumericT entry_A;
60 
61  for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
62  {
63  unsigned int row = A_size1 - 1 - row_cnt;
64 
65  if (!unit_diagonal)
66  {
67  __syncthreads();
68 
69  if (threadIdx.x == 0)
70  {
71  if (row_major_B)
72  B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
73  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
74  else //if (!row_major_B)
75  B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
76  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
77  }
78  }
79 
80  __syncthreads();
81 
82  if (row_major_B)
83  temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
84  else //if (!row_major_B)
85  temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
86 
87  //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
88  for (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
89  {
90  if (row_major_A)
91  entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
92  else //if (!row_major_A)
93  entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
94 
95  if (row_major_B)
96  B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
97  else //if (!row_major_B)
98  B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
99 
100  }
101  }
102 }
103 
104 
105 
106 template<typename NumericT>
108  const NumericT * A,
109  unsigned int A_start1, unsigned int A_start2,
110  unsigned int A_inc1, unsigned int A_inc2,
111  unsigned int A_size1, unsigned int A_size2,
112  unsigned int A_internal_size1, unsigned int A_internal_size2,
113  bool row_major_A,
114 
115  NumericT * B,
116  unsigned int B_start1, unsigned int B_start2,
117  unsigned int B_inc1, unsigned int B_inc2,
118  unsigned int B_size1, unsigned int B_size2,
119  unsigned int B_internal_size1, unsigned int B_internal_size2,
120  bool row_major_B,
121 
122  bool unit_diagonal)
123 {
124  NumericT temp;
125  NumericT entry_A;
126 
127  for (unsigned int row = 0; row < A_size1; ++row)
128  {
129 
130  if (!unit_diagonal)
131  {
132  __syncthreads();
133 
134  if (threadIdx.x == 0)
135  {
136  if (row_major_B)
137  B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
138  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
139  else //if (!row_major_B)
140  B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
141  : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
142  }
143  }
144 
145  __syncthreads();
146 
147  if (row_major_B)
148  temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
149  else //if (!row_major_B)
150  temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
151 
152  //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
153  for (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
154  {
155  if (row_major_A)
156  entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
157  else //if (!row_major_A)
158  entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
159 
160  if (row_major_B)
161  B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
162  else //if (!row_major_B)
163  B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
164 
165  }
166  }
167 }
168 
169 
170 
171 
172 
173 
174 namespace detail
175 {
176  template<typename TagT>
177  bool is_unit_solve(TagT const & tag) { return false; }
178 
179  inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
180  inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
181 
182  template<typename TagT>
183  bool is_upper_solve(TagT const & tag) { return false; }
184 
185  inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
186  inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
187 
188  template<typename Matrix1T, typename Matrix2T, typename SolverTagT>
189  void inplace_solve_impl(Matrix1T const & A,
190  Matrix2T & B,
191  SolverTagT const & tag)
192  {
193  typedef typename viennacl::result_of::cpu_value_type<Matrix1T>::type value_type;
194 
195  dim3 threads(128);
196  dim3 grid(B.size2());
197 
198  if (is_upper_solve(tag))
199  {
200  matrix_matrix_upper_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
201  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
202  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
203  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
204  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
205  bool(A.row_major()),
206 
208  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
209  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
210  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
211  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
212  bool(B.row_major()),
213 
214  is_unit_solve(tag)
215  );
216  }
217  else
218  {
219  matrix_matrix_lower_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
220  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
221  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
222  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
223  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
224  bool(A.row_major()),
225 
227  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
228  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
229  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
230  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
231  bool(B.row_major()),
232 
233  is_unit_solve(tag)
234  );
235  }
236 
237  }
238 }
239 
240 
241 //
242 // Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
243 //
244 
246 
252 template<typename NumericT, typename SolverTagT>
255  SolverTagT tag)
256 {
257  detail::inplace_solve_impl(A, B, tag);
258 }
259 
260 
261 //
262 // Solve on vector
263 //
264 
265 template<typename NumericT>
267  NumericT const * A,
268  unsigned int A_start1, unsigned int A_start2,
269  unsigned int A_inc1, unsigned int A_inc2,
270  unsigned int A_size1, unsigned int A_size2,
271  unsigned int A_internal_size1, unsigned int A_internal_size2,
272  NumericT * v,
273  unsigned int v_start,
274  unsigned int v_inc,
275  unsigned int v_size,
276 
277  unsigned int options)
278 {
279  NumericT temp;
280  unsigned int unit_diagonal_flag = (options & (1 << 0));
281 
282  unsigned int is_lower_solve = (options & (1 << 2));
283  unsigned int row;
284  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square
285  {
286  row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
287  if (!unit_diagonal_flag)
288  {
289  __syncthreads();
290  if (threadIdx.x == 0)
291  v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
292  }
293 
294  __syncthreads();
295 
296  temp = v[row * v_inc + v_start];
297 
298  for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
299  elim < (is_lower_solve ? A_size1 : row);
300  elim += blockDim.x)
301  v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
302  }
303 }
304 
305 
306 template<typename NumericT>
308  NumericT const * A,
309  unsigned int A_start1, unsigned int A_start2,
310  unsigned int A_inc1, unsigned int A_inc2,
311  unsigned int A_size1, unsigned int A_size2,
312  unsigned int A_internal_size1, unsigned int A_internal_size2,
313  NumericT * v,
314  unsigned int v_start,
315  unsigned int v_inc,
316  unsigned int v_size,
317  unsigned int options)
318 {
319  NumericT temp;
320  unsigned int unit_diagonal_flag = (options & (1 << 0));
321 
322  unsigned int is_lower_solve = (options & (1 << 2));
323  unsigned int row;
324  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square
325  {
326  row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
327  if (!unit_diagonal_flag)
328  {
329  __syncthreads();
330  if (threadIdx.x == 0)
331  v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
332  }
333 
334  __syncthreads();
335 
336  temp = v[row * v_inc + v_start];
337 
338  for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
339  elim < (is_lower_solve ? A_size1 : row);
340  elim += blockDim.x)
341  v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
342  }
343 }
344 
345 
346 namespace detail
347 {
348  inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; }
349  inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
350  inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); }
351  inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
352 
353  template<typename MatrixT, typename VectorT>
354  void inplace_solve_vector_impl(MatrixT const & mat,
355  VectorT & vec,
356  unsigned int options)
357  {
358  typedef typename viennacl::result_of::cpu_value_type<MatrixT>::type value_type;
359 
360  if (mat.row_major())
361  {
362  triangular_substitute_inplace_row_kernel<<<1, 128>>>(viennacl::cuda_arg(mat),
363  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
364  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
365  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
366  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
367  viennacl::cuda_arg(vec),
368  static_cast<unsigned int>(viennacl::traits::start(vec)),
369  static_cast<unsigned int>(viennacl::traits::stride(vec)),
370  static_cast<unsigned int>(viennacl::traits::size(vec)),
371  options
372  );
373  }
374  else
375  {
376  triangular_substitute_inplace_col_kernel<<<1, 128>>>(viennacl::cuda_arg(mat),
377  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
378  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
379  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
380  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
381  viennacl::cuda_arg(vec),
382  static_cast<unsigned int>(viennacl::traits::start(vec)),
383  static_cast<unsigned int>(viennacl::traits::stride(vec)),
384  static_cast<unsigned int>(viennacl::traits::size(vec)),
385  options
386  );
387  }
388  }
389 
390 }
391 
397 template<typename NumericT, typename SolverTagT>
399  vector_base<NumericT> & vec,
400  SolverTagT)
401 {
402  unsigned int options = detail::get_option_for_solver_tag(SolverTagT());
403 
404  detail::inplace_solve_vector_impl(mat, vec, options);
405 }
406 
407 
408 }
409 }
410 }
411 
412 #endif
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
void inplace_solve_vector_impl(MatrixT const &mat, VectorT &vec, unsigned int options)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
__global__ void triangular_substitute_inplace_row_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Implementation of the dense matrix class.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:386
__global__ void matrix_matrix_lower_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool unit_diagonal)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
A tag class representing a lower triangular matrix.
Definition: forwards.h:849
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:394
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
__global__ void matrix_matrix_upper_solve_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool unit_diagonal)
float NumericT
Definition: bisect.cpp:40
bool is_unit_solve(TagT const &tag)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
A tag class representing an upper triangular matrix.
Definition: forwards.h:854
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
__global__ void triangular_substitute_inplace_col_kernel(NumericT const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Definition: result_of.hpp:271
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
void inplace_solve_impl(Matrix1T const &A, Matrix2T &B, SolverTagT const &tag)
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859
unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864
bool is_upper_solve(TagT const &tag)