ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
qr.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
2 #define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
27 #include <utility>
28 #include <iostream>
29 #include <fstream>
30 #include <string>
31 #include <algorithm>
32 #include <vector>
33 #include <math.h>
34 #include <cmath>
35 #include <sstream>
36 #include "viennacl/ocl/backend.hpp"
37 #include "boost/numeric/ublas/vector.hpp"
38 #include "boost/numeric/ublas/matrix.hpp"
39 #include "boost/numeric/ublas/matrix_proxy.hpp"
40 #include "boost/numeric/ublas/storage.hpp"
41 #include "boost/numeric/ublas/io.hpp"
42 #include "boost/numeric/ublas/matrix_expression.hpp"
43 #include "boost/numeric/ublas/detail/matrix_assign.hpp"
44 
45 #include "viennacl/vector.hpp"
46 #include "viennacl/matrix.hpp"
47 
51 
52 namespace viennacl
53 {
54 namespace linalg
55 {
56 namespace detail
57 {
58 namespace spai
59 {
60 
61 //********** DEBUG FUNCTIONS *****************//
62 template< typename T, typename InputIteratorT>
63 void Print(std::ostream & ostr, InputIteratorT it_begin, InputIteratorT it_end)
64 {
65  //std::ostream_iterator<int> it_os(ostr, delimiter);
66  std::string delimiters = " ";
67  std::copy(it_begin, it_end, std::ostream_iterator<T>(ostr, delimiters.c_str()));
68  ostr << std::endl;
69 }
70 
71 template<typename VectorT, typename MatrixT>
72 void write_to_block(VectorT & con_A_I_J,
73  unsigned int start_ind,
74  std::vector<unsigned int> const & I,
75  std::vector<unsigned int> const & J,
76  MatrixT& m)
77 {
78  m.resize(I.size(), J.size(), false);
79  for (vcl_size_t i = 0; i < J.size(); ++i)
80  for (vcl_size_t j = 0; j < I.size(); ++j)
81  m(j,i) = con_A_I_J[start_ind + i*I.size() + j];
82 }
83 
84 template<typename VectorT>
85 void print_continious_matrix(VectorT & con_A_I_J,
86  std::vector<cl_uint> & blocks_ind,
87  std::vector<std::vector<unsigned int> > const & g_I,
88  std::vector<std::vector<unsigned int> > const & g_J)
89 {
90  typedef typename VectorT::value_type NumericType;
91 
92  std::vector<boost::numeric::ublas::matrix<NumericType> > com_A_I_J(g_I.size());
93  for (vcl_size_t i = 0; i < g_I.size(); ++i)
94  {
95  write_to_block(con_A_I_J, blocks_ind[i], g_I[i], g_J[i], com_A_I_J[i]);
96  std::cout << com_A_I_J[i] << std::endl;
97  }
98 }
99 
100 template<typename VectorT>
101 void print_continious_vector(VectorT & con_v,
102  std::vector<cl_uint> & block_ind,
103  std::vector<std::vector<unsigned int> > const & g_J)
104 {
105  typedef typename VectorT::value_type NumericType;
106 
107  std::vector<boost::numeric::ublas::vector<NumericType> > com_v(g_J.size());
108  //Print<ScalarType>(std::cout, con_v.begin(), con_v.end());
109  for (vcl_size_t i = 0; i < g_J.size(); ++i)
110  {
111  com_v[i].resize(g_J[i].size());
112  for (vcl_size_t j = 0; j < g_J[i].size(); ++j)
113  com_v[i](j) = con_v[block_ind[i] + j];
114  std::cout << com_v[i] << std::endl;
115  }
116 }
117 
118 
120 
129 inline void compute_blocks_size(std::vector<std::vector<unsigned int> > const & g_I,
130  std::vector<std::vector<unsigned int> > const & g_J,
131  unsigned int& sz,
132  std::vector<cl_uint> & blocks_ind,
133  std::vector<cl_uint> & matrix_dims)
134 {
135  sz = 0;
136  for (vcl_size_t i = 0; i < g_I.size(); ++i)
137  {
138  sz += static_cast<unsigned int>(g_I[i].size()*g_J[i].size());
139  matrix_dims[2*i] = static_cast<cl_uint>(g_I[i].size());
140  matrix_dims[2*i + 1] = static_cast<cl_uint>(g_J[i].size());
141  blocks_ind[i+1] = blocks_ind[i] + static_cast<cl_uint>(g_I[i].size()*g_J[i].size());
142  }
143 }
144 
150 template<typename SizeT>
151 void get_size(std::vector<std::vector<SizeT> > const & inds,
152  SizeT & size)
153 {
154  size = 0;
155  for (vcl_size_t i = 0; i < inds.size(); ++i)
156  size += static_cast<unsigned int>(inds[i].size());
157 }
158 
164 template<typename SizeT>
165 void init_start_inds(std::vector<std::vector<SizeT> > const & inds,
166  std::vector<cl_uint>& start_inds)
167 {
168  for (vcl_size_t i = 0; i < inds.size(); ++i)
169  start_inds[i+1] = start_inds[i] + static_cast<cl_uint>(inds[i].size());
170 }
171 
172 
173 //************************************* QR FUNCTIONS ***************************************//
174 
181 template<typename MatrixT, typename NumericT>
182 void dot_prod(MatrixT const & A,
183  unsigned int beg_ind,
184  NumericT & res)
185 {
186  res = NumericT(0);
187  for (vcl_size_t i = beg_ind; i < A.size1(); ++i)
188  res += A(i, beg_ind-1)*A(i, beg_ind-1);
189 }
190 
199 template<typename MatrixT, typename VectorT, typename NumericT>
200 void custom_inner_prod(MatrixT const & A,
201  VectorT const & v,
202  unsigned int col_ind,
203  unsigned int start_ind,
204  NumericT & res)
205 {
206  res = static_cast<NumericT>(0);
207  for (unsigned int i = start_ind; i < static_cast<unsigned int>(A.size1()); ++i)
208  res += A(i, col_ind)*v(i);
209 }
210 
217 template<typename MatrixT, typename VectorT>
218 void copy_vector(MatrixT const & A,
219  VectorT & v,
220  unsigned int beg_ind)
221 {
222  for (unsigned int i = beg_ind; i < static_cast<unsigned int>(A.size1()); ++i)
223  v(i) = A( i, beg_ind-1);
224 }
225 
226 
227 //householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
235 template<typename MatrixT, typename VectorT, typename NumericT>
236 void householder_vector(MatrixT const & A,
237  unsigned int j,
238  VectorT & v,
239  NumericT & b)
240 {
241  NumericT sg;
242 
243  dot_prod(A, j+1, sg);
244  copy_vector(A, v, j+1);
245  NumericT mu;
246  v(j) = static_cast<NumericT>(1.0);
247  if (!sg)
248  b = 0;
249  else
250  {
251  mu = std::sqrt(A(j,j)*A(j, j) + sg);
252  if (A(j, j) <= 0)
253  v(j) = A(j, j) - mu;
254  else
255  v(j) = -sg/(A(j, j) + mu);
256 
257  b = 2*(v(j)*v(j))/(sg + v(j)*v(j));
258  v = v/v(j);
259  }
260 }
261 
262 
270 template<typename MatrixT, typename VectorT, typename NumericT>
272  unsigned int iter_cnt,
273  VectorT & v,
274  NumericT b)
275 {
276  //update every column of matrix A
277  NumericT in_prod_res;
278 
279  for (unsigned int i = iter_cnt; i < static_cast<unsigned int>(A.size2()); ++i)
280  {
281  //update each column in a fashion: ai = ai - b*v*(v'*ai)
282  custom_inner_prod(A, v, i, iter_cnt, in_prod_res);
283  for (unsigned int j = iter_cnt; j < static_cast<unsigned int>(A.size1()); ++j)
284  A(j, i) -= b*in_prod_res*v(j);
285  }
286 }
287 
294 template<typename MatrixT, typename VectorT>
295 void store_householder_vector(MatrixT & A,
296  unsigned int ind,
297  VectorT & v)
298 {
299  for (unsigned int i = ind; i < static_cast<unsigned int>(A.size1()); ++i)
300  A(i, ind-1) = v(i);
301 }
302 
303 
304 //QR algorithm
310 template<typename MatrixT, typename VectorT>
311 void single_qr(MatrixT & R, VectorT & b_v)
312 {
313  typedef typename MatrixT::value_type NumericType;
314 
315  if ((R.size1() > 0) && (R.size2() > 0))
316  {
317  VectorT v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size1()));
318  b_v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size2()));
319 
320  for (unsigned int i = 0; i < static_cast<unsigned int>(R.size2()); ++i)
321  {
322  householder_vector(R, i, v, b_v[i]);
323  apply_householder_reflection(R, i, v, b_v[i]);
324  if (i < R.size1())
325  store_householder_vector(R, i+1, v);
326  }
327  }
328 }
329 
330 //********************** HELP FUNCTIONS FOR GPU-based QR factorization *************************//
331 
337 template<typename SizeT>
338 void get_max_block_size(std::vector<std::vector<SizeT> > const & inds,
339  SizeT & max_size)
340 {
341  max_size = 0;
342  for (vcl_size_t i = 0; i < inds.size(); ++i)
343  if (inds[i].size() > max_size)
344  max_size = static_cast<SizeT>(inds[i].size());
345 }
346 
354 template<typename MatrixT, typename VectorT, typename NumericT>
355 void custom_dot_prod(MatrixT const & A,
356  VectorT const & v,
357  unsigned int ind,
358  NumericT & res)
359 {
360  res = static_cast<NumericT>(0);
361  for (unsigned int j = ind; j < A.size1(); ++j)
362  {
363  if (j == ind)
364  res += v(j);
365  else
366  res += A(j, ind)*v(j);
367  }
368 }
369 
376 template<typename MatrixT, typename VectorT>
377 void apply_q_trans_vec(MatrixT const & R,
378  VectorT const & b_v,
379  VectorT & y)
380 {
381  typedef typename MatrixT::value_type NumericT;
382 
383  NumericT inn_prod = NumericT(0);
384  for (vcl_size_t i = 0; i < R.size2(); ++i)
385  {
386  custom_dot_prod(R, y, static_cast<unsigned int>(i), inn_prod);
387  for (vcl_size_t j = i; j < R.size1(); ++j)
388  {
389  if (i == j)
390  y(j) -= b_v(i)*inn_prod;
391  else
392  y(j) -= b_v(i)*inn_prod*R(j,i);
393  }
394  }
395 }
396 
403 template<typename MatrixT, typename VectorT>
404 void apply_q_trans_mat(MatrixT const & R,
405  VectorT const & b_v,
406  MatrixT & A)
407 {
408  VectorT tmp_v;
409  for (vcl_size_t i = 0; i < A.size2(); ++i)
410  {
411  tmp_v = static_cast<VectorT>(column(A,i));
412  apply_q_trans_vec(R, b_v, tmp_v);
413  column(A,i) = tmp_v;
414  }
415 }
416 
417 //parallel QR for GPU
427 template<typename NumericT>
428 void block_qr(std::vector<std::vector<unsigned int> > & g_I,
429  std::vector<std::vector<unsigned int> > & g_J,
430  block_matrix & g_A_I_J_vcl,
431  block_vector & g_bv_vcl,
432  std::vector<cl_uint> & g_is_update,
433  viennacl::context ctx)
434 {
435  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
436 
437  //typedef typename MatrixType::value_type ScalarType;
438  unsigned int bv_size = 0;
439  unsigned int v_size = 0;
440  //set up arguments for GPU
441  //find maximum size of rows/columns
442  unsigned int local_r_n = 0;
443  unsigned int local_c_n = 0;
444  //find max size for blocks
445  get_max_block_size(g_I, local_r_n);
446  get_max_block_size(g_J, local_c_n);
447  //get size
448  get_size(g_J, bv_size);
449  get_size(g_I, v_size);
450  //get start indices
451  std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0);
452  std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0);
453  init_start_inds(g_J, start_bv_inds);
454  init_start_inds(g_I, start_v_inds);
455  //init arrays
456  std::vector<NumericT> b_v(bv_size, NumericT(0));
457  std::vector<NumericT> v(v_size, NumericT(0));
458  //call qr program
459  block_vector v_vcl;
460 
461  g_bv_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
462  static_cast<unsigned int>(sizeof(NumericT)*bv_size),
463  &(b_v[0]));
464 
465  v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
466  static_cast<unsigned int>(sizeof(NumericT)*v_size),
467  &(v[0]));
468  //the same as j_start_inds
469  g_bv_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
470  static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
471  &(start_bv_inds[0]));
472 
473  v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
474  static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
475  &(start_v_inds[0]));
476  viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
477  static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
478  &(g_is_update[0]));
479  //local memory
480  //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));
483 
484  qr_kernel.local_work_size(0, local_c_n);
485  qr_kernel.global_work_size(0, local_c_n*256);
486  viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(),
487  v_vcl.handle(), g_A_I_J_vcl.handle2(),
488  g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl,
489  viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),
490  static_cast<cl_uint>(g_I.size())));
491 
492 }
493 }
494 }
495 }
496 }
497 #endif
void householder_vector(MatrixT const &A, unsigned int j, VectorT &v, NumericT &b)
Computation of Householder vector, householder reflection c.f. Gene H. Golub, Charles F...
Definition: qr.hpp:236
Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners...
Definition: spai.hpp:587
Represents contigious matrices on GPU.
void print_continious_vector(VectorT &con_v, std::vector< cl_uint > &block_ind, std::vector< std::vector< unsigned int > > const &g_J)
Definition: qr.hpp:101
viennacl::ocl::handle< cl_mem > & handle()
Returns a handle to the elements.
viennacl::ocl::handle< cl_mem > & handle1()
Return handle to start indices.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Implementation of the dense matrix class.
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
OpenCL kernel file for sparse approximate inverse operations.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
void compute_blocks_size(std::vector< std::vector< unsigned int > > const &g_I, std::vector< std::vector< unsigned int > > const &g_J, unsigned int &sz, std::vector< cl_uint > &blocks_ind, std::vector< cl_uint > &matrix_dims)
**************************************** BLOCK FUNCTIONS ************************************// ...
Definition: qr.hpp:129
void get_size(std::vector< std::vector< SizeT > > const &inds, SizeT &size)
Computes size of particular container of index set.
Definition: qr.hpp:151
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
void write_to_block(VectorT &con_A_I_J, unsigned int start_ind, std::vector< unsigned int > const &I, std::vector< unsigned int > const &J, MatrixT &m)
Definition: qr.hpp:72
void copy_vector(MatrixT const &A, VectorT &v, unsigned int beg_ind)
Copying part of matrix column.
Definition: qr.hpp:218
float NumericT
Definition: bisect.cpp:40
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39
void single_qr(MatrixT &R, VectorT &b_v)
Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224.
Definition: qr.hpp:311
void apply_q_trans_vec(MatrixT const &R, VectorT const &b_v, VectorT &y)
Recovery Q from matrix R and vector of betas b_v.
Definition: qr.hpp:377
Implementation of a bunch of (small) matrices on GPU. Experimental.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
void Print(std::ostream &ostr, InputIteratorT it_begin, InputIteratorT it_end)
Definition: qr.hpp:63
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
void store_householder_vector(MatrixT &A, unsigned int ind, VectorT &v)
Storage of vector v in column(A, ind), starting from ind-1 index of a column.
Definition: qr.hpp:295
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605
void init_start_inds(std::vector< std::vector< SizeT > > const &inds, std::vector< cl_uint > &start_inds)
Initializes start indices of particular index set.
Definition: qr.hpp:165
Implementation of a bunch of vectors on GPU. Experimental.
void custom_inner_prod(MatrixT const &A, VectorT const &v, unsigned int col_ind, unsigned int start_ind, NumericT &res)
Dot prod of particular matrix column with arbitrary vector: A(:, col_ind)
Definition: qr.hpp:200
viennacl::ocl::handle< cl_mem > & handle1()
Returns a handle to the matrix dimensions.
viennacl::ocl::handle< cl_mem > & handle()
Return handle to the elements.
std::size_t vcl_size_t
Definition: forwards.h:75
Implementations of the OpenCL backend, where all contexts are stored in.
viennacl::ocl::handle< cl_mem > & handle2()
Returns a handle to the start indices of matrix.
void get_max_block_size(std::vector< std::vector< SizeT > > const &inds, SizeT &max_size)
Getting max size of rows/columns from container of index set.
Definition: qr.hpp:338
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void custom_dot_prod(MatrixT const &A, VectorT const &v, unsigned int ind, NumericT &res)
Dot_prod(column(A, ind), v) starting from index ind+1.
Definition: qr.hpp:355
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
Represents a contiguous vector on the GPU to represent a concatentation of small vectors.
static void init(viennacl::ocl::context &ctx)
Definition: spai.hpp:594
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
void apply_q_trans_mat(MatrixT const &R, VectorT const &b_v, MatrixT &A)
Multiplication of Q'*A, where Q is in implicit for lower part of R and vector of betas - b_v...
Definition: qr.hpp:404
void block_qr(std::vector< std::vector< unsigned int > > &g_I, std::vector< std::vector< unsigned int > > &g_J, block_matrix &g_A_I_J_vcl, block_vector &g_bv_vcl, std::vector< cl_uint > &g_is_update, viennacl::context ctx)
Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224 performed on GPU.
Definition: qr.hpp:428
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_column > column(const matrix_base< NumericT, F > &A, unsigned int j)
Definition: matrix.hpp:918
void print_continious_matrix(VectorT &con_A_I_J, std::vector< cl_uint > &blocks_ind, std::vector< std::vector< unsigned int > > const &g_I, std::vector< std::vector< unsigned int > > const &g_J)
Definition: qr.hpp:85
void apply_householder_reflection(MatrixT &A, unsigned int iter_cnt, VectorT &v, NumericT b)
Inplace application of Householder vector to a matrix A.
Definition: qr.hpp:271
viennacl::ocl::handle< cl_mem > create_memory(cl_mem_flags flags, unsigned int size, void *ptr=NULL) const
Creates a memory buffer within the context.
Definition: context.hpp:216