ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 #include <algorithm> //for std::max and std::min
27 
28 #include "viennacl/forwards.h"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/traits/size.hpp"
38 
39 
40 // Minimum vector size for using OpenMP on vector operations:
41 #ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
42  #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000
43 #endif
44 
45 namespace viennacl
46 {
47 namespace linalg
48 {
49 namespace host_based
50 {
51 
52 namespace detail
53 {
60  template<typename NumericT>
62  vector_base<NumericT> const & p,
64  NumericT const * r0star,
65  vector_base<NumericT> & inner_prod_buffer,
66  vcl_size_t buffer_chunk_size,
67  vcl_size_t buffer_chunk_offset)
68  {
69  typedef NumericT value_type;
70 
71  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);
72  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);
73  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
74  unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
75  unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
76  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
77 
78  value_type inner_prod_ApAp = 0;
79  value_type inner_prod_pAp = 0;
80  value_type inner_prod_Ap_r0star = 0;
81  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
82  {
83  value_type dot_prod = 0;
84  value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
85 
86  vcl_size_t row_end = row_buffer[row+1];
87  for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
88  dot_prod += elements[i] * p_buf[col_buffer[i]];
89 
90  // update contributions for the inner products (Ap, Ap) and (p, Ap)
91  Ap_buf[static_cast<vcl_size_t>(row)] = dot_prod;
92  inner_prod_ApAp += dot_prod * dot_prod;
93  inner_prod_pAp += val_p_diag * dot_prod;
94  inner_prod_Ap_r0star += r0star ? dot_prod * r0star[static_cast<vcl_size_t>(row)] : value_type(0);
95  }
96 
97  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
98  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
99  if (r0star)
100  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
101  }
102 
103 
104 
111  template<typename NumericT>
113  vector_base<NumericT> const & p,
115  NumericT const * r0star,
116  vector_base<NumericT> & inner_prod_buffer,
117  vcl_size_t buffer_chunk_size,
118  vcl_size_t buffer_chunk_offset)
119  {
120  typedef NumericT value_type;
121 
122  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
123  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
124  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
125  unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.handle12());
126  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
127 
128  // flush result buffer (cannot be expected to be zero)
129  for (vcl_size_t i = 0; i< Ap.size(); ++i)
130  Ap_buf[i] = 0;
131 
132  // matrix-vector product with a general COO format
133  for (vcl_size_t i = 0; i < A.nnz(); ++i)
134  Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];
135 
136  // computing the inner products (Ap, Ap) and (p, Ap):
137  // Note: The COO format does not allow to inject the subsequent operations into the matrix-vector product, because row and column ordering assumptions are too weak
138  value_type inner_prod_ApAp = 0;
139  value_type inner_prod_pAp = 0;
140  value_type inner_prod_Ap_r0star = 0;
141  for (vcl_size_t i = 0; i<Ap.size(); ++i)
142  {
143  NumericT value_Ap = Ap_buf[i];
144  NumericT value_p = p_buf[i];
145 
146  inner_prod_ApAp += value_Ap * value_Ap;
147  inner_prod_pAp += value_Ap * value_p;
148  inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);
149  }
150 
151  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
152  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
153  if (r0star)
154  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
155  }
156 
157 
164  template<typename NumericT>
166  vector_base<NumericT> const & p,
168  NumericT const * r0star,
169  vector_base<NumericT> & inner_prod_buffer,
170  vcl_size_t buffer_chunk_size,
171  vcl_size_t buffer_chunk_offset)
172  {
173  typedef NumericT value_type;
174 
175  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
176  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
177  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
178  unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(A.handle2());
179  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
180 
181  value_type inner_prod_ApAp = 0;
182  value_type inner_prod_pAp = 0;
183  value_type inner_prod_Ap_r0star = 0;
184  for (vcl_size_t row = 0; row < A.size1(); ++row)
185  {
186  value_type sum = 0;
187  value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
188 
189  for (unsigned int item_id = 0; item_id < A.internal_maxnnz(); ++item_id)
190  {
191  vcl_size_t offset = row + item_id * A.internal_size1();
192  value_type val = elements[offset];
193 
194  if (val)
195  sum += (p_buf[coords[offset]] * val);
196  }
197 
198  Ap_buf[row] = sum;
199  inner_prod_ApAp += sum * sum;
200  inner_prod_pAp += val_p_diag * sum;
201  inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
202  }
203 
204  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
205  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
206  if (r0star)
207  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
208  }
209 
210 
217  template<typename NumericT, typename IndexT>
219  vector_base<NumericT> const & p,
221  NumericT const * r0star,
222  vector_base<NumericT> & inner_prod_buffer,
223  vcl_size_t buffer_chunk_size,
224  vcl_size_t buffer_chunk_offset)
225  {
226  typedef NumericT value_type;
227 
228  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
229  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
230  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
231  IndexT const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.handle1());
232  IndexT const * column_indices = detail::extract_raw_pointer<IndexT>(A.handle2());
233  IndexT const * block_start = detail::extract_raw_pointer<IndexT>(A.handle3());
234  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
235 
236  vcl_size_t num_blocks = A.size1() / A.rows_per_block() + 1;
237  std::vector<value_type> result_values(A.rows_per_block());
238 
239  value_type inner_prod_ApAp = 0;
240  value_type inner_prod_pAp = 0;
241  value_type inner_prod_Ap_r0star = 0;
242  for (vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)
243  {
244  vcl_size_t current_columns_per_block = columns_per_block[block_idx];
245 
246  for (vcl_size_t i=0; i<result_values.size(); ++i)
247  result_values[i] = 0;
248 
249  for (IndexT column_entry_index = 0;
250  column_entry_index < current_columns_per_block;
251  ++column_entry_index)
252  {
253  vcl_size_t stride_start = block_start[block_idx] + column_entry_index * A.rows_per_block();
254  // Note: This for-loop may be unrolled by hand for exploiting vectorization
255  // Careful benchmarking recommended first, memory channels may be saturated already!
256  for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
257  {
258  value_type val = elements[stride_start + row_in_block];
259 
260  result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;
261  }
262  }
263 
264  vcl_size_t first_row_in_matrix = block_idx * A.rows_per_block();
265  for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
266  {
267  vcl_size_t row = first_row_in_matrix + row_in_block;
268  if (row < Ap.size())
269  {
270  value_type row_result = result_values[row_in_block];
271 
272  Ap_buf[row] = row_result;
273  inner_prod_ApAp += row_result * row_result;
274  inner_prod_pAp += p_buf[row] * row_result;
275  inner_prod_Ap_r0star += r0star ? row_result * r0star[row] : value_type(0);
276  }
277  }
278  }
279 
280  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
281  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
282  if (r0star)
283  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
284  }
285 
286 
293  template<typename NumericT>
295  vector_base<NumericT> const & p,
297  NumericT const * r0star,
298  vector_base<NumericT> & inner_prod_buffer,
299  vcl_size_t buffer_chunk_size,
300  vcl_size_t buffer_chunk_offset)
301  {
302  typedef NumericT value_type;
303  typedef unsigned int index_type;
304 
305  value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
306  value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
307  value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
308  index_type const * coords = detail::extract_raw_pointer<index_type>(A.handle2());
309  value_type const * csr_elements = detail::extract_raw_pointer<value_type>(A.handle5());
310  index_type const * csr_row_buffer = detail::extract_raw_pointer<index_type>(A.handle3());
311  index_type const * csr_col_buffer = detail::extract_raw_pointer<index_type>(A.handle4());
312  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
313 
314  value_type inner_prod_ApAp = 0;
315  value_type inner_prod_pAp = 0;
316  value_type inner_prod_Ap_r0star = 0;
317  for (vcl_size_t row = 0; row < A.size1(); ++row)
318  {
319  value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
320  value_type sum = 0;
321 
322  //
323  // Part 1: Process ELL part
324  //
325  for (index_type item_id = 0; item_id < A.internal_ellnnz(); ++item_id)
326  {
327  vcl_size_t offset = row + item_id * A.internal_size1();
328  value_type val = elements[offset];
329 
330  if (val)
331  sum += p_buf[coords[offset]] * val;
332  }
333 
334  //
335  // Part 2: Process HYB part
336  //
337  vcl_size_t col_begin = csr_row_buffer[row];
338  vcl_size_t col_end = csr_row_buffer[row + 1];
339 
340  for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
341  sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];
342 
343  Ap_buf[row] = sum;
344  inner_prod_ApAp += sum * sum;
345  inner_prod_pAp += val_p_diag * sum;
346  inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
347  }
348 
349  data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
350  data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
351  if (r0star)
352  data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
353  }
354 
355 } // namespace detail
356 
357 
366 template<typename NumericT>
368  NumericT alpha,
371  vector_base<NumericT> const & Ap,
372  NumericT beta,
373  vector_base<NumericT> & inner_prod_buffer)
374 {
375  typedef NumericT value_type;
376 
377  value_type * data_result = detail::extract_raw_pointer<value_type>(result);
378  value_type * data_p = detail::extract_raw_pointer<value_type>(p);
379  value_type * data_r = detail::extract_raw_pointer<value_type>(r);
380  value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
381  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
382 
383  // Note: Due to the special setting in CG, there is no need to check for sizes and strides
385 
386  value_type inner_prod_r = 0;
387  for (long i = 0; i < static_cast<long>(size); ++i)
388  {
389  value_type value_p = data_p[static_cast<vcl_size_t>(i)];
390  value_type value_r = data_r[static_cast<vcl_size_t>(i)];
391 
392 
393  data_result[static_cast<vcl_size_t>(i)] += alpha * value_p;
394  value_r -= alpha * data_Ap[static_cast<vcl_size_t>(i)];
395  value_p = value_r + beta * value_p;
396  inner_prod_r += value_r * value_r;
397 
398  data_p[static_cast<vcl_size_t>(i)] = value_p;
399  data_r[static_cast<vcl_size_t>(i)] = value_r;
400  }
401 
402  data_buffer[0] = inner_prod_r;
403 }
404 
405 
412 template<typename NumericT>
414  vector_base<NumericT> const & p,
416  vector_base<NumericT> & inner_prod_buffer)
417 {
418  typedef NumericT const * PtrType;
419  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
420 }
421 
422 
423 
430 template<typename NumericT>
432  vector_base<NumericT> const & p,
434  vector_base<NumericT> & inner_prod_buffer)
435 {
436  typedef NumericT const * PtrType;
437  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
438 }
439 
440 
447 template<typename NumericT>
449  vector_base<NumericT> const & p,
451  vector_base<NumericT> & inner_prod_buffer)
452 {
453  typedef NumericT const * PtrType;
454  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
455 }
456 
457 
464 template<typename NumericT, typename IndexT>
466  vector_base<NumericT> const & p,
468  vector_base<NumericT> & inner_prod_buffer)
469 {
470  typedef NumericT const * PtrType;
471  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
472 }
473 
474 
475 
476 
483 template<typename NumericT>
485  vector_base<NumericT> const & p,
487  vector_base<NumericT> & inner_prod_buffer)
488 {
489  typedef NumericT const * PtrType;
490  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
491 }
492 
494 
495 
503 template<typename NumericT>
506  vector_base<NumericT> const & Ap,
507  vector_base<NumericT> & inner_prod_buffer,
508  vcl_size_t buffer_chunk_size,
509  vcl_size_t buffer_chunk_offset)
510 {
511  typedef NumericT value_type;
512 
513  value_type * data_s = detail::extract_raw_pointer<value_type>(s);
514  value_type * data_r = detail::extract_raw_pointer<value_type>(r);
515  value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
516  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
517 
518  // Note: Due to the special setting in CG, there is no need to check for sizes and strides
520 
521  // part 1: compute alpha:
522  value_type r_in_r0 = 0;
523  value_type Ap_in_r0 = 0;
524  for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
525  {
526  r_in_r0 += data_buffer[i];
527  Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];
528  }
529  value_type alpha = r_in_r0 / Ap_in_r0;
530 
531  // part 2: s = r - alpha * Ap and first step in reduction for s:
532  value_type inner_prod_s = 0;
533  for (long i = 0; i < static_cast<long>(size); ++i)
534  {
535  value_type value_s = data_s[static_cast<vcl_size_t>(i)];
536 
537  value_s = data_r[static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];
538  inner_prod_s += value_s * value_s;
539 
540  data_s[static_cast<vcl_size_t>(i)] = value_s;
541  }
542 
543  data_buffer[buffer_chunk_offset] = inner_prod_s;
544 }
545 
553  template<typename NumericT>
555  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
556  NumericT beta, vector_base<NumericT> const & Ap,
557  vector_base<NumericT> const & r0star,
558  vector_base<NumericT> & inner_prod_buffer,
559  vcl_size_t buffer_chunk_size)
560  {
561  typedef NumericT value_type;
562 
563  value_type * data_result = detail::extract_raw_pointer<value_type>(result);
564  value_type * data_p = detail::extract_raw_pointer<value_type>(p);
565  value_type const * data_s = detail::extract_raw_pointer<value_type>(s);
566  value_type * data_residual = detail::extract_raw_pointer<value_type>(residual);
567  value_type const * data_As = detail::extract_raw_pointer<value_type>(As);
568  value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
569  value_type const * data_r0star = detail::extract_raw_pointer<value_type>(r0star);
570  value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
571 
573 
574  value_type inner_prod_r_r0star = 0;
575  for (long i = 0; i < static_cast<long>(size); ++i)
576  {
577  vcl_size_t index = static_cast<vcl_size_t>(i);
578  value_type value_result = data_result[index];
579  value_type value_p = data_p[index];
580  value_type value_s = data_s[index];
581  value_type value_residual = data_residual[index];
582  value_type value_As = data_As[index];
583  value_type value_Ap = data_Ap[index];
584  value_type value_r0star = data_r0star[index];
585 
586  value_result += alpha * value_p + omega * value_s;
587  value_residual = value_s - omega * value_As;
588  value_p = value_residual + beta * (value_p - omega * value_Ap);
589  inner_prod_r_r0star += value_residual * value_r0star;
590 
591  data_result[index] = value_result;
592  data_residual[index] = value_residual;
593  data_p[index] = value_p;
594  }
595 
596  (void)buffer_chunk_size; // not needed here, just silence compiler warning (unused variable)
597  data_buffer[0] = inner_prod_r_r0star;
598  }
599 
606  template<typename NumericT>
608  vector_base<NumericT> const & p,
610  vector_base<NumericT> const & r0star,
611  vector_base<NumericT> & inner_prod_buffer,
612  vcl_size_t buffer_chunk_size,
613  vcl_size_t buffer_chunk_offset)
614  {
615  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
616 
617  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
618  }
619 
626  template<typename NumericT>
628  vector_base<NumericT> const & p,
630  vector_base<NumericT> const & r0star,
631  vector_base<NumericT> & inner_prod_buffer,
632  vcl_size_t buffer_chunk_size,
633  vcl_size_t buffer_chunk_offset)
634  {
635  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
636 
637  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
638  }
639 
646  template<typename NumericT>
648  vector_base<NumericT> const & p,
650  vector_base<NumericT> const & r0star,
651  vector_base<NumericT> & inner_prod_buffer,
652  vcl_size_t buffer_chunk_size,
653  vcl_size_t buffer_chunk_offset)
654  {
655  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
656 
657  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
658  }
659 
666  template<typename NumericT, typename IndexT>
668  vector_base<NumericT> const & p,
670  vector_base<NumericT> const & r0star,
671  vector_base<NumericT> & inner_prod_buffer,
672  vcl_size_t buffer_chunk_size,
673  vcl_size_t buffer_chunk_offset)
674  {
675  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
676 
677  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
678  }
679 
686  template<typename NumericT>
688  vector_base<NumericT> const & p,
690  vector_base<NumericT> const & r0star,
691  vector_base<NumericT> & inner_prod_buffer,
692  vcl_size_t buffer_chunk_size,
693  vcl_size_t buffer_chunk_offset)
694  {
695  NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
696 
697  viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
698  }
699 
700 
702 
710 template <typename T>
712  vector_base<T> const & residual,
713  vector_base<T> & R_buffer,
714  vcl_size_t offset_in_R,
715  vector_base<T> const & inner_prod_buffer,
716  vector_base<T> & r_dot_vk_buffer,
717  vcl_size_t buffer_chunk_size,
718  vcl_size_t buffer_chunk_offset)
719 {
720  typedef T value_type;
721 
722  value_type * data_v_k = detail::extract_raw_pointer<value_type>(v_k);
723  value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);
724  value_type * data_R = detail::extract_raw_pointer<value_type>(R_buffer);
725  value_type const * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
726  value_type * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);
727 
728  // Note: Due to the special setting in GMRES, there is no need to check for sizes and strides
730  vcl_size_t vk_start = viennacl::traits::start(v_k);
731 
732  // part 1: compute alpha:
733  value_type norm_vk = 0;
734  for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
735  norm_vk += data_buffer[i + buffer_chunk_size];
736  norm_vk = std::sqrt(norm_vk);
737  data_R[offset_in_R] = norm_vk;
738 
739  // Compute <r, v_k> after normalization of v_k:
740  value_type inner_prod_r_dot_vk = 0;
741  for (long i = 0; i < static_cast<long>(size); ++i)
742  {
743  value_type value_vk = data_v_k[static_cast<vcl_size_t>(i) + vk_start] / norm_vk;
744 
745  inner_prod_r_dot_vk += data_residual[static_cast<vcl_size_t>(i)] * value_vk;
746 
747  data_v_k[static_cast<vcl_size_t>(i) + vk_start] = value_vk;
748  }
749 
750  data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;
751 }
752 
753 
754 
759 template <typename T>
760 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
761  vcl_size_t v_k_size,
762  vcl_size_t v_k_internal_size,
763  vcl_size_t k,
764  vector_base<T> & vi_in_vk_buffer,
765  vcl_size_t buffer_chunk_size)
766 {
767  typedef T value_type;
768 
769  value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
770  value_type * data_inner_prod = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);
771 
772  // reset buffer:
773  for (vcl_size_t j = 0; j < k; ++j)
774  data_inner_prod[j*buffer_chunk_size] = value_type(0);
775 
776  // compute inner products:
777  for (vcl_size_t i = 0; i < v_k_size; ++i)
778  {
779  value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
780 
781  for (vcl_size_t j = 0; j < k; ++j)
782  data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;
783  }
784 }
785 
786 
791 template <typename T>
793  vcl_size_t v_k_size,
794  vcl_size_t v_k_internal_size,
795  vcl_size_t k,
796  vector_base<T> const & vi_in_vk_buffer,
797  vector_base<T> & R_buffer,
798  vcl_size_t krylov_dim,
799  vector_base<T> & inner_prod_buffer,
800  vcl_size_t buffer_chunk_size)
801 {
802  typedef T value_type;
803 
804  value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
805 
806  std::vector<T> values_vi_in_vk(k);
807 
808  // Step 1: Finish reduction of <v_i, v_k> to obtain scalars:
809  for (std::size_t i=0; i<k; ++i)
810  for (vcl_size_t j=0; j<buffer_chunk_size; ++j)
811  values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];
812 
813 
814  // Step 2: Compute v_k -= <v_i, v_k> v_i and reduction on ||v_k||:
815  value_type norm_vk = 0;
816  for (vcl_size_t i = 0; i < v_k_size; ++i)
817  {
818  value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
819 
820  for (vcl_size_t j = 0; j < k; ++j)
821  value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];
822 
823  norm_vk += value_vk * value_vk;
824  data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;
825  }
826 
827  // Step 3: Write values to R_buffer:
828  for (std::size_t i=0; i<k; ++i)
829  R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];
830 
831  inner_prod_buffer[buffer_chunk_size] = norm_vk;
832 }
833 
835 template <typename T>
837  vector_base<T> const & residual,
838  vector_base<T> const & krylov_basis,
839  vcl_size_t v_k_size,
840  vcl_size_t v_k_internal_size,
841  vector_base<T> const & coefficients,
842  vcl_size_t k)
843 {
844  typedef T value_type;
845 
846  value_type * data_result = detail::extract_raw_pointer<value_type>(result);
847  value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);
848  value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);
849  value_type const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);
850 
851  for (vcl_size_t i = 0; i < v_k_size; ++i)
852  {
853  value_type value_result = data_result[i];
854 
855  value_result += data_coefficients[0] * data_residual[i];
856  for (vcl_size_t j = 1; j<k; ++j)
857  value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];
858 
859  data_result[i] = value_result;
860  }
861 
862 }
863 
864 // Reuse implementation from CG:
865 template <typename MatrixType, typename T>
866 void pipelined_gmres_prod(MatrixType const & A,
867  vector_base<T> const & p,
868  vector_base<T> & Ap,
869  vector_base<T> & inner_prod_buffer)
870 {
871  pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
872 }
873 
874 
875 } //namespace host_based
876 } //namespace linalg
877 } //namespace viennacl
878 
879 
880 #endif
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
handle_type & handle2()
Definition: ell_matrix.hpp:103
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45
Generic size and resize functionality for different vector and matrix types.
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
const vcl_size_t & size1() const
Returns the number of rows.
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Computes the second reduction stage for multiple inner products , i=0..k-1, then updates v_k -= v_i and computes the first reduction stage for ||v_k||.
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t nnz() const
Returns the number of nonzero entries.
Determines row and column increments for matrices and matrix proxies.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
vcl_size_t rows_per_block() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab a...
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
float NumericT
Definition: bisect.cpp:40
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:75
Common routines for single-threaded or OpenMP-enabled execution on CPU.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
handle_type & handle()
Definition: ell_matrix.hpp:100
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorit...
void pipelined_prod_impl(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, NumericT const *r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined C...
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t k)
Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1}.
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Defines the action of certain unary and binary operators and its arguments (for host execution)...
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
Computes first reduction stage for multiple inner products , i=0..k-1.
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
void pipelined_gmres_prod(MatrixType const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
Simple enable-if variant that uses the SFINAE pattern.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...