1 #ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
41 #ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
42 #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000
60 template<
typename NumericT>
73 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
74 unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle1());
75 unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle2());
76 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
78 value_type inner_prod_ApAp = 0;
79 value_type inner_prod_pAp = 0;
80 value_type inner_prod_Ap_r0star = 0;
81 for (
long row = 0; row < static_cast<long>(A.
size1()); ++
row)
88 dot_prod += elements[i] * p_buf[col_buffer[i]];
92 inner_prod_ApAp += dot_prod *
dot_prod;
93 inner_prod_pAp += val_p_diag *
dot_prod;
94 inner_prod_Ap_r0star += r0star ? dot_prod * r0star[
static_cast<vcl_size_t>(
row)] : value_type(0);
97 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
98 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
100 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
111 template<
typename NumericT>
124 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
125 unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.
handle12());
126 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
134 Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];
138 value_type inner_prod_ApAp = 0;
139 value_type inner_prod_pAp = 0;
140 value_type inner_prod_Ap_r0star = 0;
146 inner_prod_ApAp += value_Ap * value_Ap;
147 inner_prod_pAp += value_Ap * value_p;
148 inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);
151 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
152 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
154 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
164 template<
typename NumericT>
177 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
178 unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(A.
handle2());
179 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
181 value_type inner_prod_ApAp = 0;
182 value_type inner_prod_pAp = 0;
183 value_type inner_prod_Ap_r0star = 0;
189 for (
unsigned int item_id = 0; item_id < A.
internal_maxnnz(); ++item_id)
192 value_type val = elements[offset];
195 sum += (p_buf[coords[offset]] * val);
199 inner_prod_ApAp += sum *
sum;
200 inner_prod_pAp += val_p_diag *
sum;
201 inner_prod_Ap_r0star += r0star ? sum * r0star[
row] : value_type(0);
204 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
205 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
207 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
217 template<
typename NumericT,
typename IndexT>
230 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
231 IndexT
const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.
handle1());
232 IndexT
const * column_indices = detail::extract_raw_pointer<IndexT>(A.
handle2());
233 IndexT
const * block_start = detail::extract_raw_pointer<IndexT>(A.
handle3());
234 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
239 value_type inner_prod_ApAp = 0;
240 value_type inner_prod_pAp = 0;
241 value_type inner_prod_Ap_r0star = 0;
242 for (
vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)
244 vcl_size_t current_columns_per_block = columns_per_block[block_idx];
246 for (
vcl_size_t i=0; i<result_values.size(); ++i)
247 result_values[i] = 0;
249 for (IndexT column_entry_index = 0;
250 column_entry_index < current_columns_per_block;
251 ++column_entry_index)
256 for (IndexT row_in_block = 0; row_in_block < A.
rows_per_block(); ++row_in_block)
258 value_type val = elements[stride_start + row_in_block];
260 result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;
265 for (IndexT row_in_block = 0; row_in_block < A.
rows_per_block(); ++row_in_block)
270 value_type row_result = result_values[row_in_block];
272 Ap_buf[
row] = row_result;
273 inner_prod_ApAp += row_result * row_result;
274 inner_prod_pAp += p_buf[
row] * row_result;
275 inner_prod_Ap_r0star += r0star ? row_result * r0star[
row] : value_type(0);
280 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
281 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
283 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
293 template<
typename NumericT>
303 typedef unsigned int index_type;
307 value_type
const * elements = detail::extract_raw_pointer<value_type>(A.
handle());
308 index_type
const * coords = detail::extract_raw_pointer<index_type>(A.
handle2());
309 value_type
const * csr_elements = detail::extract_raw_pointer<value_type>(A.
handle5());
310 index_type
const * csr_row_buffer = detail::extract_raw_pointer<index_type>(A.
handle3());
311 index_type
const * csr_col_buffer = detail::extract_raw_pointer<index_type>(A.
handle4());
312 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
314 value_type inner_prod_ApAp = 0;
315 value_type inner_prod_pAp = 0;
316 value_type inner_prod_Ap_r0star = 0;
328 value_type val = elements[offset];
331 sum += p_buf[coords[offset]] * val;
340 for (
vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
341 sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];
344 inner_prod_ApAp += sum *
sum;
345 inner_prod_pAp += val_p_diag *
sum;
346 inner_prod_Ap_r0star += r0star ? sum * r0star[
row] : value_type(0);
349 data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
350 data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
352 data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
366 template<
typename NumericT>
377 value_type * data_result = detail::extract_raw_pointer<value_type>(result);
378 value_type * data_p = detail::extract_raw_pointer<value_type>(p);
379 value_type * data_r = detail::extract_raw_pointer<value_type>(r);
380 value_type
const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
381 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
386 value_type inner_prod_r = 0;
387 for (
long i = 0; i < static_cast<long>(
size); ++i)
389 value_type value_p = data_p[
static_cast<vcl_size_t>(i)];
390 value_type value_r = data_r[
static_cast<vcl_size_t>(i)];
393 data_result[
static_cast<vcl_size_t>(i)] += alpha * value_p;
394 value_r -= alpha * data_Ap[
static_cast<vcl_size_t>(i)];
395 value_p = value_r + beta * value_p;
396 inner_prod_r += value_r * value_r;
402 data_buffer[0] = inner_prod_r;
412 template<
typename NumericT>
430 template<
typename NumericT>
447 template<
typename NumericT>
464 template<
typename NumericT,
typename IndexT>
483 template<
typename NumericT>
503 template<
typename NumericT>
513 value_type * data_s = detail::extract_raw_pointer<value_type>(s);
514 value_type * data_r = detail::extract_raw_pointer<value_type>(r);
515 value_type
const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
516 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
522 value_type r_in_r0 = 0;
523 value_type Ap_in_r0 = 0;
524 for (
vcl_size_t i=0; i<buffer_chunk_size; ++i)
526 r_in_r0 += data_buffer[i];
527 Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];
529 value_type alpha = r_in_r0 / Ap_in_r0;
532 value_type inner_prod_s = 0;
533 for (
long i = 0; i < static_cast<long>(
size); ++i)
535 value_type value_s = data_s[
static_cast<vcl_size_t>(i)];
537 value_s = data_r[
static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];
538 inner_prod_s += value_s * value_s;
543 data_buffer[buffer_chunk_offset] = inner_prod_s;
553 template<
typename NumericT>
563 value_type * data_result = detail::extract_raw_pointer<value_type>(result);
564 value_type * data_p = detail::extract_raw_pointer<value_type>(p);
565 value_type
const * data_s = detail::extract_raw_pointer<value_type>(s);
566 value_type * data_residual = detail::extract_raw_pointer<value_type>(residual);
567 value_type
const * data_As = detail::extract_raw_pointer<value_type>(As);
568 value_type
const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
569 value_type
const * data_r0star = detail::extract_raw_pointer<value_type>(r0star);
570 value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
574 value_type inner_prod_r_r0star = 0;
575 for (
long i = 0; i < static_cast<long>(
size); ++i)
578 value_type value_result = data_result[index];
579 value_type value_p = data_p[index];
580 value_type value_s = data_s[index];
581 value_type value_residual = data_residual[index];
582 value_type value_As = data_As[index];
583 value_type value_Ap = data_Ap[index];
584 value_type value_r0star = data_r0star[index];
586 value_result += alpha * value_p + omega * value_s;
587 value_residual = value_s - omega * value_As;
588 value_p = value_residual + beta * (value_p - omega * value_Ap);
589 inner_prod_r_r0star += value_residual * value_r0star;
591 data_result[index] = value_result;
592 data_residual[index] = value_residual;
593 data_p[index] = value_p;
596 (void)buffer_chunk_size;
597 data_buffer[0] = inner_prod_r_r0star;
606 template<
typename NumericT>
615 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
626 template<
typename NumericT>
635 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
646 template<
typename NumericT>
655 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
666 template<
typename NumericT,
typename IndexT>
675 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
686 template<
typename NumericT>
695 NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
710 template <
typename T>
720 typedef T value_type;
722 value_type * data_v_k = detail::extract_raw_pointer<value_type>(v_k);
723 value_type
const * data_residual = detail::extract_raw_pointer<value_type>(residual);
724 value_type * data_R = detail::extract_raw_pointer<value_type>(R_buffer);
725 value_type
const * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
726 value_type * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);
733 value_type norm_vk = 0;
734 for (
vcl_size_t i=0; i<buffer_chunk_size; ++i)
735 norm_vk += data_buffer[i + buffer_chunk_size];
736 norm_vk = std::sqrt(norm_vk);
737 data_R[offset_in_R] = norm_vk;
740 value_type inner_prod_r_dot_vk = 0;
741 for (
long i = 0; i < static_cast<long>(
size); ++i)
743 value_type value_vk = data_v_k[
static_cast<vcl_size_t>(i) + vk_start] / norm_vk;
745 inner_prod_r_dot_vk += data_residual[
static_cast<vcl_size_t>(i)] * value_vk;
747 data_v_k[
static_cast<vcl_size_t>(i) + vk_start] = value_vk;
750 data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;
759 template <
typename T>
767 typedef T value_type;
769 value_type
const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
770 value_type * data_inner_prod = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);
774 data_inner_prod[j*buffer_chunk_size] = value_type(0);
779 value_type value_vk = data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size];
782 data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;
791 template <
typename T>
802 typedef T value_type;
804 value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
806 std::vector<T> values_vi_in_vk(k);
809 for (std::size_t i=0; i<k; ++i)
810 for (
vcl_size_t j=0; j<buffer_chunk_size; ++j)
811 values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];
815 value_type norm_vk = 0;
818 value_type value_vk = data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size];
821 value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];
823 norm_vk += value_vk * value_vk;
824 data_krylov_basis[
static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;
828 for (std::size_t i=0; i<k; ++i)
829 R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];
831 inner_prod_buffer[buffer_chunk_size] = norm_vk;
835 template <
typename T>
844 typedef T value_type;
846 value_type * data_result = detail::extract_raw_pointer<value_type>(result);
847 value_type
const * data_residual = detail::extract_raw_pointer<value_type>(residual);
848 value_type
const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);
849 value_type
const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);
853 value_type value_result = data_result[i];
855 value_result += data_coefficients[0] * data_residual[i];
857 value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];
859 data_result[i] = value_result;
865 template <
typename MatrixType,
typename T>
vcl_size_t internal_ellnnz() const
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Generic size and resize functionality for different vector and matrix types.
const handle_type & handle3() const
const vcl_size_t & size1() const
Returns the number of rows.
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
const handle_type & handle() const
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t internal_size1() const
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Computes the second reduction stage for multiple inner products , i=0..k-1, then updates v_k -= v_i and computes the first reduction stage for ||v_k||.
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t nnz() const
Returns the number of nonzero entries.
Determines row and column increments for matrices and matrix proxies.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const
vcl_size_t rows_per_block() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab a...
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
const handle_type & handle2() const
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Sparse matrix class using the sliced ELLPACK with parameters C, .
result_of::size_type< T >::type start(T const &obj)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Common routines for single-threaded or OpenMP-enabled execution on CPU.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorit...
void pipelined_prod_impl(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, NumericT const *r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined C...
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
size_type size() const
Returns the length of the vector (cf. std::vector)
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t k)
Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1}.
const handle_type & handle5() const
vcl_size_t internal_maxnnz() const
Defines the action of certain unary and binary operators and its arguments (for host execution)...
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
Computes first reduction stage for multiple inner products , i=0..k-1.
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
void pipelined_gmres_prod(MatrixType const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
Simple enable-if variant that uses the SFINAE pattern.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...