doc/host__based_2iterative__operations_8hpp_source.html

 #ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_

 #define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_


 /* =========================================================================

    Copyright (c) 2010-2016, Institute for Microelectronics,

                             Institute for Analysis and Scientific Computing,

                             TU Wien.

    Portions of this software are copyright by UChicago Argonne, LLC.


                             -----------------

                   ViennaCL - The Vienna Computing Library

                             -----------------


    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at


    (A list of authors and contributors can be found in the manual)


    License:         MIT (X11), see file LICENSE in the base directory

 ============================================================================= */


 #include <cmath>

 #include <algorithm>  //for std::max and std::min


 #include "viennacl/forwards.h"

 #include "viennacl/scalar.hpp"

 #include "viennacl/tools/tools.hpp"

 #include "viennacl/meta/predicate.hpp"

 #include "viennacl/meta/enable_if.hpp"

 #include "viennacl/traits/size.hpp"

 #include "viennacl/traits/start.hpp"

 #include "viennacl/linalg/host_based/common.hpp"

 #include "viennacl/linalg/detail/op_applier.hpp"

 #include "viennacl/traits/stride.hpp"


 // Minimum vector size for using OpenMP on vector operations:

 #ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE

   #define VIENNACL_OPENMP_VECTOR_MIN_SIZE  5000

 #endif


 namespace viennacl

 {

 namespace linalg

 {

 namespace host_based

 {


 namespace detail

 {

   template<typename NumericT>

   void pipelined_prod_impl(compressed_matrix<NumericT> const & A,

                            vector_base<NumericT> const & p,

                            vector_base<NumericT> & Ap,

                            NumericT const * r0star,

                            vector_base<NumericT> & inner_prod_buffer,

                            vcl_size_t buffer_chunk_size,

                            vcl_size_t buffer_chunk_offset)

   {

     typedef NumericT        value_type;


     value_type         * Ap_buf      = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);

     value_type   const *  p_buf      = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);

     value_type   const * elements    = detail::extract_raw_pointer<value_type>(A.handle());

     unsigned int const *  row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());

     unsigned int const *  col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());

     value_type         * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


     value_type inner_prod_ApAp = 0;

     value_type inner_prod_pAp = 0;

     value_type inner_prod_Ap_r0star = 0;

     for (long row = 0; row < static_cast<long>(A.size1()); ++row)

     {

       value_type dot_prod = 0;

       value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row


       vcl_size_t row_end = row_buffer[row+1];

       for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)

         dot_prod += elements[i] * p_buf[col_buffer[i]];


       // update contributions for the inner products (Ap, Ap) and (p, Ap)

       Ap_buf[static_cast<vcl_size_t>(row)] = dot_prod;

       inner_prod_ApAp += dot_prod * dot_prod;

       inner_prod_pAp  += val_p_diag * dot_prod;

       inner_prod_Ap_r0star += r0star ? dot_prod * r0star[static_cast<vcl_size_t>(row)] : value_type(0);

     }


     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;

     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;

     if (r0star)

       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;

   }


   template<typename NumericT>

   void pipelined_prod_impl(coordinate_matrix<NumericT> const & A,

                            vector_base<NumericT> const & p,

                            vector_base<NumericT> & Ap,

                            NumericT const * r0star,

                            vector_base<NumericT> & inner_prod_buffer,

                            vcl_size_t buffer_chunk_size,

                            vcl_size_t buffer_chunk_offset)

   {

     typedef NumericT        value_type;


     value_type         * Ap_buf       = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;

     value_type   const *  p_buf       = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;

     value_type   const * elements     = detail::extract_raw_pointer<value_type>(A.handle());

     unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.handle12());

     value_type         * data_buffer  = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


     // flush result buffer (cannot be expected to be zero)

     for (vcl_size_t i = 0; i< Ap.size(); ++i)

       Ap_buf[i] = 0;


     // matrix-vector product with a general COO format

     for (vcl_size_t i = 0; i < A.nnz(); ++i)

       Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];


     // computing the inner products (Ap, Ap) and (p, Ap):

     // Note: The COO format does not allow to inject the subsequent operations into the matrix-vector product, because row and column ordering assumptions are too weak

     value_type inner_prod_ApAp = 0;

     value_type inner_prod_pAp = 0;

     value_type inner_prod_Ap_r0star = 0;

     for (vcl_size_t i = 0; i<Ap.size(); ++i)

     {

       NumericT value_Ap = Ap_buf[i];

       NumericT value_p  =  p_buf[i];


       inner_prod_ApAp += value_Ap * value_Ap;

       inner_prod_pAp  += value_Ap * value_p;

       inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);

     }


     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;

     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;

     if (r0star)

       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;

   }


   template<typename NumericT>

   void pipelined_prod_impl(ell_matrix<NumericT> const & A,

                            vector_base<NumericT> const & p,

                            vector_base<NumericT> & Ap,

                            NumericT const * r0star,

                            vector_base<NumericT> & inner_prod_buffer,

                            vcl_size_t buffer_chunk_size,

                            vcl_size_t buffer_chunk_offset)

   {

     typedef NumericT     value_type;


     value_type         * Ap_buf       = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;

     value_type   const *  p_buf       = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;

     value_type   const * elements     = detail::extract_raw_pointer<value_type>(A.handle());

     unsigned int const * coords       = detail::extract_raw_pointer<unsigned int>(A.handle2());

     value_type         * data_buffer  = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


     value_type inner_prod_ApAp = 0;

     value_type inner_prod_pAp = 0;

     value_type inner_prod_Ap_r0star = 0;

     for (vcl_size_t row = 0; row < A.size1(); ++row)

     {

       value_type sum = 0;

       value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row


       for (unsigned int item_id = 0; item_id < A.internal_maxnnz(); ++item_id)

       {

         vcl_size_t offset = row + item_id * A.internal_size1();

         value_type val = elements[offset];


         if (val)

           sum += (p_buf[coords[offset]] * val);

       }


       Ap_buf[row] = sum;

       inner_prod_ApAp += sum * sum;

       inner_prod_pAp  += val_p_diag * sum;

       inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);

     }


     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;

     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;

     if (r0star)

       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;

   }


   template<typename NumericT, typename IndexT>

   void pipelined_prod_impl(sliced_ell_matrix<NumericT, IndexT> const & A,

                            vector_base<NumericT> const & p,

                            vector_base<NumericT> & Ap,

                            NumericT const * r0star,

                            vector_base<NumericT> & inner_prod_buffer,

                            vcl_size_t buffer_chunk_size,

                            vcl_size_t buffer_chunk_offset)

   {

     typedef NumericT     value_type;


     value_type       * Ap_buf            = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;

     value_type const *  p_buf            = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;

     value_type const * elements          = detail::extract_raw_pointer<value_type>(A.handle());

     IndexT     const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.handle1());

     IndexT     const * column_indices    = detail::extract_raw_pointer<IndexT>(A.handle2());

     IndexT     const * block_start       = detail::extract_raw_pointer<IndexT>(A.handle3());

     value_type         * data_buffer     = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


     vcl_size_t num_blocks = A.size1() / A.rows_per_block() + 1;

     std::vector<value_type> result_values(A.rows_per_block());


     value_type inner_prod_ApAp = 0;

     value_type inner_prod_pAp = 0;

     value_type inner_prod_Ap_r0star = 0;

     for (vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)

     {

       vcl_size_t current_columns_per_block = columns_per_block[block_idx];


       for (vcl_size_t i=0; i<result_values.size(); ++i)

         result_values[i] = 0;


       for (IndexT column_entry_index = 0;

                   column_entry_index < current_columns_per_block;

                 ++column_entry_index)

       {

         vcl_size_t stride_start = block_start[block_idx] + column_entry_index * A.rows_per_block();

         // Note: This for-loop may be unrolled by hand for exploiting vectorization

         //       Careful benchmarking recommended first, memory channels may be saturated already!

         for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)

         {

           value_type val = elements[stride_start + row_in_block];


           result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;

         }

       }


       vcl_size_t first_row_in_matrix = block_idx * A.rows_per_block();

       for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)

       {

         vcl_size_t row = first_row_in_matrix + row_in_block;

         if (row < Ap.size())

         {

           value_type row_result = result_values[row_in_block];


           Ap_buf[row] = row_result;

           inner_prod_ApAp += row_result * row_result;

           inner_prod_pAp  += p_buf[row] * row_result;

           inner_prod_Ap_r0star += r0star ? row_result * r0star[row] : value_type(0);

         }

       }

     }


     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;

     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;

     if (r0star)

       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;

   }


   template<typename NumericT>

   void pipelined_prod_impl(hyb_matrix<NumericT> const & A,

                            vector_base<NumericT> const & p,

                            vector_base<NumericT> & Ap,

                            NumericT const * r0star,

                            vector_base<NumericT> & inner_prod_buffer,

                            vcl_size_t buffer_chunk_size,

                            vcl_size_t buffer_chunk_offset)

   {

     typedef NumericT     value_type;

     typedef unsigned int index_type;


     value_type       * Ap_buf            = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;

     value_type const *  p_buf            = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;

     value_type const * elements          = detail::extract_raw_pointer<value_type>(A.handle());

     index_type const * coords            = detail::extract_raw_pointer<index_type>(A.handle2());

     value_type const * csr_elements      = detail::extract_raw_pointer<value_type>(A.handle5());

     index_type const * csr_row_buffer    = detail::extract_raw_pointer<index_type>(A.handle3());

     index_type const * csr_col_buffer    = detail::extract_raw_pointer<index_type>(A.handle4());

     value_type         * data_buffer     = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


     value_type inner_prod_ApAp = 0;

     value_type inner_prod_pAp = 0;

     value_type inner_prod_Ap_r0star = 0;

     for (vcl_size_t row = 0; row < A.size1(); ++row)

     {

       value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row

       value_type sum = 0;


       //

       // Part 1: Process ELL part

       //

       for (index_type item_id = 0; item_id < A.internal_ellnnz(); ++item_id)

       {

         vcl_size_t offset = row + item_id * A.internal_size1();

         value_type val = elements[offset];


         if (val)

           sum += p_buf[coords[offset]] * val;

       }


       //

       // Part 2: Process HYB part

       //

       vcl_size_t col_begin = csr_row_buffer[row];

       vcl_size_t col_end   = csr_row_buffer[row + 1];


       for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)

         sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];


       Ap_buf[row] = sum;

       inner_prod_ApAp += sum * sum;

       inner_prod_pAp  += val_p_diag * sum;

       inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);

     }


     data_buffer[    buffer_chunk_size] = inner_prod_ApAp;

     data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;

     if (r0star)

       data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;

   }


 } // namespace detail


 template<typename NumericT>

 void pipelined_cg_vector_update(vector_base<NumericT> & result,

                                 NumericT alpha,

                                 vector_base<NumericT> & p,

                                 vector_base<NumericT> & r,

                                 vector_base<NumericT> const & Ap,

                                 NumericT beta,

                                 vector_base<NumericT> & inner_prod_buffer)

 {

   typedef NumericT       value_type;


   value_type       * data_result = detail::extract_raw_pointer<value_type>(result);

   value_type       * data_p      = detail::extract_raw_pointer<value_type>(p);

   value_type       * data_r      = detail::extract_raw_pointer<value_type>(r);

   value_type const * data_Ap     = detail::extract_raw_pointer<value_type>(Ap);

   value_type       * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


   // Note: Due to the special setting in CG, there is no need to check for sizes and strides

   vcl_size_t size  = viennacl::traits::size(result);


   value_type inner_prod_r = 0;

   for (long i = 0; i < static_cast<long>(size); ++i)

   {

     value_type value_p = data_p[static_cast<vcl_size_t>(i)];

     value_type value_r = data_r[static_cast<vcl_size_t>(i)];


     data_result[static_cast<vcl_size_t>(i)] += alpha * value_p;

     value_r -= alpha * data_Ap[static_cast<vcl_size_t>(i)];

     value_p  = value_r + beta * value_p;

     inner_prod_r += value_r * value_r;


     data_p[static_cast<vcl_size_t>(i)] = value_p;

     data_r[static_cast<vcl_size_t>(i)] = value_r;

   }


   data_buffer[0] = inner_prod_r;

 }


 template<typename NumericT>

 void pipelined_cg_prod(compressed_matrix<NumericT> const & A,

                        vector_base<NumericT> const & p,

                        vector_base<NumericT> & Ap,

                        vector_base<NumericT> & inner_prod_buffer)

 {

   typedef NumericT const *    PtrType;

   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);

 }


 template<typename NumericT>

 void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,

                        vector_base<NumericT> const & p,

                        vector_base<NumericT> & Ap,

                        vector_base<NumericT> & inner_prod_buffer)

 {

   typedef NumericT const *    PtrType;

   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);

 }


 template<typename NumericT>

 void pipelined_cg_prod(ell_matrix<NumericT> const & A,

                        vector_base<NumericT> const & p,

                        vector_base<NumericT> & Ap,

                        vector_base<NumericT> & inner_prod_buffer)

 {

   typedef NumericT const *    PtrType;

   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);

 }


 template<typename NumericT, typename IndexT>

 void pipelined_cg_prod(sliced_ell_matrix<NumericT, IndexT> const & A,

                        vector_base<NumericT> const & p,

                        vector_base<NumericT> & Ap,

                        vector_base<NumericT> & inner_prod_buffer)

 {

   typedef NumericT const *    PtrType;

   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);

 }


 template<typename NumericT>

 void pipelined_cg_prod(hyb_matrix<NumericT> const & A,

                        vector_base<NumericT> const & p,

                        vector_base<NumericT> & Ap,

                        vector_base<NumericT> & inner_prod_buffer)

 {

   typedef NumericT const *    PtrType;

   viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);

 }


 template<typename NumericT>

 void pipelined_bicgstab_update_s(vector_base<NumericT> & s,

                                  vector_base<NumericT> & r,

                                  vector_base<NumericT> const & Ap,

                                  vector_base<NumericT> & inner_prod_buffer,

                                  vcl_size_t buffer_chunk_size,

                                  vcl_size_t buffer_chunk_offset)

 {

   typedef NumericT      value_type;


   value_type       * data_s      = detail::extract_raw_pointer<value_type>(s);

   value_type       * data_r      = detail::extract_raw_pointer<value_type>(r);

   value_type const * data_Ap     = detail::extract_raw_pointer<value_type>(Ap);

   value_type       * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


   // Note: Due to the special setting in CG, there is no need to check for sizes and strides

   vcl_size_t size  = viennacl::traits::size(s);


   // part 1: compute alpha:

   value_type r_in_r0 = 0;

   value_type Ap_in_r0 = 0;

   for (vcl_size_t i=0; i<buffer_chunk_size; ++i)

   {

      r_in_r0 += data_buffer[i];

     Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];

   }

   value_type alpha = r_in_r0 / Ap_in_r0;


   // part 2: s = r - alpha * Ap  and first step in reduction for s:

   value_type inner_prod_s = 0;

   for (long i = 0; i < static_cast<long>(size); ++i)

   {

     value_type value_s  = data_s[static_cast<vcl_size_t>(i)];


     value_s = data_r[static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];

     inner_prod_s += value_s * value_s;


     data_s[static_cast<vcl_size_t>(i)] = value_s;

   }


   data_buffer[buffer_chunk_offset] = inner_prod_s;

 }


  template<typename NumericT>

  void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,

                                        vector_base<NumericT> & residual, vector_base<NumericT> const & As,

                                        NumericT beta, vector_base<NumericT> const & Ap,

                                        vector_base<NumericT> const & r0star,

                                        vector_base<NumericT>       & inner_prod_buffer,

                                        vcl_size_t buffer_chunk_size)

  {

    typedef NumericT    value_type;


    value_type       * data_result   = detail::extract_raw_pointer<value_type>(result);

    value_type       * data_p        = detail::extract_raw_pointer<value_type>(p);

    value_type const * data_s        = detail::extract_raw_pointer<value_type>(s);

    value_type       * data_residual = detail::extract_raw_pointer<value_type>(residual);

    value_type const * data_As       = detail::extract_raw_pointer<value_type>(As);

    value_type const * data_Ap       = detail::extract_raw_pointer<value_type>(Ap);

    value_type const * data_r0star   = detail::extract_raw_pointer<value_type>(r0star);

    value_type       * data_buffer   = detail::extract_raw_pointer<value_type>(inner_prod_buffer);


    vcl_size_t size = viennacl::traits::size(result);


    value_type inner_prod_r_r0star = 0;

    for (long i = 0; i < static_cast<long>(size); ++i)

    {

      vcl_size_t index = static_cast<vcl_size_t>(i);

      value_type value_result   = data_result[index];

      value_type value_p        = data_p[index];

      value_type value_s        = data_s[index];

      value_type value_residual = data_residual[index];

      value_type value_As       = data_As[index];

      value_type value_Ap       = data_Ap[index];

      value_type value_r0star   = data_r0star[index];


      value_result   += alpha * value_p + omega * value_s;

      value_residual  = value_s - omega * value_As;

      value_p         = value_residual + beta * (value_p - omega * value_Ap);

      inner_prod_r_r0star += value_residual * value_r0star;


      data_result[index]   = value_result;

      data_residual[index] = value_residual;

      data_p[index]        = value_p;

    }


    (void)buffer_chunk_size; // not needed here, just silence compiler warning (unused variable)

    data_buffer[0] = inner_prod_r_r0star;

  }


  template<typename NumericT>

  void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,

                               vector_base<NumericT> const & p,

                               vector_base<NumericT> & Ap,

                               vector_base<NumericT> const & r0star,

                               vector_base<NumericT> & inner_prod_buffer,

                               vcl_size_t buffer_chunk_size,

                               vcl_size_t buffer_chunk_offset)

  {

    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);


    viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);

  }


  template<typename NumericT>

  void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,

                               vector_base<NumericT> const & p,

                               vector_base<NumericT> & Ap,

                               vector_base<NumericT> const & r0star,

                               vector_base<NumericT> & inner_prod_buffer,

                               vcl_size_t buffer_chunk_size,

                               vcl_size_t buffer_chunk_offset)

  {

    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);


    viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);

  }


  template<typename NumericT>

  void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,

                               vector_base<NumericT> const & p,

                               vector_base<NumericT> & Ap,

                               vector_base<NumericT> const & r0star,

                               vector_base<NumericT> & inner_prod_buffer,

                               vcl_size_t buffer_chunk_size,

                               vcl_size_t buffer_chunk_offset)

  {

    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);


    viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);

  }


  template<typename NumericT, typename IndexT>

  void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT, IndexT> const & A,

                               vector_base<NumericT> const & p,

                               vector_base<NumericT> & Ap,

                               vector_base<NumericT> const & r0star,

                               vector_base<NumericT> & inner_prod_buffer,

                               vcl_size_t buffer_chunk_size,

                               vcl_size_t buffer_chunk_offset)

  {

    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);


    viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);

  }


  template<typename NumericT>

  void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,

                               vector_base<NumericT> const & p,

                               vector_base<NumericT> & Ap,

                               vector_base<NumericT> const & r0star,

                               vector_base<NumericT> & inner_prod_buffer,

                               vcl_size_t buffer_chunk_size,

                               vcl_size_t buffer_chunk_offset)

  {

    NumericT const * data_r0star   = detail::extract_raw_pointer<NumericT>(r0star);


    viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);

  }


 template <typename T>

 void pipelined_gmres_normalize_vk(vector_base<T> & v_k,

                                  vector_base<T> const & residual,

                                  vector_base<T> & R_buffer,

                                  vcl_size_t offset_in_R,

                                  vector_base<T> const & inner_prod_buffer,

                                  vector_base<T> & r_dot_vk_buffer,

                                  vcl_size_t buffer_chunk_size,

                                  vcl_size_t buffer_chunk_offset)

 {

   typedef T        value_type;


   value_type       * data_v_k      = detail::extract_raw_pointer<value_type>(v_k);

   value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);

   value_type       * data_R        = detail::extract_raw_pointer<value_type>(R_buffer);

   value_type const * data_buffer   = detail::extract_raw_pointer<value_type>(inner_prod_buffer);

   value_type       * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);


   // Note: Due to the special setting in GMRES, there is no need to check for sizes and strides

   vcl_size_t size     = viennacl::traits::size(v_k);

   vcl_size_t vk_start = viennacl::traits::start(v_k);


   // part 1: compute alpha:

   value_type norm_vk = 0;

   for (vcl_size_t i=0; i<buffer_chunk_size; ++i)

    norm_vk += data_buffer[i + buffer_chunk_size];

   norm_vk = std::sqrt(norm_vk);

   data_R[offset_in_R] = norm_vk;


   // Compute <r, v_k> after normalization of v_k:

   value_type inner_prod_r_dot_vk = 0;

   for (long i = 0; i < static_cast<long>(size); ++i)

   {

     value_type value_vk = data_v_k[static_cast<vcl_size_t>(i) + vk_start] / norm_vk;


     inner_prod_r_dot_vk += data_residual[static_cast<vcl_size_t>(i)] * value_vk;


     data_v_k[static_cast<vcl_size_t>(i) + vk_start] = value_vk;

   }


   data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;

 }


 template <typename T>

 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,

                                         vcl_size_t v_k_size,

                                         vcl_size_t v_k_internal_size,

                                         vcl_size_t k,

                                         vector_base<T> & vi_in_vk_buffer,

                                         vcl_size_t buffer_chunk_size)

 {

   typedef T        value_type;


   value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);

   value_type       * data_inner_prod   = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);


   // reset buffer:

   for (vcl_size_t j = 0; j < k; ++j)

     data_inner_prod[j*buffer_chunk_size] = value_type(0);


   // compute inner products:

   for (vcl_size_t i = 0; i < v_k_size; ++i)

   {

     value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];


     for (vcl_size_t j = 0; j < k; ++j)

       data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;

   }

 }


 template <typename T>

 void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,

                                         vcl_size_t v_k_size,

                                         vcl_size_t v_k_internal_size,

                                         vcl_size_t k,

                                         vector_base<T> const & vi_in_vk_buffer,

                                         vector_base<T> & R_buffer,

                                         vcl_size_t krylov_dim,

                                         vector_base<T> & inner_prod_buffer,

                                         vcl_size_t buffer_chunk_size)

 {

   typedef T        value_type;


   value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);


   std::vector<T> values_vi_in_vk(k);


   // Step 1: Finish reduction of <v_i, v_k> to obtain scalars:

   for (std::size_t i=0; i<k; ++i)

     for (vcl_size_t j=0; j<buffer_chunk_size; ++j)

       values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];


   // Step 2: Compute v_k -= <v_i, v_k> v_i and reduction on ||v_k||:

   value_type norm_vk = 0;

   for (vcl_size_t i = 0; i < v_k_size; ++i)

   {

     value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];


     for (vcl_size_t j = 0; j < k; ++j)

       value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];


     norm_vk += value_vk * value_vk;

     data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;

   }


   // Step 3: Write values to R_buffer:

   for (std::size_t i=0; i<k; ++i)

     R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];


   inner_prod_buffer[buffer_chunk_size] = norm_vk;

 }


 template <typename T>

 void pipelined_gmres_update_result(vector_base<T> & result,

                                   vector_base<T> const & residual,

                                   vector_base<T> const & krylov_basis,

                                   vcl_size_t v_k_size,

                                   vcl_size_t v_k_internal_size,

                                   vector_base<T> const & coefficients,

                                   vcl_size_t k)

 {

   typedef T        value_type;


   value_type       * data_result       = detail::extract_raw_pointer<value_type>(result);

   value_type const * data_residual     = detail::extract_raw_pointer<value_type>(residual);

   value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);

   value_type const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);


   for (vcl_size_t i = 0; i < v_k_size; ++i)

   {

     value_type value_result = data_result[i];


     value_result += data_coefficients[0] * data_residual[i];

     for (vcl_size_t j = 1; j<k; ++j)

       value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];


     data_result[i] = value_result;

   }


 }


 // Reuse implementation from CG:

 template <typename MatrixType, typename T>

 void pipelined_gmres_prod(MatrixType const & A,

                       vector_base<T> const & p,

                       vector_base<T> & Ap,

                       vector_base<T> & inner_prod_buffer)

 {

   pipelined_cg_prod(A, p, Ap, inner_prod_buffer);

 }


 } //namespace host_based

 } //namespace linalg

 } //namespace viennacl


 #endif

viennacl::hyb_matrix::internal_ellnnz
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101

viennacl::hyb_matrix
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406

viennacl::linalg::host_based::pipelined_gmres_normalize_vk
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
Definition: iterative_operations.hpp:711

viennacl::sliced_ell_matrix::handle2
handle_type & handle2()
Definition: sliced_ell_matrix.hpp:112

viennacl::ell_matrix::size1
vcl_size_t size1() const
Definition: ell_matrix.hpp:91

viennacl::ell_matrix::handle2
handle_type & handle2()
Definition: ell_matrix.hpp:103

viennacl::linalg::sum
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45

size.hpp
Generic size and resize functionality for different vector and matrix types.

viennacl::hyb_matrix::handle3
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107

viennacl::compressed_matrix::size1
const vcl_size_t & size1() const
Returns the number of rows.
Definition: compressed_matrix.hpp:927

start.hpp
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...

tools.hpp
Various little tools used here and there in ViennaCL.

viennacl::hyb_matrix::handle
const handle_type & handle() const
Definition: hyb_matrix.hpp:105

viennacl::coordinate_matrix::handle12
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Definition: coordinate_matrix.hpp:366

viennacl::hyb_matrix::internal_size1
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95

viennacl::linalg::host_based::pipelined_cg_vector_update
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
Definition: iterative_operations.hpp:367

viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage2
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Computes the second reduction stage for multiple inner products , i=0..k-1, then updates v_k -=  v_i and computes the first reduction stage for ||v_k||.
Definition: iterative_operations.hpp:792

forwards.h
This file provides the forward declarations for the main types used within ViennaCL.

viennacl::coordinate_matrix::nnz
vcl_size_t nnz() const
Returns the number of nonzero entries.
Definition: coordinate_matrix.hpp:361

stride.hpp
Determines row and column increments for matrices and matrix proxies.

viennacl::linalg::detail::spai::dot_prod
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182

viennacl::hyb_matrix::handle4
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108

viennacl::sliced_ell_matrix::rows_per_block
vcl_size_t rows_per_block() const
Definition: sliced_ell_matrix.hpp:104

viennacl::compressed_matrix::handle
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_matrix.hpp:942

viennacl::linalg::host_based::pipelined_bicgstab_prod
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab a...
Definition: iterative_operations.hpp:607

viennacl::compressed_matrix::handle1
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_matrix.hpp:936

viennacl::ell_matrix::internal_size1
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88

viennacl::sliced_ell_matrix::handle
handle_type & handle()
Definition: sliced_ell_matrix.hpp:118

NumericT
float NumericT
Definition: bisect.cpp:40

viennacl::linalg::host_based::pipelined_bicgstab_vector_update
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
Definition: iterative_operations.hpp:554

viennacl::hyb_matrix::handle2
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106

viennacl::traits::size
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239

viennacl::ell_matrix
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53

viennacl::sliced_ell_matrix
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403

viennacl::traits::start
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44

viennacl::compressed_matrix::handle2
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_matrix.hpp:938

viennacl::vector_base< NumericT >

viennacl::vcl_size_t
std::size_t vcl_size_t
Definition: forwards.h:75

viennacl::sliced_ell_matrix::size1
vcl_size_t size1() const
Definition: sliced_ell_matrix.hpp:101

common.hpp
Common routines for single-threaded or OpenMP-enabled execution on CPU.

viennacl::row
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910

predicate.hpp
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.

viennacl::ell_matrix::handle
handle_type & handle()
Definition: ell_matrix.hpp:100

viennacl::coordinate_matrix::handle
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: coordinate_matrix.hpp:368

viennacl::linalg::host_based::pipelined_cg_prod
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorit...
Definition: iterative_operations.hpp:413

viennacl::linalg::host_based::detail::pipelined_prod_impl
void pipelined_prod_impl(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, NumericT const *r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined C...
Definition: iterative_operations.hpp:61

viennacl::sliced_ell_matrix::handle3
handle_type & handle3()
Definition: sliced_ell_matrix.hpp:115

viennacl::linalg::host_based::pipelined_bicgstab_update_s
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm...
Definition: iterative_operations.hpp:504

viennacl::vector_base::size
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118

viennacl::linalg::host_based::pipelined_gmres_update_result
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t k)
Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1}.
Definition: iterative_operations.hpp:836

viennacl::compressed_matrix< NumericT >

viennacl::hyb_matrix::handle5
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109

viennacl::hyb_matrix::size1
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98

viennacl::ell_matrix::internal_maxnnz
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94

op_applier.hpp
Defines the action of certain unary and binary operators and its arguments (for host execution)...

viennacl::sliced_ell_matrix::handle1
handle_type & handle1()
Definition: sliced_ell_matrix.hpp:109

viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage1
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
Computes first reduction stage for multiple inner products , i=0..k-1.
Definition: iterative_operations.hpp:760

scalar.hpp
Implementation of the ViennaCL scalar class.

viennacl::vector_base::handle
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128

viennacl::linalg::host_based::pipelined_gmres_prod
void pipelined_gmres_prod(MatrixType const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
Definition: iterative_operations.hpp:866

enable_if.hpp
Simple enable-if variant that uses the SFINAE pattern.

viennacl::coordinate_matrix
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
Definition: coordinate_matrix.hpp:186