doc/host__based_2sparse__matrix__operations_8hpp_source.html

 #ifndef VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_

 #define VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_


 /* =========================================================================

    Copyright (c) 2010-2016, Institute for Microelectronics,

                             Institute for Analysis and Scientific Computing,

                             TU Wien.

    Portions of this software are copyright by UChicago Argonne, LLC.


                             -----------------

                   ViennaCL - The Vienna Computing Library

                             -----------------


    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at


    (A list of authors and contributors can be found in the manual)


    License:         MIT (X11), see file LICENSE in the base directory

 ============================================================================= */


 #include "viennacl/forwards.h"

 #include "viennacl/scalar.hpp"

 #include "viennacl/vector.hpp"

 #include "viennacl/tools/tools.hpp"

 #include "viennacl/linalg/host_based/common.hpp"

 #include "viennacl/linalg/host_based/vector_operations.hpp"


 #include "viennacl/linalg/host_based/spgemm_vector.hpp"


 #include <vector>


 #ifdef VIENNACL_WITH_OPENMP

 #include <omp.h>

 #endif


 namespace viennacl

 {

 namespace linalg

 {

 namespace host_based

 {

 //

 // Compressed matrix

 //


 namespace detail

 {

   template<typename NumericT, unsigned int AlignmentV>

   void row_info(compressed_matrix<NumericT, AlignmentV> const & mat,

                 vector_base<NumericT> & vec,

                 viennacl::linalg::detail::row_info_types info_selector)

   {

     NumericT         * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());

     NumericT   const * elements   = detail::extract_raw_pointer<NumericT>(mat.handle());

     unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());

     unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());


     for (vcl_size_t row = 0; row < mat.size1(); ++row)

     {

       NumericT value = 0;

       unsigned int row_end = row_buffer[row+1];


       switch (info_selector)

       {

         case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm

           for (unsigned int i = row_buffer[row]; i < row_end; ++i)

             value = std::max<NumericT>(value, std::fabs(elements[i]));

           break;


         case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm

           for (unsigned int i = row_buffer[row]; i < row_end; ++i)

             value += std::fabs(elements[i]);

           break;


         case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm

           for (unsigned int i = row_buffer[row]; i < row_end; ++i)

             value += elements[i] * elements[i];

           value = std::sqrt(value);

           break;


         case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry

           for (unsigned int i = row_buffer[row]; i < row_end; ++i)

           {

             if (col_buffer[i] == row)

             {

               value = elements[i];

               break;

             }

           }

           break;

       }

       result_buf[row] = value;

     }

   }

 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & mat,

                const viennacl::vector_base<NumericT> & vec,

                NumericT alpha,

                viennacl::vector_base<NumericT> & result,

                NumericT beta)

 {

   NumericT           * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());

   NumericT     const * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(mat.handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long row = 0; row < static_cast<long>(mat.size1()); ++row)

   {

     NumericT dot_prod = 0;

     vcl_size_t row_end = row_buffer[row+1];

     for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)

       dot_prod += elements[i] * vec_buf[col_buffer[i] * vec.stride() + vec.start()];


     if (beta < 0 || beta > 0)

     {

       vcl_size_t index = static_cast<vcl_size_t>(row) * result.stride() + result.start();

       result_buf[index] = alpha * dot_prod + beta * result_buf[index];

     }

     else

       result_buf[static_cast<vcl_size_t>(row) * result.stride() + result.start()] = alpha * dot_prod;

   }


 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,

                const viennacl::matrix_base<NumericT> & d_mat,

                      viennacl::matrix_base<NumericT> & result) {


   NumericT     const * sp_mat_elements   = detail::extract_raw_pointer<NumericT>(sp_mat.handle());

   unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());

   unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());


   NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat);

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   if ( d_mat.row_major() ) {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {

       vcl_size_t row_start = sp_mat_row_buffer[row];

       vcl_size_t row_end = sp_mat_row_buffer[row+1];

       for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {

         NumericT temp = 0;

         for (vcl_size_t k = row_start; k < row_end; ++k) {

           temp += sp_mat_elements[k] * d_mat_wrapper_row(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), col);

         }

         if (result.row_major())

           result_wrapper_row(row, col) = temp;

         else

           result_wrapper_col(row, col) = temp;

       }

     }

   }

   else {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {

       for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {

         vcl_size_t row_start = sp_mat_row_buffer[row];

         vcl_size_t row_end = sp_mat_row_buffer[row+1];

         NumericT temp = 0;

         for (vcl_size_t k = row_start; k < row_end; ++k) {

           temp += sp_mat_elements[k] * d_mat_wrapper_col(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), static_cast<vcl_size_t>(col));

         }

         if (result.row_major())

           result_wrapper_row(row, col) = temp;

         else

           result_wrapper_col(row, col) = temp;

       }

     }

   }


 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,

                const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,

                                                   const viennacl::matrix_base<NumericT>,

                                                   viennacl::op_trans > & d_mat,

                 viennacl::matrix_base<NumericT> & result) {


   NumericT     const * sp_mat_elements   = detail::extract_raw_pointer<NumericT>(sp_mat.handle());

   unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());

   unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());


   NumericT const *  d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   if ( d_mat.lhs().row_major() ) {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {

       vcl_size_t row_start = sp_mat_row_buffer[row];

       vcl_size_t row_end = sp_mat_row_buffer[row+1];

       for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {

         NumericT temp = 0;

         for (vcl_size_t k = row_start; k < row_end; ++k) {

           temp += sp_mat_elements[k] * d_mat_wrapper_row(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));

         }

         if (result.row_major())

           result_wrapper_row(row, col) = temp;

         else

           result_wrapper_col(row, col) = temp;

       }

     }

   }

   else {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {

       for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {

         vcl_size_t row_start = sp_mat_row_buffer[row];

         vcl_size_t row_end = sp_mat_row_buffer[row+1];

         NumericT temp = 0;

         for (vcl_size_t k = row_start; k < row_end; ++k) {

           temp += sp_mat_elements[k] * d_mat_wrapper_col(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));

         }

         if (result.row_major())

           result_wrapper_row(row, col) = temp;

         else

           result_wrapper_col(row, col) = temp;

       }

     }

   }


 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,

                viennacl::compressed_matrix<NumericT, AlignmentV> const & B,

                viennacl::compressed_matrix<NumericT, AlignmentV> & C)

 {


   NumericT     const * A_elements   = detail::extract_raw_pointer<NumericT>(A.handle());

   unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());

   unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());


   NumericT     const * B_elements   = detail::extract_raw_pointer<NumericT>(B.handle());

   unsigned int const * B_row_buffer = detail::extract_raw_pointer<unsigned int>(B.handle1());

   unsigned int const * B_col_buffer = detail::extract_raw_pointer<unsigned int>(B.handle2());


   C.resize(A.size1(), B.size2(), false);

   unsigned int * C_row_buffer = detail::extract_raw_pointer<unsigned int>(C.handle1());


 #if defined(VIENNACL_WITH_OPENMP)

   unsigned int block_factor = 10;

   unsigned int max_threads = omp_get_max_threads();

   long chunk_size = long(A.size1()) / long(block_factor * max_threads) + 1;

 #else

   unsigned int max_threads = 1;

 #endif

   std::vector<unsigned int> max_length_row_C(max_threads);

   std::vector<unsigned int *> row_C_temp_index_buffers(max_threads);

   std::vector<NumericT *>     row_C_temp_value_buffers(max_threads);


   /*

    * Stage 1: Determine maximum length of work buffers:

    */


 #if defined(VIENNACL_WITH_OPENMP)

   #pragma omp parallel for schedule(dynamic, chunk_size)

 #endif

   for (long i=0; i<long(A.size1()); ++i)

   {

     unsigned int row_start_A = A_row_buffer[i];

     unsigned int row_end_A   = A_row_buffer[i+1];


     unsigned int row_C_upper_bound_row = 0;

     for (unsigned int j = row_start_A; j<row_end_A; ++j)

     {

       unsigned int row_B = A_col_buffer[j];


       unsigned int entries_in_row = B_row_buffer[row_B+1] - B_row_buffer[row_B];

       row_C_upper_bound_row += entries_in_row;

     }


 #ifdef VIENNACL_WITH_OPENMP

     unsigned int thread_id = omp_get_thread_num();

 #else

     unsigned int thread_id = 0;

 #endif


     max_length_row_C[thread_id] = std::max(max_length_row_C[thread_id], std::min(row_C_upper_bound_row, static_cast<unsigned int>(B.size2())));

   }


   // determine global maximum row length

   for (std::size_t i=1; i<max_length_row_C.size(); ++i)

     max_length_row_C[0] = std::max(max_length_row_C[0], max_length_row_C[i]);


   // allocate work vectors:

   for (unsigned int i=0; i<max_threads; ++i)

     row_C_temp_index_buffers[i] = (unsigned int *)malloc(sizeof(unsigned int)*3*max_length_row_C[0]);


   /*

    * Stage 2: Determine sparsity pattern of C

    */


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for schedule(dynamic, chunk_size)

 #endif

   for (long i=0; i<long(A.size1()); ++i)

   {

     unsigned int thread_id = 0;

   #ifdef VIENNACL_WITH_OPENMP

     thread_id = omp_get_thread_num();

   #endif

     unsigned int buffer_len = max_length_row_C[0];


     unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];

     unsigned int *row_C_vector_2 = row_C_vector_1 + buffer_len;

     unsigned int *row_C_vector_3 = row_C_vector_2 + buffer_len;


     unsigned int row_start_A = A_row_buffer[i];

     unsigned int row_end_A   = A_row_buffer[i+1];


     C_row_buffer[i] = row_C_scan_symbolic_vector(row_start_A, row_end_A, A_col_buffer,

                                                  B_row_buffer, B_col_buffer, static_cast<unsigned int>(B.size2()),

                                                  row_C_vector_1, row_C_vector_2, row_C_vector_3);

   }


   // exclusive scan to obtain row start indices:

   unsigned int current_offset = 0;

   for (std::size_t i=0; i<C.size1(); ++i)

   {

     unsigned int tmp = C_row_buffer[i];

     C_row_buffer[i] = current_offset;

     current_offset += tmp;

   }

   C_row_buffer[C.size1()] = current_offset;

   C.reserve(current_offset, false);


   // allocate work vectors:

   for (unsigned int i=0; i<max_threads; ++i)

     row_C_temp_value_buffers[i] = (NumericT *)malloc(sizeof(NumericT)*3*max_length_row_C[0]);


   /*

    * Stage 3: Compute product (code similar, maybe pull out into a separate function to avoid code duplication?)

    */

   NumericT     * C_elements   = detail::extract_raw_pointer<NumericT>(C.handle());

   unsigned int * C_col_buffer = detail::extract_raw_pointer<unsigned int>(C.handle2());


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for schedule(dynamic, chunk_size)

 #endif

   for (long i = 0; i < long(A.size1()); ++i)

   {

     unsigned int row_start_A  = A_row_buffer[i];

     unsigned int row_end_A    = A_row_buffer[i+1];


     unsigned int row_C_buffer_start = C_row_buffer[i];

     unsigned int row_C_buffer_end   = C_row_buffer[i+1];


 #ifdef VIENNACL_WITH_OPENMP

     unsigned int thread_id = omp_get_thread_num();

 #else

     unsigned int thread_id = 0;

 #endif


     unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];

     unsigned int *row_C_vector_2 = row_C_vector_1 + max_length_row_C[0];

     unsigned int *row_C_vector_3 = row_C_vector_2 + max_length_row_C[0];


     NumericT *row_C_vector_1_values = row_C_temp_value_buffers[thread_id];

     NumericT *row_C_vector_2_values = row_C_vector_1_values + max_length_row_C[0];

     NumericT *row_C_vector_3_values = row_C_vector_2_values + max_length_row_C[0];


     row_C_scan_numeric_vector(row_start_A, row_end_A, A_col_buffer, A_elements,

                               B_row_buffer, B_col_buffer, B_elements, static_cast<unsigned int>(B.size2()),

                               row_C_buffer_start, row_C_buffer_end, C_col_buffer, C_elements,

                               row_C_vector_1, row_C_vector_1_values,

                               row_C_vector_2, row_C_vector_2_values,

                               row_C_vector_3, row_C_vector_3_values);

   }


   // clean up at the end:

   for (unsigned int i=0; i<max_threads; ++i)

   {

     free(row_C_temp_index_buffers[i]);

     free(row_C_temp_value_buffers[i]);

   }


 }


 //

 // Triangular solve for compressed_matrix, A \ b

 //

 namespace detail

 {

   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_inplace_solve(IndexArrayT const & row_buffer,

                          IndexArrayT const & col_buffer,

                          ConstScalarArrayT const & element_buffer,

                          ScalarArrayT & vec_buffer,

                          vcl_size_t num_cols,

                          viennacl::linalg::unit_lower_tag)

   {

     vcl_size_t row_begin = row_buffer[1];

     for (vcl_size_t row = 1; row < num_cols; ++row)

     {

       NumericT vec_entry = vec_buffer[row];

       vcl_size_t row_end = row_buffer[row+1];

       for (vcl_size_t i = row_begin; i < row_end; ++i)

       {

         vcl_size_t col_index = col_buffer[i];

         if (col_index < row)

           vec_entry -= vec_buffer[col_index] * element_buffer[i];

       }

       vec_buffer[row] = vec_entry;

       row_begin = row_end;

     }

   }


   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_inplace_solve(IndexArrayT const & row_buffer,

                          IndexArrayT const & col_buffer,

                          ConstScalarArrayT const & element_buffer,

                          ScalarArrayT & vec_buffer,

                          vcl_size_t num_cols,

                          viennacl::linalg::lower_tag)

   {

     vcl_size_t row_begin = row_buffer[0];

     for (vcl_size_t row = 0; row < num_cols; ++row)

     {

       NumericT vec_entry = vec_buffer[row];


       // substitute and remember diagonal entry

       vcl_size_t row_end = row_buffer[row+1];

       NumericT diagonal_entry = 0;

       for (vcl_size_t i = row_begin; i < row_end; ++i)

       {

         vcl_size_t col_index = col_buffer[i];

         if (col_index < row)

           vec_entry -= vec_buffer[col_index] * element_buffer[i];

         else if (col_index == row)

           diagonal_entry = element_buffer[i];

       }


       vec_buffer[row] = vec_entry / diagonal_entry;

       row_begin = row_end;

     }

   }


   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_inplace_solve(IndexArrayT const & row_buffer,

                          IndexArrayT const & col_buffer,

                          ConstScalarArrayT const & element_buffer,

                          ScalarArrayT & vec_buffer,

                          vcl_size_t num_cols,

                          viennacl::linalg::unit_upper_tag)

   {

     for (vcl_size_t row2 = 1; row2 < num_cols; ++row2)

     {

       vcl_size_t row = (num_cols - row2) - 1;

       NumericT vec_entry = vec_buffer[row];

       vcl_size_t row_begin = row_buffer[row];

       vcl_size_t row_end   = row_buffer[row+1];

       for (vcl_size_t i = row_begin; i < row_end; ++i)

       {

         vcl_size_t col_index = col_buffer[i];

         if (col_index > row)

           vec_entry -= vec_buffer[col_index] * element_buffer[i];

       }

       vec_buffer[row] = vec_entry;

     }

   }


   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_inplace_solve(IndexArrayT const & row_buffer,

                          IndexArrayT const & col_buffer,

                          ConstScalarArrayT const & element_buffer,

                          ScalarArrayT & vec_buffer,

                          vcl_size_t num_cols,

                          viennacl::linalg::upper_tag)

   {

     for (vcl_size_t row2 = 0; row2 < num_cols; ++row2)

     {

       vcl_size_t row = (num_cols - row2) - 1;

       NumericT vec_entry = vec_buffer[row];


       // substitute and remember diagonal entry

       vcl_size_t row_begin = row_buffer[row];

       vcl_size_t row_end   = row_buffer[row+1];

       NumericT diagonal_entry = 0;

       for (vcl_size_t i = row_begin; i < row_end; ++i)

       {

         vcl_size_t col_index = col_buffer[i];

         if (col_index > row)

           vec_entry -= vec_buffer[col_index] * element_buffer[i];

         else if (col_index == row)

           diagonal_entry = element_buffer[i];

       }


       vec_buffer[row] = vec_entry / diagonal_entry;

     }

   }


 } //namespace detail


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::unit_lower_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(L.handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());


   detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);

 }


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::lower_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(L.handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());


   detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);

 }


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::unit_upper_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(U.handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());


   detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);

 }


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::upper_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(U.handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());


   detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);

 }


 //

 // Triangular solve for compressed_matrix, A^T \ b

 //


 namespace detail

 {

   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_trans_inplace_solve(IndexArrayT const & row_buffer,

                                IndexArrayT const & col_buffer,

                                ConstScalarArrayT const & element_buffer,

                                ScalarArrayT & vec_buffer,

                                vcl_size_t num_cols,

                                viennacl::linalg::unit_lower_tag)

   {

     vcl_size_t col_begin = row_buffer[0];

     for (vcl_size_t col = 0; col < num_cols; ++col)

     {

       NumericT vec_entry = vec_buffer[col];

       vcl_size_t col_end = row_buffer[col+1];

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         unsigned int row_index = col_buffer[i];

         if (row_index > col)

           vec_buffer[row_index] -= vec_entry * element_buffer[i];

       }

       col_begin = col_end;

     }

   }


   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_trans_inplace_solve(IndexArrayT const & row_buffer,

                                IndexArrayT const & col_buffer,

                                ConstScalarArrayT const & element_buffer,

                                ScalarArrayT & vec_buffer,

                                vcl_size_t num_cols,

                                viennacl::linalg::lower_tag)

   {

     vcl_size_t col_begin = row_buffer[0];

     for (vcl_size_t col = 0; col < num_cols; ++col)

     {

       vcl_size_t col_end = row_buffer[col+1];


       // Stage 1: Find diagonal entry:

       NumericT diagonal_entry = 0;

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index == col)

         {

           diagonal_entry = element_buffer[i];

           break;

         }

       }


       // Stage 2: Substitute

       NumericT vec_entry = vec_buffer[col] / diagonal_entry;

       vec_buffer[col] = vec_entry;

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index > col)

           vec_buffer[row_index] -= vec_entry * element_buffer[i];

       }

       col_begin = col_end;

     }

   }


   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_trans_inplace_solve(IndexArrayT const & row_buffer,

                                IndexArrayT const & col_buffer,

                                ConstScalarArrayT const & element_buffer,

                                ScalarArrayT & vec_buffer,

                                vcl_size_t num_cols,

                                viennacl::linalg::unit_upper_tag)

   {

     for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)

     {

       vcl_size_t col = (num_cols - col2) - 1;


       NumericT vec_entry = vec_buffer[col];

       vcl_size_t col_begin = row_buffer[col];

       vcl_size_t col_end = row_buffer[col+1];

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index < col)

           vec_buffer[row_index] -= vec_entry * element_buffer[i];

       }


     }

   }


   template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>

   void csr_trans_inplace_solve(IndexArrayT const & row_buffer,

                                IndexArrayT const & col_buffer,

                                ConstScalarArrayT const & element_buffer,

                                ScalarArrayT & vec_buffer,

                                vcl_size_t num_cols,

                                viennacl::linalg::upper_tag)

   {

     for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)

     {

       vcl_size_t col = (num_cols - col2) - 1;

       vcl_size_t col_begin = row_buffer[col];

       vcl_size_t col_end = row_buffer[col+1];


       // Stage 1: Find diagonal entry:

       NumericT diagonal_entry = 0;

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index == col)

         {

           diagonal_entry = element_buffer[i];

           break;

         }

       }


       // Stage 2: Substitute

       NumericT vec_entry = vec_buffer[col] / diagonal_entry;

       vec_buffer[col] = vec_entry;

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index < col)

           vec_buffer[row_index] -= vec_entry * element_buffer[i];

       }

     }

   }


   //

   // block solves

   //

   template<typename NumericT, unsigned int AlignmentV>

   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,

                                                    const compressed_matrix<NumericT, AlignmentV>,

                                                    op_trans> & L,

                            viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,

                            vector_base<NumericT> const & /* L_diagonal */,  //ignored

                            vector_base<NumericT> & vec,

                            viennacl::linalg::unit_lower_tag)

   {

     // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.


     unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());

     unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());

     NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());

     NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());


     vcl_size_t col_begin = row_buffer[0];

     for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)

     {

       NumericT vec_entry = vec_buffer[col];

       vcl_size_t col_end = row_buffer[col+1];

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         unsigned int row_index = col_buffer[i];

         if (row_index > col)

           vec_buffer[row_index] -= vec_entry * elements[i];

       }

       col_begin = col_end;

     }

   }


   template<typename NumericT, unsigned int AlignmentV>

   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,

                                                    const compressed_matrix<NumericT, AlignmentV>,

                                                    op_trans> & L,

                            viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,

                            vector_base<NumericT> const & L_diagonal,

                            vector_base<NumericT> & vec,

                            viennacl::linalg::lower_tag)

   {

     // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.


     unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());

     unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());

     NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());

     NumericT     const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(L_diagonal.handle());

     NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());


     vcl_size_t col_begin = row_buffer[0];

     for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)

     {

       vcl_size_t col_end = row_buffer[col+1];


       NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];

       vec_buffer[col] = vec_entry;

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index > col)

           vec_buffer[row_index] -= vec_entry * elements[i];

       }

       col_begin = col_end;

     }

   }


   template<typename NumericT, unsigned int AlignmentV>

   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,

                                                    const compressed_matrix<NumericT, AlignmentV>,

                                                    op_trans> & U,

                            viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,

                            vector_base<NumericT> const & /* U_diagonal */, //ignored

                            vector_base<NumericT> & vec,

                            viennacl::linalg::unit_upper_tag)

   {

     // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.


     unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());

     unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());

     NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());

     NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());


     for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)

     {

       vcl_size_t col = (U.lhs().size1() - col2) - 1;


       NumericT vec_entry = vec_buffer[col];

       vcl_size_t col_begin = row_buffer[col];

       vcl_size_t col_end = row_buffer[col+1];

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index < col)

           vec_buffer[row_index] -= vec_entry * elements[i];

       }


     }

   }


   template<typename NumericT, unsigned int AlignmentV>

   void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,

                                                    const compressed_matrix<NumericT, AlignmentV>,

                                                    op_trans> & U,

                            viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,

                            vector_base<NumericT> const & U_diagonal,

                            vector_base<NumericT> & vec,

                            viennacl::linalg::upper_tag)

   {

     // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.


     unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());

     unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());

     NumericT     const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());

     NumericT     const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(U_diagonal.handle());

     NumericT           * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());


     for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)

     {

       vcl_size_t col = (U.lhs().size1() - col2) - 1;

       vcl_size_t col_begin = row_buffer[col];

       vcl_size_t col_end = row_buffer[col+1];


       // Stage 2: Substitute

       NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];

       vec_buffer[col] = vec_entry;

       for (vcl_size_t i = col_begin; i < col_end; ++i)

       {

         vcl_size_t row_index = col_buffer[i];

         if (row_index < col)

           vec_buffer[row_index] -= vec_entry * elements[i];

       }

     }

   }


 } //namespace detail


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,

                                       const compressed_matrix<NumericT, AlignmentV>,

                                       op_trans> const & proxy,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::unit_lower_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());


   detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);

 }


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,

                                       const compressed_matrix<NumericT, AlignmentV>,

                                       op_trans> const & proxy,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::lower_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());


   detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);

 }


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,

                                       const compressed_matrix<NumericT, AlignmentV>,

                                       op_trans> const & proxy,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::unit_upper_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());


   detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);

 }


 template<typename NumericT, unsigned int AlignmentV>

 void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,

                                       const compressed_matrix<NumericT, AlignmentV>,

                                       op_trans> const & proxy,

                    vector_base<NumericT> & vec,

                    viennacl::linalg::upper_tag tag)

 {

   NumericT           * vec_buf    = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements   = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());

   unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());

   unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());


   detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);

 }


 //

 // Compressed Compressed Matrix

 //


 template<typename NumericT>

 void prod_impl(const viennacl::compressed_compressed_matrix<NumericT> & mat,

                const viennacl::vector_base<NumericT> & vec,

                NumericT alpha,

                      viennacl::vector_base<NumericT> & result,

                NumericT beta)

 {

   NumericT           * result_buf  = detail::extract_raw_pointer<NumericT>(result.handle());

   NumericT     const * vec_buf     = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements    = detail::extract_raw_pointer<NumericT>(mat.handle());

   unsigned int const * row_buffer  = detail::extract_raw_pointer<unsigned int>(mat.handle1());

   unsigned int const * row_indices = detail::extract_raw_pointer<unsigned int>(mat.handle3());

   unsigned int const * col_buffer  = detail::extract_raw_pointer<unsigned int>(mat.handle2());


   if (beta < 0 || beta > 0)

   {

     for (vcl_size_t i = 0; i< result.size(); ++i)

       result_buf[i * result.stride() + result.start()] *= beta;

   }

   else // flush

   {

     for (vcl_size_t i = 0; i< result.size(); ++i)

       result_buf[i * result.stride() + result.start()] = 0;

   }


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i = 0; i < static_cast<long>(mat.nnz1()); ++i)

   {

     NumericT dot_prod = 0;

     vcl_size_t row_end = row_buffer[i+1];

     for (vcl_size_t j = row_buffer[i]; j < row_end; ++j)

       dot_prod += elements[j] * vec_buf[col_buffer[j] * vec.stride() + vec.start()];


     if (beta > 0 || beta < 0)

       result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()] += alpha * dot_prod;

     else

       result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()]  = alpha * dot_prod;

   }


 }


 //

 // Coordinate Matrix

 //


 namespace detail

 {

   template<typename NumericT, unsigned int AlignmentV>

   void row_info(coordinate_matrix<NumericT, AlignmentV> const & mat,

                 vector_base<NumericT> & vec,

                 viennacl::linalg::detail::row_info_types info_selector)

   {

     NumericT           * result_buf   = detail::extract_raw_pointer<NumericT>(vec.handle());

     NumericT     const * elements     = detail::extract_raw_pointer<NumericT>(mat.handle());

     unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());


     NumericT value = 0;

     unsigned int last_row = 0;


     for (vcl_size_t i = 0; i < mat.nnz(); ++i)

     {

       unsigned int current_row = coord_buffer[2*i];


       if (current_row != last_row)

       {

         if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)

           value = std::sqrt(value);


         result_buf[last_row] = value;

         value = 0;

         last_row = current_row;

       }


       switch (info_selector)

       {

         case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm

           value = std::max<NumericT>(value, std::fabs(elements[i]));

           break;


         case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm

           value += std::fabs(elements[i]);

           break;


         case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm

           value += elements[i] * elements[i];

           break;


         case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry

           if (coord_buffer[2*i+1] == current_row)

             value = elements[i];

           break;


         //default:

         //  break;

       }

     }


     if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)

       value = std::sqrt(value);


     result_buf[last_row] = value;

   }

 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & mat,

                const viennacl::vector_base<NumericT> & vec,

                NumericT alpha,

                      viennacl::vector_base<NumericT> & result,

                NumericT beta)

 {

   NumericT           * result_buf   = detail::extract_raw_pointer<NumericT>(result.handle());

   NumericT     const * vec_buf      = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements     = detail::extract_raw_pointer<NumericT>(mat.handle());

   unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());


   if (beta < 0 || beta > 0)

   {

     for (vcl_size_t i = 0; i< result.size(); ++i)

       result_buf[i * result.stride() + result.start()] *= beta;

   }

   else // flush

   {

     for (vcl_size_t i = 0; i< result.size(); ++i)

       result_buf[i * result.stride() + result.start()] = 0;

   }


   for (vcl_size_t i = 0; i < mat.nnz(); ++i)

     result_buf[coord_buffer[2*i] * result.stride() + result.start()]

       += alpha * elements[i] * vec_buf[coord_buffer[2*i+1] * vec.stride() + vec.start()];

 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,

                const viennacl::matrix_base<NumericT> & d_mat,

                      viennacl::matrix_base<NumericT> & result) {


   NumericT     const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());

   unsigned int const * sp_mat_coords   = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());


   NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat);

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   if ( d_mat.row_major() ) {


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)

     {

       if (result.row_major())

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_row(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_col(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {

       NumericT x = static_cast<NumericT>(sp_mat_elements[i]);

       vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);

       vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);

       for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {

         NumericT y = d_mat_wrapper_row( c, col);

         if (result.row_major())

           result_wrapper_row(r, col) += x * y;

         else

           result_wrapper_col(r, col) += x * y;

       }

     }

   }


   else {


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)

     {

       if (result.row_major())

         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)

           result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)

           result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {


       for (vcl_size_t i = 0; i < sp_mat.nnz(); ++i) {


         NumericT x = static_cast<NumericT>(sp_mat_elements[i]);

         vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);

         vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);

         NumericT y = d_mat_wrapper_col( c, col);


         if (result.row_major())

           result_wrapper_row( r, col) += x*y;

         else

           result_wrapper_col( r, col) += x*y;

       }


     }

   }


 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,

                const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,

                                                   const viennacl::matrix_base<NumericT>,

                                                   viennacl::op_trans > & d_mat,

                      viennacl::matrix_base<NumericT> & result) {


   NumericT     const * sp_mat_elements     = detail::extract_raw_pointer<NumericT>(sp_mat.handle());

   unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());


   NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   if ( d_mat.lhs().row_major() )

   {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)

     {

       if (result.row_major())

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


 #ifdef VIENNACL_WITH_OPENMP

     #pragma omp parallel for

 #endif

     for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {

       NumericT x = static_cast<NumericT>(sp_mat_elements[i]);

       vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);

       vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);

       if (result.row_major())

       {

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {

           NumericT y = d_mat_wrapper_row( col, c);

           result_wrapper_row(r, col) += x * y;

         }

       }

       else

       {

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {

           NumericT y = d_mat_wrapper_row( col, c);

           result_wrapper_col(r, col) += x * y;

         }

       }

     }


   }

   else

   {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)

     {

       if (result.row_major())

         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)

           result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)

           result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {

       NumericT x = static_cast<NumericT>(sp_mat_elements[i]);

       vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);

       vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);

       if (result.row_major())

       {

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {

           NumericT y = d_mat_wrapper_col( col, c);

           result_wrapper_row(r, col) += x * y;

         }

       }

       else

       {

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {

           NumericT y = d_mat_wrapper_col( col, c);

           result_wrapper_col(r, col) += x * y;

         }

       }

     }

   }


 }


 //

 // ELL Matrix

 //

 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & mat,

                const viennacl::vector_base<NumericT> & vec,

                NumericT alpha,

                      viennacl::vector_base<NumericT> & result,

                NumericT beta)

 {

   NumericT           * result_buf   = detail::extract_raw_pointer<NumericT>(result.handle());

   NumericT     const * vec_buf      = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements     = detail::extract_raw_pointer<NumericT>(mat.handle());

   unsigned int const * coords       = detail::extract_raw_pointer<unsigned int>(mat.handle2());


   for (vcl_size_t row = 0; row < mat.size1(); ++row)

   {

     NumericT sum = 0;


     for (unsigned int item_id = 0; item_id < mat.internal_maxnnz(); ++item_id)

     {

       vcl_size_t offset = row + item_id * mat.internal_size1();

       NumericT val = elements[offset];


       if (val > 0 || val < 0)

       {

         unsigned int col = coords[offset];

         sum += (vec_buf[col * vec.stride() + vec.start()] * val);

       }

     }


     if (beta < 0 || beta > 0)

     {

       vcl_size_t index = row * result.stride() + result.start();

       result_buf[index] = alpha * sum + beta * result_buf[index];

     }

     else

       result_buf[row * result.stride() + result.start()] = alpha * sum;

   }

 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & sp_mat,

                const viennacl::matrix_base<NumericT> & d_mat,

                      viennacl::matrix_base<NumericT> & result)

 {

   NumericT     const * sp_mat_elements     = detail::extract_raw_pointer<NumericT>(sp_mat.handle());

   unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());


   NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   if ( d_mat.row_major() ) {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)

     {

       if (result.row_major())

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)

     {

       for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id)

       {

         vcl_size_t offset = static_cast<vcl_size_t>(row) + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();

         NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);

         vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);


         if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings

         {

           if (result.row_major())

             for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

               result_wrapper_row(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);

           else

             for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

               result_wrapper_col(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);

         }

       }

     }

   }

   else {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)

     {

       if (result.row_major())

         for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)

           result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)

           result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {


       for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {


         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {


           vcl_size_t offset = row + item_id * sp_mat.internal_size1();

           NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);

           vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);


           if (sp_mat_val < 0 || sp_mat_val > 0)  // sp_mat_val != 0 without compiler warnings

           {

             if (result.row_major())

               result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);

             else

               result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);

           }

         }

       }

     }

   }


 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & sp_mat,

                const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,

                                                   const viennacl::matrix_base<NumericT>,

                                                   viennacl::op_trans > & d_mat,

                      viennacl::matrix_base<NumericT> & result) {


   NumericT     const * sp_mat_elements     = detail::extract_raw_pointer<NumericT>(sp_mat.handle());

   unsigned int const * sp_mat_coords       = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());


   NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat.lhs());

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   if ( d_mat.lhs().row_major() )

   {

 #ifdef VIENNACL_WITH_OPENMP

     #pragma omp parallel for

 #endif

     for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)

     {

       if (result.row_major())

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

           result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


     for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {


       for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {


         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {


           vcl_size_t offset = row + item_id * sp_mat.internal_size1();

           NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);

           vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);


           if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings

           {

             if (result.row_major())

               result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);

             else

               result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);

           }

         }

       }

     }

   }

   else

   {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)

     {

       if (result.row_major())

         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)

           result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

       else

         for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)

           result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */

     }


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

     for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {


       for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id) {


         vcl_size_t offset = row + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();

         NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);

         vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);


         if (sp_mat_val < 0 || sp_mat_val > 0)  // sp_mat_val != 0 without compiler warnings

         {

           if (result.row_major())

             for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

               result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);

           else

             for (vcl_size_t col = 0; col < d_mat.size2(); ++col)

               result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);

         }

       }

     }

   }


 }


 //

 // SELL-C-\sigma Matrix

 //

 template<typename NumericT, typename IndexT>

 void prod_impl(const viennacl::sliced_ell_matrix<NumericT, IndexT> & mat,

                const viennacl::vector_base<NumericT> & vec,

                NumericT alpha,

                      viennacl::vector_base<NumericT> & result,

                NumericT beta)

 {

   NumericT       * result_buf        = detail::extract_raw_pointer<NumericT>(result.handle());

   NumericT const * vec_buf           = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT const * elements          = detail::extract_raw_pointer<NumericT>(mat.handle());

   IndexT   const * columns_per_block = detail::extract_raw_pointer<IndexT>(mat.handle1());

   IndexT   const * column_indices    = detail::extract_raw_pointer<IndexT>(mat.handle2());

   IndexT   const * block_start       = detail::extract_raw_pointer<IndexT>(mat.handle3());


   vcl_size_t num_blocks = mat.size1() / mat.rows_per_block() + 1;


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long block_idx2 = 0; block_idx2 < static_cast<long>(num_blocks); ++block_idx2)

   {

     vcl_size_t block_idx = static_cast<vcl_size_t>(block_idx2);

     vcl_size_t current_columns_per_block = columns_per_block[block_idx];


     std::vector<NumericT> result_values(mat.rows_per_block());


     for (IndexT column_entry_index = 0;

                 column_entry_index < current_columns_per_block;

               ++column_entry_index)

     {

       vcl_size_t stride_start = block_start[block_idx] + column_entry_index * mat.rows_per_block();

       // Note: This for-loop may be unrolled by hand for exploiting vectorization

       //       Careful benchmarking recommended first, memory channels may be saturated already!

       for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)

       {

         NumericT val = elements[stride_start + row_in_block];


         result_values[row_in_block] += (val > 0 || val < 0) ? vec_buf[column_indices[stride_start + row_in_block] * vec.stride() + vec.start()] * val : 0;

       }

     }


     vcl_size_t first_row_in_matrix = block_idx * mat.rows_per_block();

     if (beta < 0 || beta > 0)

     {

       for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)

       {

         if (first_row_in_matrix + row_in_block < result.size())

         {

           vcl_size_t index = (first_row_in_matrix + row_in_block) * result.stride() + result.start();

           result_buf[index] = alpha * result_values[row_in_block] + beta * result_buf[index];

         }

       }

     }

     else

     {

       for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)

       {

         if (first_row_in_matrix + row_in_block < result.size())

           result_buf[(first_row_in_matrix + row_in_block) * result.stride() + result.start()] = alpha * result_values[row_in_block];

       }

     }

   }

 }


 //

 // Hybrid Matrix

 //

 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,

                const viennacl::vector_base<NumericT> & vec,

                NumericT alpha,

                      viennacl::vector_base<NumericT> & result,

                NumericT beta)

 {

   NumericT           * result_buf     = detail::extract_raw_pointer<NumericT>(result.handle());

   NumericT     const * vec_buf        = detail::extract_raw_pointer<NumericT>(vec.handle());

   NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());

   unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());

   NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());

   unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());

   unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());


   for (vcl_size_t row = 0; row < mat.size1(); ++row)

   {

     NumericT sum = 0;


     //

     // Part 1: Process ELL part

     //

     for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)

     {

       vcl_size_t offset = row + item_id * mat.internal_size1();

       NumericT val = elements[offset];


       if (val > 0 || val < 0)

       {

         unsigned int col = coords[offset];

         sum += (vec_buf[col * vec.stride() + vec.start()] * val);

       }

     }


     //

     // Part 2: Process HYB part

     //

     vcl_size_t col_begin = csr_row_buffer[row];

     vcl_size_t col_end   = csr_row_buffer[row + 1];


     for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)

     {

         sum += (vec_buf[csr_col_buffer[item_id] * vec.stride() + vec.start()] * csr_elements[item_id]);

     }


     if (beta < 0 || beta > 0)

     {

       vcl_size_t index = row * result.stride() + result.start();

       result_buf[index] = alpha * sum + beta * result_buf[index];

     }

     else

       result_buf[row * result.stride() + result.start()] = alpha * sum;

   }


 }


 //

 // Hybrid Matrix

 //

 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,

                const viennacl::matrix_base<NumericT> & d_mat,

                      viennacl::matrix_base<NumericT> & result)

 {

   NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat);

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat);

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat);

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat);


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());

   unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());

   NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());

   unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());

   unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());


   for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)

   {

     for (vcl_size_t row = 0; row < mat.size1(); ++row)

     {

       NumericT sum = 0;


       //

       // Part 1: Process ELL part

       //

       for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)

       {

         vcl_size_t offset = row + item_id * mat.internal_size1();

         NumericT val = elements[offset];


         if (val < 0 || val > 0)  // val != 0 without compiler warnings

         {

           vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);

           if (d_mat.row_major())

             sum += d_mat_wrapper_row(col, result_col) * val;

           else

             sum += d_mat_wrapper_col(col, result_col) * val;

         }

       }


       //

       // Part 2: Process HYB/CSR part

       //

       vcl_size_t col_begin = csr_row_buffer[row];

       vcl_size_t col_end   = csr_row_buffer[row + 1];


       if (d_mat.row_major())

         for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)

           sum += d_mat_wrapper_row(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];

       else

         for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)

           sum += d_mat_wrapper_col(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];


       if (result.row_major())

         result_wrapper_row(row, result_col) = sum;

       else

         result_wrapper_col(row, result_col) = sum;

     }

   } // for result_col

 }


 template<typename NumericT, unsigned int AlignmentV>

 void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,

                const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,

                                                   const viennacl::matrix_base<NumericT>,

                                                   viennacl::op_trans > & d_mat,

                      viennacl::matrix_base<NumericT> & result)

 {

   NumericT const * d_mat_data  = detail::extract_raw_pointer<NumericT>(d_mat);

   NumericT       * result_data = detail::extract_raw_pointer<NumericT>(result);


   vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());

   vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());

   vcl_size_t d_mat_inc1   = viennacl::traits::stride1(d_mat.lhs());

   vcl_size_t d_mat_inc2   = viennacl::traits::stride2(d_mat.lhs());

   vcl_size_t d_mat_internal_size1  = viennacl::traits::internal_size1(d_mat.lhs());

   vcl_size_t d_mat_internal_size2  = viennacl::traits::internal_size2(d_mat.lhs());


   vcl_size_t result_start1 = viennacl::traits::start1(result);

   vcl_size_t result_start2 = viennacl::traits::start2(result);

   vcl_size_t result_inc1   = viennacl::traits::stride1(result);

   vcl_size_t result_inc2   = viennacl::traits::stride2(result);

   vcl_size_t result_internal_size1  = viennacl::traits::internal_size1(result);

   vcl_size_t result_internal_size2  = viennacl::traits::internal_size2(result);


   detail::matrix_array_wrapper<NumericT const, row_major, false>

       d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);

   detail::matrix_array_wrapper<NumericT const, column_major, false>

       d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);


   detail::matrix_array_wrapper<NumericT, row_major, false>

       result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);

   detail::matrix_array_wrapper<NumericT, column_major, false>

       result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);


   NumericT     const * elements       = detail::extract_raw_pointer<NumericT>(mat.handle());

   unsigned int const * coords         = detail::extract_raw_pointer<unsigned int>(mat.handle2());

   NumericT     const * csr_elements   = detail::extract_raw_pointer<NumericT>(mat.handle5());

   unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());

   unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());


   for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)

   {

     for (vcl_size_t row = 0; row < mat.size1(); ++row)

     {

       NumericT sum = 0;


       //

       // Part 1: Process ELL part

       //

       for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)

       {

         vcl_size_t offset = row + item_id * mat.internal_size1();

         NumericT val = elements[offset];


         if (val < 0 || val > 0)  // val != 0 without compiler warnings

         {

           vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);

           if (d_mat.lhs().row_major())

             sum += d_mat_wrapper_row(result_col, col) * val;

           else

             sum += d_mat_wrapper_col(result_col, col) * val;

         }

       }


       //

       // Part 2: Process HYB/CSR part

       //

       vcl_size_t col_begin = csr_row_buffer[row];

       vcl_size_t col_end   = csr_row_buffer[row + 1];


       if (d_mat.lhs().row_major())

         for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)

           sum += d_mat_wrapper_row(result_col, static_cast<vcl_size_t>(csr_col_buffer[item_id])) * csr_elements[item_id];

       else

         for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)

           sum += d_mat_wrapper_col(result_col, static_cast<vcl_size_t>(csr_col_buffer[item_id])) * csr_elements[item_id];


       if (result.row_major())

         result_wrapper_row(row, result_col) = sum;

       else

         result_wrapper_col(row, result_col) = sum;

     }

   } // for result_col

 }


 } // namespace host_based

 } //namespace linalg

 } //namespace viennacl


 #endif

viennacl::compressed_matrix::size2
const vcl_size_t & size2() const
Returns the number of columns.
Definition: compressed_matrix.hpp:929

viennacl::hyb_matrix::internal_ellnnz
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101

viennacl::hyb_matrix
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406

viennacl::sliced_ell_matrix::handle2
handle_type & handle2()
Definition: sliced_ell_matrix.hpp:112

viennacl::ell_matrix::size1
vcl_size_t size1() const
Definition: ell_matrix.hpp:91

viennacl::ell_matrix::handle2
handle_type & handle2()
Definition: ell_matrix.hpp:103

viennacl::traits::stride1
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55

viennacl::linalg::sum
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
Definition: sum.hpp:45

viennacl::linalg::detail::row_info_types
row_info_types
Definition: forwards.h:837

viennacl::linalg::detail::SPARSE_ROW_NORM_2
Definition: forwards.h:841

viennacl::hyb_matrix::handle3
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107

viennacl::compressed_matrix::size1
const vcl_size_t & size1() const
Returns the number of rows.
Definition: compressed_matrix.hpp:927

viennacl::compressed_compressed_matrix::handle2
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_compressed_matrix.hpp:471

viennacl::compressed_compressed_matrix::handle1
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_compressed_matrix.hpp:469

tools.hpp
Various little tools used here and there in ViennaCL.

viennacl::hyb_matrix::handle
const handle_type & handle() const
Definition: hyb_matrix.hpp:105

viennacl::traits::internal_size1
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:386

viennacl::coordinate_matrix::handle12
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Definition: coordinate_matrix.hpp:366

viennacl::linalg::lower_tag
A tag class representing a lower triangular matrix.
Definition: forwards.h:849

viennacl::hyb_matrix::internal_size1
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95

viennacl::linalg::host_based::inplace_solve
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
Definition: direct_solve.hpp:130

viennacl::traits::internal_size2
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:394

viennacl::matrix_base< NumericT >

viennacl::matrix_expression
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:341

forwards.h
This file provides the forward declarations for the main types used within ViennaCL.

viennacl::traits::start1
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65

viennacl::coordinate_matrix::nnz
vcl_size_t nnz() const
Returns the number of nonzero entries.
Definition: coordinate_matrix.hpp:361

viennacl::linalg::detail::spai::dot_prod
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182

viennacl::hyb_matrix::handle4
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108

viennacl::linalg::detail::max
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59

viennacl::sliced_ell_matrix::rows_per_block
vcl_size_t rows_per_block() const
Definition: sliced_ell_matrix.hpp:104

viennacl::compressed_matrix::handle
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_matrix.hpp:942

viennacl::compressed_matrix::handle1
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_matrix.hpp:936

viennacl::ell_matrix::internal_size1
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88

viennacl::sliced_ell_matrix::handle
handle_type & handle()
Definition: sliced_ell_matrix.hpp:118

NumericT
float NumericT
Definition: bisect.cpp:40

viennacl::vector_base::stride
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124

viennacl::hyb_matrix::handle2
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106

viennacl::linalg::detail::SPARSE_ROW_NORM_INF
Definition: forwards.h:839

viennacl::compressed_compressed_matrix::handle
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_compressed_matrix.hpp:475

viennacl::linalg::host_based::row_C_scan_numeric_vector
void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
Definition: spgemm_vector.hpp:567

viennacl::traits::start2
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84

viennacl::linalg::host_based::detail::matrix_array_wrapper
Helper array for accessing a strided submatrix embedded in a larger matrix.
Definition: common.hpp:73

viennacl::ell_matrix
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53

viennacl::linalg::upper_tag
A tag class representing an upper triangular matrix.
Definition: forwards.h:854

viennacl::linalg::host_based::prod_impl
void prod_impl(const matrix_base< NumericT > &mat, bool trans, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
Definition: matrix_operations.hpp:993

viennacl::sliced_ell_matrix
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403

viennacl::compressed_compressed_matrix::handle3
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
Definition: compressed_compressed_matrix.hpp:473

viennacl::compressed_compressed_matrix
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
Definition: compressed_compressed_matrix.hpp:265

viennacl::compressed_matrix::handle2
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_matrix.hpp:938

viennacl::linalg::host_based::row_C_scan_symbolic_vector
unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
Definition: spgemm_vector.hpp:206

viennacl::vector_base< NumericT >

viennacl::vcl_size_t
std::size_t vcl_size_t
Definition: forwards.h:75

viennacl::sliced_ell_matrix::size1
vcl_size_t size1() const
Definition: sliced_ell_matrix.hpp:101

viennacl::linalg::host_based::detail::block_inplace_solve
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &, vcl_size_t, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
Definition: sparse_matrix_operations.hpp:824

viennacl::matrix_base::size2
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:226

common.hpp
Common routines for single-threaded or OpenMP-enabled execution on CPU.

viennacl::ell_matrix::maxnnz
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95

viennacl::traits::stride2
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65

viennacl::row
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910

viennacl::ell_matrix::handle
handle_type & handle()
Definition: ell_matrix.hpp:100

viennacl::compressed_matrix::reserve
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
Definition: compressed_matrix.hpp:794

viennacl::coordinate_matrix::handle
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: coordinate_matrix.hpp:368

vector.hpp
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...

viennacl::sliced_ell_matrix::handle3
handle_type & handle3()
Definition: sliced_ell_matrix.hpp:115

viennacl::matrix_base::row_major
bool row_major() const
Definition: matrix_def.hpp:248

viennacl::linalg::detail::SPARSE_ROW_DIAGONAL
Definition: forwards.h:842

viennacl::vector_base::size
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118

viennacl::compressed_compressed_matrix::nnz1
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
Definition: compressed_compressed_matrix.hpp:464

viennacl::linalg::host_based::detail::csr_inplace_solve
void csr_inplace_solve(IndexArrayT const &row_buffer, IndexArrayT const &col_buffer, ConstScalarArrayT const &element_buffer, ScalarArrayT &vec_buffer, vcl_size_t num_cols, viennacl::linalg::unit_lower_tag)
Definition: sparse_matrix_operations.hpp:494

viennacl::linalg::unit_lower_tag
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:859

viennacl::backend::mem_handle
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89

viennacl::op_trans
A tag class representing transposed matrices.
Definition: forwards.h:220

viennacl::linalg::host_based::detail::row_info
void row_info(compressed_matrix< NumericT, AlignmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
Definition: sparse_matrix_operations.hpp:53

spgemm_vector.hpp

viennacl::compressed_matrix
A sparse square matrix in compressed sparse rows format.
Definition: compressed_matrix.hpp:559

viennacl::linalg::detail::SPARSE_ROW_NORM_1
Definition: forwards.h:840

viennacl::linalg::detail::min
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45

viennacl::hyb_matrix::handle5
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109

viennacl::hyb_matrix::size1
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98

viennacl::vector_base::start
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122

viennacl::coordinate_matrix::size1
vcl_size_t size1() const
Returns the number of rows.
Definition: coordinate_matrix.hpp:357

viennacl::ell_matrix::internal_maxnnz
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94

viennacl::sliced_ell_matrix::handle1
handle_type & handle1()
Definition: sliced_ell_matrix.hpp:109

scalar.hpp
Implementation of the ViennaCL scalar class.

viennacl::compressed_matrix::resize
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
Definition: compressed_matrix.hpp:829

viennacl::vector_base::handle
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128

viennacl::linalg::host_based::detail::csr_trans_inplace_solve
void csr_trans_inplace_solve(IndexArrayT const &row_buffer, IndexArrayT const &col_buffer, ConstScalarArrayT const &element_buffer, ScalarArrayT &vec_buffer, vcl_size_t num_cols, viennacl::linalg::unit_lower_tag)
Definition: sparse_matrix_operations.hpp:696

viennacl::linalg::unit_upper_tag
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:864

viennacl::coordinate_matrix
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
Definition: coordinate_matrix.hpp:186

vector_operations.hpp
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...