doc/spai-dynamic_8hpp_source.html

 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP

 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP


 /* =========================================================================

    Copyright (c) 2010-2016, Institute for Microelectronics,

                             Institute for Analysis and Scientific Computing,

                             TU Wien.

    Portions of this software are copyright by UChicago Argonne, LLC.


                             -----------------

                   ViennaCL - The Vienna Computing Library

                             -----------------


    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at


    (A list of authors and contributors can be found in the manual)


    License:         MIT (X11), see file LICENSE in the base directory

 ============================================================================= */


 #include <utility>

 #include <iostream>

 #include <fstream>

 #include <string>

 #include <algorithm>

 #include <vector>

 #include <math.h>

 #include <map>

 //#include "block_matrix.hpp"

 //#include "block_vector.hpp"

 //#include "benchmark-utils.hpp"

 #include "boost/numeric/ublas/vector.hpp"

 #include "boost/numeric/ublas/matrix.hpp"

 #include "boost/numeric/ublas/matrix_proxy.hpp"

 #include "boost/numeric/ublas/vector_proxy.hpp"

 #include "boost/numeric/ublas/storage.hpp"

 #include "boost/numeric/ublas/io.hpp"

 #include "boost/numeric/ublas/lu.hpp"

 #include "boost/numeric/ublas/triangular.hpp"

 #include "boost/numeric/ublas/matrix_expression.hpp"

 // ViennaCL includes

 #include "viennacl/linalg/prod.hpp"

 #include "viennacl/matrix.hpp"

 #include "viennacl/compressed_matrix.hpp"

 #include "viennacl/linalg/sparse_matrix_operations.hpp"

 #include "viennacl/linalg/matrix_operations.hpp"

 #include "viennacl/scalar.hpp"

 #include "viennacl/linalg/cg.hpp"

 #include "viennacl/linalg/inner_prod.hpp"

 #include "viennacl/linalg/ilu.hpp"

 #include "viennacl/ocl/backend.hpp"


 #include "viennacl/linalg/detail/spai/block_matrix.hpp"

 #include "viennacl/linalg/detail/spai/block_vector.hpp"

 #include "viennacl/linalg/detail/spai/qr.hpp"

 #include "viennacl/linalg/detail/spai/spai-static.hpp"

 #include "viennacl/linalg/detail/spai/spai.hpp"

 #include "viennacl/linalg/detail/spai/spai_tag.hpp"

 #include "viennacl/linalg/opencl/kernels/spai.hpp"


 namespace viennacl

 {

 namespace linalg

 {

 namespace detail

 {

 namespace spai

 {


 struct CompareSecond

 {

   template<typename T1, typename T2>

   bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right)

   {

     return static_cast<double>(left.second) > static_cast<double>(right.second);

   }

 };


 template<typename MatrixT>

 void composeNewR(MatrixT const & A,

                  MatrixT const & R_n,

                  MatrixT & R)

 {

   typedef typename MatrixT::value_type        NumericType;


   vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2());

   MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(R.size1() + row_n, R.size2() + A.size2());


   //write original R to new Composite R

   boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;

   //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()

   boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),

                                                                                                             R.size2() + A.size2())) +=

   boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));


   //adding decomposed(QR) block to Composite R

   if (R_n.size1() > 0 && R_n.size2() > 0)

       boost::numeric::ublas::project(C,

                                      boost::numeric::ublas::range(R.size2(), R.size1() + row_n),

                                      boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;

   R = C;

 }


 template<typename VectorT>

 void composeNewVector(VectorT const & v_n,

                       VectorT       & v)

 {

   typedef typename VectorT::value_type          NumericType;


   VectorT w  = boost::numeric::ublas::zero_vector<NumericType>(v.size() + v_n.size());

   boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;

   boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;

   v = w;

 }


 template<typename SparseVectorT, typename NumericT>

 void sparse_norm_2(SparseVectorT const & v,

                    NumericT & norm)

 {

   for (typename SparseVectorT::const_iterator vec_it  = v.begin(); vec_it != v.end(); ++vec_it)

     norm += (vec_it->second)*(vec_it->second);


   norm = std::sqrt(norm);

 }


 template<typename SparseVectorT, typename NumericT>

 void sparse_inner_prod(SparseVectorT const & v1,

                        SparseVectorT const & v2,

                        NumericT & res_v)

 {

   typename SparseVectorT::const_iterator v_it1 = v1.begin();

   typename SparseVectorT::const_iterator v_it2 = v2.begin();


   while ((v_it1 != v1.end())&&(v_it2 != v2.end()))

   {

     if (v_it1->first == v_it2->first)

     {

       res_v += (v_it1->second)*(v_it2->second);

       ++v_it1;

       ++v_it2;

     }

     else if (v_it1->first < v_it2->first)

       ++v_it1;

     else

       ++v_it2;

   }

 }


 template<typename SparseVectorT, typename NumericT>

 bool buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c,

                             SparseVectorT const & res,

                             std::vector<unsigned int> & J,

                             std::vector<unsigned int> & J_u,

                             spai_tag const & tag)

 {

   std::vector<std::pair<unsigned int, NumericT> > p;

   vcl_size_t cur_size = 0;

   NumericT inprod, norm2;


   for (typename SparseVectorT::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it)

   {

     if (!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold()))

     {

       inprod = norm2 = 0;

       sparse_inner_prod(res, A_v_c[res_it->first], inprod);

       sparse_norm_2(A_v_c[res_it->first], norm2);

       p.push_back(std::pair<unsigned int, NumericT>(res_it->first, (inprod*inprod)/(norm2*norm2)));

     }

   }


   std::sort(p.begin(), p.end(), CompareSecond());

   while ((cur_size < J.size()) && (p.size() > 0))

   {

     J_u.push_back(p[0].first);

     p.erase(p.begin());

     cur_size++;

   }

   p.clear();

   return (cur_size > 0);

 }


 template<typename SparseVectorT>

 void buildNewRowSet(std::vector<SparseVectorT> const & A_v_c,

                     std::vector<unsigned int>  const & I,

                     std::vector<unsigned int>  const & J_n,

                     std::vector<unsigned int>        & I_n)

 {

   for (vcl_size_t i = 0; i < J_n.size(); ++i)

   {

     for (typename SparseVectorT::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it)

     {

       if (!isInIndexSet(I, col_it->first) && !isInIndexSet(I_n, col_it->first))

         I_n.push_back(col_it->first);

     }

   }

 }


 template<typename MatrixT>

 void QRBlockComposition(MatrixT const & A_I_J,

                         MatrixT const & A_I_J_u,

                         MatrixT       & A_I_u_J_u)

 {

   typedef typename MatrixT::value_type     NumericType;


   vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();

   vcl_size_t row_n2 = A_I_u_J_u.size1();

   vcl_size_t row_n = row_n1 + row_n2;

   vcl_size_t col_n = A_I_J_u.size2();


   MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(row_n, col_n);

   boost::numeric::ublas::project(C,

                                  boost::numeric::ublas::range(0, row_n1),

                                  boost::numeric::ublas::range(0, col_n))

   += boost::numeric::ublas::project(A_I_J_u,

                                     boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),

                                     boost::numeric::ublas::range(0, col_n));


   boost::numeric::ublas::project(C,

                                  boost::numeric::ublas::range(row_n1, row_n1 + row_n2),

                                  boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;

   A_I_u_J_u = C;

 }


 template<typename SparseMatrixT,

          typename SparseVectorT,

          typename DenseMatrixT,

          typename VectorT>

 void block_update(SparseMatrixT const & A,

                   std::vector<SparseVectorT> const & A_v_c,

                   std::vector<SparseVectorT>       & g_res,

                   std::vector<bool> & g_is_update,

                   std::vector<std::vector<unsigned int> >& g_I,

                   std::vector<std::vector<unsigned int> >& g_J,

                   std::vector<VectorT>      & g_b_v,

                   std::vector<DenseMatrixT> & g_A_I_J,

                   spai_tag const & tag)

 {

   typedef typename DenseMatrixT::value_type     NumericType;


   std::vector<std::vector<unsigned int> > g_J_u(g_J.size());   // set of new column indices

   std::vector<std::vector<unsigned int> > g_I_u(g_J.size());   // set of new row indices

   std::vector<DenseMatrixT> g_A_I_J_u(g_J.size());             // matrix A(I, \tilde J), cf. Kallischko p.31-32

   std::vector<DenseMatrixT> g_A_I_u_J_u(g_J.size());           // matrix A(\tilde I, \tilde J), cf. Kallischko

   std::vector<VectorT>      g_b_v_u(g_J.size());               // new vector of beta coefficients from QR factorization


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i = 0; i < static_cast<long>(g_J.size()); ++i)

   {

     if (g_is_update[static_cast<vcl_size_t>(i)])

     {

       if (buildAugmentedIndexSet<SparseVectorT, NumericType>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))

       {

         //initialize matrix A_I_\hatJ

         initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);

         //multiplication of Q'*A_I_\hatJ

         apply_q_trans_mat(g_A_I_J[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);

         //building new rows index set \hatI

         buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);

         initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);

         //composition of block for new QR factorization

         QRBlockComposition(g_A_I_J[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);

         //QR factorization

         single_qr(g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_b_v_u[static_cast<vcl_size_t>(i)]);

         //composition of new R and new vector b_v

         composeNewR(g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_A_I_J[static_cast<vcl_size_t>(i)]);

         composeNewVector(g_b_v_u[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)]);

         //composition of new sets: I and J

         g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());

         g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());

       }

       else

       {

         g_is_update[static_cast<vcl_size_t>(i)] = false;

       }

     }

   }

 }


 /**************************************************** GPU SPAI Update ****************************************************************/


 //performs Q'*A(I, \tilde J) on GPU

 template<typename NumericT>

 void block_q_multiplication(std::vector<std::vector<unsigned int> > const & g_J_u,

                             std::vector<std::vector<unsigned int> > const & g_I,

                             block_matrix & g_A_I_J_vcl,

                             block_vector & g_bv_vcl,

                             block_matrix & g_A_I_J_u_vcl,

                             std::vector<cl_uint> & g_is_update,

                             viennacl::context ctx)

 {

   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());

   unsigned int local_r_n = 0;

   unsigned int local_c_n = 0;

   unsigned int sz_blocks = 0;


   get_max_block_size(g_I,   local_r_n);

   get_max_block_size(g_J_u, local_c_n);


   //for debug

   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));

   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));

   compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);

   //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));


   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),

                                                                            &(g_is_update[0]));

   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);

   viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_q_mult");


   block_q_kernel.local_work_size(0,      local_c_n);

   block_q_kernel.global_work_size(0, 128*local_c_n);

   viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),

                                         g_bv_vcl.handle(),

                                         g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,

                                         viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),

                                         static_cast<cl_uint>(g_I.size())));

 }


 template<typename SizeT>

 void assemble_qr_row_inds(std::vector<std::vector<SizeT> > const & g_I,

                           std::vector<std::vector<SizeT> > const & g_J,

                           std::vector<std::vector<SizeT> > const & g_I_u,

                           std::vector<std::vector<SizeT> >       & g_I_q)

 {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i = 0; i < static_cast<long>(g_I.size()); ++i)

   {

     for (vcl_size_t j = g_J[static_cast<vcl_size_t>(i)].size(); j < g_I[static_cast<vcl_size_t>(i)].size(); ++j)

       g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I[static_cast<vcl_size_t>(i)][j]);


     for (vcl_size_t j = 0; j < g_I_u[static_cast<vcl_size_t>(i)].size(); ++j)

       g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I_u[static_cast<vcl_size_t>(i)][j]);

   }

 }


 template<typename NumericT>

 void assemble_qr_block(std::vector<std::vector<unsigned int> > const & g_J,

                        std::vector<std::vector<unsigned int> > const& g_I,

                        std::vector<std::vector<unsigned int> > const& g_J_u,

                        std::vector<std::vector<unsigned int> > const& g_I_u,

                        std::vector<std::vector<unsigned int> >& g_I_q,

                        block_matrix & g_A_I_J_u_vcl,

                        viennacl::ocl::handle<cl_mem> & matrix_dimensions,

                        block_matrix & g_A_I_u_J_u_vcl,

                        std::vector<cl_uint> & g_is_update,

                        bool is_empty_block,

                        viennacl::context ctx)

 {

   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());


   //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());

   assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);

   unsigned int sz_blocks;

   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));

   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));


   compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);


   std::vector<NumericT> con_A_I_J_q(sz_blocks, static_cast<NumericT>(0));


   block_matrix g_A_I_J_q_vcl;

   //need to allocate memory for QR block

   g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                     static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),

                                                     &(con_A_I_J_q[0]));

   g_A_I_J_q_vcl.handle().context(opencl_ctx);


   g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                      static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),

                                                      &(matrix_dims[0]));

   g_A_I_J_q_vcl.handle1().context(opencl_ctx);


   g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                       static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),

                                                       &(blocks_ind[0]));

   g_A_I_J_q_vcl.handle2().context(opencl_ctx);


   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),

                                                                            &(g_is_update[0]));


   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);

   if (!is_empty_block)

   {

     viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly");

     qr_assembly_kernel.local_work_size(0, 1);

     qr_assembly_kernel.global_work_size(0, 256);

     viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,

                                               g_A_I_J_u_vcl.handle(),

                                               g_A_I_J_u_vcl.handle2(),

                                               g_A_I_J_u_vcl.handle1(),

                                               g_A_I_u_J_u_vcl.handle(),

                                               g_A_I_u_J_u_vcl.handle2(),

                                               g_A_I_u_J_u_vcl.handle1(),

                                               g_A_I_J_q_vcl.handle(),

                                               g_A_I_J_q_vcl.handle2(),

                                               g_A_I_J_q_vcl.handle1(),

                                               g_is_update_vcl,

                                               static_cast<unsigned int>(g_I.size())));

   }

   else

   {

     viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly_1");

     qr_assembly_kernel.local_work_size(0, 1);

     qr_assembly_kernel.global_work_size(0, 256);

     viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),

                                               g_A_I_J_u_vcl.handle1(),

                                               g_A_I_J_q_vcl.handle(),

                                               g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),

                                               g_is_update_vcl,

                                               static_cast<unsigned int>(g_I.size())));

   }

   g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();

   g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();

   g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();

 }


 template<typename NumericT>

 void assemble_r(std::vector<std::vector<unsigned int> > & g_I,

                 std::vector<std::vector<unsigned int> > & g_J,

                 block_matrix & g_A_I_J_vcl,

                 block_matrix & g_A_I_J_u_vcl,

                 block_matrix & g_A_I_u_J_u_vcl,

                 block_vector & g_bv_vcl,

                 block_vector & g_bv_vcl_u,

                 std::vector<cl_uint> & g_is_update,

                 viennacl::context ctx)

 {

   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());

   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));

   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));

   std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);

   unsigned int sz_blocks, bv_size;


   compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);

   get_size(g_J, bv_size);

   init_start_inds(g_J, start_bv_r_inds);


   std::vector<NumericT> con_A_I_J_r(sz_blocks, static_cast<NumericT>(0));

   std::vector<NumericT> b_v_r(bv_size, static_cast<NumericT>(0));


   block_matrix g_A_I_J_r_vcl;

   block_vector g_bv_r_vcl;

   g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                     static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),

                                                     &(con_A_I_J_r[0]));

   g_A_I_J_r_vcl.handle().context(opencl_ctx);


   g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                      static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),

                                                      &(matrix_dims[0]));

   g_A_I_J_r_vcl.handle1().context(opencl_ctx);


   g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                      static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),

                                                      &(blocks_ind[0]));

   g_A_I_J_r_vcl.handle2().context(opencl_ctx);


   g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                  static_cast<unsigned int>(sizeof(NumericT)*bv_size),

                                                  &(b_v_r[0]));

   g_bv_r_vcl.handle().context(opencl_ctx);


   g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                   static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),

                                                   &(start_bv_r_inds[0]));

   g_bv_r_vcl.handle().context(opencl_ctx);


   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),

                                                                            &(g_is_update[0]));

   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);

   viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_r_assembly");

   r_assembly_kernel.local_work_size(0, 1);

   r_assembly_kernel.global_work_size(0, 256);


   viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),

                                           g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),

                                           g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),

                                           g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),

                                           g_is_update_vcl, static_cast<cl_uint>(g_I.size())));


   viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_bv_assembly");

   bv_assembly_kernel.local_work_size(0, 1);

   bv_assembly_kernel.global_work_size(0, 256);

   viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),

                                             g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),

                                             g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,

                                             static_cast<cl_uint>(g_I.size())));

   g_bv_vcl.handle() = g_bv_r_vcl.handle();

   g_bv_vcl.handle1() = g_bv_r_vcl.handle1();


   g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();

   g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();

   g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();

 }


 template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>

 void block_update(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,

                   std::vector<SparseVectorT> const & A_v_c,

                   std::vector<cl_uint> & g_is_update,

                   std::vector<SparseVectorT> & g_res,

                   std::vector<std::vector<unsigned int> > & g_J,

                   std::vector<std::vector<unsigned int> > & g_I,

                   block_matrix & g_A_I_J_vcl,

                   block_vector & g_bv_vcl,

                   spai_tag const & tag)

 {

   viennacl::context ctx = viennacl::traits::context(A);

   //updated index set for columns

   std::vector<std::vector<unsigned int> > g_J_u(g_J.size());

   //updated index set for rows

   std::vector<std::vector<unsigned int> > g_I_u(g_J.size());

   //mixed index set of old and updated indices for rows

   std::vector<std::vector<unsigned int> > g_I_q(g_J.size());

   //GPU memory for A_I_\hatJ

   block_matrix g_A_I_J_u_vcl;

   //GPU memory for A_\hatI_\hatJ

   block_matrix g_A_I_u_J_u_vcl;

   bool is_empty_block;

   //GPU memory for new b_v

   block_vector g_bv_u_vcl;


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i = 0; i < static_cast<long>(g_J.size()); ++i)

   {

     if (g_is_update[static_cast<vcl_size_t>(i)])

     {

       if (buildAugmentedIndexSet<SparseVectorT, NumericT>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))

           buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);

     }

   }

   //assemble new A_I_J_u blocks on GPU and multiply them with Q'

   block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block);

   //I have matrix A_I_J_u ready..

   block_q_multiplication<NumericT>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx);

   //assemble A_\hatI_\hatJ

   block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block);

   assemble_qr_block<NumericT>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),

                               g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx);


   block_qr<NumericT>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx);

   //concatanation of new and old indices

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i = 0; i < static_cast<long>(g_J.size()); ++i)

   {

     g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());

     g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());

   }

   assemble_r<NumericT>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl,  g_bv_vcl,  g_bv_u_vcl, g_is_update, ctx);

 }


 }

 }

 }

 }

 #endif

viennacl::linalg::opencl::kernels::spai
Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners...
Definition: spai.hpp:587

viennacl::linalg::detail::spai::buildAugmentedIndexSet
bool buildAugmentedIndexSet(std::vector< SparseVectorT > const &A_v_c, SparseVectorT const &res, std::vector< unsigned int > &J, std::vector< unsigned int > &J_u, spai_tag const &tag)
Building a new set of column indices J_u, cf. Kallischko dissertation p.31.
Definition: spai-dynamic.hpp:188

viennacl::linalg::detail::spai::block_matrix
Represents contigious matrices on GPU.
Definition: block_matrix.hpp:49

matrix_operations.hpp
Implementations of dense matrix related operations including matrix-vector products.

viennacl::linalg::detail::spai::CompareSecond
Helper functor for comparing std::pair<> based on the second member.
Definition: spai-dynamic.hpp:77

viennacl::linalg::detail::spai::block_matrix::handle
viennacl::ocl::handle< cl_mem > & handle()
Returns a handle to the elements.
Definition: block_matrix.hpp:56

viennacl::linalg::detail::spai::sparse_norm_2
void sparse_norm_2(SparseVectorT const &v, NumericT &norm)
Computation of Euclidean norm for sparse vector.
Definition: spai-dynamic.hpp:141

viennacl::linalg::detail::spai::block_vector::handle1
viennacl::ocl::handle< cl_mem > & handle1()
Return handle to start indices.
Definition: block_vector.hpp:58

prod.hpp
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...

viennacl::linalg::detail::spai::CompareSecond::operator()
bool operator()(std::pair< T1, T2 > const &left, std::pair< T1, T2 > const &right)
Definition: spai-dynamic.hpp:80

viennacl::ocl::kernel
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58

matrix.hpp
Implementation of the dense matrix class.

viennacl::ocl::kernel::local_work_size
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742

spai.hpp
OpenCL kernel file for sparse approximate inverse operations.

viennacl::ocl::context
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55

viennacl::linalg::detail::spai::compute_blocks_size
void compute_blocks_size(std::vector< std::vector< unsigned int > > const &g_I, std::vector< std::vector< unsigned int > > const &g_J, unsigned int &sz, std::vector< cl_uint > &blocks_ind, std::vector< cl_uint > &matrix_dims)
**************************************** BLOCK FUNCTIONS ************************************// ...
Definition: qr.hpp:129

viennacl::linalg::detail::spai::get_size
void get_size(std::vector< std::vector< SizeT > > const &inds, SizeT &size)
Computes size of particular container of index set.
Definition: qr.hpp:151

viennacl::linalg::detail::spai::assemble_qr_row_inds
void assemble_qr_row_inds(std::vector< std::vector< SizeT > > const &g_I, std::vector< std::vector< SizeT > > const &g_J, std::vector< std::vector< SizeT > > const &g_I_u, std::vector< std::vector< SizeT > > &g_I_q)
Assembly of container of index row sets: I_q, row indices for new "QR block".
Definition: spai-dynamic.hpp:406

spai.hpp
Main implementation of SPAI (not FSPAI). Experimental.

viennacl::linalg::detail::spai::spai_tag
A tag for SPAI.
Definition: spai_tag.hpp:64

inner_prod.hpp
Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.

viennacl::linalg::detail::spai::block_update
void block_update(SparseMatrixT const &A, std::vector< SparseVectorT > const &A_v_c, std::vector< SparseVectorT > &g_res, std::vector< bool > &g_is_update, std::vector< std::vector< unsigned int > > &g_I, std::vector< std::vector< unsigned int > > &g_J, std::vector< VectorT > &g_b_v, std::vector< DenseMatrixT > &g_A_I_J, spai_tag const &tag)
CPU-based dynamic update for SPAI preconditioner.
Definition: spai-dynamic.hpp:291

viennacl::ocl::handle::context
viennacl::ocl::context const & context() const
Definition: handle.hpp:193

NumericT
float NumericT
Definition: bisect.cpp:40

viennacl::linalg::detail::spai::isInIndexSet
bool isInIndexSet(std::vector< SizeT > const &J, SizeT ind)
Determines if element ind is in set {J}.
Definition: spai-static.hpp:72

viennacl::linalg::detail::spai::buildNewRowSet
void buildNewRowSet(std::vector< SparseVectorT > const &A_v_c, std::vector< unsigned int > const &I, std::vector< unsigned int > const &J_n, std::vector< unsigned int > &I_n)
Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p...
Definition: spai-dynamic.hpp:228

viennacl::context
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39

viennacl::linalg::detail::spai::single_qr
void single_qr(MatrixT &R, VectorT &b_v)
Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224.
Definition: qr.hpp:311

viennacl::range
basic_range range
Definition: forwards.h:424

viennacl::linalg::detail::spai::block_assembly
void block_assembly(viennacl::compressed_matrix< NumericT, AlignmentV > const &A, std::vector< std::vector< unsigned int > > const &g_J, std::vector< std::vector< unsigned int > > const &g_I, block_matrix &g_A_I_J_vcl, std::vector< cl_uint > &g_is_update, bool &is_empty_block)
Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J.
Definition: spai.hpp:507

v1
viennacl::vector< float > v1
Definition: global_variables.cpp:60

block_matrix.hpp
Implementation of a bunch of (small) matrices on GPU. Experimental.

viennacl::traits::size
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239

viennacl::ocl::local_mem
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33

insert
void insert(MatrixType &matrix, long row, long col, ScalarType value)
Definition: vector-io.hpp:31

qr.hpp
Implementation of a simultaneous QR factorization of multiple matrices. Experimental.

viennacl::linalg::detail::spai::composeNewVector
void composeNewVector(VectorT const &v_n, VectorT &v)
Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery) ...
Definition: spai-dynamic.hpp:124

viennacl::ocl::context::get_kernel
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605

viennacl::linalg::detail::spai::init_start_inds
void init_start_inds(std::vector< std::vector< SizeT > > const &inds, std::vector< cl_uint > &start_inds)
Initializes start indices of particular index set.
Definition: qr.hpp:165

ilu.hpp
Implementations of incomplete factorization preconditioners. Convenience header file.

spai-static.hpp
Implementation of a static SPAI. Experimental.

viennacl::linalg::detail::spai::assemble_r
void assemble_r(std::vector< std::vector< unsigned int > > &g_I, std::vector< std::vector< unsigned int > > &g_J, block_matrix &g_A_I_J_vcl, block_matrix &g_A_I_J_u_vcl, block_matrix &g_A_I_u_J_u_vcl, block_vector &g_bv_vcl, block_vector &g_bv_vcl_u, std::vector< cl_uint > &g_is_update, viennacl::context ctx)
Performs assembly for new R matrix on GPU.
Definition: spai-dynamic.hpp:533

compressed_matrix.hpp
Implementation of the compressed_matrix class.

block_vector.hpp
Implementation of a bunch of vectors on GPU. Experimental.

sparse_matrix_operations.hpp
Implementations of operations using sparse matrices.

viennacl::linalg::detail::spai::block_matrix::handle1
viennacl::ocl::handle< cl_mem > & handle1()
Returns a handle to the matrix dimensions.
Definition: block_matrix.hpp:59

viennacl::linalg::detail::spai::assemble_qr_block
void assemble_qr_block(std::vector< std::vector< unsigned int > > const &g_J, std::vector< std::vector< unsigned int > > const &g_I, std::vector< std::vector< unsigned int > > const &g_J_u, std::vector< std::vector< unsigned int > > const &g_I_u, std::vector< std::vector< unsigned int > > &g_I_q, block_matrix &g_A_I_J_u_vcl, viennacl::ocl::handle< cl_mem > &matrix_dimensions, block_matrix &g_A_I_u_J_u_vcl, std::vector< cl_uint > &g_is_update, bool is_empty_block, viennacl::context ctx)
Performs assembly for new QR block.
Definition: spai-dynamic.hpp:439

viennacl::linalg::detail::spai::initProjectSubMatrix
void initProjectSubMatrix(SparseMatrixT const &A_in, std::vector< unsigned int > const &J, std::vector< unsigned int > &I, DenseMatrixT &A_out)
Initializes a dense matrix from a sparse one.
Definition: spai.hpp:156

viennacl::project
matrix_range< MatrixType > project(MatrixType const &A, viennacl::range const &r1, viennacl::range const &r2)
Definition: matrix_proxy.hpp:326

viennacl::linalg::detail::spai::block_vector::handle
viennacl::ocl::handle< cl_mem > & handle()
Return handle to the elements.
Definition: block_vector.hpp:55

spai_tag.hpp
Implementation of the spai tag holding SPAI configuration parameters. Experimental.

viennacl::vcl_size_t
std::size_t vcl_size_t
Definition: forwards.h:75

cg.hpp
The conjugate gradient method is implemented here.

backend.hpp
Implementations of the OpenCL backend, where all contexts are stored in.

viennacl::linalg::detail::spai::block_matrix::handle2
viennacl::ocl::handle< cl_mem > & handle2()
Returns a handle to the start indices of matrix.
Definition: block_matrix.hpp:62

viennacl::linalg::detail::spai::sparse_inner_prod
void sparse_inner_prod(SparseVectorT const &v1, SparseVectorT const &v2, NumericT &res_v)
Dot product of two sparse vectors.
Definition: spai-dynamic.hpp:157

viennacl::traits::context
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40

viennacl::linalg::detail::spai::get_max_block_size
void get_max_block_size(std::vector< std::vector< SizeT > > const &inds, SizeT &max_size)
Getting max size of rows/columns from container of index set.
Definition: qr.hpp:338

v2
viennacl::vector< int > v2
Definition: global_variables.cpp:61

viennacl::ocl::enqueue
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50

viennacl::linalg::detail::spai::block_q_multiplication
void block_q_multiplication(std::vector< std::vector< unsigned int > > const &g_J_u, std::vector< std::vector< unsigned int > > const &g_I, block_matrix &g_A_I_J_vcl, block_vector &g_bv_vcl, block_matrix &g_A_I_J_u_vcl, std::vector< cl_uint > &g_is_update, viennacl::context ctx)
Performs multiplication Q'*A(I, \tilde J) on GPU.
Definition: spai-dynamic.hpp:361

viennacl::linalg::detail::spai::block_vector
Represents a contiguous vector on the GPU to represent a concatentation of small vectors.
Definition: block_vector.hpp:48

viennacl::linalg::opencl::kernels::spai::init
static void init(viennacl::ocl::context &ctx)
Definition: spai.hpp:594

viennacl::ocl::kernel::global_work_size
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751

viennacl::linalg::detail::spai::apply_q_trans_mat
void apply_q_trans_mat(MatrixT const &R, VectorT const &b_v, MatrixT &A)
Multiplication of Q'*A, where Q is in implicit for lower part of R and vector of betas - b_v...
Definition: qr.hpp:404

viennacl::compressed_matrix
A sparse square matrix in compressed sparse rows format.
Definition: compressed_matrix.hpp:559

scalar.hpp
Implementation of the ViennaCL scalar class.

viennacl::linalg::detail::spai::QRBlockComposition
void QRBlockComposition(MatrixT const &A_I_J, MatrixT const &A_I_J_u, MatrixT &A_I_u_J_u)
Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4...
Definition: spai-dynamic.hpp:250

viennacl::linalg::detail::spai::composeNewR
void composeNewR(MatrixT const &A, MatrixT const &R_n, MatrixT &R)
Composition of new matrix R, that is going to be used in Least Square problem solving.
Definition: spai-dynamic.hpp:94

viennacl::ocl::handle< cl_mem >

viennacl::linalg::detail::spai::spai_tag::getResidualThreshold
double getResidualThreshold() const
Definition: spai_tag.hpp:88

viennacl::ocl::context::create_memory
viennacl::ocl::handle< cl_mem > create_memory(cl_mem_flags flags, unsigned int size, void *ptr=NULL) const
Creates a memory buffer within the context.
Definition: context.hpp:216