doc/detail_2spai_2spai_8hpp_source.html

 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP

 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP


 /* =========================================================================

    Copyright (c) 2010-2016, Institute for Microelectronics,

                             Institute for Analysis and Scientific Computing,

                             TU Wien.

    Portions of this software are copyright by UChicago Argonne, LLC.


                             -----------------

                   ViennaCL - The Vienna Computing Library

                             -----------------


    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at


    (A list of authors and contributors can be found in the manual)


    License:         MIT (X11), see file LICENSE in the base directory

 ============================================================================= */


 #include <utility>

 #include <iostream>

 #include <fstream>

 #include <string>

 #include <algorithm>

 #include <vector>

 #include <math.h>

 #include <map>


 //local includes

 #include "viennacl/linalg/detail/spai/spai_tag.hpp"

 #include "viennacl/linalg/qr.hpp"

 #include "viennacl/linalg/detail/spai/spai-dynamic.hpp"

 #include "viennacl/linalg/detail/spai/spai-static.hpp"

 #include "viennacl/linalg/detail/spai/sparse_vector.hpp"

 #include "viennacl/linalg/detail/spai/block_matrix.hpp"

 #include "viennacl/linalg/detail/spai/block_vector.hpp"


 //boost includes

 #include "boost/numeric/ublas/vector.hpp"

 #include "boost/numeric/ublas/matrix.hpp"

 #include "boost/numeric/ublas/matrix_proxy.hpp"

 #include "boost/numeric/ublas/vector_proxy.hpp"

 #include "boost/numeric/ublas/storage.hpp"

 #include "boost/numeric/ublas/io.hpp"

 #include "boost/numeric/ublas/lu.hpp"

 #include "boost/numeric/ublas/triangular.hpp"

 #include "boost/numeric/ublas/matrix_expression.hpp"


 // ViennaCL includes

 #include "viennacl/linalg/prod.hpp"

 #include "viennacl/matrix.hpp"

 #include "viennacl/compressed_matrix.hpp"

 #include "viennacl/linalg/sparse_matrix_operations.hpp"

 #include "viennacl/linalg/matrix_operations.hpp"

 #include "viennacl/scalar.hpp"

 #include "viennacl/linalg/inner_prod.hpp"

 #include "viennacl/linalg/ilu.hpp"

 #include "viennacl/ocl/backend.hpp"

 #include "viennacl/linalg/opencl/kernels/spai.hpp"


 #define VIENNACL_SPAI_K_b 20


 namespace viennacl

 {

 namespace linalg

 {

 namespace detail

 {

 namespace spai

 {


 //debug function for print

 template<typename SparseVectorT>

 void print_sparse_vector(SparseVectorT const & v)

 {

   for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it)

     std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl;

 }


 template<typename DenseMatrixT>

 void print_matrix(DenseMatrixT & m)

 {

   for (int i = 0; i < m.size2(); ++i)

   {

     for (int j = 0; j < m.size1(); ++j)

       std::cout<<m(j, i)<<" ";

     std::cout<<std::endl;

   }

 }


 template<typename SparseVectorT, typename NumericT>

 void add_sparse_vectors(SparseVectorT const & v, NumericT b,  SparseVectorT & res_v)

 {

   for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)

     res_v[v_it->first] += b*v_it->second;

 }


 //sparse-matrix - vector product

 template<typename SparseVectorT, typename NumericT>

 void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,

                            SparseVectorT const & v,

                            unsigned int ind,

                            SparseVectorT & res)

 {

   for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)

     add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);


   res[ind] -= NumericT(1);

 }


 template<typename SparseVectorT>

 void build_index_set(std::vector<SparseVectorT> const & A_v_c,

                      SparseVectorT const & v,

                      std::vector<unsigned int> & J,

                      std::vector<unsigned int> & I)

 {

   buildColumnIndexSet(v, J);

   projectRows(A_v_c, J, I);

 }


 template<typename SparseMatrixT, typename DenseMatrixT>

 void initProjectSubMatrix(SparseMatrixT const & A_in,

                           std::vector<unsigned int> const & J,

                           std::vector<unsigned int> & I,

                           DenseMatrixT & A_out)

 {

   A_out.resize(I.size(), J.size(), false);

   for (vcl_size_t j = 0; j < J.size(); ++j)

     for (vcl_size_t i = 0; i < I.size(); ++i)

       A_out(i,j) = A_in(I[i],J[j]);

 }


 /************************************************** CPU BLOCK SET UP ***************************************/


 template<typename SparseMatrixT, typename DenseMatrixT, typename SparseVectorT, typename VectorT>

 void block_set_up(SparseMatrixT const & A,

                   std::vector<SparseVectorT> const & A_v_c,

                   std::vector<SparseVectorT> const & M_v,

                   std::vector<std::vector<unsigned int> >& g_I,

                   std::vector<std::vector<unsigned int> >& g_J,

                   std::vector<DenseMatrixT>& g_A_I_J,

                   std::vector<VectorT>& g_b_v)

 {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)

   {

     vcl_size_t i = static_cast<vcl_size_t>(i2);

     build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);

     initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);

     //print_matrix(g_A_I_J[i]);

     single_qr(g_A_I_J[i], g_b_v[i]);

     //print_matrix(g_A_I_J[i]);

   }

 }


 template<typename SparseVectorT>

 void index_set_up(std::vector<SparseVectorT> const & A_v_c,

                   std::vector<SparseVectorT> const & M_v,

                   std::vector<std::vector<unsigned int> > & g_J,

                   std::vector<std::vector<unsigned int> > & g_I)

 {

 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)

   {

     vcl_size_t i = static_cast<vcl_size_t>(i2);

     build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);

   }

 }


 /************************************************** GPU BLOCK SET UP ***************************************/


 template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>

 void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,

                   std::vector<SparseVectorT> const & A_v_c,

                   std::vector<SparseVectorT> const & M_v,

                   std::vector<cl_uint> g_is_update,

                   std::vector<std::vector<unsigned int> > & g_I,

                   std::vector<std::vector<unsigned int> > & g_J,

                   block_matrix & g_A_I_J,

                   block_vector & g_bv)

 {

   viennacl::context ctx = viennacl::traits::context(A);

   bool is_empty_block;


   //build index set

   index_set_up(A_v_c, M_v, g_J, g_I);

   block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);

   block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);

 }


 /***************************************************************************************************/

 /******************************** SOLVING LS PROBLEMS ON GPU ***************************************/

 /***************************************************************************************************/


 template<typename NumericT, typename SparseVectorT>

 void custom_fan_out(std::vector<NumericT> const & m_in,

                     unsigned int start_m_ind,

                     std::vector<unsigned int> const & J,

                     SparseVectorT & m)

 {

   unsigned int  cnt = 0;

   for (vcl_size_t i = 0; i < J.size(); ++i)

     m[J[i]] = m_in[start_m_ind + cnt++];

 }


 //GPU based least square problem

 template<typename SparseVectorT, typename NumericT>

 void least_square_solve(std::vector<SparseVectorT> & A_v_c,

                         std::vector<SparseVectorT> & M_v,

                         std::vector<std::vector<unsigned int> >& g_I,

                         std::vector<std::vector<unsigned int> > & g_J,

                         block_matrix & g_A_I_J_vcl,

                         block_vector & g_bv_vcl,

                         std::vector<SparseVectorT> & g_res,

                         std::vector<cl_uint> & g_is_update,

                         const spai_tag & tag,

                         viennacl::context ctx)

 {

   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());

   unsigned int y_sz, m_sz;

   std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));

   std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));


   get_size(g_I, y_sz);

   init_start_inds(g_I, y_inds);

   init_start_inds(g_J, m_inds);


   //create y_v

   std::vector<NumericT> y_v(y_sz, NumericT(0));

   for (vcl_size_t i = 0; i < M_v.size(); ++i)

   {

     for (vcl_size_t j = 0; j < g_I[i].size(); ++j)

     {

       if (g_I[i][j] == i)

         y_v[y_inds[i] + j] = NumericT(1.0);

     }

   }

   //compute m_v

   get_size(g_J, m_sz);

   std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0));


   block_vector y_v_vcl;

   block_vector m_v_vcl;

   //prepearing memory for least square problem on GPU

   y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                               static_cast<unsigned int>(sizeof(NumericT)*y_v.size()),

                                               &(y_v[0]));

   m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                               static_cast<unsigned int>(sizeof(NumericT)*m_v.size()),

                                               &(m_v[0]));

   y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),

                                                &(y_inds[0]));

   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),

                                                                            &(g_is_update[0]));

   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);

   viennacl::ocl::kernel & ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_least_squares");

   ls_kernel.local_work_size(0, 1);

   ls_kernel.global_work_size(0, 256);

   viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),

                                    y_v_vcl.handle(), y_v_vcl.handle1(),

                                    g_A_I_J_vcl.handle1(), g_is_update_vcl,

                                    //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),

                                    static_cast<unsigned int>(M_v.size())));

   //copy vector m_v back from GPU to CPU

   cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),

                                        m_v_vcl.handle().get(), CL_TRUE, 0,

                                        sizeof(NumericT)*(m_v.size()),

                                        &(m_v[0]), 0, NULL, NULL);

   VIENNACL_ERR_CHECK(vcl_err);


   //fan out vector in parallel

   //#pragma omp parallel for

   for (long i = 0; i < static_cast<long>(M_v.size()); ++i)

   {

     if (g_is_update[static_cast<vcl_size_t>(i)])

     {

       //faned out onto sparse vector

       custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]);

       g_res[static_cast<vcl_size_t>(i)].clear();

       compute_spai_residual<SparseVectorT, NumericT>(A_v_c,  M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), g_res[static_cast<vcl_size_t>(i)]);

       NumericT res_norm = 0;

       //compute norm of res - just to make sure that this implementatino works correct

       sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm);

       //std::cout<<"Residual norm of column #: "<<i<<std::endl;

       //std::cout<<res_norm<<std::endl;

       //std::cout<<"************************"<<std::endl;

       g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);

     }

   }

 }


 //CPU based least square problems

 template<typename SparseVectorT, typename DenseMatrixT, typename VectorT>

 void least_square_solve(std::vector<SparseVectorT> const & A_v_c,

                         std::vector<DenseMatrixT> & g_R,

                         std::vector<VectorT> & g_b_v,

                         std::vector<std::vector<unsigned int> > & g_I,

                         std::vector<std::vector<unsigned int> > & g_J,

                         std::vector<SparseVectorT> & g_res,

                         std::vector<bool> & g_is_update,

                         std::vector<SparseVectorT> & M_v,

                         spai_tag const & tag)

 {

   typedef typename DenseMatrixT::value_type       NumericType;


 #ifdef VIENNACL_WITH_OPENMP

   #pragma omp parallel for

 #endif

   for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)

   {

     vcl_size_t i = static_cast<vcl_size_t>(i2);

     if (g_is_update[i])

     {

       VectorT y = boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size());


       projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + long(i)));

       apply_q_trans_vec(g_R[i], g_b_v[i], y);


       VectorT m_new =  boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2());

       backwardSolve(g_R[i], y, m_new);

       fanOutVector(m_new, g_J[i], M_v[i]);

       g_res[i].clear();


       compute_spai_residual<SparseVectorT, NumericType>(A_v_c,  M_v[i], static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]);


       NumericType res_norm = 0;

       sparse_norm_2(g_res[i], res_norm);

 //                    std::cout<<"Residual norm of column #: "<<i<<std::endl;

 //                    std::cout<<res_norm<<std::endl;

 //                    std::cout<<"************************"<<std::endl;

       g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());

     }

   }

 }


 //************************************ UPDATE CHECK ***************************************************//


 template<typename VectorType>

 bool is_all_update(VectorType& parallel_is_update)

 {

   for (unsigned int i = 0; i < parallel_is_update.size(); ++i)

   {

     if (parallel_is_update[i])

       return true;

   }

   return false;

 }


 //********************************** MATRIX VECTORIZATION ***********************************************//


 //Matrix vectorization, column based approach

 template<typename SparseMatrixT, typename SparseVectorT>

 void vectorize_column_matrix(SparseMatrixT const & M_in,

                              std::vector<SparseVectorT> & M_v)

 {

   for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)

     for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)

         M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;

 }


 //Matrix vectorization row based approach

 template<typename SparseMatrixT, typename SparseVectorT>

 void vectorize_row_matrix(SparseMatrixT const & M_in,

                           std::vector<SparseVectorT> & M_v)

 {

   for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)

     for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)

       M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;

 }


 //************************************* BLOCK ASSEMBLY CODE *********************************************//


 template<typename SizeT>

 void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set,

                         std::vector<cl_uint> & a)

 {

   vcl_size_t cnt = 0;


   for (vcl_size_t i = 0; i < ind_set.size(); ++i)

     for (vcl_size_t j = 0; j < ind_set[i].size(); ++j)

       a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);

 }


 //assembling blocks on GPU

 template<typename NumericT, unsigned int AlignmentV>

 void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,

                     std::vector<std::vector<unsigned int> > const & g_J,

                     std::vector<std::vector<unsigned int> > const & g_I,

                     block_matrix & g_A_I_J_vcl,

                     std::vector<cl_uint> & g_is_update,

                     bool & is_empty_block)

 {

   //computing start indices for index sets and start indices for block matrices

   unsigned int sz_I, sz_J, sz_blocks;

   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));

   std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));

   std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));

   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));

   //

   init_start_inds(g_J, j_ind);

   init_start_inds(g_I, i_ind);

   //

   get_size(g_J, sz_J);

   get_size(g_I, sz_I);

   std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));

   //

   std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));


   // computing size for blocks

   // writing set to arrays

   write_set_to_array(g_I, I_set);

   write_set_to_array(g_J, J_set);


   // if block for assembly does exist

   if (I_set.size() > 0 && J_set.size() > 0)

   {

     viennacl::context ctx = viennacl::traits::context(A);

     viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());

     compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);

     std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0));


     block_vector set_I_vcl, set_J_vcl;

     //init memory on GPU

     //contigious g_A_I_J

     g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                     static_cast<unsigned int>(sizeof(NumericT)*(sz_blocks)),

                                                     &(con_A_I_J[0]));

     g_A_I_J_vcl.handle().context(opencl_ctx);


     //matrix_dimensions

     g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                      static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),

                                                      &(matrix_dims[0]));

     g_A_I_J_vcl.handle1().context(opencl_ctx);


     //start_block inds

     g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                      static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),

                                                      &(blocks_ind[0]));

     g_A_I_J_vcl.handle2().context(opencl_ctx);


     //set_I

     set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                   static_cast<unsigned int>(sizeof(cl_uint)*sz_I),

                                                   &(I_set[0]));

     set_I_vcl.handle().context(opencl_ctx);


     //set_J

     set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                   static_cast<unsigned int>(sizeof(cl_uint)*sz_J),

                                                   &(J_set[0]));

     set_J_vcl.handle().context(opencl_ctx);


     //i_ind

     set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                    static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),

                                                    &(i_ind[0]));

     set_I_vcl.handle().context(opencl_ctx);


     //j_ind

     set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                    static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),

                                                    &(j_ind[0]));

     set_J_vcl.handle().context(opencl_ctx);


     viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,

                                                                              static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),

                                                                              &(g_is_update[0]));


     viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);

     viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "assemble_blocks");

     assembly_kernel.local_work_size(0, 1);

     assembly_kernel.global_work_size(0, 256);

     viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),

                                            set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),

                                            set_J_vcl.handle1(),

                                            g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),

                                            g_is_update_vcl,

                                            static_cast<unsigned int>(g_I.size())));

     is_empty_block = false;

   }

   else

     is_empty_block = true;

 }


 /************************************************************************************************************************/


 template<typename SparseMatrixT, typename SparseVectorT>

 void insert_sparse_columns(std::vector<SparseVectorT> const & M_v,

                            SparseMatrixT& M,

                            bool is_right)

 {

   if (is_right)

   {

     for (unsigned int i = 0; i < M_v.size(); ++i)

       for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)

         M(vec_it->first, i) = vec_it->second;

   }

   else  //transposed fill of M

   {

     for (unsigned int i = 0; i < M_v.size(); ++i)

       for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)

         M(i, vec_it->first) = vec_it->second;

   }

 }


 template<typename MatrixT>

 void sparse_transpose(MatrixT const & A_in, MatrixT & A)

 {

   typedef typename MatrixT::value_type         NumericType;


   std::vector<std::map<vcl_size_t, NumericType> >   temp_A(A_in.size2());

   A.resize(A_in.size2(), A_in.size1(), false);


   for (typename MatrixT::const_iterator1 row_it = A_in.begin1();

        row_it != A_in.end1();

        ++row_it)

   {

     for (typename MatrixT::const_iterator2 col_it = row_it.begin();

          col_it != row_it.end();

          ++col_it)

     {

       temp_A[col_it.index2()][col_it.index1()] = *col_it;

     }

   }


   for (vcl_size_t i=0; i<temp_A.size(); ++i)

   {

     for (typename std::map<vcl_size_t, NumericType>::const_iterator it = temp_A[i].begin();

          it != temp_A[i].end();

          ++it)

       A(i, it->first) = it->second;

   }

 }


 //        template<typename SparseVectorType>

 //        void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){

 //            for (int i = 0; i < l_M_v.size(); ++i){

 //                l_M_v[i] = M_v[i + beg_ind];

 //            }

 //        }


 //CPU version

 template<typename MatrixT>

 void computeSPAI(MatrixT const & A,

                  MatrixT & M,

                  spai_tag & tag)

 {

   typedef typename MatrixT::value_type                                       NumericT;

   typedef typename boost::numeric::ublas::vector<NumericT>                   VectorType;

   typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>   SparseVectorType;

   typedef typename boost::numeric::ublas::matrix<NumericT>                   DenseMatrixType;


   //sparse matrix transpose...

   unsigned int cur_iter = 0;

   tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);

   bool go_on = true;

   std::vector<SparseVectorType> A_v_c(M.size2());

   std::vector<SparseVectorType> M_v(M.size2());

   vectorize_column_matrix(A, A_v_c);

   vectorize_column_matrix(M, M_v);


   while (go_on)

   {

     go_on = (tag.getEndInd() < static_cast<long>(M.size2()));

     cur_iter = 0;

     unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - tag.getBegInd());

     //std::vector<bool> g_is_update(M.size2(), true);

     std::vector<bool> g_is_update(l_sz, true);


     //init is update

     //init_parallel_is_update(g_is_update);

     //std::vector<SparseVectorType> A_v_c(K);

     //std::vector<SparseVectorType> M_v(K);

     //vectorization of marices

     //print_matrix(M_v);


     std::vector<SparseVectorType> l_M_v(l_sz);

     //custom_copy(M_v, l_M_v, beg_ind);

     std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());


     //print_matrix(l_M_v);

     //std::vector<SparseVectorType> l_A_v_c(K);

     //custom_copy(A_v_c, l_A_v_c, beg_ind);

     //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());

     //print_matrix(l_A_v_c);

     //vectorize_row_matrix(A, A_v_r);

     //working blocks


     std::vector<DenseMatrixType> g_A_I_J(l_sz);

     std::vector<VectorType> g_b_v(l_sz);

     std::vector<SparseVectorType> g_res(l_sz);

     std::vector<std::vector<unsigned int> > g_I(l_sz);

     std::vector<std::vector<unsigned int> > g_J(l_sz);


     while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))

     {

       // SET UP THE BLOCKS..

       // PHASE ONE

       if (cur_iter == 0)

         block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);

       else

         block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);


       //PHASE TWO, LEAST SQUARE SOLUTION

       least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);


       if (tag.getIsStatic()) break;

       cur_iter++;

     }


     std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());

     tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;

     tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));

     //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());

   }


   M.resize(M.size1(), M.size2(), false);

   insert_sparse_columns(M_v, M, tag.getIsRight());

 }


 //GPU - based version

 template<typename NumericT, unsigned int AlignmentV>

 void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, //input

                  boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A,

                  boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, //output

                  viennacl::compressed_matrix<NumericT, AlignmentV> & M,

                  spai_tag const & tag)

 {

   typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>        SparseVectorType;


   //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;

   //sparse matrix transpose...

   unsigned int cur_iter = 0;

   std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));

   //init is update

   //init_parallel_is_update(g_is_update);

   std::vector<SparseVectorType> A_v_c(cpu_M.size2());

   std::vector<SparseVectorType> M_v(cpu_M.size2());

   vectorize_column_matrix(cpu_A, A_v_c);

   vectorize_column_matrix(cpu_M, M_v);

   std::vector<SparseVectorType> g_res(cpu_M.size2());

   std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());

   std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());


   //OpenCL variables

   block_matrix g_A_I_J_vcl;

   block_vector g_bv_vcl;

   while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))

   {

     // SET UP THE BLOCKS..

     // PHASE ONE..

     //timer.start();

     //index set up on CPU

     if (cur_iter == 0)

       block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl);

     else

       block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag);

     //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;

     //PERFORM LEAST SQUARE problems solution

     //PHASE TWO

     //timer.start();

     least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));

     //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;

     if (tag.getIsStatic())

       break;

     cur_iter++;

   }


   cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);

   insert_sparse_columns(M_v, cpu_M, tag.getIsRight());

   //copy back to GPU

   M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));

   viennacl::copy(cpu_M, M);

 }


 }

 }

 }

 }

 #endif

viennacl::linalg::opencl::kernels::spai
Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners...
Definition: spai.hpp:587

viennacl::linalg::detail::spai::insert_sparse_columns
void insert_sparse_columns(std::vector< SparseVectorT > const &M_v, SparseMatrixT &M, bool is_right)
Insertion of vectorized matrix column into original sparse matrix.
Definition: spai.hpp:616

viennacl::linalg::detail::spai::block_matrix
Represents contigious matrices on GPU.
Definition: block_matrix.hpp:49

viennacl::linalg::detail::spai::is_all_update
bool is_all_update(VectorType &parallel_is_update)
Definition: spai.hpp:443

viennacl::linalg::detail::spai::add_sparse_vectors
void add_sparse_vectors(SparseVectorT const &v, NumericT b, SparseVectorT &res_v)
Add two sparse vectors res_v = b*v.
Definition: spai.hpp:105

matrix_operations.hpp
Implementations of dense matrix related operations including matrix-vector products.

viennacl::linalg::detail::spai::block_matrix::handle
viennacl::ocl::handle< cl_mem > & handle()
Returns a handle to the elements.
Definition: block_matrix.hpp:56

viennacl::linalg::detail::spai::sparse_norm_2
void sparse_norm_2(SparseVectorT const &v, NumericT &norm)
Computation of Euclidean norm for sparse vector.
Definition: spai-dynamic.hpp:141

viennacl::linalg::detail::spai::block_vector::handle1
viennacl::ocl::handle< cl_mem > & handle1()
Return handle to start indices.
Definition: block_vector.hpp:58

viennacl::ocl::context::get_queue
viennacl::ocl::command_queue & get_queue()
Definition: context.hpp:266

prod.hpp
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...

viennacl::ocl::kernel
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58

matrix.hpp
Implementation of the dense matrix class.

viennacl::linalg::detail::spai::sparse_transpose
void sparse_transpose(MatrixT const &A_in, MatrixT &A)
Transposition of sparse matrix.
Definition: spai.hpp:640

viennacl::ocl::kernel::local_work_size
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742

spai.hpp
OpenCL kernel file for sparse approximate inverse operations.

viennacl::linalg::detail::spai::build_index_set
void build_index_set(std::vector< SparseVectorT > const &A_v_c, SparseVectorT const &v, std::vector< unsigned int > &J, std::vector< unsigned int > &I)
Setting up index set of columns and rows for certain column.
Definition: spai.hpp:139

viennacl::linalg::detail::spai::spai_tag::setBegInd
void setBegInd(long beg_ind)
Definition: spai_tag.hpp:124

viennacl::linalg::detail::spai::computeSPAI
void computeSPAI(MatrixT const &A, MatrixT &M, spai_tag &tag)
Construction of SPAI preconditioner on CPU.
Definition: spai.hpp:686

viennacl::ocl::context
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55

viennacl::linalg::detail::spai::compute_spai_residual
void compute_spai_residual(std::vector< SparseVectorT > const &A_v_c, SparseVectorT const &v, unsigned int ind, SparseVectorT &res)
Computation of residual res = A*v - e.
Definition: spai.hpp:120

viennacl::linalg::detail::spai::compute_blocks_size
void compute_blocks_size(std::vector< std::vector< unsigned int > > const &g_I, std::vector< std::vector< unsigned int > > const &g_J, unsigned int &sz, std::vector< cl_uint > &blocks_ind, std::vector< cl_uint > &matrix_dims)
**************************************** BLOCK FUNCTIONS ************************************// ...
Definition: qr.hpp:129

viennacl::linalg::detail::spai::custom_fan_out
void custom_fan_out(std::vector< NumericT > const &m_in, unsigned int start_m_ind, std::vector< unsigned int > const &J, SparseVectorT &m)
Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns...
Definition: spai.hpp:271

viennacl::linalg::detail::spai::get_size
void get_size(std::vector< std::vector< SizeT > > const &inds, SizeT &size)
Computes size of particular container of index set.
Definition: qr.hpp:151

viennacl::linalg::detail::spai::least_square_solve
void least_square_solve(std::vector< SparseVectorT > &A_v_c, std::vector< SparseVectorT > &M_v, std::vector< std::vector< unsigned int > > &g_I, std::vector< std::vector< unsigned int > > &g_J, block_matrix &g_A_I_J_vcl, block_vector &g_bv_vcl, std::vector< SparseVectorT > &g_res, std::vector< cl_uint > &g_is_update, const spai_tag &tag, viennacl::context ctx)
Solution of Least square problem on GPU.
Definition: spai.hpp:298

viennacl::linalg::detail::spai::vectorize_row_matrix
void vectorize_row_matrix(SparseMatrixT const &M_in, std::vector< SparseVectorT > &M_v)
Definition: spai.hpp:472

viennacl::traits::clear
void clear(VectorType &vec)
Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL obj...
Definition: clear.hpp:43

viennacl::linalg::detail::spai::spai_tag
A tag for SPAI.
Definition: spai_tag.hpp:64

inner_prod.hpp
Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.

sparse_vector.hpp
Implementation of a helper sparse vector class for SPAI. Experimental.

viennacl::compressed_matrix::handle
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_matrix.hpp:942

viennacl::compressed_matrix::handle1
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_matrix.hpp:936

viennacl::linalg::detail::spai::print_matrix
void print_matrix(DenseMatrixT &m)
Definition: spai.hpp:88

viennacl::linalg::detail::spai::block_update
void block_update(SparseMatrixT const &A, std::vector< SparseVectorT > const &A_v_c, std::vector< SparseVectorT > &g_res, std::vector< bool > &g_is_update, std::vector< std::vector< unsigned int > > &g_I, std::vector< std::vector< unsigned int > > &g_J, std::vector< VectorT > &g_b_v, std::vector< DenseMatrixT > &g_A_I_J, spai_tag const &tag)
CPU-based dynamic update for SPAI preconditioner.
Definition: spai-dynamic.hpp:291

viennacl::ocl::handle::context
viennacl::ocl::context const & context() const
Definition: handle.hpp:193

viennacl::ocl::command_queue::handle
viennacl::ocl::handle< cl_command_queue > const & handle() const
Definition: command_queue.hpp:81

NumericT
float NumericT
Definition: bisect.cpp:40

viennacl::context
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
Definition: context.hpp:39

VIENNACL_ERR_CHECK
#define VIENNACL_ERR_CHECK(err)
Definition: error.hpp:681

viennacl::linalg::detail::spai::single_qr
void single_qr(MatrixT &R, VectorT &b_v)
Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224.
Definition: qr.hpp:311

viennacl::linalg::detail::spai::apply_q_trans_vec
void apply_q_trans_vec(MatrixT const &R, VectorT const &b_v, VectorT &y)
Recovery Q from matrix R and vector of betas b_v.
Definition: qr.hpp:377

viennacl::linalg::detail::spai::block_assembly
void block_assembly(viennacl::compressed_matrix< NumericT, AlignmentV > const &A, std::vector< std::vector< unsigned int > > const &g_J, std::vector< std::vector< unsigned int > > const &g_I, block_matrix &g_A_I_J_vcl, std::vector< cl_uint > &g_is_update, bool &is_empty_block)
Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J.
Definition: spai.hpp:507

viennacl::linalg::detail::spai::projectRows
void projectRows(std::vector< SparseVectorT > const &A_v_c, std::vector< unsigned int > const &J, std::vector< unsigned int > &I)
Row projection for matrix A(:,J) -> A(I,J), building index set of non-zero rows.
Definition: spai-static.hpp:171

block_matrix.hpp
Implementation of a bunch of (small) matrices on GPU. Experimental.

viennacl::ocl::handle::get
const OCL_TYPE & get() const
Definition: handle.hpp:191

viennacl::linalg::detail::spai::backwardSolve
void backwardSolve(MatrixT const &R, VectorT const &y, VectorT &x)
Solution of linear:R*x=y system by backward substitution.
Definition: spai-static.hpp:102

viennacl::ocl::context::get_kernel
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605

viennacl::linalg::detail::spai::init_start_inds
void init_start_inds(std::vector< std::vector< SizeT > > const &inds, std::vector< cl_uint > &start_inds)
Initializes start indices of particular index set.
Definition: qr.hpp:165

ilu.hpp
Implementations of incomplete factorization preconditioners. Convenience header file.

spai-static.hpp
Implementation of a static SPAI. Experimental.

viennacl::linalg::detail::spai::spai_tag::getEndInd
long getEndInd() const
Definition: spai_tag.hpp:98

viennacl::linalg::detail::spai::spai_tag::getIsRight
bool getIsRight() const
Definition: spai_tag.hpp:94

compressed_matrix.hpp
Implementation of the compressed_matrix class.

block_vector.hpp
Implementation of a bunch of vectors on GPU. Experimental.

viennacl::linalg::detail::spai::spai_tag::getBegInd
long getBegInd() const
Definition: spai_tag.hpp:96

sparse_matrix_operations.hpp
Implementations of operations using sparse matrices.

viennacl::linalg::detail::spai::block_matrix::handle1
viennacl::ocl::handle< cl_mem > & handle1()
Returns a handle to the matrix dimensions.
Definition: block_matrix.hpp:59

viennacl::linalg::detail::spai::initProjectSubMatrix
void initProjectSubMatrix(SparseMatrixT const &A_in, std::vector< unsigned int > const &J, std::vector< unsigned int > &I, DenseMatrixT &A_out)
Initializes a dense matrix from a sparse one.
Definition: spai.hpp:156

viennacl::compressed_matrix::handle2
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_matrix.hpp:938

viennacl::linalg::detail::spai::sparse_vector
Represents a sparse vector based on std::map
Definition: sparse_vector.hpp:50

viennacl::linalg::detail::spai::block_vector::handle
viennacl::ocl::handle< cl_mem > & handle()
Return handle to the elements.
Definition: block_vector.hpp:55

qr.hpp
Provides a QR factorization using a block-based approach.

spai_tag.hpp
Implementation of the spai tag holding SPAI configuration parameters. Experimental.

viennacl::vcl_size_t
std::size_t vcl_size_t
Definition: forwards.h:75

viennacl::linalg::detail::spai::spai_tag::getResidualNormThreshold
double getResidualNormThreshold() const
Definition: spai_tag.hpp:86

VIENNACL_SPAI_K_b
#define VIENNACL_SPAI_K_b
Definition: spai.hpp:68

backend.hpp
Implementations of the OpenCL backend, where all contexts are stored in.

viennacl::linalg::detail::spai::block_matrix::handle2
viennacl::ocl::handle< cl_mem > & handle2()
Returns a handle to the start indices of matrix.
Definition: block_matrix.hpp:62

viennacl::linalg::detail::spai::spai_tag::getIsStatic
bool getIsStatic() const
Definition: spai_tag.hpp:92

viennacl::linalg::detail::spai::block_set_up
void block_set_up(SparseMatrixT const &A, std::vector< SparseVectorT > const &A_v_c, std::vector< SparseVectorT > const &M_v, std::vector< std::vector< unsigned int > > &g_I, std::vector< std::vector< unsigned int > > &g_J, std::vector< DenseMatrixT > &g_A_I_J, std::vector< VectorT > &g_b_v)
Setting up blocks and QR factorizing them on CPU.
Definition: spai.hpp:181

viennacl::linalg::detail::spai::spai_tag::setEndInd
void setEndInd(long end_ind)
Definition: spai_tag.hpp:126

viennacl::traits::context
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40

viennacl::ocl::enqueue
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50

viennacl::linalg::detail::spai::write_set_to_array
void write_set_to_array(std::vector< std::vector< SizeT > > const &ind_set, std::vector< cl_uint > &a)
Definition: spai.hpp:484

viennacl::copy
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
Definition: circulant_matrix.hpp:150

viennacl::linalg::detail::spai::block_vector
Represents a contiguous vector on the GPU to represent a concatentation of small vectors.
Definition: block_vector.hpp:48

viennacl::linalg::detail::spai::fanOutVector
void fanOutVector(VectorT const &m_in, std::vector< unsigned int > const &J, SparseVectorT &m)
Projects solution of LS problem onto original column m.
Definition: spai-static.hpp:88

viennacl::linalg::detail::spai::buildColumnIndexSet
void buildColumnIndexSet(SparseVectorT const &v, std::vector< unsigned int > &J)
Builds index set of projected columns for current column of preconditioner.
Definition: spai-static.hpp:140

viennacl::linalg::detail::spai::spai_tag::getIterationLimit
unsigned int getIterationLimit() const
Definition: spai_tag.hpp:90

viennacl::linalg::opencl::kernels::spai::init
static void init(viennacl::ocl::context &ctx)
Definition: spai.hpp:594

viennacl::ocl::kernel::global_work_size
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751

viennacl::linalg::detail::spai::print_sparse_vector
void print_sparse_vector(SparseVectorT const &v)
Definition: spai.hpp:81

viennacl::linalg::detail::spai::vectorize_column_matrix
void vectorize_column_matrix(SparseMatrixT const &M_in, std::vector< SparseVectorT > &M_v)
Solution of Least square problem on CPU.
Definition: spai.hpp:462

viennacl::compressed_matrix
A sparse square matrix in compressed sparse rows format.
Definition: compressed_matrix.hpp:559

viennacl::linalg::detail::min
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45

viennacl::linalg::detail::spai::index_set_up
void index_set_up(std::vector< SparseVectorT > const &A_v_c, std::vector< SparseVectorT > const &M_v, std::vector< std::vector< unsigned int > > &g_J, std::vector< std::vector< unsigned int > > &g_I)
Setting up index set of columns and rows for all columns.
Definition: spai.hpp:211

scalar.hpp
Implementation of the ViennaCL scalar class.

viennacl::compressed_matrix::resize
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
Definition: compressed_matrix.hpp:829

spai-dynamic.hpp
Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental...

viennacl::ocl::handle< cl_mem >

viennacl::ocl::context::create_memory
viennacl::ocl::handle< cl_mem > create_memory(cl_mem_flags flags, unsigned int size, void *ptr=NULL) const
Creates a memory buffer within the context.
Definition: context.hpp:216