1 #ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
2 #define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
45 const unsigned int * row_indices,
46 const unsigned int * column_indices,
49 unsigned int *influences_row,
50 unsigned int *influences_id,
51 unsigned int *influences_values
54 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
55 unsigned int global_size = gridDim.x * blockDim.x;
57 for (
unsigned int i = global_id; i <
size1; i += global_size)
59 unsigned int tmp = row_indices[i];
60 influences_row[i] = tmp;
61 influences_values[i] = row_indices[i+1] - tmp;
64 for (
unsigned int i = global_id; i < nnz; i += global_size)
65 influences_id[i] = column_indices[i];
73 template<
typename NumericT>
80 amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
81 viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
82 static_cast<unsigned int>(A.
size1()),
83 static_cast<unsigned int>(A.
nnz()),
93 template<
typename NumericT>
98 throw std::runtime_error(
"not implemented yet");
102 template<
typename NumericT>
122 unsigned int coarse_id = 0;
125 coarse_ids.set(i, coarse_id);
138 template<
typename IndexT>
142 IndexT
const *point_types,
143 IndexT
const *random_weights,
146 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
147 unsigned int global_size = gridDim.x * blockDim.x;
149 for (
unsigned int i = global_id; i <
size; i += global_size)
151 switch (point_types[i])
160 work_random[i] = random_weights[i];
166 template<
typename IndexT>
168 IndexT
const *work_random,
169 IndexT
const *work_index,
171 IndexT *work_random2,
173 IndexT
const *influences_row,
174 IndexT
const *influences_id,
177 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
178 unsigned int global_size = gridDim.x * blockDim.x;
180 for (
unsigned int i = global_id; i <
size; i += global_size)
183 unsigned int state = work_state[i];
184 unsigned int random = work_random[i];
185 unsigned int index = work_index[i];
188 unsigned int j_stop = influences_row[i + 1];
189 for (
unsigned int j = influences_row[i]; j < j_stop; ++j)
191 unsigned int influenced_point_id = influences_id[j];
194 if (state < work_state[influenced_point_id])
196 state = work_state[influenced_point_id];
197 random = work_random[influenced_point_id];
198 index = work_index[influenced_point_id];
200 else if (state == work_state[influenced_point_id])
202 if (random < work_random[influenced_point_id])
204 state = work_state[influenced_point_id];
205 random = work_random[influenced_point_id];
206 index = work_index[influenced_point_id];
208 else if (random == work_random[influenced_point_id])
210 if (index < work_index[influenced_point_id])
212 state = work_state[influenced_point_id];
213 random = work_random[influenced_point_id];
214 index = work_index[influenced_point_id];
221 work_state2[i] = state;
222 work_random2[i] = random;
223 work_index2[i] = index;
228 template<
typename IndexT>
230 IndexT
const *work_index,
232 IndexT *undecided_buffer,
235 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
236 unsigned int global_size = gridDim.x * blockDim.x;
238 unsigned int num_undecided = 0;
239 for (
unsigned int i = global_id; i <
size; i += global_size)
241 unsigned int max_state = work_state[i];
242 unsigned int max_index = work_index[i];
248 else if (max_state == 2)
256 __shared__
unsigned int shared_buffer[256];
257 shared_buffer[threadIdx.x] = num_undecided;
262 shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+
stride];
265 if (threadIdx.x == 0)
266 undecided_buffer[blockIdx.x] = shared_buffer[0];
273 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
274 unsigned int global_size = gridDim.x * blockDim.x;
276 for (
unsigned int i = global_id; i <
size; i += global_size)
289 template<
typename NumericT>
295 unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
296 for (std::size_t i=0; i<random_weights.size(); ++i)
297 random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.
size1());
309 unsigned int num_undecided =
static_cast<unsigned int>(A.
size1());
313 unsigned int pmis_iters = 0;
314 while (num_undecided > 0)
326 static_cast<unsigned int>(A.
size1())
334 for (
unsigned int r = 0; r < 2; ++r)
345 static_cast<unsigned int>(A.
size1())
350 work_state = work_state2;
351 work_random = work_random2;
352 work_index = work_index2;
362 static_cast<unsigned int>(A.
size1())
369 for (std::size_t i=0; i<undecided_buffer.
size(); ++i)
370 num_undecided += undecided_buffer_host[i];
385 template<
typename IndexT>
388 IndexT
const *influences_row,
389 IndexT
const *influences_id,
392 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
393 unsigned int global_size = gridDim.x * blockDim.x;
395 for (
unsigned int i = global_id; i <
size; i += global_size)
399 unsigned int coarse_index = coarse_ids[i];
401 unsigned int j_stop = influences_row[i + 1];
402 for (
unsigned int j = influences_row[i]; j < j_stop; ++j)
404 unsigned int influenced_point_id = influences_id[j];
405 coarse_ids[influenced_point_id] = coarse_index;
407 if (influenced_point_id != i)
415 template<
typename IndexT>
418 IndexT
const *influences_row,
419 IndexT
const *influences_id,
422 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
423 unsigned int global_size = gridDim.x * blockDim.x;
425 for (
unsigned int i = global_id; i <
size; i += global_size)
429 unsigned int j_stop = influences_row[i + 1];
430 for (
unsigned int j = influences_row[i]; j < j_stop; ++j)
432 unsigned int influenced_point_id = influences_id[j];
436 coarse_ids[i] = coarse_ids[influenced_point_id];
448 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
449 unsigned int global_size = gridDim.x * blockDim.x;
451 for (
unsigned int i = global_id; i <
size; i += global_size)
465 template<
typename NumericT>
479 throw std::runtime_error(
"Only MIS2 coarsening implemented. Selected coarsening not available with CUDA backend!");
490 static_cast<unsigned int>(A.
size1())
502 static_cast<unsigned int>(A.
size1())
511 static_cast<unsigned int>(A.
size1())
525 template<
typename InternalT1>
533 default:
throw std::runtime_error(
"not implemented yet");
542 template<
typename NumericT>
544 unsigned int *P_col_buffer,
546 unsigned int *coarse_ids,
549 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
550 unsigned int global_size = gridDim.x * blockDim.x;
552 for (
unsigned int i = global_id; i <
size; i += global_size)
555 P_col_buffer[i] = coarse_ids[i];
571 template<
typename NumericT>
580 amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(P.
handle1().cuda_handle()),
581 viennacl::cuda_arg<unsigned int>(P.
handle2().cuda_handle()),
582 viennacl::cuda_arg<NumericT>(P.
handle().cuda_handle()),
584 static_cast<unsigned int>(A.
size1())
593 template<
typename NumericT>
595 const unsigned int *A_row_indices,
596 const unsigned int *A_col_indices,
598 unsigned int A_size1,
600 unsigned int *Jacobi_row_indices,
601 unsigned int *Jacobi_col_indices,
606 unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
607 unsigned int global_size = gridDim.x * blockDim.x;
609 for (
unsigned int row = global_id;
row < A_size1;
row += global_size)
611 unsigned int row_begin = A_row_indices[
row];
612 unsigned int row_end = A_row_indices[
row+1];
614 Jacobi_row_indices[
row] = row_begin;
618 for (
unsigned int j = row_begin; j < row_end; ++j)
620 if (A_col_indices[j] ==
row)
622 diag = A_elements[j];
628 for (
unsigned int j = row_begin; j < row_end; ++j)
630 unsigned int col_index = A_col_indices[j];
631 Jacobi_col_indices[j] = col_index;
633 if (col_index ==
row)
634 Jacobi_elements[j] =
NumericT(1) - omega;
636 Jacobi_elements[j] = - omega * A_elements[j] /
diag;
641 Jacobi_row_indices[A_size1] = A_nnz;
653 template<
typename NumericT>
667 amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
668 viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
669 viennacl::cuda_arg<NumericT>(A.
handle().cuda_handle()),
670 static_cast<unsigned int>(A.
size1()),
671 static_cast<unsigned int>(A.
nnz()),
672 viennacl::cuda_arg<unsigned int>(Jacobi.handle1().cuda_handle()),
673 viennacl::cuda_arg<unsigned int>(Jacobi.handle2().cuda_handle()),
674 viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()),
692 template<
typename MatrixT>
702 default:
throw std::runtime_error(
"Not implemented yet!");
707 template<
typename NumericT>
709 const unsigned int * row_indices,
710 const unsigned int * column_indices,
713 unsigned int B_row_start,
unsigned int B_col_start,
714 unsigned int B_row_inc,
unsigned int B_col_inc,
715 unsigned int B_row_size,
unsigned int B_col_size,
716 unsigned int B_internal_rows,
unsigned int B_internal_cols)
718 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
720 row += gridDim.x * blockDim.x)
722 unsigned int row_end = row_indices[
row+1];
723 for (
unsigned int j = row_indices[
row]; j<row_end; j++)
724 B[(B_row_start +
row * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j];
729 template<
typename NumericT,
unsigned int AlignmentV>
733 compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
734 viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
735 viennacl::cuda_arg<NumericT>(A.
handle().cuda_handle()),
736 viennacl::cuda_arg<NumericT>(B),
748 template<
typename NumericT>
750 const unsigned int * row_indices,
751 const unsigned int * column_indices,
759 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
761 row += gridDim.x * blockDim.x)
765 unsigned int row_end = row_indices[
row+1];
766 for (
unsigned int j = row_indices[
row]; j < row_end; ++j)
768 unsigned int col = column_indices[j];
772 sum += elements[j] * x_old[col];
790 template<
typename NumericT>
798 for (
unsigned int i=0; i<iterations; ++i)
802 compressed_matrix_smooth_jacobi_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
803 viennacl::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
804 viennacl::cuda_arg<NumericT>(A.
handle().cuda_handle()),
805 static_cast<NumericT>(weight),
809 static_cast<unsigned int>(rhs_smooth.
size())
void amg_influence_trivial(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Routine for taking all connections in the matrix as strong.
void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context &amg_context)
Assign IDs to coarse points.
Helper class implementing an array on the host. Default case: No conversion necessary.
__global__ void amg_agg_merge_undecided_2(unsigned int *point_types, unsigned int size)
amg_coarsening_method get_coarsening_method() const
Returns the current coarsening strategy.
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
__global__ void compressed_matrix_smooth_jacobi_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT weight, const NumericT *x_old, NumericT *x_new, const NumericT *rhs, unsigned int size)
void amg_influence_advanced(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Routine for extracting strongly connected points considering a user-provided threshold value...
const vcl_size_t & size1() const
Returns the number of rows.
__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state, IndexT const *work_index, IndexT *point_types, IndexT *undecided_buffer, unsigned int size)
CUDA kernel for marking MIS and non-MIS nodes.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
__global__ void amg_pmis2_init_workdata(IndexT *work_state, IndexT *work_random, IndexT *work_index, IndexT const *point_types, IndexT const *random_weights, unsigned int size)
CUDA kernel initializing the work vectors at each PMIS iteration.
double get_jacobi_weight() const
Returns the Jacobi smoother weight (damping).
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
void amg_interpol_ag(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
void amg_coarse(InternalT1 &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Calls the right coarsening procedure.
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void amg_agg_merge_undecided(IndexT *point_types, IndexT *coarse_ids, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
__global__ void amg_influence_trivial_kernel(const unsigned int *row_indices, const unsigned int *column_indices, unsigned int size1, unsigned int nnz, unsigned int *influences_row, unsigned int *influences_id, unsigned int *influences_values)
void amg_interpol(MatrixT const &A, MatrixT &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Dispatcher for building the interpolation matrix.
__global__ void amg_interpol_sa_kernel(const unsigned int *A_row_indices, const unsigned int *A_col_indices, const NumericT *A_elements, unsigned int A_size1, unsigned int A_nnz, unsigned int *Jacobi_row_indices, unsigned int *Jacobi_col_indices, NumericT *Jacobi_elements, NumericT omega)
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
void generate_row_block_information()
Builds the row block information needed for fast sparse matrix-vector multiplications.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
void assign_to_dense(viennacl::compressed_matrix< NumericT, AlignmentV > const &A, viennacl::matrix_base< NumericT > &B)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
void amg_coarse_ag_stage1_mis2(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) coarsening, single-threaded version of stage 1.
__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer, unsigned int *P_col_buffer, NumericT *P_elements, unsigned int *coarse_ids, unsigned int size)
void smooth_jacobi(unsigned int iterations, compressed_matrix< NumericT > const &A, vector< NumericT > &x, vector< NumericT > &x_backup, vector< NumericT > const &rhs_smooth, NumericT weight)
Damped Jacobi Smoother (CUDA version)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
__global__ void amg_agg_propagate_coarse_indices(IndexT *point_types, IndexT *coarse_ids, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
A tag for algebraic multigrid (AMG). Used to transport information from the user to the implementatio...
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
void amg_interpol_sa(compressed_matrix< NumericT > const &A, compressed_matrix< NumericT > &P, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
size_type size() const
Returns the length of the vector (cf. std::vector)
viennacl::vector< unsigned int > point_types_
__global__ void compressed_matrix_assign_to_dense(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols)
amg_interpolation_method get_interpolation_method() const
Returns the current interpolation method.
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Helper classes and functions for the AMG preconditioner. Experimental.
viennacl::vector< unsigned int > coarse_id_
__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size)
CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided so that subsequent kernels work...
const handle_type & handle() const
Returns the memory handle.
viennacl::vector< unsigned int > influence_values_
__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state, IndexT const *work_random, IndexT const *work_index, IndexT *work_state2, IndexT *work_random2, IndexT *work_index2, IndexT const *influences_row, IndexT const *influences_id, unsigned int size)
CUDA kernel propagating the state triple (status, weight, nodeID) to neighbors using a max()-operatio...
viennacl::vector< unsigned int > influence_ids_
void amg_coarse_ag(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG) ...
void switch_memory_context(viennacl::context new_ctx)
void amg_influence(compressed_matrix< NumericT > const &A, viennacl::linalg::detail::amg::amg_level_context &amg_context, viennacl::linalg::amg_tag &tag)
Dispatcher for influence processing.
viennacl::vector< unsigned int > influence_jumper_