1 #ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
43 template<
typename NumericT>
54 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
59 result[i] += alpha * value_p;
60 value_r -= alpha * Ap[i];
61 value_p = value_r + beta * value_p;
65 inner_prod_contrib += value_r * value_r;
69 __shared__
NumericT shared_array[256];
70 shared_array[threadIdx.x] = inner_prod_contrib;
75 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
80 inner_prod_buffer[blockIdx.x] = shared_array[0];
84 template<
typename NumericT>
113 template<
unsigned int SubWarpSizeV,
typename NumericT>
115 const unsigned int * row_indices,
116 const unsigned int * column_indices,
122 unsigned int buffer_size)
124 __shared__
NumericT shared_elements[256];
128 const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
129 const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
130 const unsigned int block_start = blockIdx.x * block_increment;
131 const unsigned int block_stop =
min(block_start + block_increment, size);
133 for (
unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
135 row += blockDim.x / SubWarpSizeV)
138 unsigned int row_end = row_indices[
row+1];
139 for (
unsigned int i = row_indices[
row] + id_in_row; i < row_end; i += SubWarpSizeV)
140 dot_prod += elements[i] * p[column_indices[i]];
142 shared_elements[threadIdx.x] =
dot_prod;
143 if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
144 if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
145 if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
146 if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
147 if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
151 Ap[
row] = shared_elements[threadIdx.x];
152 inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
153 inner_prod_pAp += p[
row] * shared_elements[threadIdx.x];
158 __shared__
NumericT shared_array_ApAp[256];
159 __shared__
NumericT shared_array_pAp[256];
160 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
161 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
167 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
168 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
173 if (threadIdx.x == 0) {
174 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
175 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
180 template<
typename NumericT>
182 const unsigned int * row_indices,
183 const unsigned int * column_indices,
184 const unsigned int * row_blocks,
186 unsigned int num_blocks,
191 unsigned int buffer_size)
196 __shared__
NumericT shared_elements[1024];
198 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
200 unsigned int row_start = row_blocks[block_id];
201 unsigned int row_stop = row_blocks[block_id + 1];
202 unsigned int element_start = row_indices[row_start];
203 unsigned int element_stop = row_indices[row_stop];
204 unsigned int rows_to_process = row_stop - row_start;
206 if (rows_to_process > 1)
209 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
210 shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
215 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
218 unsigned int thread_row_start = row_indices[
row] - element_start;
219 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
220 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
221 dot_prod += shared_elements[i];
223 inner_prod_ApAp += dot_prod *
dot_prod;
231 shared_elements[threadIdx.x] = 0;
232 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
233 shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
240 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
243 if (threadIdx.x == 0)
245 Ap[row_start] = shared_elements[0];
246 inner_prod_ApAp += shared_elements[0] * shared_elements[0];
247 inner_prod_pAp += p[row_start] * shared_elements[0];
255 __shared__
NumericT shared_array_ApAp[256];
256 __shared__
NumericT shared_array_pAp[256];
257 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
258 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
264 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
265 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
270 if (threadIdx.x == 0) {
271 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
272 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
279 template<
typename NumericT>
286 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
288 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
289 if (
double(A.
nnz()) /
double(A.
size1()) > 6.4)
291 pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>(
293 if (
double(A.
nnz()) /
double(A.
size1()) > 12.0)
295 pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>(
297 viennacl::cuda_arg<unsigned int>(A.
handle1()),
298 viennacl::cuda_arg<unsigned int>(A.
handle2()),
299 viennacl::cuda_arg<NumericT>(A.
handle()),
304 buffer_size_per_vector
310 pipelined_cg_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
311 viennacl::cuda_arg<unsigned int>(A.
handle2()),
312 viennacl::cuda_arg<unsigned int>(A.
handle3()),
313 viennacl::cuda_arg<NumericT>(A.
handle()),
314 static_cast<unsigned int>(A.
blocks1()),
319 buffer_size_per_vector);
330 template<
typename NumericT>
333 const unsigned int * group_boundaries,
338 unsigned int buffer_size)
342 __shared__
unsigned int shared_rows[128];
343 __shared__
NumericT inter_results[128];
347 unsigned int group_start = group_boundaries[blockIdx.x];
348 unsigned int group_end = group_boundaries[blockIdx.x + 1];
349 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
351 unsigned int local_index = 0;
353 for (
unsigned int k = 0; k < k_end; ++k)
355 local_index = group_start + k * blockDim.x + threadIdx.x;
357 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
358 val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
361 if (threadIdx.x == 0 && k > 0)
363 if (tmp.x == shared_rows[blockDim.x-1])
364 val += inter_results[blockDim.x-1];
367 NumericT Ap_entry = inter_results[blockDim.x-1];
368 Ap[shared_rows[blockDim.x-1]] = Ap_entry;
369 inner_prod_ApAp += Ap_entry * Ap_entry;
370 inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
376 shared_rows[threadIdx.x] = tmp.x;
377 inter_results[threadIdx.x] = val;
383 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
385 inter_results[threadIdx.x] += left;
390 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
391 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
393 NumericT Ap_entry = inter_results[threadIdx.x];
394 Ap[tmp.x] = Ap_entry;
395 inner_prod_ApAp += Ap_entry * Ap_entry;
396 inner_prod_pAp += Ap_entry * p[tmp.x];
402 if (local_index + 1 == group_end)
404 NumericT Ap_entry = inter_results[threadIdx.x];
405 Ap[tmp.x] = Ap_entry;
406 inner_prod_ApAp += Ap_entry * Ap_entry;
407 inner_prod_pAp += Ap_entry * p[tmp.x];
411 __shared__
NumericT shared_array_ApAp[256];
412 __shared__
NumericT shared_array_pAp[256];
413 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
414 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
420 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
421 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
426 if (threadIdx.x == 0) {
427 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
428 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
434 template<
typename NumericT>
440 unsigned int size = p.
size();
441 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
445 pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle12()),
446 viennacl::cuda_arg<NumericT>(A.
handle()),
447 viennacl::cuda_arg<unsigned int>(A.
handle3()),
452 buffer_size_per_vector);
462 template<
typename NumericT>
465 unsigned int internal_row_num,
466 unsigned int items_per_row,
471 unsigned int buffer_size)
475 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
476 unsigned int glb_sz = gridDim.x * blockDim.x;
482 unsigned int offset =
row;
483 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
486 sum += val ? p[coords[offset]] * val :
NumericT(0);
490 inner_prod_ApAp += sum *
sum;
491 inner_prod_pAp += sum * p[
row];
495 __shared__
NumericT shared_array_ApAp[256];
496 __shared__
NumericT shared_array_pAp[256];
497 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
498 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
504 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
505 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
510 if (threadIdx.x == 0) {
511 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
512 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
517 template<
typename NumericT>
523 unsigned int size = p.
size();
524 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
526 pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
527 viennacl::cuda_arg<NumericT>(A.
handle()),
529 static_cast<unsigned int>(A.
maxnnz()),
534 buffer_size_per_vector);
543 template<
typename NumericT>
545 const unsigned int * column_indices,
546 const unsigned int * block_start,
551 unsigned int block_size,
553 unsigned int buffer_size)
558 unsigned int blocks_per_threadblock = blockDim.x / block_size;
559 unsigned int id_in_block = threadIdx.x % block_size;
560 unsigned int num_blocks = (size - 1) / block_size + 1;
561 unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
562 unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
564 for (
unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
566 unsigned int row = block_idx * block_size + id_in_block;
567 unsigned int offset = block_start[block_idx];
568 unsigned int num_columns = columns_per_block[block_idx];
571 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
573 unsigned int index = offset + item_id * block_size + id_in_block;
576 sum += val ? (p[column_indices[index]] * val) : 0;
582 inner_prod_ApAp += sum *
sum;
583 inner_prod_pAp += sum * p[
row];
588 __shared__
NumericT shared_array_ApAp[256];
589 __shared__
NumericT shared_array_pAp[256];
590 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
591 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
597 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
598 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
603 if (threadIdx.x == 0) {
604 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
605 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
609 template<
typename NumericT>
615 unsigned int size = p.
size();
616 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
618 pipelined_cg_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
619 viennacl::cuda_arg<unsigned int>(A.
handle2()),
620 viennacl::cuda_arg<unsigned int>(A.
handle3()),
621 viennacl::cuda_arg<NumericT>(A.
handle()),
627 buffer_size_per_vector);
637 template<
typename NumericT>
640 const unsigned int * csr_rows,
641 const unsigned int * csr_cols,
643 unsigned int internal_row_num,
644 unsigned int items_per_row,
649 unsigned int buffer_size)
653 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
654 unsigned int glb_sz = gridDim.x * blockDim.x;
660 unsigned int offset =
row;
661 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
663 NumericT val = ell_elements[offset];
665 sum += val ? p[ell_coords[offset]] * val :
NumericT(0);
668 unsigned int col_begin = csr_rows[
row];
669 unsigned int col_end = csr_rows[
row + 1];
671 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
673 sum += p[csr_cols[item_id]] * csr_elements[item_id];
677 inner_prod_ApAp += sum *
sum;
678 inner_prod_pAp += sum * p[
row];
682 __shared__
NumericT shared_array_ApAp[256];
683 __shared__
NumericT shared_array_pAp[256];
684 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
685 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
691 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
692 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
697 if (threadIdx.x == 0) {
698 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
699 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
705 template<
typename NumericT>
711 unsigned int size = p.
size();
712 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
714 pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
715 viennacl::cuda_arg<NumericT>(A.
handle()),
716 viennacl::cuda_arg<unsigned int>(A.
handle3()),
717 viennacl::cuda_arg<unsigned int>(A.
handle4()),
718 viennacl::cuda_arg<NumericT>(A.
handle5()),
720 static_cast<unsigned int>(A.
ell_nnz()),
725 buffer_size_per_vector);
733 template<
typename NumericT>
739 unsigned int chunk_size,
740 unsigned int chunk_offset)
745 __shared__
NumericT shared_array[256];
746 __shared__
NumericT shared_array_Ap_in_r0[256];
748 shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
749 shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
753 if (threadIdx.x <
stride) {
754 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
755 shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x +
stride];
761 alpha = shared_array[0] / shared_array_Ap_in_r0[0];
765 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
769 value_s = residual[i] - alpha * Ap[i];
770 inner_prod_contrib += value_s * value_s;
777 shared_array[threadIdx.x] = inner_prod_contrib;
782 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
786 if (threadIdx.x == 0)
787 inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
790 template<
typename NumericT>
798 unsigned int size =
static_cast<unsigned int>(s.
size());
799 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
800 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
812 template<
typename NumericT>
827 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
832 NumericT value_residual = residual[i];
837 value_result += alpha * value_p + omega * value_s;
838 value_residual = value_s - omega * value_As;
839 value_p = value_residual + beta * (value_p - omega * value_Ap);
841 result[i] = value_result;
842 residual[i] = value_residual;
844 inner_prod_r_r0star += value_residual * value_r0star;
848 __shared__
NumericT shared_array[256];
849 shared_array[threadIdx.x] = inner_prod_r_r0star;
854 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
858 if (threadIdx.x == 0)
859 inner_prod_buffer[blockIdx.x] = shared_array[0];
863 template<
typename NumericT>
870 (void)buffer_chunk_size;
871 unsigned int size =
static_cast<unsigned int>(result.
size());
895 template<
unsigned int SubWarpSizeV,
typename NumericT>
897 const unsigned int * row_indices,
898 const unsigned int * column_indices,
905 unsigned int buffer_size,
906 unsigned int buffer_offset)
908 __shared__
NumericT shared_elements[256];
913 const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
914 const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
915 const unsigned int block_start = blockIdx.x * block_increment;
916 const unsigned int block_stop =
min(block_start + block_increment, size);
918 for (
unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
920 row += blockDim.x / SubWarpSizeV)
923 unsigned int row_end = row_indices[
row+1];
924 for (
unsigned int i = row_indices[
row] + id_in_row; i < row_end; i += SubWarpSizeV)
925 dot_prod += elements[i] * p[column_indices[i]];
927 shared_elements[threadIdx.x] =
dot_prod;
928 if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
929 if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
930 if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
931 if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
932 if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
936 Ap[
row] = shared_elements[threadIdx.x];
937 inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
938 inner_prod_pAp += p[
row] * shared_elements[threadIdx.x];
939 inner_prod_r0Ap += r0star[
row] * shared_elements[threadIdx.x];
944 __shared__
NumericT shared_array_ApAp[256];
945 __shared__
NumericT shared_array_pAp[256];
946 __shared__
NumericT shared_array_r0Ap[256];
947 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
948 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
949 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
955 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
956 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
957 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
962 if (threadIdx.x == 0) {
963 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
964 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
965 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
971 template<
typename NumericT>
973 const unsigned int * row_indices,
974 const unsigned int * column_indices,
975 const unsigned int * row_blocks,
977 unsigned int num_blocks,
983 unsigned int buffer_size,
984 unsigned int buffer_offset)
990 __shared__
NumericT shared_elements[1024];
992 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
994 unsigned int row_start = row_blocks[block_id];
995 unsigned int row_stop = row_blocks[block_id + 1];
996 unsigned int element_start = row_indices[row_start];
997 unsigned int element_stop = row_indices[row_stop];
998 unsigned int rows_to_process = row_stop - row_start;
1000 if (rows_to_process > 1)
1003 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
1004 shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
1009 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
1012 unsigned int thread_row_start = row_indices[
row] - element_start;
1013 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
1014 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
1015 dot_prod += shared_elements[i];
1017 inner_prod_ApAp += dot_prod *
dot_prod;
1026 shared_elements[threadIdx.x] = 0;
1027 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
1028 shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
1034 if (threadIdx.x <
stride)
1035 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
1038 if (threadIdx.x == 0)
1040 Ap[row_start] = shared_elements[0];
1041 inner_prod_ApAp += shared_elements[0] * shared_elements[0];
1042 inner_prod_pAp += p[row_start] * shared_elements[0];
1043 inner_prod_r0Ap += r0star[row_start] * shared_elements[0];
1051 __shared__
NumericT shared_array_ApAp[256];
1052 __shared__
NumericT shared_array_pAp[256];
1053 __shared__
NumericT shared_array_r0Ap[256];
1054 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1055 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1056 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1060 if (threadIdx.x <
stride)
1062 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1063 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1064 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1069 if (threadIdx.x == 0) {
1070 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1071 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1072 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1079 template<
typename NumericT>
1089 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1090 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1092 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
1093 if (
double(A.
nnz()) /
double(A.
size1()) > 6.4)
1095 pipelined_bicgstab_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>(
1097 if (
double(A.
nnz()) /
double(A.
size1()) > 12.0)
1099 pipelined_bicgstab_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>(
1101 viennacl::cuda_arg<unsigned int>(A.
handle1()),
1102 viennacl::cuda_arg<unsigned int>(A.
handle2()),
1103 viennacl::cuda_arg<NumericT>(A.
handle()),
1116 pipelined_bicgstab_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
1117 viennacl::cuda_arg<unsigned int>(A.
handle2()),
1118 viennacl::cuda_arg<unsigned int>(A.
handle3()),
1119 viennacl::cuda_arg<NumericT>(A.
handle()),
1120 static_cast<unsigned int>(A.
blocks1()),
1138 template<
typename NumericT>
1141 const unsigned int * group_boundaries,
1147 unsigned int buffer_size,
1148 unsigned int buffer_offset)
1153 __shared__
unsigned int shared_rows[128];
1154 __shared__
NumericT inter_results[128];
1158 unsigned int group_start = group_boundaries[blockIdx.x];
1159 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1160 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1162 unsigned int local_index = 0;
1164 for (
unsigned int k = 0; k < k_end; ++k)
1166 local_index = group_start + k * blockDim.x + threadIdx.x;
1168 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1169 val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
1172 if (threadIdx.x == 0 && k > 0)
1174 if (tmp.x == shared_rows[blockDim.x-1])
1175 val += inter_results[blockDim.x-1];
1178 NumericT Ap_entry = inter_results[blockDim.x-1];
1179 Ap[shared_rows[blockDim.x-1]] = Ap_entry;
1180 inner_prod_ApAp += Ap_entry * Ap_entry;
1181 inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
1182 inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
1188 shared_rows[threadIdx.x] = tmp.x;
1189 inter_results[threadIdx.x] = val;
1195 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1197 inter_results[threadIdx.x] += left;
1202 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1203 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1205 NumericT Ap_entry = inter_results[threadIdx.x];
1206 Ap[tmp.x] = Ap_entry;
1207 inner_prod_ApAp += Ap_entry * Ap_entry;
1208 inner_prod_pAp += Ap_entry * p[tmp.x];
1209 inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
1215 if (local_index + 1 == group_end)
1217 NumericT Ap_entry = inter_results[threadIdx.x];
1218 Ap[tmp.x] = Ap_entry;
1219 inner_prod_ApAp += Ap_entry * Ap_entry;
1220 inner_prod_pAp += Ap_entry * p[tmp.x];
1221 inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
1225 __shared__
NumericT shared_array_ApAp[256];
1226 __shared__
NumericT shared_array_pAp[256];
1227 __shared__
NumericT shared_array_r0Ap[256];
1228 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1229 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1230 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1234 if (threadIdx.x <
stride)
1236 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1237 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1238 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1243 if (threadIdx.x == 0) {
1244 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1245 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1246 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1252 template<
typename NumericT>
1262 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1263 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1267 pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle12()),
1268 viennacl::cuda_arg<NumericT>(A.
handle()),
1269 viennacl::cuda_arg<unsigned int>(A.
handle3()),
1286 template<
typename NumericT>
1289 unsigned int internal_row_num,
1290 unsigned int items_per_row,
1296 unsigned int buffer_size,
1297 unsigned int buffer_offset)
1302 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1303 unsigned int glb_sz = gridDim.x * blockDim.x;
1309 unsigned int offset =
row;
1310 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1313 sum += val ? p[coords[offset]] * val :
NumericT(0);
1317 inner_prod_ApAp += sum *
sum;
1318 inner_prod_pAp += sum * p[
row];
1319 inner_prod_r0Ap += sum * r0star[
row];
1323 __shared__
NumericT shared_array_ApAp[256];
1324 __shared__
NumericT shared_array_pAp[256];
1325 __shared__
NumericT shared_array_r0Ap[256];
1326 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1327 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1328 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1332 if (threadIdx.x <
stride)
1334 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1335 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1336 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1341 if (threadIdx.x == 0) {
1342 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1343 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1344 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1349 template<
typename NumericT>
1359 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1360 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1362 pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
1363 viennacl::cuda_arg<NumericT>(A.
handle()),
1365 static_cast<unsigned int>(A.
maxnnz()),
1381 template<
typename NumericT>
1383 const unsigned int * column_indices,
1384 const unsigned int * block_start,
1390 unsigned int block_size,
1392 unsigned int buffer_size,
1393 unsigned int buffer_offset)
1399 unsigned int blocks_per_threadblock = blockDim.x / block_size;
1400 unsigned int id_in_block = threadIdx.x % block_size;
1401 unsigned int num_blocks = (size - 1) / block_size + 1;
1402 unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
1403 unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
1405 for (
unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
1407 unsigned int row = block_idx * block_size + id_in_block;
1408 unsigned int offset = block_start[block_idx];
1409 unsigned int num_columns = columns_per_block[block_idx];
1412 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
1414 unsigned int index = offset + item_id * block_size + id_in_block;
1417 sum += val ? (p[column_indices[index]] * val) : 0;
1423 inner_prod_ApAp += sum *
sum;
1424 inner_prod_pAp += sum * p[
row];
1425 inner_prod_r0Ap += sum * r0star[
row];
1430 __shared__
NumericT shared_array_ApAp[256];
1431 __shared__
NumericT shared_array_pAp[256];
1432 __shared__
NumericT shared_array_r0Ap[256];
1433 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1434 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1435 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1439 if (threadIdx.x <
stride)
1441 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1442 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1443 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1448 if (threadIdx.x == 0) {
1449 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1450 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1451 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1455 template<
typename NumericT>
1465 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1466 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1468 pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
1469 viennacl::cuda_arg<unsigned int>(A.
handle2()),
1470 viennacl::cuda_arg<unsigned int>(A.
handle3()),
1471 viennacl::cuda_arg<NumericT>(A.
handle()),
1489 template<
typename NumericT>
1492 const unsigned int * csr_rows,
1493 const unsigned int * csr_cols,
1495 unsigned int internal_row_num,
1496 unsigned int items_per_row,
1502 unsigned int buffer_size,
1503 unsigned int buffer_offset)
1508 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1509 unsigned int glb_sz = gridDim.x * blockDim.x;
1515 unsigned int offset =
row;
1516 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1518 NumericT val = ell_elements[offset];
1520 sum += val ? p[ell_coords[offset]] * val :
NumericT(0);
1523 unsigned int col_begin = csr_rows[
row];
1524 unsigned int col_end = csr_rows[
row + 1];
1526 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
1528 sum += p[csr_cols[item_id]] * csr_elements[item_id];
1532 inner_prod_ApAp += sum *
sum;
1533 inner_prod_pAp += sum * p[
row];
1534 inner_prod_r0Ap += sum * r0star[
row];
1538 __shared__
NumericT shared_array_ApAp[256];
1539 __shared__
NumericT shared_array_pAp[256];
1540 __shared__
NumericT shared_array_r0Ap[256];
1541 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1542 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1543 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1547 if (threadIdx.x <
stride)
1549 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1550 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1551 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1556 if (threadIdx.x == 0) {
1557 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1558 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1559 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1565 template<
typename NumericT>
1575 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1576 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1578 pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
1579 viennacl::cuda_arg<NumericT>(A.
handle()),
1580 viennacl::cuda_arg<unsigned int>(A.
handle3()),
1581 viennacl::cuda_arg<unsigned int>(A.
handle4()),
1582 viennacl::cuda_arg<NumericT>(A.
handle5()),
1584 static_cast<unsigned int>(A.
ell_nnz()),
1597 template <
typename T>
1599 unsigned int vk_offset,
1602 unsigned int R_offset,
1603 T
const * inner_prod_buffer,
1604 unsigned int chunk_size,
1605 T * r_dot_vk_buffer,
1606 unsigned int chunk_offset,
1609 __shared__ T shared_array[128];
1613 shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
1617 if (threadIdx.x <
stride)
1618 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1623 norm_vk = sqrt(shared_array[0]);
1625 T inner_prod_contrib = 0;
1626 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1627 T value_vk = vk[i + vk_offset] / norm_vk;
1629 inner_prod_contrib += residual[i] * value_vk;
1631 vk[i + vk_offset] = value_vk;
1636 shared_array[threadIdx.x] = inner_prod_contrib;
1640 if (threadIdx.x <
stride)
1641 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1645 if (threadIdx.x == 0)
1646 r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
1648 if (blockDim.x * blockIdx.x + threadIdx.x == 0)
1649 R_buffer[R_offset] = norm_vk;
1659 template <
typename T>
1670 unsigned int R_offset = offset_in_R;
1671 unsigned int chunk_size = buffer_chunk_size;
1672 unsigned int chunk_offset = buffer_chunk_offset;
1673 unsigned int size = v_k.
size();
1690 template <
typename T>
1695 T * vi_in_vk_buffer,
1696 unsigned int chunk_size)
1698 __shared__ T shared_array[7*128];
1701 unsigned int k_base = 0;
1704 unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1706 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1707 shared_array[threadIdx.x + j*chunk_size] = 0;
1709 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1713 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1714 shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) *
internal_size];
1721 if (threadIdx.x <
stride) {
1722 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1723 shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size +
stride];
1728 if (threadIdx.x == 0)
1729 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1730 vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
1732 k_base += vecs_in_iteration;
1737 template <
typename T>
1745 unsigned int chunk_size = buffer_chunk_size;
1746 unsigned int size = v_k_size;
1748 unsigned int k = param_k;
1750 pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(
viennacl::cuda_arg(device_krylov_basis),
1762 template <
typename T>
1767 T
const * vi_in_vk_buffer,
1768 unsigned int chunk_size,
1770 unsigned int krylov_dim,
1771 T * inner_prod_buffer)
1773 __shared__ T shared_array[7*128];
1777 unsigned int k_base = 0;
1780 unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1783 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1784 shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
1788 if (threadIdx.x <
stride) {
1789 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1790 shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size +
stride];
1796 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1800 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1801 value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) *
internal_size];
1802 vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
1807 if (blockIdx.x == 0)
1808 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1809 R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
1812 k_base += vecs_in_iteration;
1816 shared_array[threadIdx.x] = vk_dot_vk;
1820 if (threadIdx.x <
stride)
1821 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1825 if (threadIdx.x == 0)
1826 inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
1829 template <
typename T>
1840 unsigned int chunk_size = buffer_chunk_size;
1841 unsigned int size = v_k_size;
1843 unsigned int k = param_k;
1844 unsigned int krylov = krylov_dim;
1846 pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(
viennacl::cuda_arg(device_krylov_basis),
1861 template <
typename T>
1864 T
const * krylov_basis,
1867 T
const * coefficients,
1870 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1872 T value_result = result[i] + coefficients[0] * residual[i];
1874 for (
unsigned int j = 1; j < k; ++j)
1875 value_result += coefficients[j] * krylov_basis[i + (j-1)*
internal_size];
1877 result[i] = value_result;
1881 template <
typename T>
1890 unsigned int size = v_k_size;
1892 unsigned int k = param_k;
1906 template <
typename NumericT>
1912 unsigned int size = p.
size();
1913 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1915 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
1916 if (
double(A.
nnz()) /
double(A.
size1()) > 6.4)
1918 pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>(
1920 if (
double(A.
nnz()) /
double(A.
size1()) > 12.0)
1922 pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<128, 256>>>(
1924 viennacl::cuda_arg<unsigned int>(A.
handle1()),
1925 viennacl::cuda_arg<unsigned int>(A.
handle2()),
1926 viennacl::cuda_arg<NumericT>(A.
handle()),
1931 buffer_size_per_vector
1937 pipelined_cg_csr_vec_mul_adaptive_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
1938 viennacl::cuda_arg<unsigned int>(A.
handle2()),
1939 viennacl::cuda_arg<unsigned int>(A.
handle3()),
1940 viennacl::cuda_arg<NumericT>(A.
handle()),
1941 static_cast<unsigned int>(A.
blocks1()),
1946 buffer_size_per_vector);
1952 template <
typename T>
1958 unsigned int size = p.
size();
1959 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1963 pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.
handle12()),
1964 viennacl::cuda_arg<T>(A.
handle()),
1965 viennacl::cuda_arg<unsigned int>(A.
handle3()),
1970 buffer_size_per_vector);
1974 template <
typename T>
1980 unsigned int size = p.
size();
1981 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1983 pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
1984 viennacl::cuda_arg<T>(A.
handle()),
1986 static_cast<unsigned int>(A.
maxnnz()),
1991 buffer_size_per_vector);
1995 template <
typename T>
2001 unsigned int size = p.
size();
2002 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
2004 pipelined_cg_sliced_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
2005 viennacl::cuda_arg<unsigned int>(A.
handle2()),
2006 viennacl::cuda_arg<unsigned int>(A.
handle3()),
2007 viennacl::cuda_arg<T>(A.
handle()),
2013 buffer_size_per_vector);
2018 template <
typename T>
2024 unsigned int size = p.
size();
2025 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
2027 pipelined_cg_hyb_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.
handle2()),
2028 viennacl::cuda_arg<T>(A.
handle()),
2029 viennacl::cuda_arg<unsigned int>(A.
handle3()),
2030 viennacl::cuda_arg<unsigned int>(A.
handle4()),
2031 viennacl::cuda_arg<T>(A.
handle5()),
2033 static_cast<unsigned int>(A.
ell_nnz()),
2038 buffer_size_per_vector);
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, unsigned int block_size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T const *vi_in_vk_buffer, unsigned int chunk_size, T *R_buffer, unsigned int krylov_dim, T *inner_prod_buffer)
__global__ void pipelined_cg_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT *r, NumericT const *Ap, NumericT beta, NumericT *inner_prod_buffer, unsigned int size)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle() const
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t internal_size1() const
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, unsigned int block_size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_gmres_normalize_vk_kernel(T *vk, unsigned int vk_offset, T const *residual, T *R_buffer, unsigned int R_offset, T const *inner_prod_buffer, unsigned int chunk_size, T *r_dot_vk_buffer, unsigned int chunk_offset, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const
__global__ void pipelined_bicgstab_csr_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t rows_per_block() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const handle_type & handle2() const
__global__ void pipelined_cg_csr_vec_mul_blocked_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
__global__ void pipelined_bicgstab_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT omega, NumericT const *s, NumericT *residual, NumericT const *As, NumericT beta, NumericT const *Ap, NumericT const *r0star, NumericT *inner_prod_buffer, unsigned int size)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
Sparse matrix class using the sliced ELLPACK with parameters C, .
result_of::size_type< T >::type start(T const &obj)
__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
vcl_size_t maxnnz() const
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void pipelined_bicgstab_csr_vec_mul_blocked_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
__global__ void pipelined_gmres_update_result_kernel(T *result, T const *residual, T const *krylov_basis, unsigned int size, unsigned int internal_size, T const *coefficients, unsigned int k)
size_type size() const
Returns the length of the vector (cf. std::vector)
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t ell_nnz() const
__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T *vi_in_vk_buffer, unsigned int chunk_size)
__global__ void pipelined_cg_csr_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_bicgstab_update_s_kernel(NumericT *s, NumericT const *residual, NumericT const *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int chunk_size, unsigned int chunk_offset)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
const handle_type & handle5() const
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
Implementation of the ViennaCL scalar class.
void pipelined_gmres_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
NumericT min(std::vector< NumericT > const &v1)
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...