1 #ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
29 #ifdef VIENNACL_WITH_AVX2
30 #include "immintrin.h"
43 #ifdef VIENNACL_WITH_AVX2
45 unsigned int row_C_scan_symbolic_vector_AVX2(
int const *row_indices_B_begin,
int const *row_indices_B_end,
46 int const *B_row_buffer,
int const *B_col_buffer,
int B_size2,
47 int *row_C_vector_output)
49 __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
50 __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
52 __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
53 __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
54 __m256i avx_load_mask2 = avx_load_mask;
56 __m256i avx_row_indices = _mm256_set1_epi32(0);
57 avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
58 avx_load_mask = avx_load_mask2;
59 __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
60 avx_load_mask = avx_load_mask2;
61 __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
63 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
64 __m256i avx_index_front = avx_all_bsize2;
65 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
67 int *output_ptr = row_C_vector_output;
72 __m256i avx_index_min1 = avx_index_front;
73 __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
74 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
76 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(78));
77 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
79 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(177));
80 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
82 int min_index_in_front = ((
int*)&avx_index_min1)[0];
84 if (min_index_in_front == B_size2)
88 *output_ptr = min_index_in_front;
92 avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
94 avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
95 avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
97 avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask);
98 avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
100 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
101 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
104 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
112 template<
unsigned int IndexNum>
114 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
unsigned int B_size2,
115 unsigned int const *row_C_vector_input,
unsigned int const *row_C_vector_input_end,
116 unsigned int *row_C_vector_output)
118 unsigned int index_front[IndexNum+1];
119 unsigned int const *index_front_start[IndexNum+1];
120 unsigned int const *index_front_end[IndexNum+1];
123 for (
unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
125 index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
126 index_front_end[i] = B_col_buffer + B_row_buffer[*row_indices_B + 1];
128 index_front_start[IndexNum] = row_C_vector_input;
129 index_front_end[IndexNum] = row_C_vector_input_end;
132 for (
unsigned int i=0; i<=IndexNum; ++i)
133 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
135 unsigned int *output_ptr = row_C_vector_output;
140 unsigned int min_index_in_front = B_size2;
141 for (
unsigned int i=0; i<=IndexNum; ++i)
142 min_index_in_front =
std::min(min_index_in_front, index_front[i]);
144 if (min_index_in_front == B_size2)
148 for (
unsigned int i=0; i<=IndexNum; ++i)
150 if (index_front[i] == min_index_in_front)
152 index_front_start[i] += 1;
153 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
158 *output_ptr = min_index_in_front;
162 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
168 template<
typename OutputWriterT>
170 unsigned int const *input2_begin,
unsigned int const *input2_end,
171 unsigned int termination_index,
172 unsigned int *output_begin)
174 unsigned int *output_ptr = output_begin;
176 unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
177 unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
180 unsigned int min_index =
std::min(val_1, val_2);
182 if (min_index == termination_index)
185 if (min_index == val_1)
188 val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
191 if (min_index == val_2)
194 val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
198 OutputWriterT::apply(output_ptr, min_index);
202 return static_cast<unsigned int>(output_ptr - output_begin);
207 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
unsigned int B_size2,
208 unsigned int *row_C_vector_1,
unsigned int *row_C_vector_2,
unsigned int *row_C_vector_3)
211 if (row_start_A == row_end_A)
215 if (row_end_A - row_start_A == 1)
217 unsigned int A_col = A_col_buffer[row_start_A];
218 return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
222 unsigned int row_C_len = 0;
223 if (row_end_A - row_start_A == 2)
225 unsigned int A_col_1 = A_col_buffer[row_start_A];
226 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
227 return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
228 B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
234 #ifdef VIENNACL_WITH_AVX2
235 row_C_len = row_C_scan_symbolic_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A),
236 (
const int*)B_row_buffer, (
const int*)B_col_buffer,
int(B_size2),
237 (
int*)row_C_vector_1);
240 unsigned int A_col_1 = A_col_buffer[row_start_A];
241 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
242 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
243 B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
251 while (row_end_A > row_start_A)
253 #ifdef VIENNACL_WITH_AVX2
254 if (row_end_A - row_start_A > 2)
256 unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A),
257 (
const int*)B_row_buffer, (
const int*)B_col_buffer,
int(B_size2),
258 (
int*)row_C_vector_3);
259 if (row_start_A + 8 >= row_end_A)
260 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
261 row_C_vector_1, row_C_vector_1 + row_C_len,
265 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
266 row_C_vector_1, row_C_vector_1 + row_C_len,
273 if (row_start_A == row_end_A - 1)
276 unsigned int row_index_B = A_col_buffer[row_start_A];
277 return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
278 row_C_vector_1, row_C_vector_1 + row_C_len,
282 else if (row_start_A + 1 < row_end_A)
285 unsigned int A_col_1 = A_col_buffer[row_start_A];
286 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
287 unsigned int merged_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
288 B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
291 if (row_start_A + 2 == row_end_A)
292 return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
293 row_C_vector_1, row_C_vector_1 + row_C_len,
297 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
298 row_C_vector_1, row_C_vector_1 + row_C_len,
306 unsigned int row_index_B = A_col_buffer[row_start_A];
307 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
308 row_C_vector_1, row_C_vector_1 + row_C_len,
314 std::swap(row_C_vector_1, row_C_vector_2);
326 template<
unsigned int IndexNum,
typename NumericT>
328 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
NumericT const *B_elements,
unsigned int B_size2,
329 unsigned int const *row_C_vector_input,
unsigned int const *row_C_vector_input_end,
NumericT *row_C_vector_input_values,
330 unsigned int *row_C_vector_output,
NumericT *row_C_vector_output_values)
332 unsigned int index_front[IndexNum+1];
333 unsigned int const *index_front_start[IndexNum+1];
334 unsigned int const *index_front_end[IndexNum+1];
335 NumericT const * value_front_start[IndexNum+1];
339 for (
unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
341 unsigned int row_B = *row_indices_B;
343 index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
344 index_front_end[i] = B_col_buffer + B_row_buffer[row_B + 1];
345 value_front_start[i] = B_elements + B_row_buffer[row_B];
346 values_A[i] = val_A[i];
348 index_front_start[IndexNum] = row_C_vector_input;
349 index_front_end[IndexNum] = row_C_vector_input_end;
350 value_front_start[IndexNum] = row_C_vector_input_values;
354 for (
unsigned int i=0; i<=IndexNum; ++i)
355 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
357 unsigned int *output_ptr = row_C_vector_output;
362 unsigned int min_index_in_front = B_size2;
363 for (
unsigned int i=0; i<=IndexNum; ++i)
364 min_index_in_front =
std::min(min_index_in_front, index_front[i]);
366 if (min_index_in_front == B_size2)
371 for (
unsigned int i=0; i<=IndexNum; ++i)
373 if (index_front[i] == min_index_in_front)
375 index_front_start[i] += 1;
376 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
378 row_C_value += values_A[i] * *value_front_start[i];
379 value_front_start[i] += 1;
384 *output_ptr = min_index_in_front;
386 *row_C_vector_output_values = row_C_value;
387 ++row_C_vector_output_values;
390 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
395 #ifdef VIENNACL_WITH_AVX2
397 unsigned int row_C_scan_numeric_vector_AVX2(
int const *row_indices_B_begin,
int const *row_indices_B_end,
double const *values_A,
398 int const *B_row_buffer,
int const *B_col_buffer,
double const *B_elements,
400 int *row_C_vector_output,
double *row_C_vector_output_values)
402 __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
403 __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
405 __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
406 __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
407 __m256i avx_load_mask2 = avx_load_mask;
409 __m256i avx_row_indices = _mm256_set1_epi32(0);
410 avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
413 avx_load_mask = avx_load_mask2;
414 __m256d avx_value_A_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
416 _mm256_extractf128_si256(avx_row_indices_offsets, 0),
417 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8);
418 avx_load_mask = avx_load_mask2;
419 __m256d avx_value_A_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
421 _mm256_extractf128_si256(avx_row_indices_offsets, 1),
422 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8);
425 avx_load_mask = avx_load_mask2;
426 __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
427 avx_load_mask = avx_load_mask2;
428 __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
430 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
431 avx_load_mask2 = avx_load_mask;
432 __m256i avx_index_front = avx_all_bsize2;
433 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
436 avx_load_mask = avx_load_mask2;
437 __m256d avx_value_front_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
439 _mm256_extractf128_si256(avx_row_start, 0),
440 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8);
441 avx_load_mask = avx_load_mask2;
442 __m256d avx_value_front_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
444 _mm256_extractf128_si256(avx_row_start, 1),
445 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8);
447 int *output_ptr = row_C_vector_output;
452 __m256i avx_index_min1 = avx_index_front;
453 __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
454 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
456 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(78));
457 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
459 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(177));
460 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
462 int min_index_in_front = ((
int*)&avx_index_min1)[0];
464 if (min_index_in_front == B_size2)
469 value += (min_index_in_front == ((
int*)&avx_index_front)[0]) ? ((
double*)&avx_value_front_low)[0] * ((
double*)&avx_value_A_low)[0] : 0;
470 value += (min_index_in_front == ((
int*)&avx_index_front)[1]) ? ((
double*)&avx_value_front_low)[1] * ((
double*)&avx_value_A_low)[1] : 0;
471 value += (min_index_in_front == ((
int*)&avx_index_front)[2]) ? ((
double*)&avx_value_front_low)[2] * ((
double*)&avx_value_A_low)[2] : 0;
472 value += (min_index_in_front == ((
int*)&avx_index_front)[3]) ? ((
double*)&avx_value_front_low)[3] * ((
double*)&avx_value_A_low)[3] : 0;
473 value += (min_index_in_front == ((
int*)&avx_index_front)[4]) ? ((
double*)&avx_value_front_high)[0] * ((
double*)&avx_value_A_high)[0] : 0;
474 value += (min_index_in_front == ((
int*)&avx_index_front)[5]) ? ((
double*)&avx_value_front_high)[1] * ((
double*)&avx_value_A_high)[1] : 0;
475 value += (min_index_in_front == ((
int*)&avx_index_front)[6]) ? ((
double*)&avx_value_front_high)[2] * ((
double*)&avx_value_A_high)[2] : 0;
476 value += (min_index_in_front == ((
int*)&avx_index_front)[7]) ? ((
double*)&avx_value_front_high)[3] * ((
double*)&avx_value_A_high)[3] : 0;
477 *row_C_vector_output_values = value;
478 ++row_C_vector_output_values;
481 *output_ptr = min_index_in_front;
485 avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
487 avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
488 avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
490 avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask);
491 avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
493 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
494 avx_load_mask2 = avx_load_mask;
495 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
498 avx_load_mask = avx_load_mask2;
499 avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low,
501 _mm256_extractf128_si256(avx_row_start, 0),
502 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8);
504 avx_load_mask = avx_load_mask2;
505 avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high,
507 _mm256_extractf128_si256(avx_row_start, 1),
508 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8);
514 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
519 template<
typename NumericT>
521 unsigned int const *input2_index_begin,
unsigned int const *input2_index_end,
NumericT const *input2_values_begin,
NumericT factor2,
522 unsigned int termination_index,
523 unsigned int *output_index_begin,
NumericT *output_values_begin)
525 unsigned int *output_ptr = output_index_begin;
527 unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
528 unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
532 unsigned int min_index =
std::min(index1, index2);
535 if (min_index == termination_index)
538 if (min_index == index1)
540 ++input1_index_begin;
541 index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
543 value += factor1 * *input1_values_begin;
544 ++input1_values_begin;
547 if (min_index == index2)
549 ++input2_index_begin;
550 index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
552 value += factor2 * *input2_values_begin;
553 ++input2_values_begin;
557 *output_ptr = min_index;
559 *output_values_begin = value;
560 ++output_values_begin;
563 return static_cast<unsigned int>(output_ptr - output_index_begin);
566 template<
typename NumericT>
568 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
NumericT const *B_elements,
unsigned int B_size2,
569 unsigned int row_start_C,
unsigned int row_end_C,
unsigned int *C_col_buffer,
NumericT *C_elements,
570 unsigned int *row_C_vector_1,
NumericT *row_C_vector_1_values,
571 unsigned int *row_C_vector_2,
NumericT *row_C_vector_2_values,
572 unsigned int *row_C_vector_3,
NumericT *row_C_vector_3_values)
577 if (row_start_A == row_end_A)
581 if (row_end_A - row_start_A == 1)
583 unsigned int A_col = A_col_buffer[row_start_A];
584 unsigned int B_end = B_row_buffer[A_col + 1];
585 NumericT A_value = A_elements[row_start_A];
586 C_col_buffer += row_start_C;
587 C_elements += row_start_C;
588 for (
unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
590 *C_col_buffer = B_col_buffer[j];
591 *C_elements = A_value * B_elements[j];
596 unsigned int row_C_len = 0;
597 if (row_end_A - row_start_A == 2)
599 unsigned int A_col_1 = A_col_buffer[row_start_A];
600 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
602 unsigned int B_offset_1 = B_row_buffer[A_col_1];
603 unsigned int B_offset_2 = B_row_buffer[A_col_2];
605 row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
606 B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
608 C_col_buffer + row_start_C, C_elements + row_start_C);
611 #ifdef VIENNACL_WITH_AVX2
612 else if (row_end_A - row_start_A > 10)
614 row_C_len = row_C_scan_numeric_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
615 (
const int*)B_row_buffer, (
const int*)B_col_buffer, B_elements,
int(B_size2),
616 (
int*)row_C_vector_1, row_C_vector_1_values);
622 unsigned int A_col_1 = A_col_buffer[row_start_A];
623 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
625 unsigned int B_offset_1 = B_row_buffer[A_col_1];
626 unsigned int B_offset_2 = B_row_buffer[A_col_2];
628 row_C_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
629 B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
631 row_C_vector_1, row_C_vector_1_values);
636 while (row_end_A > row_start_A)
638 #ifdef VIENNACL_WITH_AVX2
639 if (row_end_A - row_start_A > 9)
641 unsigned int merged_len = row_C_scan_numeric_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
642 (
const int*)B_row_buffer, (
const int*)B_col_buffer, B_elements,
int(B_size2),
643 (
int*)row_C_vector_3, row_C_vector_3_values);
645 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
647 row_C_vector_2, row_C_vector_2_values);
652 if (row_start_A + 1 == row_end_A)
654 unsigned int A_col = A_col_buffer[row_start_A];
655 unsigned int B_offset = B_row_buffer[A_col];
657 row_C_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
658 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
660 C_col_buffer + row_start_C, C_elements + row_start_C);
663 else if (row_start_A + 2 < row_end_A)
666 unsigned int A_col_1 = A_col_buffer[row_start_A];
667 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
669 unsigned int B_offset_1 = B_row_buffer[A_col_1];
670 unsigned int B_offset_2 = B_row_buffer[A_col_2];
672 unsigned int merged_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
673 B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
675 row_C_vector_3, row_C_vector_3_values);
677 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
679 row_C_vector_2, row_C_vector_2_values);
684 unsigned int A_col = A_col_buffer[row_start_A];
685 unsigned int B_offset = B_row_buffer[A_col];
687 row_C_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
688 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
690 row_C_vector_2, row_C_vector_2_values);
694 std::swap(row_C_vector_1, row_C_vector_2);
695 std::swap(row_C_vector_1_values, row_C_vector_2_values);
unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1, unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2, unsigned int termination_index, unsigned int *output_index_begin, NumericT *output_values_begin)
This file provides the forward declarations for the main types used within ViennaCL.
unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end, unsigned int const *input2_begin, unsigned int const *input2_end, unsigned int termination_index, unsigned int *output_begin)
void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
static void apply(unsigned int *, unsigned int)
unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values, unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
Merges up to IndexNum rows from B into the result buffer.
unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, unsigned int *row_C_vector_output)
Merges up to IndexNum rows from B into the result buffer.
Common routines for single-threaded or OpenMP-enabled execution on CPU.
viennacl::enable_if< viennacl::is_scalar< ScalarT1 >::value &&viennacl::is_scalar< ScalarT2 >::value >::type swap(ScalarT1 &s1, ScalarT2 &s2)
Swaps the contents of two scalars, data is copied.
static void apply(unsigned int *ptr, unsigned int value)
T min(const T &lhs, const T &rhs)
Minimum.