1 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
47 template<
typename DestNumericT,
typename SrcNumericT>
48 __global__
void convert_kernel(DestNumericT * dest,
unsigned int start_dest,
unsigned int inc_dest,
unsigned int size_dest,
49 SrcNumericT
const * src,
unsigned int start_src,
unsigned int inc_src)
51 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
53 i += gridDim.x * blockDim.x)
54 dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
58 template<
typename DestNumericT,
typename SrcNumericT>
76 template<
typename NumericT>
83 unsigned int options2,
89 if (options2 & (1 << 0))
92 if (options2 & (1 << 1))
94 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
96 i += gridDim.x * blockDim.x)
101 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
103 i += gridDim.x * blockDim.x)
109 template<
typename NumericT>
116 unsigned int options2,
122 if (options2 & (1 << 0))
125 if (options2 & (1 << 1))
127 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
129 i += gridDim.x * blockDim.x)
134 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
136 i += gridDim.x * blockDim.x)
143 template<
typename NumericT,
typename ScalarType1>
149 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
151 value_type data_alpha = alpha;
153 data_alpha = -data_alpha;
154 if (reciprocal_alpha)
155 data_alpha =
static_cast<value_type
>(1) / data_alpha;
157 value_type temporary_alpha = 0;
159 temporary_alpha = alpha;
178 template<
typename NumericT>
185 unsigned int options2,
191 unsigned int options3,
197 if (options2 & (1 << 0))
201 if (options3 & (1 << 0))
204 if (options2 & (1 << 1))
206 if (options3 & (1 << 1))
208 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
210 i += gridDim.x * blockDim.x)
211 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
215 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
217 i += gridDim.x * blockDim.x)
218 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
223 if (options3 & (1 << 1))
225 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
227 i += gridDim.x * blockDim.x)
228 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
232 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
234 i += gridDim.x * blockDim.x)
235 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
241 template<
typename NumericT>
248 unsigned int options2,
254 unsigned int options3,
260 if (options2 & (1 << 0))
264 if (options3 & (1 << 0))
267 if (options2 & (1 << 1))
269 if (options3 & (1 << 1))
271 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
273 i += gridDim.x * blockDim.x)
274 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
278 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
280 i += gridDim.x * blockDim.x)
281 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
286 if (options3 & (1 << 1))
288 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
290 i += gridDim.x * blockDim.x)
291 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
295 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
297 i += gridDim.x * blockDim.x)
298 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
304 template<
typename NumericT>
311 unsigned int options2,
317 unsigned int options3,
323 if (options2 & (1 << 0))
327 if (options3 & (1 << 0))
330 if (options2 & (1 << 1))
332 if (options3 & (1 << 1))
334 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
336 i += gridDim.x * blockDim.x)
337 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
341 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
343 i += gridDim.x * blockDim.x)
344 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
349 if (options3 & (1 << 1))
351 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
353 i += gridDim.x * blockDim.x)
354 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
358 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
360 i += gridDim.x * blockDim.x)
361 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
367 template<
typename NumericT>
374 unsigned int options2,
380 unsigned int options3,
386 if (options2 & (1 << 0))
390 if (options3 & (1 << 0))
393 if (options2 & (1 << 1))
395 if (options3 & (1 << 1))
397 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
399 i += gridDim.x * blockDim.x)
400 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
404 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
406 i += gridDim.x * blockDim.x)
407 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
412 if (options3 & (1 << 1))
414 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
416 i += gridDim.x * blockDim.x)
417 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
421 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
423 i += gridDim.x * blockDim.x)
424 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
432 template<
typename NumericT,
typename ScalarT1,
typename ScalarT2>
439 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
441 value_type data_alpha = alpha;
443 data_alpha = -data_alpha;
444 if (reciprocal_alpha)
445 data_alpha =
static_cast<value_type
>(1) / data_alpha;
447 value_type temporary_alpha = 0;
449 temporary_alpha = alpha;
453 value_type temporary_beta = 0;
455 temporary_beta = beta;
482 template<
typename NumericT>
489 unsigned int options2,
495 unsigned int options3,
501 if (options2 & (1 << 0))
505 if (options3 & (1 << 0))
508 if (options2 & (1 << 1))
510 if (options3 & (1 << 1))
512 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
514 i += gridDim.x * blockDim.x)
515 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
519 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
521 i += gridDim.x * blockDim.x)
522 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
527 if (options3 & (1 << 1))
529 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
531 i += gridDim.x * blockDim.x)
532 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
536 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
538 i += gridDim.x * blockDim.x)
539 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
545 template<
typename NumericT>
552 unsigned int options2,
558 unsigned int options3,
564 if (options2 & (1 << 0))
568 if (options3 & (1 << 0))
571 if (options2 & (1 << 1))
573 if (options3 & (1 << 1))
575 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
577 i += gridDim.x * blockDim.x)
578 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
582 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
584 i += gridDim.x * blockDim.x)
585 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
590 if (options3 & (1 << 1))
592 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
594 i += gridDim.x * blockDim.x)
595 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
599 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
601 i += gridDim.x * blockDim.x)
602 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
608 template<
typename NumericT>
615 unsigned int options2,
621 unsigned int options3,
627 if (options2 & (1 << 0))
631 if (options3 & (1 << 0))
634 if (options2 & (1 << 1))
636 if (options3 & (1 << 1))
638 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
640 i += gridDim.x * blockDim.x)
641 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
645 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
647 i += gridDim.x * blockDim.x)
648 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
653 if (options3 & (1 << 1))
655 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
657 i += gridDim.x * blockDim.x)
658 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
662 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
664 i += gridDim.x * blockDim.x)
665 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
671 template<
typename NumericT>
678 unsigned int options2,
684 unsigned int options3,
690 if (options2 & (1 << 0))
694 if (options3 & (1 << 0))
697 if (options2 & (1 << 1))
699 if (options3 & (1 << 1))
701 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
703 i += gridDim.x * blockDim.x)
704 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] / beta;
708 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
710 i += gridDim.x * blockDim.x)
711 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] / alpha + vec3[i*inc3+start3] * beta;
716 if (options3 & (1 << 1))
718 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
720 i += gridDim.x * blockDim.x)
721 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] / beta;
725 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
727 i += gridDim.x * blockDim.x)
728 vec1[i*inc1+
start1] += vec2[i*inc2+
start2] * alpha + vec3[i*inc3+start3] * beta;
734 template<
typename NumericT,
typename ScalarT1,
typename ScalarT2>
741 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
743 value_type data_alpha = alpha;
745 data_alpha = -data_alpha;
746 if (reciprocal_alpha)
747 data_alpha =
static_cast<value_type
>(1) / data_alpha;
749 value_type temporary_alpha = 0;
751 temporary_alpha = alpha;
755 value_type temporary_beta = 0;
757 temporary_beta = beta;
781 template<
typename NumericT>
790 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
792 i += gridDim.x * blockDim.x)
802 template<
typename NumericT,
typename ScalarT1>
807 value_type temporary_alpha = 0;
809 temporary_alpha = alpha;
825 template<
typename NumericT>
836 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
838 i += gridDim.x * blockDim.x)
840 tmp = vec2[i*inc2+
start2];
842 vec1[i*inc1+
start1] = tmp;
852 template<
typename NumericT>
868 template<
typename NumericT>
887 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
889 i += gridDim.x * blockDim.x)
891 vec1[i*inc1+
start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
894 else if (op_type == 1)
896 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
898 i += gridDim.x * blockDim.x)
900 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
903 else if (op_type == 0)
905 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
907 i += gridDim.x * blockDim.x)
909 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
914 template<
typename NumericT>
933 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
935 i += gridDim.x * blockDim.x)
937 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] / vec3[i*inc3+start3];
940 else if (op_type == 0)
942 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
944 i += gridDim.x * blockDim.x)
946 vec1[i*inc1+
start1] = vec2[i*inc2+
start2] * vec3[i*inc3+start3];
956 template<
typename NumericT,
typename OpT>
960 unsigned int op_type = 2;
984 template<
typename OpT>
988 unsigned int op_type = 2;
1012 template<
typename OpT>
1016 unsigned int op_type = 2;
1046 template<
typename NumericT>
1051 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1052 vec1[i*inc1+
start1] = acos(vec2[i*inc2+start2]);
1055 template<
typename NumericT>
1073 template<
typename NumericT>
1078 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1079 vec1[i*inc1+
start1] = asin(vec2[i*inc2+start2]);
1082 template<
typename NumericT>
1099 template<
typename NumericT>
1104 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1105 vec1[i*inc1+
start1] = atan(vec2[i*inc2+start2]);
1108 template<
typename NumericT>
1125 template<
typename NumericT>
1130 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1131 vec1[i*inc1+
start1] = ceil(vec2[i*inc2+start2]);
1134 template<
typename NumericT>
1151 template<
typename NumericT>
1156 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1157 vec1[i*inc1+
start1] = cos(vec2[i*inc2+start2]);
1160 template<
typename NumericT>
1177 template<
typename NumericT>
1182 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1183 vec1[i*inc1+
start1] = cosh(vec2[i*inc2+start2]);
1186 template<
typename NumericT>
1203 template<
typename NumericT>
1208 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1209 vec1[i*inc1+
start1] = exp(vec2[i*inc2+start2]);
1212 template<
typename NumericT>
1229 template<
typename NumericT>
1234 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1235 vec1[i*inc1+
start1] = fabs(vec2[i*inc2+start2]);
1238 template<
typename NumericT>
1254 template<
typename NumericT>
1259 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1260 vec1[i*inc1+
start1] = abs(vec2[i*inc2+start2]);
1263 template<
typename NumericT>
1281 template<
typename NumericT>
1286 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1287 vec1[i*inc1+
start1] = floor(vec2[i*inc2+start2]);
1290 template<
typename NumericT>
1307 template<
typename NumericT>
1312 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1313 vec1[i*inc1+
start1] = log(vec2[i*inc2+start2]);
1316 template<
typename NumericT>
1333 template<
typename NumericT>
1338 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1339 vec1[i*inc1+
start1] = log10(vec2[i*inc2+start2]);
1342 template<
typename NumericT>
1359 template<
typename NumericT>
1364 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1365 vec1[i*inc1+
start1] = sin(vec2[i*inc2+start2]);
1368 template<
typename NumericT>
1385 template<
typename NumericT>
1390 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1391 vec1[i*inc1+
start1] = sinh(vec2[i*inc2+start2]);
1394 template<
typename NumericT>
1411 template<
typename NumericT>
1416 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1417 vec1[i*inc1+
start1] = sqrt(vec2[i*inc2+start2]);
1420 template<
typename NumericT>
1437 template<
typename NumericT>
1442 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1443 vec1[i*inc1+
start1] = tan(vec2[i*inc2+start2]);
1446 template<
typename NumericT>
1463 template<
typename NumericT>
1468 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1469 vec1[i*inc1+
start1] = tanh(vec2[i*inc2+start2]);
1472 template<
typename NumericT>
1492 template<
typename NumericT>
1503 __shared__
NumericT tmp_buffer[128];
1504 unsigned int group_start1 = (blockIdx.x *
size1) / (gridDim.x) * inc1 +
start1;
1505 unsigned int group_start2 = (blockIdx.x *
size2) / (gridDim.x) * inc2 +
start2;
1507 unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
1508 - ( blockIdx.x * size1) / (gridDim.x);
1512 for (
unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
1513 tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
1514 tmp_buffer[threadIdx.x] = tmp;
1520 if (threadIdx.x <
stride)
1521 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
1524 if (threadIdx.x == 0)
1525 group_buffer[blockIdx.x] = tmp_buffer[0];
1532 template<
typename NumericT>
1538 unsigned int option,
1541 __shared__
NumericT tmp_buffer[128];
1543 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1546 thread_sum += vec1[i*inc1+
start1];
1548 thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
1551 tmp_buffer[threadIdx.x] = thread_sum;
1556 if (threadIdx.x <
stride)
1559 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1561 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x +
stride]);
1565 if (threadIdx.x == 0)
1568 *result = sqrt(tmp_buffer[0]);
1570 *result = tmp_buffer[0];
1574 template<
typename NumericT>
1580 unsigned int option,
1583 __shared__
NumericT tmp_buffer[128];
1585 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1588 thread_sum += vec1[i*inc1+
start1];
1590 thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
1593 tmp_buffer[threadIdx.x] = thread_sum;
1598 if (threadIdx.x <
stride)
1601 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1603 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x +
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x +
stride];
1607 if (threadIdx.x == 0)
1608 *result = tmp_buffer[0];
1611 template<
typename NumericT>
1617 unsigned int option,
1620 __shared__
NumericT tmp_buffer[128];
1622 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1625 thread_sum += vec1[i*inc1+
start1];
1627 thread_sum = (thread_sum > vec1[i*inc1+
start1]) ? thread_sum : vec1[i*inc1+start1];
1630 tmp_buffer[threadIdx.x] = thread_sum;
1635 if (threadIdx.x <
stride)
1638 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
1640 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x +
stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x +
stride];
1644 if (threadIdx.x == 0)
1645 *result = tmp_buffer[0];
1651 struct vector_sum_kernel_launcher_integers
1653 template<
typename NumericT,
typename ScalarT>
1655 unsigned int option,
1663 static_cast<unsigned int>(option),
1669 struct vector_sum_kernel_launcher_unsigned_integers
1671 template<
typename NumericT,
typename ScalarT>
1672 static void apply(vector_base<NumericT>
const & temp,
1673 unsigned int option,
1681 static_cast<unsigned int>(option),
1687 struct vector_sum_kernel_launcher_floats
1689 template<
typename NumericT,
typename ScalarT>
1690 static void apply(vector_base<NumericT>
const & temp,
1691 unsigned int option,
1699 static_cast<unsigned int>(option),
1705 template<
typename NumericT>
1706 struct vector_sum_kernel_launcher :
public vector_sum_kernel_launcher_integers {};
1709 struct vector_sum_kernel_launcher<unsigned char> :
public vector_sum_kernel_launcher_unsigned_integers {};
1712 struct vector_sum_kernel_launcher<unsigned short> :
public vector_sum_kernel_launcher_unsigned_integers {};
1715 struct vector_sum_kernel_launcher<unsigned int> :
public vector_sum_kernel_launcher_unsigned_integers {};
1718 struct vector_sum_kernel_launcher<unsigned long> :
public vector_sum_kernel_launcher_unsigned_integers {};
1721 struct vector_sum_kernel_launcher<float> :
public vector_sum_kernel_launcher_floats {};
1724 struct vector_sum_kernel_launcher<double> :
public vector_sum_kernel_launcher_floats {};
1738 template<
typename NumericT,
typename ScalarT>
1745 static const unsigned int work_groups = 128;
1760 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
1770 template<
typename NumericT>
1777 const unsigned int work_groups = 128;
1793 std::vector<value_type> temp_cpu(work_groups);
1797 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1803 #define VIENNACL_MDOT_WORKGROUP_SIZE 128
1804 #define VIENNACL_MDOT_WORKGROUP_NUM 128
1806 template<
typename NumericT>
1808 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1813 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1814 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1815 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1820 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1821 entry_x = x[i * stridex + startx];
1822 group_sum0 += entry_x * y0[i * stride0 + start0];
1823 group_sum1 += entry_x * y1[i * stride1 +
start1];
1825 tmp_buffer[threadIdx.x] = group_sum0;
1826 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1831 if (threadIdx.x <
stride) {
1832 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1833 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1838 if (threadIdx.x == 0) {
1839 group_results[blockIdx.x] = tmp_buffer[0];
1840 group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
1845 template<
typename NumericT>
1847 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1853 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1854 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1855 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1861 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1862 entry_x = x[i * stridex + startx];
1863 group_sum0 += entry_x * y0[i * stride0 + start0];
1864 group_sum1 += entry_x * y1[i * stride1 +
start1];
1865 group_sum2 += entry_x * y2[i * stride2 +
start2];
1867 tmp_buffer[threadIdx.x] = group_sum0;
1868 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1869 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1874 if (threadIdx.x <
stride) {
1875 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1876 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1877 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1882 if (threadIdx.x == 0) {
1883 group_results[blockIdx.x ] = tmp_buffer[0];
1884 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1885 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1890 template<
typename NumericT>
1892 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1895 const NumericT *y3,
unsigned int start3,
unsigned int stride3,
1899 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1900 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1901 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1908 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1909 entry_x = x[i * stridex + startx];
1910 group_sum0 += entry_x * y0[i * stride0 + start0];
1911 group_sum1 += entry_x * y1[i * stride1 +
start1];
1912 group_sum2 += entry_x * y2[i * stride2 +
start2];
1913 group_sum3 += entry_x * y3[i * stride3 + start3];
1915 tmp_buffer[threadIdx.x] = group_sum0;
1916 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1917 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1918 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1923 if (threadIdx.x <
stride) {
1924 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1925 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1926 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1927 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
1932 if (threadIdx.x == 0) {
1933 group_results[blockIdx.x ] = tmp_buffer[0];
1934 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1935 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1936 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
1941 template<
typename NumericT>
1943 const NumericT *y0,
unsigned int start0,
unsigned int stride0,
1946 const NumericT *y3,
unsigned int start3,
unsigned int stride3,
1947 const NumericT *y4,
unsigned int start4,
unsigned int stride4,
1948 const NumericT *y5,
unsigned int start5,
unsigned int stride5,
1949 const NumericT *y6,
unsigned int start6,
unsigned int stride6,
1950 const NumericT *y7,
unsigned int start7,
unsigned int stride7,
1954 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1955 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1956 unsigned int vec_stop_index =
min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex);
1967 for (
unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1968 entry_x = x[i * stridex + startx];
1969 group_sum0 += entry_x * y0[i * stride0 + start0];
1970 group_sum1 += entry_x * y1[i * stride1 +
start1];
1971 group_sum2 += entry_x * y2[i * stride2 +
start2];
1972 group_sum3 += entry_x * y3[i * stride3 + start3];
1973 group_sum4 += entry_x * y4[i * stride4 + start4];
1974 group_sum5 += entry_x * y5[i * stride5 + start5];
1975 group_sum6 += entry_x * y6[i * stride6 + start6];
1976 group_sum7 += entry_x * y7[i * stride7 + start7];
1978 tmp_buffer[threadIdx.x] = group_sum0;
1979 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1980 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1981 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1982 tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
1983 tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
1984 tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
1985 tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
1990 if (threadIdx.x <
stride) {
1991 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+
stride ];
1992 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+
stride + blockDim.x];
1993 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 2 * blockDim.x];
1994 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 3 * blockDim.x];
1995 tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 4 * blockDim.x];
1996 tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 5 * blockDim.x];
1997 tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 6 * blockDim.x];
1998 tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+
stride + 7 * blockDim.x];
2003 if (threadIdx.x == 0) {
2004 group_results[blockIdx.x ] = tmp_buffer[0];
2005 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
2006 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
2007 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
2008 group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
2009 group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
2010 group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
2011 group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
2016 template<
typename NumericT>
2020 unsigned int start_result,
2021 unsigned int inc_result)
2030 if (threadIdx.x <
stride)
2031 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x +
stride];
2034 if (threadIdx.x == 0)
2035 result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
2038 template<
typename NumericT>
2048 while (vec_tuple.
const_size() > current_index)
2050 switch (vec_tuple.
const_size() - current_index)
2082 vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
2114 vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
2142 vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
2219 vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(
viennacl::cuda_arg(temp),
2232 #undef VIENNACL_MDOT_WORKGROUP_NUM
2233 #undef VIENNACL_MDOT_WORKGROUP_SIZE
2237 template<
typename NumericT>
2243 unsigned int norm_selector,
2246 __shared__
NumericT tmp_buffer[128];
2248 NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2249 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2250 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2251 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2252 group_stop = (group_stop > size1) ? size1 : group_stop;
2254 if (norm_selector == 1)
2256 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2257 tmp += fabs(vec[i*inc1 + start1]);
2259 else if (norm_selector == 2)
2262 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2264 vec_entry = vec[i*inc1 +
start1];
2265 tmp += vec_entry * vec_entry;
2268 else if (norm_selector == 0)
2270 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2271 tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
2273 else if (norm_selector == 3)
2275 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2276 tmp = (vec[i*inc1 +
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2278 else if (norm_selector == 4)
2280 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2281 tmp = (vec[i*inc1 +
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2284 tmp_buffer[threadIdx.x] = tmp;
2286 if (norm_selector == 1 || norm_selector == 2)
2291 if (threadIdx.x <
stride)
2292 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2295 else if (norm_selector == 3)
2301 if (threadIdx.x <
stride)
2302 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2305 else if (norm_selector == 4)
2311 if (threadIdx.x <
stride)
2312 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2321 if (threadIdx.x <
stride)
2322 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+
stride]);
2326 if (threadIdx.x == 0)
2327 group_buffer[blockIdx.x] = tmp_buffer[0];
2330 template<
typename NumericT>
2336 unsigned int norm_selector,
2339 __shared__
NumericT tmp_buffer[128];
2341 NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2342 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2343 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2344 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2345 group_stop = (group_stop > size1) ? size1 : group_stop;
2347 if (norm_selector == 1)
2349 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2350 tmp += abs(vec[i*inc1 + start1]);
2352 else if (norm_selector == 0)
2354 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2355 tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
2357 else if (norm_selector == 3)
2359 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2360 tmp = (vec[i*inc1 +
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2362 else if (norm_selector == 4)
2364 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2365 tmp = (vec[i*inc1 +
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2368 tmp_buffer[threadIdx.x] = tmp;
2370 if (norm_selector == 1 || norm_selector == 2)
2375 if (threadIdx.x <
stride)
2376 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2379 else if (norm_selector == 3)
2385 if (threadIdx.x <
stride)
2386 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2389 else if (norm_selector == 4)
2395 if (threadIdx.x <
stride)
2396 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2405 if (threadIdx.x <
stride)
2406 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
2410 if (threadIdx.x == 0)
2411 group_buffer[blockIdx.x] = tmp_buffer[0];
2414 template<
typename NumericT>
2420 unsigned int norm_selector,
2423 __shared__
NumericT tmp_buffer[128];
2425 NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2426 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2427 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2428 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2429 group_stop = (group_stop > size1) ? size1 : group_stop;
2431 if (norm_selector == 1)
2433 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2434 tmp += vec[i*inc1 +
start1];
2436 else if (norm_selector == 0)
2438 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2439 tmp = (tmp > vec[i*inc1 +
start1]) ? tmp : vec[i*inc1 + start1];
2441 else if (norm_selector == 3)
2443 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2444 tmp = (vec[i*inc1 +
start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2446 else if (norm_selector == 4)
2448 for (
unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2449 tmp = (vec[i*inc1 +
start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2452 tmp_buffer[threadIdx.x] = tmp;
2454 if (norm_selector == 1 || norm_selector == 2)
2459 if (threadIdx.x <
stride)
2460 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+
stride];
2463 else if (norm_selector == 3)
2469 if (threadIdx.x <
stride)
2470 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2473 else if (norm_selector == 4)
2479 if (threadIdx.x <
stride)
2480 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+
stride] : tmp_buffer[threadIdx.x];
2489 if (threadIdx.x <
stride)
2490 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+
stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+
stride];
2494 if (threadIdx.x == 0)
2495 group_buffer[blockIdx.x] = tmp_buffer[0];
2501 struct norm_kernel_launcher_integers
2503 template<
typename NumericT>
2506 unsigned int option)
2512 static_cast<unsigned int>(option),
2519 struct norm_kernel_launcher_unsigned_integers
2521 template<
typename NumericT>
2522 static void apply(vector_base<NumericT>
const & vec1,
2523 vector_base<NumericT> & temp,
2524 unsigned int option)
2530 static_cast<unsigned int>(option),
2538 struct norm_kernel_launcher_floats
2540 template<
typename NumericT>
2541 static void apply(vector_base<NumericT>
const & vec1,
2542 vector_base<NumericT> & temp,
2543 unsigned int option)
2549 static_cast<unsigned int>(option),
2556 template<
typename NumericT>
2557 struct norm_kernel_launcher :
public norm_kernel_launcher_integers {};
2560 struct norm_kernel_launcher<unsigned char> :
public norm_kernel_launcher_unsigned_integers {};
2563 struct norm_kernel_launcher<unsigned short> :
public norm_kernel_launcher_unsigned_integers {};
2566 struct norm_kernel_launcher<unsigned int> :
public norm_kernel_launcher_unsigned_integers {};
2569 struct norm_kernel_launcher<unsigned long> :
public norm_kernel_launcher_unsigned_integers {};
2572 struct norm_kernel_launcher<float> :
public norm_kernel_launcher_floats {};
2575 struct norm_kernel_launcher<double> :
public norm_kernel_launcher_floats {};
2586 template<
typename NumericT>
2595 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2596 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
2604 template<
typename NumericT>
2613 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2616 std::vector<value_type> temp_cpu(work_groups);
2620 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2631 template<
typename NumericT>
2640 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2642 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 2, result);
2650 template<
typename NumericT>
2659 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2661 std::vector<value_type> temp_cpu(work_groups);
2665 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2667 result = std::sqrt(result);
2678 template<
typename NumericT>
2687 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2688 detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 0, result);
2698 template<
typename NumericT>
2707 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2709 std::vector<value_type> temp_cpu(work_groups);
2713 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2721 template<
typename NumericT>
2727 unsigned int option,
2730 __shared__
NumericT tmp_buffer[128];
2732 for (
unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
2735 thread_minmax = (vec1[i*inc1+
start1] < thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2737 thread_minmax = (vec1[i*inc1+
start1] > thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2740 tmp_buffer[threadIdx.x] = thread_minmax;
2745 if (threadIdx.x <
stride)
2748 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x +
stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x +
stride] : tmp_buffer[threadIdx.x];
2750 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x +
stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x +
stride] : tmp_buffer[threadIdx.x];
2754 if (threadIdx.x == 0)
2755 *result = tmp_buffer[0];
2764 template<
typename NumericT>
2773 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2779 static_cast<unsigned int>(0),
2792 template<
typename NumericT>
2801 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2803 std::vector<value_type> temp_cpu(work_groups);
2807 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2818 template<
typename NumericT>
2827 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2833 static_cast<unsigned int>(1),
2846 template<
typename NumericT>
2855 detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2857 std::vector<value_type> temp_cpu(work_groups);
2861 for (
typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2873 template<
typename NumericT>
2890 template<
typename NumericT>
2909 template<
typename NumericT>
2911 __device__
inline unsigned long cuda_abs(
unsigned long val) {
return val; }
2912 __device__
inline unsigned int cuda_abs(
unsigned int val) {
return val; }
2913 __device__
inline unsigned short cuda_abs(
unsigned short val) {
return val; }
2914 __device__
inline unsigned char cuda_abs(
unsigned char val) {
return val; }
2916 template<
typename NumericT>
2921 unsigned int * result)
2923 __shared__
NumericT float_buffer[128];
2924 __shared__
unsigned int index_buffer[128];
2926 float_buffer[threadIdx.x] = 0;
2927 index_buffer[threadIdx.x] = 0;
2932 for (
unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
2934 tmp = vec[i*inc1+
start1];
2938 float_buffer[threadIdx.x] = tmp;
2939 index_buffer[threadIdx.x] = i;
2948 if (threadIdx.x <
stride)
2951 if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+
stride])
2953 index_buffer[threadIdx.x] = index_buffer[threadIdx.x+
stride];
2954 float_buffer[threadIdx.x] = float_buffer[threadIdx.x+
stride];
2959 if (threadIdx.x == 0)
2960 *result = index_buffer[0];
2971 template<
typename NumericT>
2983 viennacl::cuda_arg<unsigned int>(h)
2988 unsigned int ret = 0;
2995 template<
typename NumericT>
3011 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
3013 tmp1 = vec1[i*inc1+
start1];
3014 tmp2 = vec2[i*inc2+
start2];
3016 vec1[i*inc1+
start1] = alpha * tmp1 + beta * tmp2;
3017 vec2[i*inc2+
start2] = alpha * tmp2 - beta * tmp1;
3031 template<
typename NumericT>
3038 value_type temporary_alpha = 0;
3040 temporary_alpha = alpha;
3042 value_type temporary_beta = 0;
3044 temporary_beta = beta;
3062 template<
typename NumericT>
3064 unsigned int startX,
3069 unsigned int startY,
3072 unsigned int scan_offset,
3075 __shared__
NumericT shared_buffer[256];
3078 unsigned int work_per_thread = (sizeX - 1) / (gridDim.x * blockDim.x) + 1;
3079 unsigned int block_start = work_per_thread * blockDim.x * blockIdx.x;
3080 unsigned int block_stop = work_per_thread * blockDim.x * (blockIdx.x + 1);
3081 unsigned int block_offset = 0;
3084 for (
unsigned int i = block_start + threadIdx.x; i < block_stop; i += blockDim.x)
3087 my_value = (i < sizeX) ? X[i * incX + startX] : 0;
3093 shared_buffer[threadIdx.x] = my_value;
3095 if (threadIdx.x >=
stride)
3096 my_value += shared_buffer[threadIdx.x -
stride];
3099 shared_buffer[threadIdx.x] = my_value;
3103 if (scan_offset > 0)
3104 my_value = (threadIdx.x > 0) ? shared_buffer[threadIdx.x - 1] : 0;
3108 Y[i * incY + startY] = block_offset + my_value;
3110 block_offset += shared_buffer[blockDim.x-1];
3114 if (threadIdx.x == 0)
3115 carries[blockIdx.x] = block_offset;
3120 template<
typename NumericT>
3123 __shared__
NumericT shared_buffer[256];
3126 NumericT my_carry = carries[threadIdx.x];
3133 shared_buffer[threadIdx.x] = my_carry;
3135 if (threadIdx.x >=
stride)
3136 my_carry += shared_buffer[threadIdx.x -
stride];
3139 shared_buffer[threadIdx.x] = my_carry;
3143 carries[threadIdx.x] = (threadIdx.x > 0) ? shared_buffer[threadIdx.x - 1] : 0;
3146 template<
typename NumericT>
3148 unsigned int startY,
3154 unsigned int work_per_thread = (sizeY - 1) / (gridDim.x * blockDim.x) + 1;
3155 unsigned int block_start = work_per_thread * blockDim.x * blockIdx.x;
3156 unsigned int block_stop = work_per_thread * blockDim.x * (blockIdx.x + 1);
3160 if (threadIdx.x == 0)
3161 shared_offset = carries[blockIdx.x];
3166 for (
unsigned int i = block_start + threadIdx.x; i < block_stop; i += blockDim.x)
3168 Y[i * incY + startY] += shared_offset;
3180 template<
typename NumericT>
3201 static_cast<unsigned int>(is_inclusive ? 0 : 1),
3202 viennacl::cuda_arg<NumericT>(cuda_carries)
3206 scan_kernel_2<<<1, block_num>>>(viennacl::cuda_arg<NumericT>(cuda_carries));
3214 viennacl::cuda_arg<NumericT>(cuda_carries)
3225 template<
typename NumericT>
3238 template<
typename NumericT>
vcl_size_t const_size() const
__global__ void convert_kernel(DestNumericT *dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, SrcNumericT const *src, unsigned int start_src, unsigned int inc_src)
__global__ void vec_element_abs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
void vector_assign(vector_base< NumericT > &vec1, ScalarT1 const &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
__global__ void vector_sum_kernel_unsigned_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
void norm_2_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^2-norm of a vector - implementation.
void convert(matrix_base< DestNumericT > &mat1, matrix_base< SrcNumericT > const &mat2)
__global__ void norm_kernel_floats(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, NumericT *group_results)
__global__ void vec_element_asin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Generic size and resize functionality for different vector and matrix types.
__global__ void plane_rotation_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT alpha, NumericT beta)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, NumericT *group_results)
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
__global__ void vector_multi_sum_kernel(NumericT const *vec1, NumericT *result, unsigned int start_result, unsigned int inc_result)
__global__ void vector_maxmin_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_fabs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
void max_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
__global__ void norm_kernel_unsigned_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
Determines row and column increments for matrices and matrix proxies.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
void exclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an exclusive scan using CUDA.
void norm_1_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^1-norm of a vector.
T max(const T &lhs, const T &rhs)
Maximum.
#define VIENNACL_MDOT_WORKGROUP_SIZE
An expression template class that represents a binary operation that yields a vector.
__global__ void scan_kernel_2(NumericT *carries)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void vec_element_sqrt_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void avbv_v_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
void max_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void vec_element_tanh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
#define VIENNACL_MDOT_WORKGROUP_NUM
void vector_swap(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
Swaps the contents of two vectors, data is copied.
__global__ void index_norm_inf_kernel(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int *result)
void scan_impl(vector_base< NumericT > const &input, vector_base< NumericT > &output, bool is_inclusive)
Worker routine for scan routines.
__global__ void inner_prod_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT *group_buffer)
__global__ void vec_element_tan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
void inclusive_scan(vector_base< NumericT > const &input, vector_base< NumericT > &output)
This function implements an inclusive scan using CUDA.
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
void norm_inf_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the supremum-norm of a vector.
void inner_prod_cpu(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, NumericT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
result_of::size_type< T >::type start(T const &obj)
__global__ void vec_element_atan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void sum_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void scan_kernel_3(NumericT *Y, unsigned int startY, unsigned int incY, unsigned int sizeY, NumericT const *carries)
Common base class for dense vectors, vector ranges, and vector slices.
__global__ void vector_sum_kernel_floats(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void scan_kernel_1(NumericT const *X, unsigned int startX, unsigned int incX, unsigned int sizeX, NumericT *Y, unsigned int startY, unsigned int incY, unsigned int scan_offset, NumericT *carries)
void sum_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
void avbv_v(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void vec_element_log_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_sum_kernel_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_cosh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
__global__ void av_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_acos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_ceil_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void min_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void norm_kernel_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
void norm_1_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^1-norm of a vector.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vcl_size_t index_norm_inf(vector_base< NumericT > const &vec1)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
__global__ void element_op_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
__global__ void vec_element_exp_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void plane_rotation(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2, NumericT alpha, NumericT beta)
Computes a plane rotation of two vectors.
void min_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
Common routines for CUDA execution.
void inner_prod_impl(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, ScalarT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initial...
__global__ void vector_swap_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2)
void norm_inf_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the supremum-norm of a vector.
__global__ void avbv_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
size_type size() const
Returns the length of the vector (cf. std::vector)
__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, NumericT *group_results)
__global__ void vec_element_sin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
VectorType const & const_at(vcl_size_t i) const
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
__global__ void vec_element_cos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_assign_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int internal_size1, NumericT alpha)
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
__global__ void vec_element_log10_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
T min(const T &lhs, const T &rhs)
Minimum.
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
__global__ void vec_element_floor_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Implementation of the ViennaCL scalar class.
void norm_2_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^2-norm of a vector - implementation.
void avbv(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, const NumericT *y4, unsigned int start4, unsigned int stride4, const NumericT *y5, unsigned int start5, unsigned int stride5, const NumericT *y6, unsigned int start6, unsigned int stride6, const NumericT *y7, unsigned int start7, unsigned int stride7, NumericT *group_results)
__global__ void vec_element_sinh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
__device__ NumericT cuda_abs(NumericT val)
Simple enable-if variant that uses the SFINAE pattern.
NumericT min(std::vector< NumericT > const &v1)
__global__ void element_op_int_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)