1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
33 template<
typename DestNumericT,
typename SrcNumericT>
35 unsigned int A_start1,
unsigned int A_start2,
36 unsigned int A_inc1,
unsigned int A_inc2,
37 unsigned int A_size1,
unsigned int A_size2,
38 unsigned int A_internal_size1,
unsigned int A_internal_size2,
40 const SrcNumericT * B,
41 unsigned int B_start1,
unsigned int B_start2,
42 unsigned int B_inc1,
unsigned int B_inc2,
43 unsigned int B_internal_size1,
unsigned int B_internal_size2)
45 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
46 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
48 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
49 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
50 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1];
58 template<
typename NumericT>
60 unsigned int A_start1,
unsigned int A_start2,
61 unsigned int A_inc1,
unsigned int A_inc2,
62 unsigned int A_size1,
unsigned int A_size2,
63 unsigned int A_internal_size1,
unsigned int A_internal_size2,
66 unsigned int options2,
68 unsigned int B_start1,
unsigned int B_start2,
69 unsigned int B_inc1,
unsigned int B_inc2,
70 unsigned int B_internal_size1,
unsigned int B_internal_size2)
73 if (options2 & (1 << 0))
76 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
77 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
79 if (options2 & (1 << 1))
81 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
82 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
83 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
87 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
88 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
89 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
94 template<
typename NumericT>
96 unsigned int A_start1,
unsigned int A_start2,
97 unsigned int A_inc1,
unsigned int A_inc2,
98 unsigned int A_size1,
unsigned int A_size2,
99 unsigned int A_internal_size1,
unsigned int A_internal_size2,
102 unsigned int options2,
104 unsigned int B_start1,
unsigned int B_start2,
105 unsigned int B_inc1,
unsigned int B_inc2,
106 unsigned int B_internal_size1,
unsigned int B_internal_size2)
109 if (options2 & (1 << 0))
112 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
113 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
115 if (options2 & (1 << 1))
117 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
118 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
119 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
123 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
124 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
125 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
135 template<
typename NumericT>
137 unsigned int A_start1,
unsigned int A_start2,
138 unsigned int A_inc1,
unsigned int A_inc2,
139 unsigned int A_size1,
unsigned int A_size2,
140 unsigned int A_internal_size1,
unsigned int A_internal_size2,
143 unsigned int options2,
145 unsigned int B_start1,
unsigned int B_start2,
146 unsigned int B_inc1,
unsigned int B_inc2,
147 unsigned int B_internal_size1,
unsigned int B_internal_size2,
150 unsigned int options3,
152 unsigned int C_start1,
unsigned int C_start2,
153 unsigned int C_inc1,
unsigned int C_inc2,
154 unsigned int C_internal_size1,
unsigned int C_internal_size2)
157 if (options2 & (1 << 0))
161 if (options3 & (1 << 0))
164 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
165 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
167 if (options2 & (1 << 1))
169 if (options3 & (1 << 1))
171 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
172 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
173 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
174 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
175 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
179 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
180 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
181 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
182 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
183 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
188 if (options3 & (1 << 1))
190 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
191 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
192 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
193 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
194 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
198 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
199 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
200 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
201 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
202 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
209 template<
typename NumericT>
211 unsigned int A_start1,
unsigned int A_start2,
212 unsigned int A_inc1,
unsigned int A_inc2,
213 unsigned int A_size1,
unsigned int A_size2,
214 unsigned int A_internal_size1,
unsigned int A_internal_size2,
217 unsigned int options2,
219 unsigned int B_start1,
unsigned int B_start2,
220 unsigned int B_inc1,
unsigned int B_inc2,
221 unsigned int B_internal_size1,
unsigned int B_internal_size2,
224 unsigned int options3,
226 unsigned int C_start1,
unsigned int C_start2,
227 unsigned int C_inc1,
unsigned int C_inc2,
228 unsigned int C_internal_size1,
unsigned int C_internal_size2)
231 if (options2 & (1 << 0))
235 if (options3 & (1 << 0))
238 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
239 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
241 if (options2 & (1 << 1))
243 if (options3 & (1 << 1))
245 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
246 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
247 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
248 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
249 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
253 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
254 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
255 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
256 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
257 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
262 if (options3 & (1 << 1))
264 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
265 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
266 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
267 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
268 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
272 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
273 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
274 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
275 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
276 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
282 template<
typename NumericT>
284 unsigned int A_start1,
unsigned int A_start2,
285 unsigned int A_inc1,
unsigned int A_inc2,
286 unsigned int A_size1,
unsigned int A_size2,
287 unsigned int A_internal_size1,
unsigned int A_internal_size2,
290 unsigned int options2,
292 unsigned int B_start1,
unsigned int B_start2,
293 unsigned int B_inc1,
unsigned int B_inc2,
294 unsigned int B_internal_size1,
unsigned int B_internal_size2,
297 unsigned int options3,
299 unsigned int C_start1,
unsigned int C_start2,
300 unsigned int C_inc1,
unsigned int C_inc2,
301 unsigned int C_internal_size1,
unsigned int C_internal_size2)
304 if (options2 & (1 << 0))
308 if (options3 & (1 << 0))
311 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
312 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
314 if (options2 & (1 << 1))
316 if (options3 & (1 << 1))
318 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
319 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
320 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
321 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
322 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
326 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
327 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
328 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
329 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
330 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
335 if (options3 & (1 << 1))
337 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
338 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
339 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
340 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
341 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
345 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
346 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
347 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
348 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
349 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
356 template<
typename NumericT>
359 unsigned int A_start1,
unsigned int A_start2,
360 unsigned int A_inc1,
unsigned int A_inc2,
361 unsigned int A_size1,
unsigned int A_size2,
362 unsigned int A_internal_size1,
unsigned int A_internal_size2,
365 unsigned int options2,
367 unsigned int B_start1,
unsigned int B_start2,
368 unsigned int B_inc1,
unsigned int B_inc2,
369 unsigned int B_internal_size1,
unsigned int B_internal_size2,
372 unsigned int options3,
374 unsigned int C_start1,
unsigned int C_start2,
375 unsigned int C_inc1,
unsigned int C_inc2,
376 unsigned int C_internal_size1,
unsigned int C_internal_size2)
379 if (options2 & (1 << 0))
383 if (options3 & (1 << 0))
386 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
387 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
389 if (options2 & (1 << 1))
391 if (options3 & (1 << 1))
393 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
394 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
395 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
396 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
397 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
401 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
402 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
403 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
404 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
405 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
410 if (options3 & (1 << 1))
412 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
413 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
414 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
415 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
416 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
420 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
421 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
422 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
423 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
424 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
435 template<
typename NumericT>
438 unsigned int A_start1,
unsigned int A_start2,
439 unsigned int A_inc1,
unsigned int A_inc2,
440 unsigned int A_size1,
unsigned int A_size2,
441 unsigned int A_internal_size1,
unsigned int A_internal_size2,
444 unsigned int options2,
446 unsigned int B_start1,
unsigned int B_start2,
447 unsigned int B_inc1,
unsigned int B_inc2,
448 unsigned int B_internal_size1,
unsigned int B_internal_size2,
451 unsigned int options3,
453 unsigned int C_start1,
unsigned int C_start2,
454 unsigned int C_inc1,
unsigned int C_inc2,
455 unsigned int C_internal_size1,
unsigned int C_internal_size2)
458 if (options2 & (1 << 0))
462 if (options3 & (1 << 0))
465 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
466 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
468 if (options2 & (1 << 1))
470 if (options3 & (1 << 1))
472 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
473 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
474 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
475 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
476 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
480 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
481 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
482 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
483 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
484 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
489 if (options3 & (1 << 1))
491 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
492 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
493 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
494 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
495 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
499 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
500 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
501 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
502 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
503 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
510 template<
typename NumericT>
513 unsigned int A_start1,
unsigned int A_start2,
514 unsigned int A_inc1,
unsigned int A_inc2,
515 unsigned int A_size1,
unsigned int A_size2,
516 unsigned int A_internal_size1,
unsigned int A_internal_size2,
519 unsigned int options2,
521 unsigned int B_start1,
unsigned int B_start2,
522 unsigned int B_inc1,
unsigned int B_inc2,
523 unsigned int B_internal_size1,
unsigned int B_internal_size2,
526 unsigned int options3,
528 unsigned int C_start1,
unsigned int C_start2,
529 unsigned int C_inc1,
unsigned int C_inc2,
530 unsigned int C_internal_size1,
unsigned int C_internal_size2)
533 if (options2 & (1 << 0))
537 if (options3 & (1 << 0))
540 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
541 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
543 if (options2 & (1 << 1))
545 if (options3 & (1 << 1))
547 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
548 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
549 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
550 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
551 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
555 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
556 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
557 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
558 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
559 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
564 if (options3 & (1 << 1))
566 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
567 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
568 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
569 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
570 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
574 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
575 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
576 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
577 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
578 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
584 template<
typename NumericT>
587 unsigned int A_start1,
unsigned int A_start2,
588 unsigned int A_inc1,
unsigned int A_inc2,
589 unsigned int A_size1,
unsigned int A_size2,
590 unsigned int A_internal_size1,
unsigned int A_internal_size2,
593 unsigned int options2,
595 unsigned int B_start1,
unsigned int B_start2,
596 unsigned int B_inc1,
unsigned int B_inc2,
597 unsigned int B_internal_size1,
unsigned int B_internal_size2,
600 unsigned int options3,
602 unsigned int C_start1,
unsigned int C_start2,
603 unsigned int C_inc1,
unsigned int C_inc2,
604 unsigned int C_internal_size1,
unsigned int C_internal_size2)
607 if (options2 & (1 << 0))
611 if (options3 & (1 << 0))
614 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
615 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
617 if (options2 & (1 << 1))
619 if (options3 & (1 << 1))
621 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
622 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
623 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
624 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
625 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
629 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
630 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
631 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
632 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
633 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
638 if (options3 & (1 << 1))
640 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
641 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
642 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
643 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
644 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
648 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
649 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
650 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
651 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
652 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
659 template<
typename NumericT>
662 unsigned int A_start1,
unsigned int A_start2,
663 unsigned int A_inc1,
unsigned int A_inc2,
664 unsigned int A_size1,
unsigned int A_size2,
665 unsigned int A_internal_size1,
unsigned int A_internal_size2,
668 unsigned int options2,
670 unsigned int B_start1,
unsigned int B_start2,
671 unsigned int B_inc1,
unsigned int B_inc2,
672 unsigned int B_internal_size1,
unsigned int B_internal_size2,
675 unsigned int options3,
677 unsigned int C_start1,
unsigned int C_start2,
678 unsigned int C_inc1,
unsigned int C_inc2,
679 unsigned int C_internal_size1,
unsigned int C_internal_size2)
682 if (options2 & (1 << 0))
686 if (options3 & (1 << 0))
689 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
690 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
692 if (options2 & (1 << 1))
694 if (options3 & (1 << 1))
696 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
697 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
698 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
699 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
700 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
704 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
705 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
706 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
707 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
708 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
713 if (options3 & (1 << 1))
715 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
716 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
717 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
718 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
719 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
723 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
724 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
725 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
726 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
727 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
738 template<
typename NumericT>
741 unsigned int A_start1,
unsigned int A_start2,
742 unsigned int A_inc1,
unsigned int A_inc2,
743 unsigned int A_size1,
unsigned int A_size2,
744 unsigned int A_internal_size1,
unsigned int A_internal_size2,
747 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
748 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
750 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
751 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
752 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
756 template<
typename NumericT>
759 unsigned int A_start1,
unsigned int A_start2,
760 unsigned int A_inc1,
unsigned int A_inc2,
761 unsigned int A_size1,
unsigned int A_size2,
762 unsigned int A_internal_size1,
unsigned int A_internal_size2,
765 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
767 for (
unsigned int row = gid;
row < A_size1;
row += blockDim.x * gridDim.x)
768 A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1] = alpha;
775 template<
typename NumericT>
778 unsigned int A_start1,
unsigned int A_start2,
779 unsigned int A_inc1,
unsigned int A_inc2,
780 unsigned int A_size1,
unsigned int A_size2,
781 unsigned int A_internal_size1,
unsigned int A_internal_size2,
784 unsigned int B_start1,
unsigned int B_start2,
785 unsigned int B_inc1,
unsigned int B_inc2,
786 unsigned int B_internal_size1,
unsigned int B_internal_size2,
789 unsigned int C_start1,
unsigned int C_start2,
790 unsigned int C_inc1,
unsigned int C_inc2,
791 unsigned int C_internal_size1,
unsigned int C_internal_size2,
793 unsigned int op_type)
795 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
796 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
800 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
801 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
802 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
803 = pow(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
804 C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
806 else if (op_type == 1)
808 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
809 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
810 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
811 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
812 / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
814 else if (op_type == 0)
816 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
817 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
818 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
819 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
820 * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
824 template<
typename NumericT>
827 unsigned int A_start1,
unsigned int A_start2,
828 unsigned int A_inc1,
unsigned int A_inc2,
829 unsigned int A_size1,
unsigned int A_size2,
830 unsigned int A_internal_size1,
unsigned int A_internal_size2,
833 unsigned int B_start1,
unsigned int B_start2,
834 unsigned int B_inc1,
unsigned int B_inc2,
835 unsigned int B_internal_size1,
unsigned int B_internal_size2,
838 unsigned int C_start1,
unsigned int C_start2,
839 unsigned int C_inc1,
unsigned int C_inc2,
840 unsigned int C_internal_size1,
unsigned int C_internal_size2,
842 unsigned int op_type)
844 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
845 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
849 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
850 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
851 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
852 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
853 / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
855 else if (op_type == 0)
857 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
858 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
859 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
860 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
861 * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
871 template<
typename NumericT>
874 unsigned int A_start1,
unsigned int A_start2,
875 unsigned int A_inc1,
unsigned int A_inc2,
876 unsigned int A_size1,
unsigned int A_size2,
877 unsigned int A_internal_size1,
unsigned int A_internal_size2,
880 unsigned int B_start1,
unsigned int B_start2,
881 unsigned int B_inc1,
unsigned int B_inc2,
882 unsigned int B_internal_size1,
unsigned int B_internal_size2)
884 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
885 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
887 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
888 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
889 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
894 template<
typename NumericT>
897 unsigned int A_start1,
unsigned int A_start2,
898 unsigned int A_inc1,
unsigned int A_inc2,
899 unsigned int A_size1,
unsigned int A_size2,
900 unsigned int A_internal_size1,
unsigned int A_internal_size2,
903 unsigned int B_start1,
unsigned int B_start2,
904 unsigned int B_inc1,
unsigned int B_inc2,
905 unsigned int B_internal_size1,
unsigned int B_internal_size2)
907 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
908 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
910 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
911 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
912 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
917 template<
typename NumericT>
920 unsigned int A_start1,
unsigned int A_start2,
921 unsigned int A_inc1,
unsigned int A_inc2,
922 unsigned int A_size1,
unsigned int A_size2,
923 unsigned int A_internal_size1,
unsigned int A_internal_size2,
926 unsigned int B_start1,
unsigned int B_start2,
927 unsigned int B_inc1,
unsigned int B_inc2,
928 unsigned int B_internal_size1,
unsigned int B_internal_size2)
930 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
931 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
933 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
934 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
935 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
940 template<
typename NumericT>
943 unsigned int A_start1,
unsigned int A_start2,
944 unsigned int A_inc1,
unsigned int A_inc2,
945 unsigned int A_size1,
unsigned int A_size2,
946 unsigned int A_internal_size1,
unsigned int A_internal_size2,
949 unsigned int B_start1,
unsigned int B_start2,
950 unsigned int B_inc1,
unsigned int B_inc2,
951 unsigned int B_internal_size1,
unsigned int B_internal_size2)
953 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
954 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
956 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
957 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
958 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
963 template<
typename NumericT>
966 unsigned int A_start1,
unsigned int A_start2,
967 unsigned int A_inc1,
unsigned int A_inc2,
968 unsigned int A_size1,
unsigned int A_size2,
969 unsigned int A_internal_size1,
unsigned int A_internal_size2,
972 unsigned int B_start1,
unsigned int B_start2,
973 unsigned int B_inc1,
unsigned int B_inc2,
974 unsigned int B_internal_size1,
unsigned int B_internal_size2)
976 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
977 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
979 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
980 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
981 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
986 template<
typename NumericT>
989 unsigned int A_start1,
unsigned int A_start2,
990 unsigned int A_inc1,
unsigned int A_inc2,
991 unsigned int A_size1,
unsigned int A_size2,
992 unsigned int A_internal_size1,
unsigned int A_internal_size2,
995 unsigned int B_start1,
unsigned int B_start2,
996 unsigned int B_inc1,
unsigned int B_inc2,
997 unsigned int B_internal_size1,
unsigned int B_internal_size2)
999 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1000 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1002 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1003 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1004 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1009 template<
typename NumericT>
1012 unsigned int A_start1,
unsigned int A_start2,
1013 unsigned int A_inc1,
unsigned int A_inc2,
1014 unsigned int A_size1,
unsigned int A_size2,
1015 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1018 unsigned int B_start1,
unsigned int B_start2,
1019 unsigned int B_inc1,
unsigned int B_inc2,
1020 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1022 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1023 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1025 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1026 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1027 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1032 template<
typename NumericT>
1035 unsigned int A_start1,
unsigned int A_start2,
1036 unsigned int A_inc1,
unsigned int A_inc2,
1037 unsigned int A_size1,
unsigned int A_size2,
1038 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1041 unsigned int B_start1,
unsigned int B_start2,
1042 unsigned int B_inc1,
unsigned int B_inc2,
1043 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1045 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1046 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1048 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1049 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1050 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1055 template<
typename NumericT>
1058 unsigned int A_start1,
unsigned int A_start2,
1059 unsigned int A_inc1,
unsigned int A_inc2,
1060 unsigned int A_size1,
unsigned int A_size2,
1061 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1064 unsigned int B_start1,
unsigned int B_start2,
1065 unsigned int B_inc1,
unsigned int B_inc2,
1066 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1068 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1069 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1071 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1072 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1073 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1078 template<
typename NumericT>
1081 unsigned int A_start1,
unsigned int A_start2,
1082 unsigned int A_inc1,
unsigned int A_inc2,
1083 unsigned int A_size1,
unsigned int A_size2,
1084 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1087 unsigned int B_start1,
unsigned int B_start2,
1088 unsigned int B_inc1,
unsigned int B_inc2,
1089 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1091 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1092 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1094 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1095 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1096 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1101 template<
typename NumericT>
1104 unsigned int A_start1,
unsigned int A_start2,
1105 unsigned int A_inc1,
unsigned int A_inc2,
1106 unsigned int A_size1,
unsigned int A_size2,
1107 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1110 unsigned int B_start1,
unsigned int B_start2,
1111 unsigned int B_inc1,
unsigned int B_inc2,
1112 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1114 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1115 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1117 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1118 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1119 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1124 template<
typename NumericT>
1127 unsigned int A_start1,
unsigned int A_start2,
1128 unsigned int A_inc1,
unsigned int A_inc2,
1129 unsigned int A_size1,
unsigned int A_size2,
1130 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1133 unsigned int B_start1,
unsigned int B_start2,
1134 unsigned int B_inc1,
unsigned int B_inc2,
1135 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1137 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1138 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1140 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1141 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1142 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1147 template<
typename NumericT>
1150 unsigned int A_start1,
unsigned int A_start2,
1151 unsigned int A_inc1,
unsigned int A_inc2,
1152 unsigned int A_size1,
unsigned int A_size2,
1153 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1156 unsigned int B_start1,
unsigned int B_start2,
1157 unsigned int B_inc1,
unsigned int B_inc2,
1158 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1160 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1161 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1163 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1164 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1165 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1170 template<
typename NumericT>
1173 unsigned int A_start1,
unsigned int A_start2,
1174 unsigned int A_inc1,
unsigned int A_inc2,
1175 unsigned int A_size1,
unsigned int A_size2,
1176 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1179 unsigned int B_start1,
unsigned int B_start2,
1180 unsigned int B_inc1,
unsigned int B_inc2,
1181 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1183 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1184 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1186 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1187 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1188 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1193 template<
typename NumericT>
1196 unsigned int A_start1,
unsigned int A_start2,
1197 unsigned int A_inc1,
unsigned int A_inc2,
1198 unsigned int A_size1,
unsigned int A_size2,
1199 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1202 unsigned int B_start1,
unsigned int B_start2,
1203 unsigned int B_inc1,
unsigned int B_inc2,
1204 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1206 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1207 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1209 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1210 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1211 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1216 template<
typename NumericT>
1219 unsigned int A_start1,
unsigned int A_start2,
1220 unsigned int A_inc1,
unsigned int A_inc2,
1221 unsigned int A_size1,
unsigned int A_size2,
1222 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1225 unsigned int B_start1,
unsigned int B_start2,
1226 unsigned int B_inc1,
unsigned int B_inc2,
1227 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1229 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1230 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1232 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1233 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1234 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1239 template<
typename NumericT>
1242 unsigned int A_start1,
unsigned int A_start2,
1243 unsigned int A_inc1,
unsigned int A_inc2,
1244 unsigned int A_size1,
unsigned int A_size2,
1245 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1248 unsigned int B_start1,
unsigned int B_start2,
1249 unsigned int B_inc1,
unsigned int B_inc2,
1250 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1252 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1253 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1255 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1256 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1257 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1266 template<
typename NumericT>
1269 unsigned int A_row_start,
1270 unsigned int A_col_start,
1271 unsigned int A_row_inc,
1272 unsigned int A_col_inc,
1273 unsigned int A_row_size,
1274 unsigned int A_col_size,
1275 unsigned int A_internal_rows,
1276 unsigned int A_internal_cols,
1278 unsigned int v_start,
1280 unsigned int v_size,
1282 unsigned int result_start,
1283 unsigned int result_inc,
1284 unsigned int result_size)
1287 for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
row < A_row_size;
row += gridDim.x * blockDim.x)
1290 for (
unsigned int col = 0; col < A_col_size; ++col)
1291 dot_prod += A[(
row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
1292 result[
row * result_inc + result_start] =
dot_prod;
1297 template<
typename NumericT>
1300 unsigned int A_row_start,
1301 unsigned int A_col_start,
1302 unsigned int A_row_inc,
1303 unsigned int A_col_inc,
1304 unsigned int A_row_size,
1305 unsigned int A_col_size,
1306 unsigned int A_internal_rows,
1307 unsigned int A_internal_cols,
1309 unsigned int v_start,
1311 unsigned int v_size,
1313 unsigned int result_start,
1314 unsigned int result_inc,
1315 unsigned int result_size)
1319 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1320 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1321 unsigned int lid = threadIdx.x;
1323 for (
unsigned int row = row_gid;
row < A_col_size;
row += gridDim.x)
1326 for (
unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
1327 dot_prod += A[(
row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
1333 work[lid] += work[lid+
stride];
1337 result[
row * result_inc + result_start] = work[0];
1354 template<
typename NumericT>
1357 unsigned int A_start1,
unsigned int A_start2,
1358 unsigned int A_inc1,
unsigned int A_inc2,
1359 unsigned int A_size1,
unsigned int A_size2,
1360 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1363 unsigned int options2,
1376 if (options2 & (1 << 0))
1378 if (options2 & (1 << 1))
1381 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1382 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1384 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1387 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1388 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1394 template<
typename NumericT>
1397 unsigned int A_start1,
unsigned int A_start2,
1398 unsigned int A_inc1,
unsigned int A_inc2,
1399 unsigned int A_size1,
unsigned int A_size2,
1400 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1403 unsigned int options2,
1416 if (options2 & (1 << 0))
1418 if (options2 & (1 << 1))
1421 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1422 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1424 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1427 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1428 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1433 template <
typename T>
1442 unsigned int size =
min(size1, size2);
1443 if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1446 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1448 i += gridDim.x * blockDim.x)
1450 D[i] = A[i*stride + i];
1451 S[i+1] = (i + 1 <
size2) ? A[i*stride + (i + 1)] : 0;
1455 template <
typename T>
1464 unsigned int size =
min(size1, size2);
1465 if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1468 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1470 i += gridDim.x * blockDim.x)
1472 D[i] = A[i*stride + i];
1473 S[i+1] = (i + 1 <
size2) ? A[i + (i + 1) *
stride] : 0;
1479 template<
typename T>
1483 unsigned int row_start,
1484 unsigned int col_start,
1488 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1489 unsigned int sz = gridDim.x * blockDim.x;
1491 for(
unsigned int i = row_start + x; i <
size; i += sz)
1493 V[i - row_start] = A[i * stride + col_start];
1497 template<
typename T>
1501 unsigned int row_start,
1502 unsigned int col_start,
1506 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1507 unsigned int sz = gridDim.x * blockDim.x;
1509 for(
unsigned int i = row_start + x; i <
size; i += sz)
1511 V[i - row_start] = A[i + col_start *
stride];
1515 template<
typename T>
1519 unsigned int row_start,
1520 unsigned int col_start,
1524 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1525 unsigned int sz = gridDim.x * blockDim.x;
1527 for(
unsigned int i = col_start + x; i <
size; i += sz)
1529 V[i - col_start] = A[row_start * stride + i];
1534 template<
typename T>
1538 unsigned int row_start,
1539 unsigned int col_start,
1543 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1544 unsigned int sz = gridDim.x * blockDim.x;
1546 for(
unsigned int i = col_start + x; i <
size; i += sz)
1548 V[i - col_start] = A[row_start + i *
stride];
1555 template<
typename T>
1559 unsigned int row_start,
1560 unsigned int col_start,
1567 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1569 i += gridDim.x * blockDim.x)
1572 for(
unsigned int j = row_start; j <
size1; j++)
1573 ss = ss +(V[j] * A[j * stride + i]);
1575 for(
unsigned int j = row_start; j <
size1; j++)
1576 A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
1580 template<
typename T>
1584 unsigned int row_start,
1585 unsigned int col_start,
1592 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1594 i += gridDim.x * blockDim.x)
1597 for(
unsigned int j = row_start; j <
size1; j++)
1598 ss = ss +(V[j] * A[j + i * stride]);
1600 for(
unsigned int j = row_start; j <
size1; j++)
1601 A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
1607 template<
typename T>
1611 unsigned int row_start,
1612 unsigned int col_start,
1617 __shared__ T sums[128];
1620 for(
unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1623 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1624 ss = ss + (V[j] * A[i * stride + j]);
1625 sums[threadIdx.x] = ss;
1633 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1634 A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
1638 template<
typename T>
1642 unsigned int row_start,
1643 unsigned int col_start,
1648 __shared__ T sums[128];
1651 for(
unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1654 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1655 ss = ss + (V[j] * A[i + j *
stride]);
1656 sums[threadIdx.x] = ss;
1664 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1665 A[i + j *
stride] = A[i + j *
stride] - (2 * V[j] * sum_Av);
1671 template<
typename T>
1674 unsigned int th_Idx,
1675 unsigned int bl_Dim)
1677 unsigned int step = bl_Dim >> 1;
1682 sums[th_Idx] += sums[th_Idx +
step];
1689 template <
typename T>
1694 unsigned int strideQ)
1696 __shared__ T sums[128];
1698 for(
unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1701 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1702 ss = ss + (V[j] * QL[i * strideQ + j]);
1703 sums[threadIdx.x] = ss;
1711 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1712 QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
1716 template <
typename T>
1721 unsigned int strideQ)
1723 __shared__ T sums[128];
1725 for(
unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1728 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1729 ss = ss + (V[j] * QL[i + j * strideQ]);
1730 sums[threadIdx.x] = ss;
1738 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1739 QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
1744 template <
typename T>
1751 unsigned int start_i,
1754 unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1755 __shared__ T cs_lcl[256];
1756 __shared__ T ss_lcl[256];
1758 T x = (j <
size) ? matr[(end_i + 1) + j *
stride] : 0;
1760 unsigned int elems_num = end_i - start_i + 1;
1761 unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1763 for(
unsigned int block_id = 0; block_id < block_num; block_id++)
1765 unsigned int to =
min(elems_num - block_id * blockDim.x, blockDim.x);
1767 if(threadIdx.x < to)
1769 cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1770 ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1775 for(
unsigned int ind = 0; ind < to; ind++)
1777 unsigned int i = end_i - (ind + block_id * blockDim.x);
1778 T z = matr[i + j *
stride];
1779 T cs_val = cs_lcl[ind];
1780 T ss_val = ss_lcl[ind];
1781 matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
1782 x = -x * ss_val + z * cs_val;
1788 matr[(start_i) + j * stride] = x;
1791 template <
typename T>
1798 unsigned int start_i,
1801 unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1802 __shared__ T cs_lcl[256];
1803 __shared__ T ss_lcl[256];
1805 T x = (j <
size) ? matr[(end_i + 1) *stride + j] : 0;
1807 unsigned int elems_num = end_i - start_i + 1;
1808 unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1810 for(
unsigned int block_id = 0; block_id < block_num; block_id++)
1812 unsigned int to =
min(elems_num - block_id * blockDim.x, blockDim.x);
1814 if(threadIdx.x < to)
1816 cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1817 ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1822 for(
unsigned int ind = 0; ind < to; ind++)
1824 unsigned int i = end_i - (ind + block_id * blockDim.x);
1825 T z = matr[i *stride + j];
1826 T cs_val = cs_lcl[ind];
1827 T ss_val = ss_lcl[ind];
1828 matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
1829 x = -x * ss_val + z * cs_val;
1835 matr[(start_i) * stride + j] = x;
__global__ void matrix_col_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void house_update_A_right_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void convert_col_kernel(DestNumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const SrcNumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
__global__ void house_update_A_left_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
__global__ void matrix_col_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void bidiag_pack_column_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void house_update_A_left_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
__global__ void matrix_col_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void matrix_col_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void house_update_QL_row_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void scaled_rank1_update_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void matrix_col_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_m_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
__global__ void matrix_col_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void matrix_col_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__device__ void col_reduce_lcl_array(T *sums, unsigned int th_Idx, unsigned int bl_Dim)
__global__ void matrix_col_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void element_op_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void house_update_A_right_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void matrix_col_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void element_op_int_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_col_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void trans_vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void house_update_QL_column_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void givens_next_column_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void bidiag_pack_row_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
NumericT min(std::vector< NumericT > const &v1)