1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
33 template<
typename DestNumericT,
typename SrcNumericT>
36 unsigned int A_start1,
unsigned int A_start2,
37 unsigned int A_inc1,
unsigned int A_inc2,
38 unsigned int A_size1,
unsigned int A_size2,
39 unsigned int A_internal_size1,
unsigned int A_internal_size2,
41 const SrcNumericT * B,
42 unsigned int B_start1,
unsigned int B_start2,
43 unsigned int B_inc1,
unsigned int B_inc2,
44 unsigned int B_internal_size1,
unsigned int B_internal_size2)
46 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
47 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
49 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
50 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
51 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2];
55 template<
typename NumericT>
58 unsigned int A_start1,
unsigned int A_start2,
59 unsigned int A_internal_size1,
unsigned int A_internal_size2,
60 unsigned int A_size1,
unsigned int A_size2,
61 unsigned int A_stride1,
unsigned int A_stride2,
64 unsigned int B_start1,
unsigned int B_start2,
65 unsigned int B_internal_size1,
unsigned int B_internal_size2,
66 unsigned int B_stride1,
unsigned int B_stride2,
69 for(
unsigned int row = blockIdx.x;
row<A_size1;
row+=gridDim.x)
71 for(
unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
74 B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 *
row)] = A[(A_start1 + A_stride1 *
row) * A_internal_size2 + (A_start2 + A_stride2 * col)];
76 B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 *
row) + (A_start2 + A_stride2 * col) * A_internal_size1];
86 template<
typename NumericT>
89 unsigned int A_start1,
unsigned int A_start2,
90 unsigned int A_inc1,
unsigned int A_inc2,
91 unsigned int A_size1,
unsigned int A_size2,
92 unsigned int A_internal_size1,
unsigned int A_internal_size2,
95 unsigned int options2,
97 unsigned int B_start1,
unsigned int B_start2,
98 unsigned int B_inc1,
unsigned int B_inc2,
99 unsigned int B_internal_size1,
unsigned int B_internal_size2)
102 if (options2 & (1 << 0))
105 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
106 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
108 if (options2 & (1 << 1))
110 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
111 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
112 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
116 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
117 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
118 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
123 template<
typename NumericT>
126 unsigned int A_start1,
unsigned int A_start2,
127 unsigned int A_inc1,
unsigned int A_inc2,
128 unsigned int A_size1,
unsigned int A_size2,
129 unsigned int A_internal_size1,
unsigned int A_internal_size2,
132 unsigned int options2,
134 unsigned int B_start1,
unsigned int B_start2,
135 unsigned int B_inc1,
unsigned int B_inc2,
136 unsigned int B_internal_size1,
unsigned int B_internal_size2)
139 if (options2 & (1 << 0))
142 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
143 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
145 if (options2 & (1 << 1))
147 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
148 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
149 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
153 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
154 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
155 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
165 template<
typename NumericT>
168 unsigned int A_start1,
unsigned int A_start2,
169 unsigned int A_inc1,
unsigned int A_inc2,
170 unsigned int A_size1,
unsigned int A_size2,
171 unsigned int A_internal_size1,
unsigned int A_internal_size2,
174 unsigned int options2,
176 unsigned int B_start1,
unsigned int B_start2,
177 unsigned int B_inc1,
unsigned int B_inc2,
178 unsigned int B_internal_size1,
unsigned int B_internal_size2,
181 unsigned int options3,
183 unsigned int C_start1,
unsigned int C_start2,
184 unsigned int C_inc1,
unsigned int C_inc2,
185 unsigned int C_internal_size1,
unsigned int C_internal_size2)
188 if (options2 & (1 << 0))
192 if (options3 & (1 << 0))
195 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
196 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
198 if (options2 & (1 << 1))
200 if (options3 & (1 << 1))
202 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
203 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
204 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
205 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
206 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
210 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
211 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
212 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
213 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
214 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
219 if (options3 & (1 << 1))
221 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
222 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
223 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
224 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
225 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
229 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
230 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
231 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
232 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
233 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
240 template<
typename NumericT>
243 unsigned int A_start1,
unsigned int A_start2,
244 unsigned int A_inc1,
unsigned int A_inc2,
245 unsigned int A_size1,
unsigned int A_size2,
246 unsigned int A_internal_size1,
unsigned int A_internal_size2,
249 unsigned int options2,
251 unsigned int B_start1,
unsigned int B_start2,
252 unsigned int B_inc1,
unsigned int B_inc2,
253 unsigned int B_internal_size1,
unsigned int B_internal_size2,
256 unsigned int options3,
258 unsigned int C_start1,
unsigned int C_start2,
259 unsigned int C_inc1,
unsigned int C_inc2,
260 unsigned int C_internal_size1,
unsigned int C_internal_size2)
263 if (options2 & (1 << 0))
267 if (options3 & (1 << 0))
270 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
271 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
273 if (options2 & (1 << 1))
275 if (options3 & (1 << 1))
277 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
278 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
279 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
280 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
281 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
285 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
286 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
287 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
288 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
289 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
294 if (options3 & (1 << 1))
296 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
297 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
298 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
299 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
300 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
304 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
305 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
306 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
307 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
308 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
314 template<
typename NumericT>
317 unsigned int A_start1,
unsigned int A_start2,
318 unsigned int A_inc1,
unsigned int A_inc2,
319 unsigned int A_size1,
unsigned int A_size2,
320 unsigned int A_internal_size1,
unsigned int A_internal_size2,
323 unsigned int options2,
325 unsigned int B_start1,
unsigned int B_start2,
326 unsigned int B_inc1,
unsigned int B_inc2,
327 unsigned int B_internal_size1,
unsigned int B_internal_size2,
330 unsigned int options3,
332 unsigned int C_start1,
unsigned int C_start2,
333 unsigned int C_inc1,
unsigned int C_inc2,
334 unsigned int C_internal_size1,
unsigned int C_internal_size2)
337 if (options2 & (1 << 0))
341 if (options3 & (1 << 0))
344 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
345 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
347 if (options2 & (1 << 1))
349 if (options3 & (1 << 1))
351 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
352 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
353 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
354 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
355 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
359 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
360 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
361 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
362 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
363 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
368 if (options3 & (1 << 1))
370 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
371 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
372 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
373 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
374 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
378 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
379 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
380 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
381 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
382 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
389 template<
typename NumericT>
392 unsigned int A_start1,
unsigned int A_start2,
393 unsigned int A_inc1,
unsigned int A_inc2,
394 unsigned int A_size1,
unsigned int A_size2,
395 unsigned int A_internal_size1,
unsigned int A_internal_size2,
398 unsigned int options2,
400 unsigned int B_start1,
unsigned int B_start2,
401 unsigned int B_inc1,
unsigned int B_inc2,
402 unsigned int B_internal_size1,
unsigned int B_internal_size2,
405 unsigned int options3,
407 unsigned int C_start1,
unsigned int C_start2,
408 unsigned int C_inc1,
unsigned int C_inc2,
409 unsigned int C_internal_size1,
unsigned int C_internal_size2)
412 if (options2 & (1 << 0))
416 if (options3 & (1 << 0))
419 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
420 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
422 if (options2 & (1 << 1))
424 if (options3 & (1 << 1))
426 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
427 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
428 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
429 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
430 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
434 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
435 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
436 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
437 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
438 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
443 if (options3 & (1 << 1))
445 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
446 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
447 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
448 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
449 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
453 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
454 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
455 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
456 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
457 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
468 template<
typename NumericT>
471 unsigned int A_start1,
unsigned int A_start2,
472 unsigned int A_inc1,
unsigned int A_inc2,
473 unsigned int A_size1,
unsigned int A_size2,
474 unsigned int A_internal_size1,
unsigned int A_internal_size2,
477 unsigned int options2,
479 unsigned int B_start1,
unsigned int B_start2,
480 unsigned int B_inc1,
unsigned int B_inc2,
481 unsigned int B_internal_size1,
unsigned int B_internal_size2,
484 unsigned int options3,
486 unsigned int C_start1,
unsigned int C_start2,
487 unsigned int C_inc1,
unsigned int C_inc2,
488 unsigned int C_internal_size1,
unsigned int C_internal_size2)
491 if (options2 & (1 << 0))
495 if (options3 & (1 << 0))
498 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
499 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
501 if (options2 & (1 << 1))
503 if (options3 & (1 << 1))
505 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
506 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
507 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
508 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
509 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
513 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
514 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
515 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
516 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
517 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
522 if (options3 & (1 << 1))
524 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
525 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
526 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
527 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
528 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
532 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
533 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
534 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
535 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
536 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
543 template<
typename NumericT>
546 unsigned int A_start1,
unsigned int A_start2,
547 unsigned int A_inc1,
unsigned int A_inc2,
548 unsigned int A_size1,
unsigned int A_size2,
549 unsigned int A_internal_size1,
unsigned int A_internal_size2,
552 unsigned int options2,
554 unsigned int B_start1,
unsigned int B_start2,
555 unsigned int B_inc1,
unsigned int B_inc2,
556 unsigned int B_internal_size1,
unsigned int B_internal_size2,
559 unsigned int options3,
561 unsigned int C_start1,
unsigned int C_start2,
562 unsigned int C_inc1,
unsigned int C_inc2,
563 unsigned int C_internal_size1,
unsigned int C_internal_size2)
566 if (options2 & (1 << 0))
570 if (options3 & (1 << 0))
573 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
574 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
576 if (options2 & (1 << 1))
578 if (options3 & (1 << 1))
580 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
581 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
582 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
583 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
584 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
588 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
589 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
590 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
591 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
592 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
597 if (options3 & (1 << 1))
599 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
600 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
601 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
602 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
603 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
607 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
608 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
609 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
610 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
611 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
617 template<
typename NumericT>
620 unsigned int A_start1,
unsigned int A_start2,
621 unsigned int A_inc1,
unsigned int A_inc2,
622 unsigned int A_size1,
unsigned int A_size2,
623 unsigned int A_internal_size1,
unsigned int A_internal_size2,
626 unsigned int options2,
628 unsigned int B_start1,
unsigned int B_start2,
629 unsigned int B_inc1,
unsigned int B_inc2,
630 unsigned int B_internal_size1,
unsigned int B_internal_size2,
633 unsigned int options3,
635 unsigned int C_start1,
unsigned int C_start2,
636 unsigned int C_inc1,
unsigned int C_inc2,
637 unsigned int C_internal_size1,
unsigned int C_internal_size2)
640 if (options2 & (1 << 0))
644 if (options3 & (1 << 0))
647 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
648 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
650 if (options2 & (1 << 1))
652 if (options3 & (1 << 1))
654 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
655 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
656 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
657 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
658 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
662 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
663 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
664 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
665 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
666 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
671 if (options3 & (1 << 1))
673 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
674 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
675 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
676 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
677 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
681 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
682 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
683 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
684 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
685 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
692 template<
typename NumericT>
695 unsigned int A_start1,
unsigned int A_start2,
696 unsigned int A_inc1,
unsigned int A_inc2,
697 unsigned int A_size1,
unsigned int A_size2,
698 unsigned int A_internal_size1,
unsigned int A_internal_size2,
701 unsigned int options2,
703 unsigned int B_start1,
unsigned int B_start2,
704 unsigned int B_inc1,
unsigned int B_inc2,
705 unsigned int B_internal_size1,
unsigned int B_internal_size2,
708 unsigned int options3,
710 unsigned int C_start1,
unsigned int C_start2,
711 unsigned int C_inc1,
unsigned int C_inc2,
712 unsigned int C_internal_size1,
unsigned int C_internal_size2)
715 if (options2 & (1 << 0))
719 if (options3 & (1 << 0))
722 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
723 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
725 if (options2 & (1 << 1))
727 if (options3 & (1 << 1))
729 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
730 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
731 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
732 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
733 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
737 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
738 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
739 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
740 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
741 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
746 if (options3 & (1 << 1))
748 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
749 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
750 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
751 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
752 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
756 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
757 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
758 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
759 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
760 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
769 template<
typename NumericT>
772 unsigned int A_start1,
unsigned int A_start2,
773 unsigned int A_inc1,
unsigned int A_inc2,
774 unsigned int A_size1,
unsigned int A_size2,
775 unsigned int A_internal_size1,
unsigned int A_internal_size2,
778 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
779 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
781 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
782 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
783 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
787 template<
typename NumericT>
790 unsigned int A_start1,
unsigned int A_start2,
791 unsigned int A_inc1,
unsigned int A_inc2,
792 unsigned int A_size1,
unsigned int A_size2,
793 unsigned int A_internal_size1,
unsigned int A_internal_size2,
796 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
798 for (
unsigned int row = gid;
row < A_size1;
row += blockDim.x * gridDim.x)
799 A[(
row * A_inc1 + A_start1) * A_internal_size2 +
row * A_inc2 + A_start2] = alpha;
806 template<
typename NumericT>
809 unsigned int A_start1,
unsigned int A_start2,
810 unsigned int A_inc1,
unsigned int A_inc2,
811 unsigned int A_size1,
unsigned int A_size2,
812 unsigned int A_internal_size1,
unsigned int A_internal_size2,
815 unsigned int B_start1,
unsigned int B_start2,
816 unsigned int B_inc1,
unsigned int B_inc2,
817 unsigned int B_internal_size1,
unsigned int B_internal_size2,
820 unsigned int C_start1,
unsigned int C_start2,
821 unsigned int C_inc1,
unsigned int C_inc2,
822 unsigned int C_internal_size1,
unsigned int C_internal_size2,
824 unsigned int op_type)
826 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
827 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
831 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
832 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
833 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
834 = pow(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
835 C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
837 else if (op_type == 1)
839 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
840 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
841 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
842 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
843 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
845 else if (op_type == 0)
847 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
848 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
849 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
850 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
851 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
855 template<
typename NumericT>
858 unsigned int A_start1,
unsigned int A_start2,
859 unsigned int A_inc1,
unsigned int A_inc2,
860 unsigned int A_size1,
unsigned int A_size2,
861 unsigned int A_internal_size1,
unsigned int A_internal_size2,
864 unsigned int B_start1,
unsigned int B_start2,
865 unsigned int B_inc1,
unsigned int B_inc2,
866 unsigned int B_internal_size1,
unsigned int B_internal_size2,
869 unsigned int C_start1,
unsigned int C_start2,
870 unsigned int C_inc1,
unsigned int C_inc2,
871 unsigned int C_internal_size1,
unsigned int C_internal_size2,
873 unsigned int op_type)
875 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
876 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
880 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
881 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
882 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
883 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
884 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
886 else if (op_type == 0)
888 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
889 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
890 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
891 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
892 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
901 template<
typename NumericT>
904 unsigned int A_start1,
unsigned int A_start2,
905 unsigned int A_inc1,
unsigned int A_inc2,
906 unsigned int A_size1,
unsigned int A_size2,
907 unsigned int A_internal_size1,
unsigned int A_internal_size2,
910 unsigned int B_start1,
unsigned int B_start2,
911 unsigned int B_inc1,
unsigned int B_inc2,
912 unsigned int B_internal_size1,
unsigned int B_internal_size2)
914 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
915 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
917 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
918 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
919 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
924 template<
typename NumericT>
927 unsigned int A_start1,
unsigned int A_start2,
928 unsigned int A_inc1,
unsigned int A_inc2,
929 unsigned int A_size1,
unsigned int A_size2,
930 unsigned int A_internal_size1,
unsigned int A_internal_size2,
933 unsigned int B_start1,
unsigned int B_start2,
934 unsigned int B_inc1,
unsigned int B_inc2,
935 unsigned int B_internal_size1,
unsigned int B_internal_size2)
937 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
938 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
940 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
941 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
942 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
947 template<
typename NumericT>
950 unsigned int A_start1,
unsigned int A_start2,
951 unsigned int A_inc1,
unsigned int A_inc2,
952 unsigned int A_size1,
unsigned int A_size2,
953 unsigned int A_internal_size1,
unsigned int A_internal_size2,
956 unsigned int B_start1,
unsigned int B_start2,
957 unsigned int B_inc1,
unsigned int B_inc2,
958 unsigned int B_internal_size1,
unsigned int B_internal_size2)
960 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
961 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
963 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
964 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
965 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
970 template<
typename NumericT>
973 unsigned int A_start1,
unsigned int A_start2,
974 unsigned int A_inc1,
unsigned int A_inc2,
975 unsigned int A_size1,
unsigned int A_size2,
976 unsigned int A_internal_size1,
unsigned int A_internal_size2,
979 unsigned int B_start1,
unsigned int B_start2,
980 unsigned int B_inc1,
unsigned int B_inc2,
981 unsigned int B_internal_size1,
unsigned int B_internal_size2)
983 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
984 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
986 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
987 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
988 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
993 template<
typename NumericT>
996 unsigned int A_start1,
unsigned int A_start2,
997 unsigned int A_inc1,
unsigned int A_inc2,
998 unsigned int A_size1,
unsigned int A_size2,
999 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1002 unsigned int B_start1,
unsigned int B_start2,
1003 unsigned int B_inc1,
unsigned int B_inc2,
1004 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1006 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1007 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1009 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1010 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1011 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1016 template<
typename NumericT>
1019 unsigned int A_start1,
unsigned int A_start2,
1020 unsigned int A_inc1,
unsigned int A_inc2,
1021 unsigned int A_size1,
unsigned int A_size2,
1022 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1025 unsigned int B_start1,
unsigned int B_start2,
1026 unsigned int B_inc1,
unsigned int B_inc2,
1027 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1029 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1030 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1032 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1033 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1034 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1039 template<
typename NumericT>
1042 unsigned int A_start1,
unsigned int A_start2,
1043 unsigned int A_inc1,
unsigned int A_inc2,
1044 unsigned int A_size1,
unsigned int A_size2,
1045 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1048 unsigned int B_start1,
unsigned int B_start2,
1049 unsigned int B_inc1,
unsigned int B_inc2,
1050 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1052 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1053 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1055 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1056 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1057 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1062 template<
typename NumericT>
1065 unsigned int A_start1,
unsigned int A_start2,
1066 unsigned int A_inc1,
unsigned int A_inc2,
1067 unsigned int A_size1,
unsigned int A_size2,
1068 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1071 unsigned int B_start1,
unsigned int B_start2,
1072 unsigned int B_inc1,
unsigned int B_inc2,
1073 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1075 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1076 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1078 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1079 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1080 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1085 template<
typename NumericT>
1088 unsigned int A_start1,
unsigned int A_start2,
1089 unsigned int A_inc1,
unsigned int A_inc2,
1090 unsigned int A_size1,
unsigned int A_size2,
1091 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1094 unsigned int B_start1,
unsigned int B_start2,
1095 unsigned int B_inc1,
unsigned int B_inc2,
1096 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1098 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1099 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1101 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1102 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1103 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1108 template<
typename NumericT>
1111 unsigned int A_start1,
unsigned int A_start2,
1112 unsigned int A_inc1,
unsigned int A_inc2,
1113 unsigned int A_size1,
unsigned int A_size2,
1114 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1117 unsigned int B_start1,
unsigned int B_start2,
1118 unsigned int B_inc1,
unsigned int B_inc2,
1119 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1121 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1122 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1124 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1125 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1126 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1131 template<
typename NumericT>
1134 unsigned int A_start1,
unsigned int A_start2,
1135 unsigned int A_inc1,
unsigned int A_inc2,
1136 unsigned int A_size1,
unsigned int A_size2,
1137 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1140 unsigned int B_start1,
unsigned int B_start2,
1141 unsigned int B_inc1,
unsigned int B_inc2,
1142 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1144 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1145 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1147 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1148 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1149 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1154 template<
typename NumericT>
1157 unsigned int A_start1,
unsigned int A_start2,
1158 unsigned int A_inc1,
unsigned int A_inc2,
1159 unsigned int A_size1,
unsigned int A_size2,
1160 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1163 unsigned int B_start1,
unsigned int B_start2,
1164 unsigned int B_inc1,
unsigned int B_inc2,
1165 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1167 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1168 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1170 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1171 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1172 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1177 template<
typename NumericT>
1180 unsigned int A_start1,
unsigned int A_start2,
1181 unsigned int A_inc1,
unsigned int A_inc2,
1182 unsigned int A_size1,
unsigned int A_size2,
1183 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1186 unsigned int B_start1,
unsigned int B_start2,
1187 unsigned int B_inc1,
unsigned int B_inc2,
1188 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1190 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1191 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1193 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1194 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1195 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1200 template<
typename NumericT>
1203 unsigned int A_start1,
unsigned int A_start2,
1204 unsigned int A_inc1,
unsigned int A_inc2,
1205 unsigned int A_size1,
unsigned int A_size2,
1206 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1209 unsigned int B_start1,
unsigned int B_start2,
1210 unsigned int B_inc1,
unsigned int B_inc2,
1211 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1213 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1214 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1216 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1217 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1218 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1223 template<
typename NumericT>
1226 unsigned int A_start1,
unsigned int A_start2,
1227 unsigned int A_inc1,
unsigned int A_inc2,
1228 unsigned int A_size1,
unsigned int A_size2,
1229 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1232 unsigned int B_start1,
unsigned int B_start2,
1233 unsigned int B_inc1,
unsigned int B_inc2,
1234 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1236 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1237 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1239 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1240 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1241 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1246 template<
typename NumericT>
1249 unsigned int A_start1,
unsigned int A_start2,
1250 unsigned int A_inc1,
unsigned int A_inc2,
1251 unsigned int A_size1,
unsigned int A_size2,
1252 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1255 unsigned int B_start1,
unsigned int B_start2,
1256 unsigned int B_inc1,
unsigned int B_inc2,
1257 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1259 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1260 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1262 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1263 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1264 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1269 template<
typename NumericT>
1272 unsigned int A_start1,
unsigned int A_start2,
1273 unsigned int A_inc1,
unsigned int A_inc2,
1274 unsigned int A_size1,
unsigned int A_size2,
1275 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1278 unsigned int B_start1,
unsigned int B_start2,
1279 unsigned int B_inc1,
unsigned int B_inc2,
1280 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1282 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1283 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1285 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1286 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1287 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1296 template<
typename NumericT>
1299 unsigned int A_row_start,
1300 unsigned int A_col_start,
1301 unsigned int A_row_inc,
1302 unsigned int A_col_inc,
1303 unsigned int A_row_size,
1304 unsigned int A_col_size,
1305 unsigned int A_internal_rows,
1306 unsigned int A_internal_cols,
1308 unsigned int v_start,
1310 unsigned int v_size,
1312 unsigned int result_start,
1313 unsigned int result_inc,
1314 unsigned int result_size)
1318 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1319 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1320 unsigned int lid = threadIdx.x;
1322 for (
unsigned int row = row_gid;
row < A_row_size;
row += gridDim.x)
1325 for (
unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1326 dot_prod += A[(
row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1332 work[lid] += work[lid+
stride];
1336 result[
row * result_inc + result_start] = work[0];
1341 template<
typename NumericT>
1344 unsigned int A_row_start,
1345 unsigned int A_col_start,
1346 unsigned int A_row_inc,
1347 unsigned int A_col_inc,
1348 unsigned int A_row_size,
1349 unsigned int A_col_size,
1350 unsigned int A_internal_rows,
1351 unsigned int A_internal_cols,
1353 unsigned int v_start,
1355 unsigned int v_size,
1357 unsigned int result_start,
1358 unsigned int result_inc,
1359 unsigned int result_size)
1361 for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
row < A_col_size;
row += gridDim.x * blockDim.x)
1364 for (
unsigned int col = 0; col < A_row_size; ++col)
1365 dot_prod += A[(
row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1366 result[
row * result_inc + result_start] =
dot_prod;
1383 template<
typename NumericT>
1386 unsigned int A_start1,
unsigned int A_start2,
1387 unsigned int A_inc1,
unsigned int A_inc2,
1388 unsigned int A_size1,
unsigned int A_size2,
1389 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1392 unsigned int options2,
1405 if (options2 & (1 << 0))
1407 if (options2 & (1 << 1))
1410 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1411 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1413 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1416 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1417 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
1423 template<
typename NumericT>
1426 unsigned int A_start1,
unsigned int A_start2,
1427 unsigned int A_inc1,
unsigned int A_inc2,
1428 unsigned int A_size1,
unsigned int A_size2,
1429 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1432 unsigned int options2,
1445 if (options2 & (1 << 0))
1447 if (options2 & (1 << 1))
1450 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1451 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1453 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1456 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1457 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
__global__ void element_op_int_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
__global__ void matrix_row_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
__global__ void ambm_m_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
__global__ void matrix_row_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void trans_vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
result_of::size_type< T >::type start2(T const &obj)
__global__ void element_op_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void convert_row_kernel(DestNumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const SrcNumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void scaled_rank1_update_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void trans_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_internal_size1, unsigned int A_internal_size2, unsigned int A_size1, unsigned int A_size2, unsigned int A_stride1, unsigned int A_stride2, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_internal_size1, unsigned int B_internal_size2, unsigned int B_stride1, unsigned int B_stride2, bool data_major)
__global__ void matrix_row_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void matrix_row_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)