1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
65 template <
typename StringType>
70 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
71 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
72 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
73 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
77 source.append(
" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
78 source.append(
" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
79 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
80 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
86 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
88 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] ");
91 source.append(
" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
93 source.append(
" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
96 source.append(
"* alpha ");
98 source.append(
"/ alpha ");
102 source.append(
"+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
104 source.append(
"+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
106 source.append(
"* beta");
108 source.append(
"/ beta");
114 source.append(
" A[row * A_internal_size2 + col] ");
116 source.append(
" A[row + col * A_internal_size1] ");
119 source.append(
" B[row * B_internal_size2 + col] ");
121 source.append(
" B[row + col * B_internal_size1] ");
124 source.append(
"* alpha ");
126 source.append(
"/ alpha ");
130 source.append(
"+ C[row * C_internal_size2 + col] ");
132 source.append(
"+ C[row + col * C_internal_size2] ");
134 source.append(
"* beta");
136 source.append(
"/ beta");
139 source.append(
"; \n");
142 template <
typename StringType>
145 source.append(
"__kernel void am");
152 source.append(
"_cpu");
154 source.append(
"_gpu");
157 source.append(
"_cpu");
159 source.append(
"_gpu");
160 source.append(
"( \n");
161 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
162 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
163 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
164 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
165 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
168 source.append(
" "); source.append(numeric_string); source.append(
" fac2, \n");
172 source.append(
" __global "); source.append(numeric_string); source.append(
" * fac2, \n");
174 source.append(
" unsigned int options2, \n");
175 source.append(
" __global const "); source.append(numeric_string); source.append(
" * B, \n");
176 source.append(
" unsigned int B_start1, unsigned int B_start2, \n");
177 source.append(
" unsigned int B_inc1, unsigned int B_inc2, \n");
178 source.append(
" unsigned int B_internal_size1, unsigned int B_internal_size2");
182 source.append(
", \n\n");
185 source.append(
" "); source.append(numeric_string); source.append(
" fac3, \n");
189 source.append(
" __global "); source.append(numeric_string); source.append(
" * fac3, \n");
191 source.append(
" unsigned int options3, \n");
192 source.append(
" __global const "); source.append(numeric_string); source.append(
" * C, \n");
193 source.append(
" unsigned int C_start1, unsigned int C_start2, \n");
194 source.append(
" unsigned int C_inc1, unsigned int C_inc2, \n");
195 source.append(
" unsigned int C_internal_size1, unsigned int C_internal_size2 \n");
197 source.append(
") { \n");
201 source.append(
" "); source.append(numeric_string); source.append(
" alpha = fac2; \n");
205 source.append(
" "); source.append(numeric_string); source.append(
" alpha = fac2[0]; \n");
207 source.append(
" if (options2 & (1 << 0)) \n");
208 source.append(
" alpha = -alpha; \n");
209 source.append(
" \n");
213 source.append(
" "); source.append(numeric_string); source.append(
" beta = fac3; \n");
217 source.append(
" "); source.append(numeric_string); source.append(
" beta = fac3[0]; \n");
221 source.append(
" if (options3 & (1 << 0)) \n");
222 source.append(
" beta = -beta; \n");
223 source.append(
" \n");
225 source.append(
" if (options2 & (1 << 1)) { \n");
228 source.append(
" if (options3 & (1 << 1)) {\n");
230 source.append(
" } else {\n");
232 source.append(
" } \n");
236 source.append(
" } else { \n");
239 source.append(
" if (options3 & (1 << 1)) {\n");
241 source.append(
" } else {\n");
243 source.append(
" } \n");
247 source.append(
" } \n");
248 source.append(
"} \n");
251 template <
typename StringType>
278 template <
typename StringType>
281 source.append(
"__kernel void assign_cpu( \n");
282 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
283 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
284 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
285 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
286 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
287 source.append(
" "); source.append(numeric_string); source.append(
" alpha) \n");
288 source.append(
"{ \n");
291 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
292 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
293 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
294 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
295 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
299 source.append(
" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
300 source.append(
" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
301 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
302 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
303 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha; \n");
305 source.append(
"} \n");
308 template <
typename StringType>
311 source.append(
"__kernel void diagonal_assign_cpu( \n");
312 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
313 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
314 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
315 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
316 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
317 source.append(
" "); source.append(numeric_string); source.append(
" alpha) \n");
318 source.append(
"{ \n");
319 source.append(
" for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
321 source.append(
" A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
323 source.append(
" A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) * A_internal_size1] = alpha; \n");
324 source.append(
"} \n");
327 template <
typename StringType>
330 source.append(
"__kernel void element_op( \n");
331 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
332 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
333 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
334 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
335 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
336 source.append(
" __global "); source.append(numeric_string); source.append(
" * B, \n");
337 source.append(
" unsigned int B_start1, unsigned int B_start2, \n");
338 source.append(
" unsigned int B_inc1, unsigned int B_inc2, \n");
339 source.append(
" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
340 source.append(
" __global "); source.append(numeric_string); source.append(
" * C, \n");
341 source.append(
" unsigned int C_start1, unsigned int C_start2, \n");
342 source.append(
" unsigned int C_inc1, unsigned int C_inc2, \n");
343 source.append(
" unsigned int C_internal_size1, unsigned int C_internal_size2, \n");
344 source.append(
" unsigned int op_type) \n");
345 source.append(
"{ \n");
348 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
349 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
350 source.append(
" if (op_type == 2) {");
351 if (numeric_string ==
"float" || numeric_string ==
"double")
353 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
354 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
355 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
356 source.append(
" pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
357 source.append(
" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
359 source.append(
" } else if (op_type == 1) {");
360 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
361 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
362 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
363 source.append(
" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
364 source.append(
" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
365 source.append(
" } else if (op_type == 0) {");
366 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
367 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
368 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
369 source.append(
" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
370 source.append(
" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
375 source.append(
" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
376 source.append(
" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
377 source.append(
" if (op_type == 2) {");
378 if (numeric_string ==
"float" || numeric_string ==
"double")
380 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
381 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
382 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
383 source.append(
" pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1], \n");
384 source.append(
" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]); \n");
386 source.append(
" } else if (op_type == 1) {");
387 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
388 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
389 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
390 source.append(
" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / \n");
391 source.append(
" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n");
392 source.append(
" } else if (op_type == 0) {");
393 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
394 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
395 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
396 source.append(
" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * \n");
397 source.append(
" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n");
400 source.append(
"} \n");
404 template<
typename StringT>
408 source.append(
"__kernel void fft_direct(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
409 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *output, \n");
410 source.append(
" unsigned int size, \n");
411 source.append(
" unsigned int stride, \n");
412 source.append(
" unsigned int batch_num, \n");
413 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
414 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
415 source.append(
" \n");
416 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
417 source.append(
" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
418 source.append(
" "); source.append(numeric_string); source.append(
"2 f = 0.0f; \n");
419 source.append(
" \n");
420 source.append(
" for (unsigned int n = 0; n < size; n++) { \n");
421 source.append(
" "); source.append(numeric_string); source.append(
"2 in = ");
423 source.append(
"input[batch_id * stride + n]; \n");
425 source.append(
"input[n * stride + batch_id]; \n");
426 source.append(
" \n");
427 source.append(
" "); source.append(numeric_string); source.append(
" sn, cs; \n");
428 source.append(
" "); source.append(numeric_string); source.append(
" arg = sign * 2 * NUM_PI * k / size * n; \n");
429 source.append(
" sn = sincos(arg, &cs); \n");
430 source.append(
" \n");
431 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
432 source.append(
" f = f + ("); source.append(numeric_string); source.append(
"2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
433 source.append(
" } \n");
434 source.append(
" \n");
436 source.append(
" output[batch_id * stride + k] = f; \n");
438 source.append(
" output[k * stride + batch_id] = f; \n");
439 source.append(
" } \n");
440 source.append(
" } \n");
441 source.append(
"} \n");
443 source.append(
" \n");
445 source.append(
"__kernel void fft_radix2(__global "); source.append(numeric_string); source.append(
"2* input, \n");
446 source.append(
" unsigned int s, \n");
447 source.append(
" unsigned int bit_size, \n");
448 source.append(
" unsigned int size, \n");
449 source.append(
" unsigned int stride, \n");
450 source.append(
" unsigned int batch_num, \n");
451 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
452 source.append(
" \n");
453 source.append(
" unsigned int ss = 1 << s; \n");
454 source.append(
" unsigned int half_size = size >> 1; \n");
455 source.append(
" \n");
456 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
457 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
458 source.append(
" \n");
459 source.append(
" unsigned int glb_id = get_global_id(0); \n");
460 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
462 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
463 source.append(
" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
464 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
465 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
469 source.append(
" unsigned int offset = batch_id * stride + pos; \n");
470 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
471 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss]; \n");
475 source.append(
" unsigned int offset = pos * stride + batch_id; \n");
476 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input[offset]; \n");
477 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input[offset + ss * stride]; \n");
480 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
482 source.append(
" sn = sincos(arg, &cs); \n");
484 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
486 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
489 source.append(
" input[offset + ss] = in1 - tmp; \n");
491 source.append(
" input[offset + ss * stride] = in1 - tmp; \n");
492 source.append(
" input[offset] = in1 + tmp; \n");
493 source.append(
" } \n");
494 source.append(
" } \n");
495 source.append(
"} \n");
497 source.append(
" \n");
499 source.append(
" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
500 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
501 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
502 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
503 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
504 source.append(
" v = (v >> 16) | (v << 16); \n");
505 source.append(
" \n");
506 source.append(
" v = v >> (32 - bit_size); \n");
507 source.append(
" \n");
508 source.append(
" return v; \n");
509 source.append(
" } \n");
511 source.append(
" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append(
"2* input, \n");
512 source.append(
" __local "); source.append(numeric_string); source.append(
"2* lcl_input, \n");
513 source.append(
" unsigned int bit_size, \n");
514 source.append(
" unsigned int size, \n");
515 source.append(
" unsigned int stride, \n");
516 source.append(
" unsigned int batch_num, \n");
517 source.append(
" "); source.append(numeric_string); source.append(
" sign) { \n");
519 source.append(
" unsigned int grp_id = get_group_id(0); \n");
520 source.append(
" unsigned int grp_num = get_num_groups(0); \n");
522 source.append(
" unsigned int lcl_sz = get_local_size(0); \n");
523 source.append(
" unsigned int lcl_id = get_local_id(0); \n");
524 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
526 source.append(
" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
529 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
530 source.append(
" unsigned int v = get_reorder_num(p, bit_size); \n");
532 source.append(
" lcl_input[v] = input[batch_id * stride + p]; \n");
534 source.append(
" lcl_input[v] = input[p * stride + batch_id]; \n");
535 source.append(
" } \n");
537 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
540 source.append(
" for (unsigned int s = 0; s < bit_size; s++) { \n");
541 source.append(
" unsigned int ss = 1 << s; \n");
543 source.append(
" "); source.append(numeric_string); source.append(
" cs, sn; \n");
545 source.append(
" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
546 source.append(
" unsigned int group = (tid & (ss - 1)); \n");
547 source.append(
" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
549 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = lcl_input[pos]; \n");
550 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = lcl_input[pos + ss]; \n");
552 source.append(
" "); source.append(numeric_string); source.append(
" arg = group * sign * NUM_PI / ss; \n");
554 source.append(
" sn = sincos(arg, &cs); \n");
555 source.append(
" "); source.append(numeric_string); source.append(
"2 ex = ("); source.append(numeric_string); source.append(
"2)(cs, sn); \n");
557 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = ("); source.append(numeric_string); source.append(
"2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
559 source.append(
" lcl_input[pos + ss] = in1 - tmp; \n");
560 source.append(
" lcl_input[pos] = in1 + tmp; \n");
561 source.append(
" } \n");
563 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
564 source.append(
" } \n");
567 source.append(
" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
569 source.append(
" input[batch_id * stride + p] = lcl_input[p]; \n");
571 source.append(
" input[p * stride + batch_id] = lcl_input[p]; \n");
572 source.append(
" } \n");
573 source.append(
" } \n");
574 source.append(
" } \n");
576 source.append(
" \n");
582 source.append(
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
583 source.append(
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
584 source.append(
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
585 source.append(
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
586 source.append(
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
587 source.append(
" v = (v >> 16) | (v << 16); \n");
589 source.append(
" v = v >> (32 - bit_size); \n");
591 source.append(
" return v; \n");
592 source.append(
"} \n");
594 source.append(
"__kernel void fft_reorder(__global "); source.append(numeric_string); source.append(
"2* input, \n");
595 source.append(
" unsigned int bit_size, \n");
596 source.append(
" unsigned int size, \n");
597 source.append(
" unsigned int stride, \n");
598 source.append(
" int batch_num) { \n");
600 source.append(
" unsigned int glb_id = get_global_id(0); \n");
601 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
603 source.append(
" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
604 source.append(
" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
605 source.append(
" unsigned int v = get_reorder_num_2(i, bit_size); \n");
607 source.append(
" if (i < v) {\n");
610 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[batch_id * stride + i]; \n");
611 source.append(
" input[batch_id * stride + i] = input[batch_id * stride + v]; \n");
612 source.append(
" input[batch_id * stride + v] = tmp; \n");
616 source.append(
" "); source.append(numeric_string); source.append(
"2 tmp = input[i * stride + batch_id]; \n");
617 source.append(
" input[i * stride + batch_id] = input[v * stride + batch_id]; \n");
618 source.append(
" input[v * stride + batch_id] = tmp; \n");
620 source.append(
" } \n");
621 source.append(
" } \n");
622 source.append(
" } \n");
623 source.append(
"} \n");
626 template<
typename StringT>
629 source.append(
"__kernel void lu_factorize( \n");
630 source.append(
" __global "); source.append(numeric_string); source.append(
" * matrix, \n");
631 source.append(
" unsigned int matrix_rows, \n");
632 source.append(
" unsigned int matrix_cols, \n");
633 source.append(
" unsigned int matrix_internal_rows, \n");
634 source.append(
" unsigned int matrix_internal_cols) \n");
635 source.append(
"{ \n");
636 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
640 source.append(
" unsigned rowi; \n");
641 source.append(
" unsigned rowk; \n");
642 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
643 source.append(
" { \n");
644 source.append(
" rowi = i * matrix_internal_cols; \n");
645 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
646 source.append(
" { \n");
647 source.append(
" rowk = k * matrix_internal_cols; \n");
648 source.append(
" if (get_global_id(0) == 0) \n");
649 source.append(
" matrix[rowi + k] /= matrix[rowk + k]; \n");
651 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
652 source.append(
" temp = matrix[rowi + k]; \n");
655 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
656 source.append(
" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
660 source.append(
" for (unsigned int i=1; i<matrix_rows; ++i) \n");
661 source.append(
" { \n");
662 source.append(
" for (unsigned int k=0; k<i; ++k) \n");
663 source.append(
" { \n");
665 source.append(
" if (get_global_id(0) == 0) \n");
666 source.append(
" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
668 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
669 source.append(
" temp = matrix[i + k*matrix_internal_rows]; \n");
672 source.append(
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
673 source.append(
" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
681 template<
typename StringT>
684 source.append(
"__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append(
"cpu") : source.append(
"gpu"); source.append(
"( \n");
685 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
686 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
687 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
688 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
689 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
692 source.append(
" "); source.append(numeric_string); source.append(
" val, \n");
694 source.append(
" __global const "); source.append(numeric_string); source.append(
" *val, \n");
696 source.append(
" unsigned int options2, \n");
698 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec1, \n");
699 source.append(
" unsigned int start1, \n");
700 source.append(
" unsigned int inc1, \n");
701 source.append(
" unsigned int size1, \n");
703 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vec2, \n");
704 source.append(
" unsigned int start2, \n");
705 source.append(
" unsigned int inc2, \n");
706 source.append(
" unsigned int size2) \n");
707 source.append(
"{ \n");
710 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val; \n");
712 source.append(
" "); source.append(numeric_string); source.append(
" alpha = val[0]; \n");
714 source.append(
" if (options2 & (1 << 0)) \n");
715 source.append(
" alpha = -alpha; \n");
717 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
718 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
720 source.append(
" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
721 source.append(
" { \n");
722 source.append(
" "); source.append(numeric_string); source.append(
" tmp = vec1[row * inc1 + start1];");
723 source.append(
" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
724 source.append(
" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
726 source.append(
" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
728 source.append(
" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
729 source.append(
" } \n");
730 source.append(
"} \n");
733 template <
typename StringType>
736 source.append(
"__kernel void trans_vec_mul( \n");
737 source.append(
" __global const "); source.append(numeric_string); source.append(
" * A, \n");
738 source.append(
" unsigned int A_row_start, unsigned int A_col_start, \n");
739 source.append(
" unsigned int A_row_inc, unsigned int A_col_inc, \n");
740 source.append(
" unsigned int A_row_size, unsigned int A_col_size, \n");
741 source.append(
" unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
742 source.append(
" __global const "); source.append(numeric_string); source.append(
" * v, \n");
743 source.append(
" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
744 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
745 source.append(
" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
746 source.append(
" __local "); source.append(numeric_string); source.append(
" * work) \n");
747 source.append(
"{ \n");
750 source.append(
" for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
751 source.append(
" { \n");
752 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
753 source.append(
" for (unsigned int col = 0; col < A_row_size; ++col) \n");
754 source.append(
" dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
755 source.append(
" result[row * result_inc + result_start] = dot_prod; \n");
759 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
760 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
761 source.append(
" unsigned int lid = get_local_id(0); \n");
763 source.append(
" for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
764 source.append(
" { \n");
765 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
766 source.append(
" for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
767 source.append(
" dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
768 source.append(
" work[lid] = dot_prod; \n");
770 source.append(
" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
771 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
772 source.append(
" if(lid < stride) \n");
773 source.append(
" work[lid] += work[lid+stride]; \n");
774 source.append(
" } \n");
776 source.append(
" if(lid == 0) \n");
777 source.append(
" result[row * result_inc + result_start] = work[0]; \n");
779 source.append(
" } \n");
780 source.append(
"} \n");
783 template<
typename StringT>
786 source.append(
"__kernel void triangular_substitute_inplace( \n");
787 source.append(
" __global "); source.append(numeric_string); source.append(
" * A, \n");
788 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
789 source.append(
" unsigned int A_inc1, unsigned int A_inc2, \n");
790 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
791 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
792 source.append(
" __global "); source.append(numeric_string); source.append(
" * v, \n");
793 source.append(
" unsigned int v_start, \n");
794 source.append(
" unsigned int v_inc, \n");
795 source.append(
" unsigned int v_size, \n");
796 source.append(
" unsigned int options) \n");
797 source.append(
"{ \n");
798 source.append(
" "); source.append(numeric_string); source.append(
" temp; \n");
799 source.append(
" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
800 source.append(
" unsigned int transposed_access_A = (options & (1 << 1)); \n");
801 source.append(
" unsigned int is_lower_solve = (options & (1 << 2)); \n");
802 source.append(
" unsigned int row; \n");
803 source.append(
" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n");
804 source.append(
" { \n");
805 source.append(
" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
806 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
807 source.append(
" if (!unit_diagonal_flag) \n");
808 source.append(
" { \n");
809 source.append(
" if (get_global_id(0) == 0) \n");
811 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
813 source.append(
" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
814 source.append(
" } \n");
816 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
818 source.append(
" temp = v[row * v_inc + v_start]; \n");
820 source.append(
" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
821 source.append(
" elim < (is_lower_solve ? A_size1 : row); \n");
822 source.append(
" elim += get_global_size(0)) \n");
825 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
826 source.append(
" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
830 source.append(
" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
831 source.append(
" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
833 source.append(
" } \n");
834 source.append(
"} \n");
837 template <
typename StringT>
840 source.append(
"__kernel void trans_kernel(\n");
841 source.append(
" __global const ");source.append(numeric_string);source.append(
" * A, \n");
842 source.append(
" unsigned int A_start1, unsigned int A_start2, \n");
843 source.append(
" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
844 source.append(
" unsigned int A_size1, unsigned int A_size2, \n");
845 source.append(
" unsigned int A_stride1, unsigned int A_stride2, \n");
846 source.append(
" __global ");source.append(numeric_string);source.append(
" * B, \n");
847 source.append(
" unsigned int B_start1, unsigned int B_start2, \n");
848 source.append(
" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
849 source.append(
" unsigned int B_stride1, unsigned int B_stride2) \n");
850 source.append(
"{ \n");
851 source.append(
" for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
852 source.append(
" { \n");
853 source.append(
" for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
854 source.append(
" { \n");
856 source.append(
" B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; \n");
858 source.append(
" B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; \n");
859 source.append(
" } \n");
860 source.append(
" } \n");
861 source.append(
"} \n");
864 template <
typename StringType>
867 source.append(
"__kernel void vec_mul( \n");
868 source.append(
" __global const "); source.append(numeric_string); source.append(
" * A, \n");
869 source.append(
" unsigned int A_row_start, unsigned int A_col_start, \n");
870 source.append(
" unsigned int A_row_inc, unsigned int A_col_inc, \n");
871 source.append(
" unsigned int A_row_size, unsigned int A_col_size, \n");
872 source.append(
" unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
873 source.append(
" __global const "); source.append(numeric_string); source.append(
" * v, \n");
874 source.append(
" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
875 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
876 source.append(
" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
877 source.append(
" __local "); source.append(numeric_string); source.append(
" * work) \n");
878 source.append(
"{ \n");
881 source.append(
" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
882 source.append(
" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
883 source.append(
" unsigned int lid = get_local_id(0); \n");
885 source.append(
" for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
886 source.append(
" { \n");
887 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
888 source.append(
" for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
889 source.append(
" dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
890 source.append(
" work[lid] = dot_prod; \n");
892 source.append(
" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
893 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
894 source.append(
" if(lid < stride) \n");
895 source.append(
" work[lid] += work[lid+stride]; \n");
896 source.append(
" } \n");
898 source.append(
" if(lid == 0) \n");
899 source.append(
" result[row * result_inc + result_start] = work[0]; \n");
904 source.append(
" for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
905 source.append(
" { \n");
906 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
907 source.append(
" for (unsigned int col = 0; col < A_col_size; ++col) \n");
908 source.append(
" dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
909 source.append(
" result[row * result_inc + result_start] = dot_prod; \n");
911 source.append(
" } \n");
912 source.append(
"} \n");
925 template <
typename NumericT,
typename F>
939 static std::map<cl_context, bool> init_done;
943 source.reserve(8192);
945 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
958 #ifdef VIENNACL_BUILD_INFO
959 std::cout <<
"Creating program " << prog_name << std::endl;
961 ctx.add_program(source, prog_name);
962 init_done[ctx.handle().get()] =
true;
968 template<
typename NumericT>
976 std::pair<bool, cl_context> key(is_row_major, h);
977 if (handlers_map.find(key) == handlers_map.end())
981 namespace ds = viennacl::device_specific;
984 handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
987 ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'N');
988 ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'N');
989 ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device,
'N',
'T');
990 ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device,
'T',
'T');
1005 handler.add(
"prod_NN", ds::matrix_product_template(matrix_product_params_NN,
'N',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
false, beta, &C));
1006 handler.add(
"prod_TN", ds::matrix_product_template(matrix_product_params_TN,
'T',
'N'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
false, beta, &C));
1007 handler.add(
"prod_NT", ds::matrix_product_template(matrix_product_params_NT,
'N',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
false, &B,
true, beta, &C));
1008 handler.add(
"prod_TT", ds::matrix_product_template(matrix_product_params_TT,
'T',
'T'),
scheduler::preset::mat_mat_prod(alpha, &A,
true, &B,
true, beta, &C));
1017 template<
typename NumericT,
typename LayoutT>
1027 static std::map<cl_context, bool> init_done;
1035 source.reserve(8192);
1037 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1043 if (numeric_string ==
"float" || numeric_string ==
"double")
1046 generate_lu(source, numeric_string, is_row_major);
1052 #ifdef VIENNACL_BUILD_INFO
1053 std::cout <<
"Creating program " << prog_name << std::endl;
1055 ctx.add_program(source, prog_name);
1056 init_done[ctx.handle().get()] =
true;
1064 template<
typename StringT>
1067 source.append(
" __kernel void convert_row_" + dest_type +
"_" + src_type +
"( \n");
1068 source.append(
" __global " + dest_type +
" * dest, \n");
1069 source.append(
" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
1070 source.append(
" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
1071 source.append(
" __global const " + src_type +
" * src, \n");
1072 source.append(
" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
1073 source.append(
" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
1074 source.append(
" { \n");
1075 source.append(
" for (unsigned int i = get_group_id(0); i < size1_dest; i += get_num_groups(0)) \n");
1076 source.append(
" for (unsigned int j = get_local_id(0); j < size2_dest; j += get_local_size(0)) \n");
1077 source.append(
" dest[(start1_dest + i * inc1_dest) * internal_size2_dest + (start2_dest + j * inc2_dest)] = src[(start1_src + i * inc1_src) * internal_size2_src + (start2_src + j * inc2_src)]; \n");
1078 source.append(
" } \n");
1081 template<
typename StringT>
1084 source.append(
" __kernel void convert_col_" + dest_type +
"_" + src_type +
"( \n");
1085 source.append(
" __global " + dest_type +
" * dest, \n");
1086 source.append(
" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
1087 source.append(
" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
1088 source.append(
" __global const " + src_type +
" * src, \n");
1089 source.append(
" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
1090 source.append(
" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
1091 source.append(
" { \n");
1092 source.append(
" for (unsigned int j = get_group_id(0); j < size2_dest; j += get_num_groups(0)) \n");
1093 source.append(
" for (unsigned int i = get_local_id(0); i < size1_dest; i += get_local_size(0)) \n");
1094 source.append(
" dest[(start1_dest + i * inc1_dest) + (start2_dest + j * inc2_dest) * internal_size1_dest] = src[(start1_src + i * inc1_src) + (start2_src + j * inc2_src) * internal_size1_src]; \n");
1095 source.append(
" } \n");
1098 template<
typename StringT>
1112 return "matrix_convert";
1117 static std::map<cl_context, bool> init_done;
1121 source.reserve(4096);
1177 #ifdef VIENNACL_BUILD_INFO
1178 std::cout <<
"Creating program " << prog_name << std::endl;
viennacl::ocl::device const & current_device() const
Returns the current device.
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
void generate_ambm_impl(StringType &source, std::string const &numeric_string, ambm_config const &cfg)
Helper class for checking whether a matrix has a row-major layout.
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
void generate_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
void generate_element_op(StringType &source, std::string const &numeric_string, bool is_row_major)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
static void init(viennacl::ocl::context &ctx)
void generate_vec_mul(StringT &source, std::string const &numeric_string)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
static void apply(viennacl::ocl::context const &)
bool with_stride_and_range
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context.
const OCL_TYPE & get() const
void generate_ambm(StringType &source, std::string const &numeric_string, bool is_row_major)
static std::string program_name()
static void init(viennacl::ocl::context &ctx)
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
void generate_trans_vec_mul(StringType &source, std::string const &numeric_string, bool is_row_major)
static std::string program_name()
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
void generate_ambm_impl2(StringType &source, ambm_config const &cfg, bool mult_alpha, bool mult_beta)
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
Main kernel class for vector conversion routines (e.g. convert vector to vector).
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Representation of an OpenCL kernel in ViennaCL.
std::string type_to_string(viennacl::row_major)
void generate_matrix_convert_col(StringT &source, std::string const &dest_type, std::string const &src_type)
A tag for column-major storage of a dense matrix.
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
void generate_diagonal_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
void generate_matrix_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
static void init(viennacl::ocl::context &ctx)
Helper class for converting a type to its string representation.
A tag for row-major storage of a dense matrix.
static std::string program_name()
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.
void generate_matrix_convert_row(StringT &source, std::string const &dest_type, std::string const &src_type)