ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
22 #include "viennacl/tools/tools.hpp"
23 #include "viennacl/ocl/kernel.hpp"
25 #include "viennacl/ocl/utils.hpp"
26 
29 
32 namespace viennacl
33 {
34 namespace linalg
35 {
36 namespace opencl
37 {
38 namespace kernels
39 {
40 
42 
45 {
46  VIENNACL_AMBM_NONE = 0, // matrix does not exist/contribute
49 };
50 
53 {
55 
58  std::string assign_op;
61 };
62 
63 
64 // just returns the for-loop
65 template <typename StringType>
66 void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta)
67 {
68  if (cfg.is_row_major)
69  {
70  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
71  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
72  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
73  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
74  }
75  else
76  {
77  source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
78  source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
79  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
80  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
81  }
82 
83  if (cfg.with_stride_and_range)
84  {
85  if (cfg.is_row_major)
86  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
87  else
88  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] ");
89  source.append(cfg.assign_op);
90  if (cfg.is_row_major)
91  source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
92  else
93  source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
94 
95  if (mult_alpha)
96  source.append("* alpha ");
97  else
98  source.append("/ alpha ");
99  if (cfg.b != VIENNACL_AMBM_NONE)
100  {
101  if (cfg.is_row_major)
102  source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
103  else
104  source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
105  if (mult_beta)
106  source.append("* beta");
107  else
108  source.append("/ beta");
109  }
110  }
111  else
112  {
113  if (cfg.is_row_major)
114  source.append(" A[row * A_internal_size2 + col] ");
115  else
116  source.append(" A[row + col * A_internal_size1] ");
117  source.append(cfg.assign_op);
118  if (cfg.is_row_major)
119  source.append(" B[row * B_internal_size2 + col] ");
120  else
121  source.append(" B[row + col * B_internal_size1] ");
122 
123  if (mult_alpha)
124  source.append("* alpha ");
125  else
126  source.append("/ alpha ");
127  if (cfg.b != VIENNACL_AMBM_NONE)
128  {
129  if (cfg.is_row_major)
130  source.append("+ C[row * C_internal_size2 + col] ");
131  else
132  source.append("+ C[row + col * C_internal_size2] ");
133  if (mult_beta)
134  source.append("* beta");
135  else
136  source.append("/ beta");
137  }
138  }
139  source.append("; \n");
140 }
141 
142 template <typename StringType>
143 void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg)
144 {
145  source.append("__kernel void am");
146  if (cfg.b != VIENNACL_AMBM_NONE)
147  source.append("bm");
148  if (cfg.assign_op != "=")
149  source.append("_m");
150 
151  if (cfg.a == VIENNACL_AMBM_CPU)
152  source.append("_cpu");
153  else if (cfg.a == VIENNACL_AMBM_GPU)
154  source.append("_gpu");
155 
156  if (cfg.b == VIENNACL_AMBM_CPU)
157  source.append("_cpu");
158  else if (cfg.b == VIENNACL_AMBM_GPU)
159  source.append("_gpu");
160  source.append("( \n");
161  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
162  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
163  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
164  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
165  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
166  if (cfg.a == VIENNACL_AMBM_CPU)
167  {
168  source.append(" "); source.append(numeric_string); source.append(" fac2, \n");
169  }
170  else if (cfg.a == VIENNACL_AMBM_GPU)
171  {
172  source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n");
173  }
174  source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
175  source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n");
176  source.append(" unsigned int B_start1, unsigned int B_start2, \n");
177  source.append(" unsigned int B_inc1, unsigned int B_inc2, \n");
178  source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2");
179 
180  if (cfg.b != VIENNACL_AMBM_NONE)
181  {
182  source.append(", \n\n");
183  if (cfg.b == VIENNACL_AMBM_CPU)
184  {
185  source.append(" "); source.append(numeric_string); source.append(" fac3, \n");
186  }
187  else if (cfg.b == VIENNACL_AMBM_GPU)
188  {
189  source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n");
190  }
191  source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
192  source.append(" __global const "); source.append(numeric_string); source.append(" * C, \n");
193  source.append(" unsigned int C_start1, unsigned int C_start2, \n");
194  source.append(" unsigned int C_inc1, unsigned int C_inc2, \n");
195  source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2 \n");
196  }
197  source.append(") { \n");
198 
199  if (cfg.a == VIENNACL_AMBM_CPU)
200  {
201  source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n");
202  }
203  else if (cfg.a == VIENNACL_AMBM_GPU)
204  {
205  source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
206  }
207  source.append(" if (options2 & (1 << 0)) \n");
208  source.append(" alpha = -alpha; \n");
209  source.append(" \n");
210 
211  if (cfg.b == VIENNACL_AMBM_CPU)
212  {
213  source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n");
214  }
215  else if (cfg.b == VIENNACL_AMBM_GPU)
216  {
217  source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
218  }
219  if (cfg.b != VIENNACL_AMBM_NONE)
220  {
221  source.append(" if (options3 & (1 << 0)) \n");
222  source.append(" beta = -beta; \n");
223  source.append(" \n");
224  }
225  source.append(" if (options2 & (1 << 1)) { \n");
226  if (cfg.b != VIENNACL_AMBM_NONE)
227  {
228  source.append(" if (options3 & (1 << 1)) {\n");
229  generate_ambm_impl2(source, cfg, false, false);
230  source.append(" } else {\n");
231  generate_ambm_impl2(source, cfg, false, true);
232  source.append(" } \n");
233  }
234  else
235  generate_ambm_impl2(source, cfg, false, true);
236  source.append(" } else { \n");
237  if (cfg.b != VIENNACL_AMBM_NONE)
238  {
239  source.append(" if (options3 & (1 << 1)) {\n");
240  generate_ambm_impl2(source, cfg, true, false);
241  source.append(" } else {\n");
242  generate_ambm_impl2(source, cfg, true, true);
243  source.append(" } \n");
244  }
245  else
246  generate_ambm_impl2(source, cfg, true, true);
247  source.append(" } \n");
248  source.append("} \n");
249 }
250 
251 template <typename StringType>
252 void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major)
253 {
254  ambm_config cfg;
255  cfg.assign_op = "=";
256  cfg.with_stride_and_range = true;
257  cfg.is_row_major = is_row_major;
258 
259  // am
260  cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
261  cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
262 
263  // ambm
264  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
265  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
266  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
267  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
268 
269  // ambm_m
270  cfg.assign_op = "+=";
271 
272  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
273  cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
274  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
275  cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
276 }
277 
278 template <typename StringType>
279 void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
280 {
281  source.append("__kernel void assign_cpu( \n");
282  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
283  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
284  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
285  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
286  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
287  source.append(" "); source.append(numeric_string); source.append(" alpha) \n");
288  source.append("{ \n");
289  if (is_row_major)
290  {
291  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
292  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
293  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
294  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
295  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
296  }
297  else
298  {
299  source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
300  source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
301  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
302  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
303  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha; \n");
304  }
305  source.append("} \n");
306 }
307 
308 template <typename StringType>
309 void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
310 {
311  source.append("__kernel void diagonal_assign_cpu( \n");
312  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
313  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
314  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
315  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
316  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
317  source.append(" "); source.append(numeric_string); source.append(" alpha) \n");
318  source.append("{ \n");
319  source.append(" for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
320  if (is_row_major)
321  source.append(" A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
322  else
323  source.append(" A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) * A_internal_size1] = alpha; \n");
324  source.append("} \n");
325 }
326 
327 template <typename StringType>
328 void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major)
329 {
330  source.append("__kernel void element_op( \n");
331  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
332  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
333  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
334  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
335  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
336  source.append(" __global "); source.append(numeric_string); source.append(" * B, \n");
337  source.append(" unsigned int B_start1, unsigned int B_start2, \n");
338  source.append(" unsigned int B_inc1, unsigned int B_inc2, \n");
339  source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
340  source.append(" __global "); source.append(numeric_string); source.append(" * C, \n");
341  source.append(" unsigned int C_start1, unsigned int C_start2, \n");
342  source.append(" unsigned int C_inc1, unsigned int C_inc2, \n");
343  source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2, \n");
344  source.append(" unsigned int op_type) \n"); //0: product, 1: division, 2: pow
345  source.append("{ \n");
346  if (is_row_major)
347  {
348  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
349  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
350  source.append(" if (op_type == 2) {");
351  if (numeric_string == "float" || numeric_string == "double")
352  {
353  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
354  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
355  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
356  source.append(" pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
357  source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
358  }
359  source.append(" } else if (op_type == 1) {");
360  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
361  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
362  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
363  source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
364  source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
365  source.append(" } else if (op_type == 0) {");
366  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
367  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
368  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
369  source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
370  source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
371  source.append(" }");
372  }
373  else
374  {
375  source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
376  source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
377  source.append(" if (op_type == 2) {");
378  if (numeric_string == "float" || numeric_string == "double")
379  {
380  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
381  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
382  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
383  source.append(" pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1], \n");
384  source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]); \n");
385  }
386  source.append(" } else if (op_type == 1) {");
387  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
388  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
389  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
390  source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / \n");
391  source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n");
392  source.append(" } else if (op_type == 0) {");
393  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
394  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
395  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
396  source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * \n");
397  source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n");
398  source.append(" }");
399  }
400  source.append("} \n");
401 }
402 
403 
404 template<typename StringT>
405 void generate_fft(StringT & source, std::string const & numeric_string, bool is_row_major)
406 {
407  // naive fourier transform (quadratic complexity, use for reference only)
408  source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
409  source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
410  source.append(" unsigned int size, \n");
411  source.append(" unsigned int stride, \n");
412  source.append(" unsigned int batch_num, \n");
413  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
414  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
415  source.append(" \n");
416  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
417  source.append(" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
418  source.append(" "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
419  source.append(" \n");
420  source.append(" for (unsigned int n = 0; n < size; n++) { \n");
421  source.append(" "); source.append(numeric_string); source.append("2 in = ");
422  if (is_row_major)
423  source.append("input[batch_id * stride + n]; \n"); //input index here
424  else
425  source.append("input[n * stride + batch_id]; \n"); //input index here
426  source.append(" \n");
427  source.append(" "); source.append(numeric_string); source.append(" sn, cs; \n");
428  source.append(" "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
429  source.append(" sn = sincos(arg, &cs); \n");
430  source.append(" \n");
431  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
432  source.append(" f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
433  source.append(" } \n");
434  source.append(" \n");
435  if (is_row_major)
436  source.append(" output[batch_id * stride + k] = f; \n"); // output index here
437  else
438  source.append(" output[k * stride + batch_id] = f; \n"); // output index here
439  source.append(" } \n");
440  source.append(" } \n");
441  source.append("} \n");
442 
443  source.append(" \n");
444 
445  source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
446  source.append(" unsigned int s, \n");
447  source.append(" unsigned int bit_size, \n");
448  source.append(" unsigned int size, \n");
449  source.append(" unsigned int stride, \n");
450  source.append(" unsigned int batch_num, \n");
451  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
452  source.append(" \n");
453  source.append(" unsigned int ss = 1 << s; \n");
454  source.append(" unsigned int half_size = size >> 1; \n");
455  source.append(" \n");
456  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
457  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
458  source.append(" \n");
459  source.append(" unsigned int glb_id = get_global_id(0); \n");
460  source.append(" unsigned int glb_sz = get_global_size(0); \n");
461 
462  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
463  source.append(" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
464  source.append(" unsigned int group = (tid & (ss - 1)); \n");
465  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
466 
467  if (is_row_major)
468  {
469  source.append(" unsigned int offset = batch_id * stride + pos; \n");
470  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
471  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
472  }
473  else
474  {
475  source.append(" unsigned int offset = pos * stride + batch_id; \n");
476  source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
477  source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
478  }
479 
480  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
481 
482  source.append(" sn = sincos(arg, &cs); \n");
483 
484  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
485 
486  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
487 
488  if (is_row_major)
489  source.append(" input[offset + ss] = in1 - tmp; \n");//index
490  else
491  source.append(" input[offset + ss * stride] = in1 - tmp; \n");//index
492  source.append(" input[offset] = in1 + tmp; \n");//index
493  source.append(" } \n");
494  source.append(" } \n");
495  source.append("} \n");
496 
497  source.append(" \n");
498 
499  source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
500  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
501  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
502  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
503  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
504  source.append(" v = (v >> 16) | (v << 16); \n");
505  source.append(" \n");
506  source.append(" v = v >> (32 - bit_size); \n");
507  source.append(" \n");
508  source.append(" return v; \n");
509  source.append(" } \n");
510 
511  source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
512  source.append(" __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
513  source.append(" unsigned int bit_size, \n");
514  source.append(" unsigned int size, \n");
515  source.append(" unsigned int stride, \n");
516  source.append(" unsigned int batch_num, \n");
517  source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
518 
519  source.append(" unsigned int grp_id = get_group_id(0); \n");
520  source.append(" unsigned int grp_num = get_num_groups(0); \n");
521 
522  source.append(" unsigned int lcl_sz = get_local_size(0); \n");
523  source.append(" unsigned int lcl_id = get_local_id(0); \n");
524  source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
525 
526  source.append(" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
527  //unsigned int base_offset = stride * batch_id; \n");
528  //copy chunk of global memory to local \n");
529  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
530  source.append(" unsigned int v = get_reorder_num(p, bit_size); \n");
531  if (is_row_major)
532  source.append(" lcl_input[v] = input[batch_id * stride + p]; \n"); //index
533  else
534  source.append(" lcl_input[v] = input[p * stride + batch_id]; \n"); //index
535  source.append(" } \n");
536 
537  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
538 
539  //performs Cooley-Tukey FFT on local array
540  source.append(" for (unsigned int s = 0; s < bit_size; s++) { \n");
541  source.append(" unsigned int ss = 1 << s; \n");
542 
543  source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
544 
545  source.append(" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
546  source.append(" unsigned int group = (tid & (ss - 1)); \n");
547  source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
548 
549  source.append(" "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
550  source.append(" "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
551 
552  source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
553 
554  source.append(" sn = sincos(arg, &cs); \n");
555  source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
556 
557  source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
558 
559  source.append(" lcl_input[pos + ss] = in1 - tmp; \n");
560  source.append(" lcl_input[pos] = in1 + tmp; \n");
561  source.append(" } \n");
562 
563  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
564  source.append(" } \n");
565 
566  //copy local array back to global memory
567  source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
568  if (is_row_major)
569  source.append(" input[batch_id * stride + p] = lcl_input[p]; \n");//index
570  else
571  source.append(" input[p * stride + batch_id] = lcl_input[p]; \n");//index
572  source.append(" } \n");
573  source.append(" } \n");
574  source.append(" } \n");
575 
576  source.append(" \n");
577 
578  //
579  // Performs reordering of input data in bit-reversal order
580  // Probably it's better to do in host side,
581  //
582  source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
583  source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
584  source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
585  source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
586  source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
587  source.append(" v = (v >> 16) | (v << 16); \n");
588 
589  source.append(" v = v >> (32 - bit_size); \n");
590 
591  source.append(" return v; \n");
592  source.append("} \n");
593 
594  source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
595  source.append(" unsigned int bit_size, \n");
596  source.append(" unsigned int size, \n");
597  source.append(" unsigned int stride, \n");
598  source.append(" int batch_num) { \n");
599 
600  source.append(" unsigned int glb_id = get_global_id(0); \n");
601  source.append(" unsigned int glb_sz = get_global_size(0); \n");
602 
603  source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
604  source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
605  source.append(" unsigned int v = get_reorder_num_2(i, bit_size); \n");
606 
607  source.append(" if (i < v) {\n");
608  if (is_row_major)
609  {
610  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
611  source.append(" input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
612  source.append(" input[batch_id * stride + v] = tmp; \n"); //index
613  }
614  else
615  {
616  source.append(" "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
617  source.append(" input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
618  source.append(" input[v * stride + batch_id] = tmp; \n"); //index
619  }
620  source.append(" } \n");
621  source.append(" } \n");
622  source.append(" } \n");
623  source.append("} \n");
624 }
625 
626 template<typename StringT>
627 void generate_lu(StringT & source, std::string const & numeric_string, bool is_row_major)
628 {
629  source.append("__kernel void lu_factorize( \n");
630  source.append(" __global "); source.append(numeric_string); source.append(" * matrix, \n");
631  source.append(" unsigned int matrix_rows, \n");
632  source.append(" unsigned int matrix_cols, \n");
633  source.append(" unsigned int matrix_internal_rows, \n");
634  source.append(" unsigned int matrix_internal_cols) \n");
635  source.append("{ \n");
636  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
637 
638  if (is_row_major)
639  {
640  source.append(" unsigned rowi; \n");
641  source.append(" unsigned rowk; \n");
642  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
643  source.append(" { \n");
644  source.append(" rowi = i * matrix_internal_cols; \n");
645  source.append(" for (unsigned int k=0; k<i; ++k) \n");
646  source.append(" { \n");
647  source.append(" rowk = k * matrix_internal_cols; \n");
648  source.append(" if (get_global_id(0) == 0) \n");
649  source.append(" matrix[rowi + k] /= matrix[rowk + k]; \n");
650 
651  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
652  source.append(" temp = matrix[rowi + k]; \n");
653 
654  //parallel subtraction:
655  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
656  source.append(" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
657  }
658  else
659  {
660  source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
661  source.append(" { \n");
662  source.append(" for (unsigned int k=0; k<i; ++k) \n");
663  source.append(" { \n");
664 
665  source.append(" if (get_global_id(0) == 0) \n");
666  source.append(" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
667 
668  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
669  source.append(" temp = matrix[i + k*matrix_internal_rows]; \n");
670 
671  //parallel subtraction:
672  source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
673  source.append(" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
674  }
675  source.append(" }");
676  source.append(" }");
677  source.append("}");
678 }
679 
680 
681 template<typename StringT>
682 void generate_scaled_rank1_update(StringT & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
683 {
684  source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
685  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
686  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
687  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
688  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
689  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
690 
691  if (alpha_on_cpu) {
692  source.append(" "); source.append(numeric_string); source.append(" val, \n");
693  } else {
694  source.append(" __global const "); source.append(numeric_string); source.append(" *val, \n");
695  }
696  source.append(" unsigned int options2, \n");
697 
698  source.append(" __global const "); source.append(numeric_string); source.append(" * vec1, \n");
699  source.append(" unsigned int start1, \n");
700  source.append(" unsigned int inc1, \n");
701  source.append(" unsigned int size1, \n");
702 
703  source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
704  source.append(" unsigned int start2, \n");
705  source.append(" unsigned int inc2, \n");
706  source.append(" unsigned int size2) \n");
707  source.append("{ \n");
708 
709  if (alpha_on_cpu) {
710  source.append(" "); source.append(numeric_string); source.append(" alpha = val; \n");
711  } else {
712  source.append(" "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
713  }
714  source.append(" if (options2 & (1 << 0)) \n");
715  source.append(" alpha = -alpha; \n");
716 
717  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
718  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
719 
720  source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
721  source.append(" { \n");
722  source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
723  source.append(" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
724  source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
725  if (is_row_major)
726  source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
727  else
728  source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
729  source.append(" } \n");
730  source.append("} \n");
731 }
732 
733 template <typename StringType>
734 void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
735 {
736  source.append("__kernel void trans_vec_mul( \n");
737  source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n");
738  source.append(" unsigned int A_row_start, unsigned int A_col_start, \n");
739  source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n");
740  source.append(" unsigned int A_row_size, unsigned int A_col_size, \n");
741  source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
742  source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n");
743  source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
744  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
745  source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
746  source.append(" __local "); source.append(numeric_string); source.append(" * work) \n");
747  source.append("{ \n");
748  if (is_row_major)
749  {
750  source.append(" for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
751  source.append(" { \n");
752  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
753  source.append(" for (unsigned int col = 0; col < A_row_size; ++col) \n");
754  source.append(" dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
755  source.append(" result[row * result_inc + result_start] = dot_prod; \n");
756  }
757  else
758  {
759  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
760  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
761  source.append(" unsigned int lid = get_local_id(0); \n");
762 
763  source.append(" for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
764  source.append(" { \n");
765  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
766  source.append(" for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
767  source.append(" dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
768  source.append(" work[lid] = dot_prod; \n");
769 
770  source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
771  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
772  source.append(" if(lid < stride) \n");
773  source.append(" work[lid] += work[lid+stride]; \n");
774  source.append(" } \n");
775 
776  source.append(" if(lid == 0) \n");
777  source.append(" result[row * result_inc + result_start] = work[0]; \n");
778  }
779  source.append(" } \n");
780  source.append("} \n");
781 }
782 
783 template<typename StringT>
784 void generate_triangular_substitute_inplace(StringT & source, std::string const & numeric_string, bool is_row_major)
785 {
786  source.append("__kernel void triangular_substitute_inplace( \n");
787  source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
788  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
789  source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
790  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
791  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
792  source.append(" __global "); source.append(numeric_string); source.append(" * v, \n");
793  source.append(" unsigned int v_start, \n");
794  source.append(" unsigned int v_inc, \n");
795  source.append(" unsigned int v_size, \n");
796  source.append(" unsigned int options) \n");
797  source.append("{ \n");
798  source.append(" "); source.append(numeric_string); source.append(" temp; \n");
799  source.append(" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
800  source.append(" unsigned int transposed_access_A = (options & (1 << 1)); \n");
801  source.append(" unsigned int is_lower_solve = (options & (1 << 2)); \n");
802  source.append(" unsigned int row; \n");
803  source.append(" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n"); //Note: A required to be square
804  source.append(" { \n");
805  source.append(" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
806  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
807  source.append(" if (!unit_diagonal_flag) \n");
808  source.append(" { \n");
809  source.append(" if (get_global_id(0) == 0) \n");
810  if (is_row_major)
811  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
812  else
813  source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
814  source.append(" } \n");
815 
816  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
817 
818  source.append(" temp = v[row * v_inc + v_start]; \n");
819 
820  source.append(" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
821  source.append(" elim < (is_lower_solve ? A_size1 : row); \n");
822  source.append(" elim += get_global_size(0)) \n");
823  if (is_row_major)
824  {
825  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
826  source.append(" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
827  }
828  else
829  {
830  source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
831  source.append(" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
832  }
833  source.append(" } \n");
834  source.append("} \n");
835 }
836 
837 template <typename StringT>
838 void generate_trans_kernel(StringT & source, std::string const & numeric_string, bool is_row_major)
839 {
840  source.append("__kernel void trans_kernel(\n");
841  source.append(" __global const ");source.append(numeric_string);source.append(" * A, \n");
842  source.append(" unsigned int A_start1, unsigned int A_start2, \n");
843  source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
844  source.append(" unsigned int A_size1, unsigned int A_size2, \n");
845  source.append(" unsigned int A_stride1, unsigned int A_stride2, \n");
846  source.append(" __global ");source.append(numeric_string);source.append(" * B, \n");
847  source.append(" unsigned int B_start1, unsigned int B_start2, \n");
848  source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
849  source.append(" unsigned int B_stride1, unsigned int B_stride2) \n");
850  source.append("{ \n");
851  source.append(" for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
852  source.append(" { \n");
853  source.append(" for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
854  source.append(" { \n");
855  if(is_row_major)
856  source.append(" B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; \n");
857  else
858  source.append(" B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; \n");
859  source.append(" } \n");
860  source.append(" } \n");
861  source.append("} \n");
862 }
863 
864 template <typename StringType>
865 void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
866 {
867  source.append("__kernel void vec_mul( \n");
868  source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n");
869  source.append(" unsigned int A_row_start, unsigned int A_col_start, \n");
870  source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n");
871  source.append(" unsigned int A_row_size, unsigned int A_col_size, \n");
872  source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
873  source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n");
874  source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
875  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
876  source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
877  source.append(" __local "); source.append(numeric_string); source.append(" * work) \n");
878  source.append("{ \n");
879  if (is_row_major)
880  {
881  source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
882  source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
883  source.append(" unsigned int lid = get_local_id(0); \n");
884 
885  source.append(" for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
886  source.append(" { \n");
887  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
888  source.append(" for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
889  source.append(" dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
890  source.append(" work[lid] = dot_prod; \n");
891 
892  source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
893  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
894  source.append(" if(lid < stride) \n");
895  source.append(" work[lid] += work[lid+stride]; \n");
896  source.append(" } \n");
897 
898  source.append(" if(lid == 0) \n");
899  source.append(" result[row * result_inc + result_start] = work[0]; \n");
900 
901  }
902  else
903  {
904  source.append(" for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
905  source.append(" { \n");
906  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
907  source.append(" for (unsigned int col = 0; col < A_col_size; ++col) \n");
908  source.append(" dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
909  source.append(" result[row * result_inc + result_start] = dot_prod; \n");
910  }
911  source.append(" } \n");
912  source.append("} \n");
913 }
914 
915 namespace detail
916 {
917  inline std::string type_to_string(viennacl::row_major) { return "row"; }
918  inline std::string type_to_string(viennacl::column_major) { return "col"; }
919 }
920 
922 
923 // main kernel class
925 template <typename NumericT, typename F>
926 struct matrix
927 {
928  static std::string program_name()
929  {
931  }
932 
933  static void init(viennacl::ocl::context & ctx)
934  {
936  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
938 
939  static std::map<cl_context, bool> init_done;
940  if (!init_done[ctx.handle().get()])
941  {
942  std::string source;
943  source.reserve(8192);
944 
945  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
946 
947  // fully parametrized kernels:
948  generate_ambm(source, numeric_string, is_row_major);
949 
950  // kernels with mostly predetermined skeleton:
951  generate_assign_cpu(source, numeric_string, is_row_major);
952  generate_diagonal_assign_cpu(source, numeric_string, is_row_major);
953  generate_element_op(source, numeric_string, is_row_major);
954  generate_trans_vec_mul(source, numeric_string, is_row_major);
955  generate_vec_mul(source, numeric_string, is_row_major);
956 
957  std::string prog_name = program_name();
958  #ifdef VIENNACL_BUILD_INFO
959  std::cout << "Creating program " << prog_name << std::endl;
960  #endif
961  ctx.add_program(source, prog_name);
962  init_done[ctx.handle().get()] = true;
963  } //if
964  } //init
965 };
966 
968 template<typename NumericT>
970 {
971 public:
973  {
974  static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
975  cl_context h = ctx.handle().get();
976  std::pair<bool, cl_context> key(is_row_major, h);
977  if (handlers_map.find(key) == handlers_map.end())
978  {
980 
981  namespace ds = viennacl::device_specific;
982  viennacl::ocl::device const & device = ctx.current_device();
983  std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"_matrix_prod_row":"_matrix_prod_col");
984  handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
985  ds::execution_handler & handler = viennacl::device_specific::at(handlers_map, key);
986 
987  ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'N');
988  ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'N');
989  ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'T');
990  ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'T');
991 
993  if (is_row_major)
995  else
997 
998  //Dummy types. The values don't matter for the kernel generation.
1002  NumericT alpha = 1;
1003  NumericT beta = 0;
1004 
1005  handler.add("prod_NN", ds::matrix_product_template(matrix_product_params_NN, 'N', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, false, beta, &C));
1006  handler.add("prod_TN", ds::matrix_product_template(matrix_product_params_TN, 'T', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, false, beta, &C));
1007  handler.add("prod_NT", ds::matrix_product_template(matrix_product_params_NT, 'N', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, true, beta, &C));
1008  handler.add("prod_TT", ds::matrix_product_template(matrix_product_params_TT, 'T', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, true, beta, &C));
1009 
1010  }
1011  return viennacl::device_specific::at(handlers_map, key);
1012  }
1013 };
1014 
1015 // main kernel class
1017 template<typename NumericT, typename LayoutT>
1019 {
1020  static std::string program_name()
1021  {
1022  return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_legacy_" + detail::type_to_string(LayoutT());
1023  }
1024 
1025  static void init(viennacl::ocl::context & ctx)
1026  {
1027  static std::map<cl_context, bool> init_done;
1028  if (!init_done[ctx.handle().get()])
1029  {
1031  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1033 
1034  std::string source;
1035  source.reserve(8192);
1036 
1037  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1038 
1039  // kernels with mostly predetermined skeleton:
1040  generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
1041  generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
1042 
1043  if (numeric_string == "float" || numeric_string == "double")
1044  {
1045  generate_fft(source, numeric_string, is_row_major);
1046  generate_lu(source, numeric_string, is_row_major);
1047  generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
1048  generate_trans_kernel(source, numeric_string, is_row_major);
1049  }
1050 
1051  std::string prog_name = program_name();
1052  #ifdef VIENNACL_BUILD_INFO
1053  std::cout << "Creating program " << prog_name << std::endl;
1054  #endif
1055  ctx.add_program(source, prog_name);
1056  init_done[ctx.handle().get()] = true;
1057  } //if
1058  } //init
1059 };
1060 
1061 
1062 
1063 
1064 template<typename StringT>
1065 void generate_matrix_convert_row(StringT & source, std::string const & dest_type, std::string const & src_type)
1066 {
1067  source.append(" __kernel void convert_row_" + dest_type + "_" + src_type + "( \n");
1068  source.append(" __global " + dest_type + " * dest, \n");
1069  source.append(" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
1070  source.append(" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
1071  source.append(" __global const " + src_type + " * src, \n");
1072  source.append(" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
1073  source.append(" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
1074  source.append(" { \n");
1075  source.append(" for (unsigned int i = get_group_id(0); i < size1_dest; i += get_num_groups(0)) \n");
1076  source.append(" for (unsigned int j = get_local_id(0); j < size2_dest; j += get_local_size(0)) \n");
1077  source.append(" dest[(start1_dest + i * inc1_dest) * internal_size2_dest + (start2_dest + j * inc2_dest)] = src[(start1_src + i * inc1_src) * internal_size2_src + (start2_src + j * inc2_src)]; \n");
1078  source.append(" } \n");
1079 }
1080 
1081 template<typename StringT>
1082 void generate_matrix_convert_col(StringT & source, std::string const & dest_type, std::string const & src_type)
1083 {
1084  source.append(" __kernel void convert_col_" + dest_type + "_" + src_type + "( \n");
1085  source.append(" __global " + dest_type + " * dest, \n");
1086  source.append(" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
1087  source.append(" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
1088  source.append(" __global const " + src_type + " * src, \n");
1089  source.append(" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
1090  source.append(" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
1091  source.append(" { \n");
1092  source.append(" for (unsigned int j = get_group_id(0); j < size2_dest; j += get_num_groups(0)) \n");
1093  source.append(" for (unsigned int i = get_local_id(0); i < size1_dest; i += get_local_size(0)) \n");
1094  source.append(" dest[(start1_dest + i * inc1_dest) + (start2_dest + j * inc2_dest) * internal_size1_dest] = src[(start1_src + i * inc1_src) + (start2_src + j * inc2_src) * internal_size1_src]; \n");
1095  source.append(" } \n");
1096 }
1097 
1098 template<typename StringT>
1099 void generate_matrix_convert(StringT & source, std::string const & dest_type, std::string const & src_type)
1100 {
1101  generate_matrix_convert_row(source, dest_type, src_type);
1102  generate_matrix_convert_col(source, dest_type, src_type);
1103 }
1104 
1107 {
1108 
1109 public:
1110  static std::string program_name()
1111  {
1112  return "matrix_convert";
1113  }
1114 
1115  static void init(viennacl::ocl::context & ctx)
1116  {
1117  static std::map<cl_context, bool> init_done;
1118  if (!init_done[ctx.handle().get()])
1119  {
1120  std::string source;
1121  source.reserve(4096);
1122 
1123  // int
1129 
1130  // unsigned int
1136 
1137  // long
1143 
1144  // unsigned long
1150 
1151  // float
1157 
1158  if (ctx.current_device().double_support())
1159  {
1161 
1167 
1174  }
1175 
1176  std::string prog_name = program_name();
1177  #ifdef VIENNACL_BUILD_INFO
1178  std::cout << "Creating program " << prog_name << std::endl;
1179  #endif
1180  ctx.add_program(source, prog_name);
1181  init_done[ctx.handle().get()] = true;
1182  } //if
1183  } //init
1184 
1185 };
1186 
1187 
1188 } // namespace kernels
1189 } // namespace opencl
1190 } // namespace linalg
1191 } // namespace viennacl
1192 #endif
1193 
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:112
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: matrix.hpp:969
Implements a OpenCL platform within ViennaCL.
void generate_fft(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:405
void generate_triangular_substitute_inplace(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:784
void generate_ambm_impl(StringType &source, std::string const &numeric_string, ambm_config const &cfg)
Definition: matrix.hpp:143
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:484
void append_double_precision_pragma< double >(viennacl::ocl::context const &ctx, std::string &source)
Definition: utils.hpp:78
Various little tools used here and there in ViennaCL.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
void generate_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:279
Provides OpenCL-related utilities.
A class representing a compute device (e.g. a GPU)
Definition: device.hpp:49
A dense matrix class.
Definition: forwards.h:375
void generate_element_op(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:328
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
static void init(viennacl::ocl::context &ctx)
Definition: matrix.hpp:933
float NumericT
Definition: bisect.cpp:40
void generate_vec_mul(StringT &source, std::string const &numeric_string)
static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context &ctx)
Definition: matrix.hpp:972
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
viennacl::ocl::program & add_program(cl_program p, std::string const &prog_name)
Adds a program to the context.
Definition: context.hpp:368
const OCL_TYPE & get() const
Definition: handle.hpp:191
void generate_ambm(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:252
static void init(viennacl::ocl::context &ctx)
Definition: matrix.hpp:1025
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
Definition: matrix.hpp:1018
void generate_trans_kernel(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:838
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Definition: device.hpp:956
A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependen...
Definition: shared_ptr.hpp:83
void generate_trans_vec_mul(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:734
Configuration struct for generating OpenCL kernels for linear combinations of matrices.
Definition: matrix.hpp:52
void generate_ambm_impl2(StringType &source, ambm_config const &cfg, bool mult_alpha, bool mult_beta)
Definition: matrix.hpp:66
void generate_lu(StringT &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:627
statement mat_mat_prod(NumericT alpha, viennacl::matrix_base< NumericT > const *A, bool A_trans, viennacl::matrix_base< NumericT > const *B, bool B_trans, NumericT beta, viennacl::matrix_base< NumericT > const *C)
Definition: preset.hpp:33
Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type v...
Definition: matrix.hpp:926
Main kernel class for vector conversion routines (e.g. convert vector to vector).
Definition: matrix.hpp:1106
void generate_scaled_rank1_update(StringT &source, std::string const &numeric_string, bool is_row_major, bool alpha_on_cpu)
Definition: matrix.hpp:682
Representation of an OpenCL kernel in ViennaCL.
std::string type_to_string(viennacl::row_major)
Definition: matrix.hpp:917
void generate_matrix_convert_col(StringT &source, std::string const &dest_type, std::string const &src_type)
Definition: matrix.hpp:1082
A tag for column-major storage of a dense matrix.
Definition: forwards.h:321
ambm_scalar_type
Enumeration for the scalar type in ambm-like operations.
Definition: matrix.hpp:44
void generate_diagonal_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:309
void generate_matrix_convert(StringT &source, std::string const &dest_type, std::string const &src_type)
Definition: matrix.hpp:1099
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
Definition: forwards.h:142
static void init(viennacl::ocl::context &ctx)
Definition: matrix.hpp:1115
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
A tag for row-major storage of a dense matrix.
Definition: forwards.h:304
Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc.
void generate_matrix_convert_row(StringT &source, std::string const &dest_type, std::string const &src_type)
Definition: matrix.hpp:1065