ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
coordinate_matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
27 
30 namespace viennacl
31 {
32 namespace linalg
33 {
34 namespace opencl
35 {
36 namespace kernels
37 {
38 
40 
41 template<typename StringT>
42 void generate_coordinate_matrix_vec_mul(StringT & source, std::string const & numeric_string)
43 {
44  source.append("__kernel void vec_mul( \n");
45  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
46  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
47  source.append(" __global const uint * group_boundaries, \n");
48  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
49  source.append(" uint4 layout_x, \n");
50  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
51  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
52  source.append(" uint4 layout_result, \n");
53  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
54  source.append(" __local unsigned int * shared_rows, \n");
55  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
56  source.append("{ \n");
57  source.append(" uint2 tmp; \n");
58  source.append(" "); source.append(numeric_string); source.append(" val; \n");
59  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
60  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
61  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
62 
63  source.append(" uint local_index = 0; \n");
64 
65  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
66  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
67 
68  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
69  source.append(" val = (local_index < group_end) ? elements[local_index] * x[tmp.y * layout_x.y + layout_x.x] : 0; \n");
70 
71  //check for carry from previous loop run:
72  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
73  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
74  source.append(" val += inter_results[get_local_size(0)-1]; \n");
75  source.append(" else if (beta != 0) \n");
76  source.append(" result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] += alpha * inter_results[get_local_size(0)-1]; \n");
77  source.append(" else \n");
78  source.append(" result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] = alpha * inter_results[get_local_size(0)-1]; \n");
79  source.append(" } \n");
80 
81  //segmented parallel reduction begin
82  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
83  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
84  source.append(" inter_results[get_local_id(0)] = val; \n");
85  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
86  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
87 
88  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
89  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
90  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
91  source.append(" inter_results[get_local_id(0)] += left; \n");
92  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
93  source.append(" } \n");
94  //segmented parallel reduction end
95 
96  source.append(" if (local_index < group_end - 1 && get_local_id(0) < get_local_size(0) - 1 && \n");
97  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
98  source.append(" if (beta != 0) result[tmp.x * layout_result.y + layout_result.x] += alpha * inter_results[get_local_id(0)]; \n");
99  source.append(" else result[tmp.x * layout_result.y + layout_result.x] = alpha * inter_results[get_local_id(0)]; \n");
100  source.append(" } \n");
101 
102  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
103  source.append(" } \n"); //for k
104 
105  source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
106  source.append(" if (beta != 0) result[tmp.x * layout_result.y + layout_result.x] += alpha * inter_results[get_local_id(0)]; \n");
107  source.append(" else result[tmp.x * layout_result.y + layout_result.x] = alpha * inter_results[get_local_id(0)]; \n");
108  source.append(" } \n");
109  source.append("} \n");
110 
111 }
112 
113 namespace detail
114 {
116  template<typename StringT>
117  void generate_coordinate_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
118  bool B_transposed, bool B_row_major, bool C_row_major)
119  {
120  source.append("__kernel void ");
121  source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
122  source.append("( \n");
123  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
124  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
125  source.append(" __global const uint * group_boundaries, \n");
126  source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
127  source.append(" unsigned int d_mat_row_start, \n");
128  source.append(" unsigned int d_mat_col_start, \n");
129  source.append(" unsigned int d_mat_row_inc, \n");
130  source.append(" unsigned int d_mat_col_inc, \n");
131  source.append(" unsigned int d_mat_row_size, \n");
132  source.append(" unsigned int d_mat_col_size, \n");
133  source.append(" unsigned int d_mat_internal_rows, \n");
134  source.append(" unsigned int d_mat_internal_cols, \n");
135  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
136  source.append(" unsigned int result_row_start, \n");
137  source.append(" unsigned int result_col_start, \n");
138  source.append(" unsigned int result_row_inc, \n");
139  source.append(" unsigned int result_col_inc, \n");
140  source.append(" unsigned int result_row_size, \n");
141  source.append(" unsigned int result_col_size, \n");
142  source.append(" unsigned int result_internal_rows, \n");
143  source.append(" unsigned int result_internal_cols, \n");
144  source.append(" __local unsigned int * shared_rows, \n");
145  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
146  source.append("{ \n");
147  source.append(" uint2 tmp; \n");
148  source.append(" "); source.append(numeric_string); source.append(" val; \n");
149  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
150  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
151  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
152 
153  source.append(" uint local_index = 0; \n");
154 
155  source.append(" for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
156  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
157  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
158 
159  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
160  if (B_transposed && B_row_major)
161  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + tmp.y * d_mat_col_inc ] : 0; \n");
162  else if (B_transposed && !B_row_major)
163  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + tmp.y * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
164  else if (!B_transposed && B_row_major)
165  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] : 0; \n");
166  else
167  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
168 
169  //check for carry from previous loop run:
170  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
171  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
172  source.append(" val += inter_results[get_local_size(0)-1]; \n");
173  source.append(" else \n");
174  if (C_row_major)
175  source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_size(0)-1]; \n");
176  else
177  source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_size(0)-1]; \n");
178  source.append(" } \n");
179 
180  //segmented parallel reduction begin
181  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
182  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
183  source.append(" inter_results[get_local_id(0)] = val; \n");
184  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
185  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
186 
187  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
188  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
189  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
190  source.append(" inter_results[get_local_id(0)] += left; \n");
191  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
192  source.append(" } \n");
193  //segmented parallel reduction end
194 
195  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
196  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
197  if (C_row_major)
198  source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
199  else
200  source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
201  source.append(" } \n");
202 
203  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
204  source.append(" } \n"); //for k
205 
206  source.append(" if (local_index + 1 == group_end) \n"); //write results of last active entry (this may not necessarily be the case already)
207  if (C_row_major)
208  source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
209  else
210  source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
211  source.append(" } \n"); //for result_col
212  source.append("} \n");
213 
214  }
215 }
216 
217 template<typename StringT>
218 void generate_coordinate_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
219 {
220  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
221  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, true);
222  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, false);
223  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, true);
224 
225  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
226  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, true);
227  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, false);
228  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, true);
229 }
230 
231 template<typename StringT>
232 void generate_coordinate_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
233 {
234  source.append("__kernel void row_info_extractor( \n");
235  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
236  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
237  source.append(" __global const uint * group_boundaries, \n");
238  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
239  source.append(" unsigned int option, \n");
240  source.append(" __local unsigned int * shared_rows, \n");
241  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
242  source.append("{ \n");
243  source.append(" uint2 tmp; \n");
244  source.append(" "); source.append(numeric_string); source.append(" val; \n");
245  source.append(" uint last_index = get_local_size(0) - 1; \n");
246  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
247  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
248  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : ("); source.append(numeric_string); source.append(")0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
249 
250  source.append(" uint local_index = 0; \n");
251 
252  source.append(" for (uint k = 0; k < k_end; ++k) \n");
253  source.append(" { \n");
254  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
255 
256  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
257  source.append(" val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; \n");
258 
259  //check for carry from previous loop run:
260  source.append(" if (get_local_id(0) == 0 && k > 0) \n");
261  source.append(" { \n");
262  source.append(" if (tmp.x == shared_rows[last_index]) \n");
263  source.append(" { \n");
264  source.append(" switch (option) \n");
265  source.append(" { \n");
266  source.append(" case 0: \n"); //inf-norm
267  source.append(" case 3: \n"); //diagonal entry
268  source.append(" val = max(val, fabs(inter_results[last_index])); \n");
269  source.append(" break; \n");
270 
271  source.append(" case 1: \n"); //1-norm
272  source.append(" val = fabs(val) + inter_results[last_index]; \n");
273  source.append(" break; \n");
274 
275  source.append(" case 2: \n"); //2-norm
276  source.append(" val = sqrt(val * val + inter_results[last_index]); \n");
277  source.append(" break; \n");
278 
279  source.append(" default: \n");
280  source.append(" break; \n");
281  source.append(" } \n");
282  source.append(" } \n");
283  source.append(" else \n");
284  source.append(" { \n");
285  source.append(" switch (option) \n");
286  source.append(" { \n");
287  source.append(" case 0: \n"); //inf-norm
288  source.append(" case 1: \n"); //1-norm
289  source.append(" case 3: \n"); //diagonal entry
290  source.append(" result[shared_rows[last_index]] = inter_results[last_index]; \n");
291  source.append(" break; \n");
292 
293  source.append(" case 2: \n"); //2-norm
294  source.append(" result[shared_rows[last_index]] = sqrt(inter_results[last_index]); \n");
295  source.append(" default: \n");
296  source.append(" break; \n");
297  source.append(" } \n");
298  source.append(" } \n");
299  source.append(" } \n");
300 
301  //segmented parallel reduction begin
302  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
303  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
304  source.append(" switch (option) \n");
305  source.append(" { \n");
306  source.append(" case 0: \n");
307  source.append(" case 3: \n");
308  source.append(" inter_results[get_local_id(0)] = val; \n");
309  source.append(" break; \n");
310  source.append(" case 1: \n");
311  source.append(" inter_results[get_local_id(0)] = fabs(val); \n");
312  source.append(" break; \n");
313  source.append(" case 2: \n");
314  source.append(" inter_results[get_local_id(0)] = val * val; \n");
315  source.append(" default: \n");
316  source.append(" break; \n");
317  source.append(" } \n");
318  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
319 
320  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) \n");
321  source.append(" { \n");
322  source.append(" "); source.append(numeric_string); source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : ("); source.append(numeric_string); source.append(")0; \n");
323  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
324  source.append(" switch (option) \n");
325  source.append(" { \n");
326  source.append(" case 0: \n"); //inf-norm
327  source.append(" case 3: \n"); //diagonal entry
328  source.append(" inter_results[get_local_id(0)] = max(inter_results[get_local_id(0)], left); \n");
329  source.append(" break; \n");
330 
331  source.append(" case 1: \n"); //1-norm
332  source.append(" inter_results[get_local_id(0)] += left; \n");
333  source.append(" break; \n");
334 
335  source.append(" case 2: \n"); //2-norm
336  source.append(" inter_results[get_local_id(0)] += left; \n");
337  source.append(" break; \n");
338 
339  source.append(" default: \n");
340  source.append(" break; \n");
341  source.append(" } \n");
342  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
343  source.append(" } \n");
344  //segmented parallel reduction end
345 
346  source.append(" if (get_local_id(0) != last_index && \n");
347  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] && \n");
348  source.append(" inter_results[get_local_id(0)] != 0) \n");
349  source.append(" { \n");
350  source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
351  source.append(" } \n");
352 
353  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
354  source.append(" } \n"); //for k
355 
356  source.append(" if (local_index + 1 == group_end && inter_results[get_local_id(0)] != 0) \n");
357  source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
358  source.append("} \n");
359 }
360 
362 
363 // main kernel class
365 template<typename NumericT>
367 {
368  static std::string program_name()
369  {
370  return viennacl::ocl::type_to_string<NumericT>::apply() + "_coordinate_matrix";
371  }
372 
373  static void init(viennacl::ocl::context & ctx)
374  {
375  static std::map<cl_context, bool> init_done;
376  if (!init_done[ctx.handle().get()])
377  {
379  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
380 
381  std::string source;
382  source.reserve(1024);
383 
384  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
385 
386  generate_coordinate_matrix_vec_mul(source, numeric_string);
388  generate_coordinate_matrix_row_info_extractor(source, numeric_string);
389 
390  std::string prog_name = program_name();
391  #ifdef VIENNACL_BUILD_INFO
392  std::cout << "Creating program " << prog_name << std::endl;
393  #endif
394  ctx.add_program(source, prog_name);
395  init_done[ctx.handle().get()] = true;
396  } //if
397  } //init
398 };
399 
400 } // namespace kernels
401 } // namespace opencl
402 } // namespace linalg
403 } // namespace viennacl
404 #endif
405 
static void init(viennacl::ocl::context &ctx)
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:49
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Main kernel class for generating OpenCL kernels for coordinate_matrix.
Provides OpenCL-related utilities.
void generate_coordinate_matrix_dense_matrix_mul(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
Common implementations shared by OpenCL-based operations.
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:191
void generate_coordinate_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_vec_mul(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_coordinate_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)