ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
compressed_matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 #include "viennacl/ocl/kernel.hpp"
24 #include "viennacl/ocl/utils.hpp"
25 
27 
30 namespace viennacl
31 {
32 namespace linalg
33 {
34 namespace opencl
35 {
36 namespace kernels
37 {
38 
40 
41 template<typename StringT>
42 void generate_compressed_matrix_block_trans_lu_backward(StringT & source, std::string const & numeric_string)
43 {
44  source.append("__kernel void block_trans_lu_backward( \n");
45  source.append(" __global const unsigned int * row_jumper_U, \n"); //U part (note that U is transposed in memory)
46  source.append(" __global const unsigned int * column_indices_U, \n");
47  source.append(" __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
48  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
49  source.append(" __global const unsigned int * block_offsets, \n");
50  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
51  source.append(" unsigned int size) \n");
52  source.append("{ \n");
53  source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
54  source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
55  source.append(" unsigned int row_start; \n");
56  source.append(" unsigned int row_stop; \n");
57  source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
58 
59  source.append(" if (col_start >= col_stop) \n");
60  source.append(" return; \n");
61 
62  //backward elimination, using U and diagonal_U
63  source.append(" for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
64  source.append(" { \n");
65  source.append(" unsigned int col = (col_stop - iter) - 1; \n");
66  source.append(" result_entry = result[col] / diagonal_U[col]; \n");
67  source.append(" row_start = row_jumper_U[col]; \n");
68  source.append(" row_stop = row_jumper_U[col + 1]; \n");
69  source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
70  source.append(" result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
71  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
72  source.append(" } \n");
73 
74  //divide result vector by diagonal:
75  source.append(" for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
76  source.append(" result[col] /= diagonal_U[col]; \n");
77  source.append("} \n");
78 }
79 
80 template<typename StringT>
81 void generate_compressed_matrix_block_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
82 {
83  source.append("__kernel void block_trans_unit_lu_forward( \n");
84  source.append(" __global const unsigned int * row_jumper_L, \n"); //L part (note that L is transposed in memory)
85  source.append(" __global const unsigned int * column_indices_L, \n");
86  source.append(" __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
87  source.append(" __global const unsigned int * block_offsets, \n");
88  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
89  source.append(" unsigned int size) \n");
90  source.append("{ \n");
91  source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
92  source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
93  source.append(" unsigned int row_start = row_jumper_L[col_start]; \n");
94  source.append(" unsigned int row_stop; \n");
95  source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
96 
97  source.append(" if (col_start >= col_stop) \n");
98  source.append(" return; \n");
99 
100  //forward elimination, using L:
101  source.append(" for (unsigned int col = col_start; col < col_stop; ++col) \n");
102  source.append(" { \n");
103  source.append(" result_entry = result[col]; \n");
104  source.append(" row_stop = row_jumper_L[col + 1]; \n");
105  source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
106  source.append(" result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
107  source.append(" row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
108  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
109  source.append(" } \n");
110 
111  source.append("}; \n");
112 }
113 
114 namespace detail
115 {
117  template<typename StringT>
118  void generate_compressed_matrix_dense_matrix_mult(StringT & source, std::string const & numeric_string,
119  bool B_transposed, bool B_row_major, bool C_row_major)
120  {
121  source.append("__kernel void ");
122  source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
123  source.append("( \n");
124  source.append(" __global const unsigned int * sp_mat_row_indices, \n");
125  source.append(" __global const unsigned int * sp_mat_col_indices, \n");
126  source.append(" __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
127  source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
128  source.append(" unsigned int d_mat_row_start, \n");
129  source.append(" unsigned int d_mat_col_start, \n");
130  source.append(" unsigned int d_mat_row_inc, \n");
131  source.append(" unsigned int d_mat_col_inc, \n");
132  source.append(" unsigned int d_mat_row_size, \n");
133  source.append(" unsigned int d_mat_col_size, \n");
134  source.append(" unsigned int d_mat_internal_rows, \n");
135  source.append(" unsigned int d_mat_internal_cols, \n");
136  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
137  source.append(" unsigned int result_row_start, \n");
138  source.append(" unsigned int result_col_start, \n");
139  source.append(" unsigned int result_row_inc, \n");
140  source.append(" unsigned int result_col_inc, \n");
141  source.append(" unsigned int result_row_size, \n");
142  source.append(" unsigned int result_col_size, \n");
143  source.append(" unsigned int result_internal_rows, \n");
144  source.append(" unsigned int result_internal_cols) { \n");
145 
146  // split work rows (sparse matrix rows) to thread groups
147  source.append(" for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
148 
149  source.append(" unsigned int row_start = sp_mat_row_indices[row]; \n");
150  source.append(" unsigned int row_end = sp_mat_row_indices[row+1]; \n");
151 
152  // split result cols between threads in a thread group
153  source.append(" for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
154 
155  source.append(" "); source.append(numeric_string); source.append(" r = 0; \n");
156 
157  source.append(" for (unsigned int k = row_start; k < row_end; k ++) { \n");
158 
159  source.append(" unsigned int j = sp_mat_col_indices[k]; \n");
160  source.append(" "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
161 
162  source.append(" "); source.append(numeric_string);
163  if (B_transposed && B_row_major)
164  source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
165  else if (B_transposed && !B_row_major)
166  source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
167  else if (!B_transposed && B_row_major)
168  source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
169  else
170  source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
171  source.append(" r += x * y; \n");
172  source.append(" } \n");
173 
174  if (C_row_major)
175  source.append(" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
176  else
177  source.append(" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
178  source.append(" } \n");
179  source.append(" } \n");
180 
181  source.append("} \n");
182 
183  }
184 }
185 template<typename StringT>
186 void generate_compressed_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
187 {
188  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
189  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, true);
190  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, false);
191  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, true);
192 
193  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
194  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, true);
195  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, false);
196  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, true);
197 }
198 
199 template<typename StringT>
200 void generate_compressed_matrix_jacobi(StringT & source, std::string const & numeric_string)
201 {
202 
203  source.append(" __kernel void jacobi( \n");
204  source.append(" __global const unsigned int * row_indices, \n");
205  source.append(" __global const unsigned int * column_indices, \n");
206  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
207  source.append(" "); source.append(numeric_string); source.append(" weight, \n");
208  source.append(" __global const "); source.append(numeric_string); source.append(" * old_result, \n");
209  source.append(" __global "); source.append(numeric_string); source.append(" * new_result, \n");
210  source.append(" __global const "); source.append(numeric_string); source.append(" * rhs, \n");
211  source.append(" unsigned int size) \n");
212  source.append(" { \n");
213  source.append(" "); source.append(numeric_string); source.append(" sum, diag=1; \n");
214  source.append(" int col; \n");
215  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
216  source.append(" { \n");
217  source.append(" sum = 0; \n");
218  source.append(" for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
219  source.append(" { \n");
220  source.append(" col = column_indices[j]; \n");
221  source.append(" if (i == col) \n");
222  source.append(" diag = elements[j]; \n");
223  source.append(" else \n");
224  source.append(" sum += elements[j] * old_result[col]; \n");
225  source.append(" } \n");
226  source.append(" new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
227  source.append(" } \n");
228  source.append(" } \n");
229 
230 }
231 
232 
233 template<typename StringT>
234 void generate_compressed_matrix_lu_backward(StringT & source, std::string const & numeric_string)
235 {
236  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
237  source.append("__kernel void lu_backward( \n");
238  source.append(" __global const unsigned int * row_indices, \n");
239  source.append(" __global const unsigned int * column_indices, \n");
240  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
241  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
242  source.append(" unsigned int size) \n");
243  source.append("{ \n");
244  source.append(" __local unsigned int col_index_buffer[128]; \n");
245  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
246  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
247 
248  source.append(" unsigned int nnz = row_indices[size]; \n");
249  source.append(" unsigned int current_row = size-1; \n");
250  source.append(" unsigned int row_at_window_start = size-1; \n");
251  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
252  source.append(" "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
253  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
254  source.append(" unsigned int next_row = row_indices[size-1]; \n");
255 
256  source.append(" unsigned int i = loop_end + get_local_id(0); \n");
257  source.append(" while (1) \n");
258  source.append(" { \n");
259  //load into shared memory (coalesced access):
260  source.append(" if (i < nnz) \n");
261  source.append(" { \n");
262  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
263  source.append(" unsigned int tmp = column_indices[i]; \n");
264  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
265  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
266  source.append(" } \n");
267 
268  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
269 
270  //now a single thread does the remaining work in shared memory:
271  source.append(" if (get_local_id(0) == 0) \n");
272  source.append(" { \n");
273  // traverse through all the loaded data from back to front:
274  source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
275  source.append(" { \n");
276  source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
277 
278  source.append(" if (i+k >= nnz) \n");
279  source.append(" continue; \n");
280 
281  source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
282  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
283  source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
284  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
285  source.append(" else if (col_index_buffer[k] == current_row) \n");
286  source.append(" diagonal_entry = element_buffer[k]; \n");
287 
288  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
289  source.append(" { \n");
290  source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
291  source.append(" if (current_row > 0) //load next row's data \n");
292  source.append(" { \n");
293  source.append(" --current_row; \n");
294  source.append(" next_row = row_indices[current_row]; \n");
295  source.append(" current_vector_entry = vector[current_row]; \n");
296  source.append(" } \n");
297  source.append(" } \n");
298 
299 
300  source.append(" } \n"); // for k
301 
302  source.append(" row_at_window_start = current_row; \n");
303  source.append(" } \n"); // if (get_local_id(0) == 0)
304 
305  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
306 
307  source.append(" if (i < get_local_size(0)) \n");
308  source.append(" break; \n");
309 
310  source.append(" i -= get_local_size(0); \n");
311  source.append(" } \n"); //for i
312  source.append("} \n");
313 
314 }
315 
316 template<typename StringT>
317 void generate_compressed_matrix_lu_forward(StringT & source, std::string const & numeric_string)
318 {
319 
320  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
321  source.append("__kernel void lu_forward( \n");
322  source.append(" __global const unsigned int * row_indices, \n");
323  source.append(" __global const unsigned int * column_indices, \n");
324  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
325  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
326  source.append(" unsigned int size) \n");
327  source.append("{ \n");
328  source.append(" __local unsigned int col_index_buffer[128]; \n");
329  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
330  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
331 
332  source.append(" unsigned int nnz = row_indices[size]; \n");
333  source.append(" unsigned int current_row = 0; \n");
334  source.append(" unsigned int row_at_window_start = 0; \n");
335  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
336  source.append(" "); source.append(numeric_string); source.append(" diagonal_entry; \n");
337  source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
338  source.append(" unsigned int next_row = row_indices[1]; \n");
339 
340  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
341  source.append(" { \n");
342  //load into shared memory (coalesced access):
343  source.append(" if (i < nnz) \n");
344  source.append(" { \n");
345  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
346  source.append(" unsigned int tmp = column_indices[i]; \n");
347  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
348  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
349  source.append(" } \n");
350 
351  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
352 
353  //now a single thread does the remaining work in shared memory:
354  source.append(" if (get_local_id(0) == 0) \n");
355  source.append(" { \n");
356  // traverse through all the loaded data:
357  source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
358  source.append(" { \n");
359  source.append(" if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
360  source.append(" { \n");
361  source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
362  source.append(" ++current_row; \n");
363  source.append(" if (current_row < size) \n"); //load next row's data
364  source.append(" { \n");
365  source.append(" next_row = row_indices[current_row+1]; \n");
366  source.append(" current_vector_entry = vector[current_row]; \n");
367  source.append(" } \n");
368  source.append(" } \n");
369 
370  source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
371  source.append(" { \n");
372  source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
373  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
374  source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
375  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
376  source.append(" } \n");
377  source.append(" else if (col_index_buffer[k] == current_row) \n");
378  source.append(" diagonal_entry = element_buffer[k]; \n");
379 
380  source.append(" } \n"); // for k
381 
382  source.append(" row_at_window_start = current_row; \n");
383  source.append(" } \n"); // if (get_local_id(0) == 0)
384 
385  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
386  source.append(" } \n"); //for i
387  source.append("} \n");
388 
389 }
390 
391 template<typename StringT>
392 void generate_compressed_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
393 {
394  source.append("__kernel void row_info_extractor( \n");
395  source.append(" __global const unsigned int * row_indices, \n");
396  source.append(" __global const unsigned int * column_indices, \n");
397  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
398  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
399  source.append(" unsigned int size, \n");
400  source.append(" unsigned int option \n");
401  source.append(" ) \n");
402  source.append("{ \n");
403  source.append(" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
404  source.append(" { \n");
405  source.append(" "); source.append(numeric_string); source.append(" value = 0; \n");
406  source.append(" unsigned int row_end = row_indices[row+1]; \n");
407 
408  source.append(" switch (option) \n");
409  source.append(" { \n");
410  source.append(" case 0: \n"); //inf-norm
411  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
412  source.append(" value = max(value, fabs(elements[i])); \n");
413  source.append(" break; \n");
414 
415  source.append(" case 1: \n"); //1-norm
416  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
417  source.append(" value += fabs(elements[i]); \n");
418  source.append(" break; \n");
419 
420  source.append(" case 2: \n"); //2-norm
421  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
422  source.append(" value += elements[i] * elements[i]; \n");
423  source.append(" value = sqrt(value); \n");
424  source.append(" break; \n");
425 
426  source.append(" case 3: \n"); //diagonal entry
427  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
428  source.append(" { \n");
429  source.append(" if (column_indices[i] == row) \n");
430  source.append(" { \n");
431  source.append(" value = elements[i]; \n");
432  source.append(" break; \n");
433  source.append(" } \n");
434  source.append(" } \n");
435  source.append(" break; \n");
436 
437  source.append(" default: \n");
438  source.append(" break; \n");
439  source.append(" } \n");
440  source.append(" result[row] = value; \n");
441  source.append(" } \n");
442  source.append("} \n");
443 
444 }
445 
446 template<typename StringT>
447 void generate_compressed_matrix_trans_lu_backward(StringT & source, std::string const & numeric_string)
448 {
449 
450  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
451  source.append("__kernel void trans_lu_backward( \n");
452  source.append(" __global const unsigned int * row_indices, \n");
453  source.append(" __global const unsigned int * column_indices, \n");
454  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
455  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
456  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
457  source.append(" unsigned int size) \n");
458  source.append("{ \n");
459  source.append(" __local unsigned int row_index_lookahead[256]; \n");
460  source.append(" __local unsigned int row_index_buffer[256]; \n");
461 
462  source.append(" unsigned int row_index; \n");
463  source.append(" unsigned int col_index; \n");
464  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
465  source.append(" unsigned int nnz = row_indices[size]; \n");
466  source.append(" unsigned int row_at_window_start = size; \n");
467  source.append(" unsigned int row_at_window_end; \n");
468  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
469 
470  source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
471  source.append(" { \n");
472  source.append(" unsigned int i = (nnz - i2) - 1; \n");
473  source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
474  source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
475  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
476 
477  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
478 
479  source.append(" if (i2 < nnz) \n");
480  source.append(" { \n");
481  source.append(" unsigned int row_index_dec = 0; \n");
482  source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
483  source.append(" ++row_index_dec; \n");
484  source.append(" row_index = row_at_window_start - row_index_dec; \n");
485  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
486  source.append(" } \n");
487  source.append(" else \n");
488  source.append(" { \n");
489  source.append(" row_index = size+1; \n");
490  source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
491  source.append(" } \n");
492 
493  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
494 
495  source.append(" row_at_window_start = row_index_buffer[0]; \n");
496  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
497 
498  //backward elimination
499  source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
500  source.append(" { \n");
501  source.append(" unsigned int row = row_at_window_start - row2; \n");
502  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
503 
504  source.append(" if ( (row_index == row) && (col_index < row) ) \n");
505  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
506 
507  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
508  source.append(" } \n");
509 
510  source.append(" row_at_window_start = row_at_window_end; \n");
511  source.append(" } \n");
512 
513  // final step: Divide vector by diagonal entries:
514  source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
515  source.append(" vector[i] /= diagonal_entries[i]; \n");
516  source.append("} \n");
517 
518 }
519 
520 template<typename StringT>
521 void generate_compressed_matrix_trans_lu_forward(StringT & source, std::string const & numeric_string)
522 {
523 
524  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
525  source.append("__kernel void trans_lu_forward( \n");
526  source.append(" __global const unsigned int * row_indices, \n");
527  source.append(" __global const unsigned int * column_indices, \n");
528  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
529  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
530  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
531  source.append(" unsigned int size) \n");
532  source.append("{ \n");
533  source.append(" __local unsigned int row_index_lookahead[256]; \n");
534  source.append(" __local unsigned int row_index_buffer[256]; \n");
535 
536  source.append(" unsigned int row_index; \n");
537  source.append(" unsigned int col_index; \n");
538  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
539  source.append(" unsigned int nnz = row_indices[size]; \n");
540  source.append(" unsigned int row_at_window_start = 0; \n");
541  source.append(" unsigned int row_at_window_end = 0; \n");
542  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
543 
544  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
545  source.append(" { \n");
546  source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
547  source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
548  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
549 
550  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
551 
552  source.append(" if (i < nnz) \n");
553  source.append(" { \n");
554  source.append(" unsigned int row_index_inc = 0; \n");
555  source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
556  source.append(" ++row_index_inc; \n");
557  source.append(" row_index = row_at_window_start + row_index_inc; \n");
558  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
559  source.append(" } \n");
560  source.append(" else \n");
561  source.append(" { \n");
562  source.append(" row_index = size+1; \n");
563  source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
564  source.append(" } \n");
565 
566  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
567 
568  source.append(" row_at_window_start = row_index_buffer[0]; \n");
569  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
570 
571  //forward elimination
572  source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
573  source.append(" { \n");
574  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
575 
576  source.append(" if ( (row_index == row) && (col_index > row) ) \n");
577  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
578 
579  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
580  source.append(" } \n");
581 
582  source.append(" row_at_window_start = row_at_window_end; \n");
583  source.append(" } \n");
584 
585  // final step: Divide vector by diagonal entries:
586  source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
587  source.append(" vector[i] /= diagonal_entries[i]; \n");
588  source.append("} \n");
589 
590 }
591 
592 template<typename StringT>
593 void generate_compressed_matrix_trans_unit_lu_backward(StringT & source, std::string const & numeric_string)
594 {
595 
596  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
597  source.append("__kernel void trans_unit_lu_backward( \n");
598  source.append(" __global const unsigned int * row_indices, \n");
599  source.append(" __global const unsigned int * column_indices, \n");
600  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
601  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
602  source.append(" unsigned int size) \n");
603  source.append("{ \n");
604  source.append(" __local unsigned int row_index_lookahead[256]; \n");
605  source.append(" __local unsigned int row_index_buffer[256]; \n");
606 
607  source.append(" unsigned int row_index; \n");
608  source.append(" unsigned int col_index; \n");
609  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
610  source.append(" unsigned int nnz = row_indices[size]; \n");
611  source.append(" unsigned int row_at_window_start = size; \n");
612  source.append(" unsigned int row_at_window_end; \n");
613  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
614 
615  source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
616  source.append(" { \n");
617  source.append(" unsigned int i = (nnz - i2) - 1; \n");
618  source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
619  source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
620  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
621 
622  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
623 
624  source.append(" if (i2 < nnz) \n");
625  source.append(" { \n");
626  source.append(" unsigned int row_index_dec = 0; \n");
627  source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
628  source.append(" ++row_index_dec; \n");
629  source.append(" row_index = row_at_window_start - row_index_dec; \n");
630  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
631  source.append(" } \n");
632  source.append(" else \n");
633  source.append(" { \n");
634  source.append(" row_index = size+1; \n");
635  source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
636  source.append(" } \n");
637 
638  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
639 
640  source.append(" row_at_window_start = row_index_buffer[0]; \n");
641  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
642 
643  //backward elimination
644  source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
645  source.append(" { \n");
646  source.append(" unsigned int row = row_at_window_start - row2; \n");
647  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
648 
649  source.append(" if ( (row_index == row) && (col_index < row) ) \n");
650  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
651 
652  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
653  source.append(" } \n");
654 
655  source.append(" row_at_window_start = row_at_window_end; \n");
656  source.append(" } \n");
657  source.append("} \n");
658 
659 }
660 
661 
662 template<typename StringT>
663 void generate_compressed_matrix_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
664 {
665 
666  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
667  source.append("__kernel void trans_unit_lu_forward( \n");
668  source.append(" __global const unsigned int * row_indices, \n");
669  source.append(" __global const unsigned int * column_indices, \n");
670  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
671  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
672  source.append(" unsigned int size) \n");
673  source.append("{ \n");
674  source.append(" __local unsigned int row_index_lookahead[256]; \n");
675  source.append(" __local unsigned int row_index_buffer[256]; \n");
676 
677  source.append(" unsigned int row_index; \n");
678  source.append(" unsigned int col_index; \n");
679  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
680  source.append(" unsigned int nnz = row_indices[size]; \n");
681  source.append(" unsigned int row_at_window_start = 0; \n");
682  source.append(" unsigned int row_at_window_end = 0; \n");
683  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
684 
685  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
686  source.append(" { \n");
687  source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
688  source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
689  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
690 
691  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
692 
693  source.append(" if (i < nnz) \n");
694  source.append(" { \n");
695  source.append(" unsigned int row_index_inc = 0; \n");
696  source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
697  source.append(" ++row_index_inc; \n");
698  source.append(" row_index = row_at_window_start + row_index_inc; \n");
699  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
700  source.append(" } \n");
701  source.append(" else \n");
702  source.append(" { \n");
703  source.append(" row_index = size+1; \n");
704  source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
705  source.append(" } \n");
706 
707  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
708 
709  source.append(" row_at_window_start = row_index_buffer[0]; \n");
710  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
711 
712  //forward elimination
713  source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
714  source.append(" { \n");
715  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
716 
717  source.append(" if ( (row_index == row) && (col_index > row) ) \n");
718  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
719 
720  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
721  source.append(" } \n");
722 
723  source.append(" row_at_window_start = row_at_window_end; \n");
724  source.append(" } \n");
725  source.append("} \n");
726 
727 }
728 
729 template<typename StringT>
730 void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT & source, std::string const & numeric_string)
731 {
732 
733  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
734  source.append("__kernel void trans_unit_lu_forward_slow( \n");
735  source.append(" __global const unsigned int * row_indices, \n");
736  source.append(" __global const unsigned int * column_indices, \n");
737  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
738  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
739  source.append(" unsigned int size) \n");
740  source.append("{ \n");
741  source.append(" for (unsigned int row = 0; row < size; ++row) \n");
742  source.append(" { \n");
743  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
744 
745  source.append(" unsigned int row_start = row_indices[row]; \n");
746  source.append(" unsigned int row_stop = row_indices[row + 1]; \n");
747  source.append(" for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
748  source.append(" { \n");
749  source.append(" unsigned int col_index = column_indices[entry_index]; \n");
750  source.append(" if (col_index > row) \n");
751  source.append(" vector[col_index] -= result_entry * elements[entry_index]; \n");
752  source.append(" } \n");
753 
754  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
755  source.append(" } \n");
756  source.append("} \n");
757 
758 }
759 
760 template<typename StringT>
761 void generate_compressed_matrix_unit_lu_backward(StringT & source, std::string const & numeric_string)
762 {
763 
764  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
765  source.append("__kernel void unit_lu_backward( \n");
766  source.append(" __global const unsigned int * row_indices, \n");
767  source.append(" __global const unsigned int * column_indices, \n");
768  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
769  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
770  source.append(" unsigned int size) \n");
771  source.append("{ \n");
772  source.append(" __local unsigned int col_index_buffer[128]; \n");
773  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
774  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
775 
776  source.append(" unsigned int nnz = row_indices[size]; \n");
777  source.append(" unsigned int current_row = size-1; \n");
778  source.append(" unsigned int row_at_window_start = size-1; \n");
779  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
780  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
781  source.append(" unsigned int next_row = row_indices[size-1]; \n");
782 
783  source.append(" unsigned int i = loop_end + get_local_id(0); \n");
784  source.append(" while (1) \n");
785  source.append(" { \n");
786  //load into shared memory (coalesced access):
787  source.append(" if (i < nnz) \n");
788  source.append(" { \n");
789  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
790  source.append(" unsigned int tmp = column_indices[i]; \n");
791  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
792  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
793  source.append(" } \n");
794 
795  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
796 
797  //now a single thread does the remaining work in shared memory:
798  source.append(" if (get_local_id(0) == 0) \n");
799  source.append(" { \n");
800  // traverse through all the loaded data from back to front:
801  source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
802  source.append(" { \n");
803  source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
804 
805  source.append(" if (i+k >= nnz) \n");
806  source.append(" continue; \n");
807 
808  source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
809  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
810  source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
811  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
812 
813  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
814  source.append(" { \n");
815  source.append(" vector[current_row] = current_vector_entry; \n");
816  source.append(" if (current_row > 0) \n"); //load next row's data
817  source.append(" { \n");
818  source.append(" --current_row; \n");
819  source.append(" next_row = row_indices[current_row]; \n");
820  source.append(" current_vector_entry = vector[current_row]; \n");
821  source.append(" } \n");
822  source.append(" } \n");
823 
824 
825  source.append(" } \n"); // for k
826 
827  source.append(" row_at_window_start = current_row; \n");
828  source.append(" } \n"); // if (get_local_id(0) == 0)
829 
830  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
831 
832  source.append(" if (i < get_local_size(0)) \n");
833  source.append(" break; \n");
834 
835  source.append(" i -= get_local_size(0); \n");
836  source.append(" } \n"); //for i
837  source.append("} \n");
838 
839 }
840 
841 template<typename StringT>
842 void generate_compressed_matrix_unit_lu_forward(StringT & source, std::string const & numeric_string)
843 {
844 
845  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
846  source.append("__kernel void unit_lu_forward( \n");
847  source.append(" __global const unsigned int * row_indices, \n");
848  source.append(" __global const unsigned int * column_indices, \n");
849  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
850  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
851  source.append(" unsigned int size) \n");
852  source.append("{ \n");
853  source.append(" __local unsigned int col_index_buffer[128]; \n");
854  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
855  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
856 
857  source.append(" unsigned int nnz = row_indices[size]; \n");
858  source.append(" unsigned int current_row = 0; \n");
859  source.append(" unsigned int row_at_window_start = 0; \n");
860  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
861  source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
862  source.append(" unsigned int next_row = row_indices[1]; \n");
863 
864  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
865  source.append(" { \n");
866  //load into shared memory (coalesced access):
867  source.append(" if (i < nnz) \n");
868  source.append(" { \n");
869  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
870  source.append(" unsigned int tmp = column_indices[i]; \n");
871  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
872  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
873  source.append(" } \n");
874 
875  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
876 
877  //now a single thread does the remaining work in shared memory:
878  source.append(" if (get_local_id(0) == 0) \n");
879  source.append(" { \n");
880  // traverse through all the loaded data:
881  source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
882  source.append(" { \n");
883  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
884  source.append(" { \n");
885  source.append(" vector[current_row] = current_vector_entry; \n");
886  source.append(" ++current_row; \n");
887  source.append(" if (current_row < size) //load next row's data \n");
888  source.append(" { \n");
889  source.append(" next_row = row_indices[current_row+1]; \n");
890  source.append(" current_vector_entry = vector[current_row]; \n");
891  source.append(" } \n");
892  source.append(" } \n");
893 
894  source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
895  source.append(" { \n");
896  source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
897  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
898  source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
899  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
900  source.append(" } \n");
901 
902  source.append(" } \n"); // for k
903 
904  source.append(" row_at_window_start = current_row; \n");
905  source.append(" } \n"); // if (get_local_id(0) == 0)
906 
907  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
908  source.append(" } //for i \n");
909  source.append("} \n");
910 
911 }
912 
913 template<typename StringT>
914 void generate_compressed_matrix_vec_mul_nvidia(StringT & source, std::string const & numeric_string, unsigned int subwarp_size, bool with_alpha_beta)
915 {
916  std::stringstream ss;
917  ss << subwarp_size;
918 
919  if (with_alpha_beta)
920  source.append("__kernel void vec_mul_nvidia_alpha_beta( \n");
921  else
922  source.append("__kernel void vec_mul_nvidia( \n");
923  source.append(" __global const unsigned int * row_indices, \n");
924  source.append(" __global const unsigned int * column_indices, \n");
925  source.append(" __global const unsigned int * row_blocks, \n");
926  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
927  source.append(" unsigned int num_blocks, \n");
928  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
929  source.append(" uint4 layout_x, \n");
930  if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
931  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
932  source.append(" uint4 layout_result \n");
933  if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
934  source.append(") { \n");
935  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
936 
937  source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
938  source.append(" const unsigned int block_increment = get_local_size(0) * ((layout_result.z - 1) / (get_global_size(0)) + 1); \n");
939  source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
940  source.append(" const unsigned int block_stop = min(block_start + block_increment, layout_result.z); \n");
941 
942  source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
943  source.append(" row < block_stop; \n");
944  source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
945  source.append(" { \n");
946  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
947  source.append(" unsigned int row_end = row_indices[row+1]; \n");
948  source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
949  source.append(" dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
950 
951  source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
952  source.append(" #pragma unroll \n");
953  source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
954  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
955 
956  source.append(" if (id_in_row == 0) \n");
957  if (with_alpha_beta)
958  source.append(" result[row * layout_result.y + layout_result.x] = alpha * shared_elements[get_local_id(0)] + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
959  else
960  source.append(" result[row * layout_result.y + layout_result.x] = shared_elements[get_local_id(0)]; \n");
961  source.append(" } \n");
962  source.append("} \n");
963 
964 }
965 
966 template<typename StringT>
967 void generate_compressed_matrix_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
968 {
969  if (with_alpha_beta)
970  source.append("__kernel void vec_mul_alpha_beta( \n");
971  else
972  source.append("__kernel void vec_mul( \n");
973  source.append(" __global const unsigned int * row_indices, \n");
974  source.append(" __global const unsigned int * column_indices, \n");
975  source.append(" __global const unsigned int * row_blocks, \n");
976  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
977  source.append(" unsigned int num_blocks, \n");
978  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
979  source.append(" uint4 layout_x, \n");
980  if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
981  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
982  source.append(" uint4 layout_result \n");
983  if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
984  source.append(") { \n");
985  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
986 
987  source.append(" unsigned int row_start = row_blocks[get_group_id(0)]; \n");
988  source.append(" unsigned int row_stop = row_blocks[get_group_id(0) + 1]; \n");
989  source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
990  source.append(" unsigned int element_start = row_indices[row_start]; \n");
991  source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
992 
993  source.append(" if (rows_to_process > 4) { \n"); // CSR stream
994  // load to shared buffer:
995  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
996  source.append(" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
997 
998  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
999 
1000  // use one thread per row to sum:
1001  source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
1002  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
1003  source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
1004  source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
1005  source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
1006  source.append(" dot_prod += shared_elements[i]; \n");
1007  if (with_alpha_beta)
1008  source.append(" result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
1009  else
1010  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1011  source.append(" } \n");
1012  source.append(" } \n");
1013 
1014  // use multiple threads for the summation
1015  source.append(" else if (rows_to_process > 1) \n"); // CSR stream with local reduction
1016  source.append(" {\n");
1017  // load to shared buffer:
1018  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0))\n");
1019  source.append(" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x];\n");
1020 
1021  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1022 
1023  // sum each row separately using a reduction:
1024  source.append(" for (unsigned int row = row_start; row < row_stop; ++row)\n");
1025  source.append(" {\n");
1026  source.append(" unsigned int current_row_start = row_indices[row] - element_start;\n");
1027  source.append(" unsigned int current_row_stop = row_indices[row + 1] - element_start;\n");
1028  source.append(" unsigned int thread_base_id = current_row_start + get_local_id(0);\n");
1029 
1030  // sum whatever exceeds the current buffer:
1031  source.append(" for (unsigned int j = thread_base_id + get_local_size(0); j < current_row_stop; j += get_local_size(0))\n");
1032  source.append(" shared_elements[thread_base_id] += shared_elements[j];\n");
1033 
1034  // reduction
1035  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n");
1036  source.append(" {\n");
1037  source.append(" barrier(CLK_LOCAL_MEM_FENCE);\n");
1038  source.append(" if (get_local_id(0) < stride && thread_base_id < current_row_stop)\n");
1039  source.append(" shared_elements[thread_base_id] += (thread_base_id + stride < current_row_stop) ? shared_elements[thread_base_id+stride] : 0;\n");
1040  source.append(" }\n");
1041  source.append(" "); source.append(numeric_string); source.append(" row_result = 0; \n");
1042  source.append(" if (current_row_stop > current_row_start)\n");
1043  source.append(" row_result = shared_elements[current_row_start]; \n");
1044  source.append(" if (get_local_id(0) == 0)\n");
1045  if (with_alpha_beta)
1046  source.append(" result[row * layout_result.y + layout_result.x] = alpha * row_result + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0);\n");
1047  else
1048  source.append(" result[row * layout_result.y + layout_result.x] = row_result;\n");
1049  source.append(" }\n");
1050  source.append(" }\n");
1051 
1052 
1053  source.append(" else \n"); // CSR vector for a single row
1054  source.append(" { \n");
1055  // load and sum to shared buffer:
1056  source.append(" shared_elements[get_local_id(0)] = 0; \n");
1057  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
1058  source.append(" shared_elements[get_local_id(0)] += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
1059 
1060  // reduction to obtain final result
1061  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
1062  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1063  source.append(" if (get_local_id(0) < stride) \n");
1064  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
1065  source.append(" } \n");
1066 
1067  source.append(" if (get_local_id(0) == 0) \n");
1068  if (with_alpha_beta)
1069  source.append(" result[row_start * layout_result.y + layout_result.x] = alpha * shared_elements[0] + ((beta != 0) ? beta * result[row_start * layout_result.y + layout_result.x] : 0); \n");
1070  else
1071  source.append(" result[row_start * layout_result.y + layout_result.x] = shared_elements[0]; \n");
1072  source.append(" } \n");
1073 
1074  source.append("} \n");
1075 
1076 }
1077 
1078 
1079 template<typename StringT>
1080 void generate_compressed_matrix_vec_mul4(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
1081 {
1082  if (with_alpha_beta)
1083  source.append("__kernel void vec_mul4_alpha_beta( \n");
1084  else
1085  source.append("__kernel void vec_mul4( \n");
1086  source.append(" __global const unsigned int * row_indices, \n");
1087  source.append(" __global const uint4 * column_indices, \n");
1088  source.append(" __global const "); source.append(numeric_string); source.append("4 * elements, \n");
1089  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
1090  source.append(" uint4 layout_x, \n");
1091  if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
1092  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1093  source.append(" uint4 layout_result \n");
1094  if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
1095  source.append(") { \n");
1096  source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
1097  source.append(" unsigned int start, next_stop; \n");
1098  source.append(" uint4 col_idx; \n");
1099  source.append(" "); source.append(numeric_string); source.append("4 tmp_vec; \n");
1100  source.append(" "); source.append(numeric_string); source.append("4 tmp_entries; \n");
1101 
1102  source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
1103  source.append(" { \n");
1104  source.append(" dot_prod = 0; \n");
1105  source.append(" start = row_indices[row] / 4; \n");
1106  source.append(" next_stop = row_indices[row+1] / 4; \n");
1107 
1108  source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
1109  source.append(" { \n");
1110  source.append(" col_idx = column_indices[i]; \n");
1111 
1112  source.append(" tmp_entries = elements[i]; \n");
1113  source.append(" tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
1114  source.append(" tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
1115  source.append(" tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
1116  source.append(" tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
1117 
1118  source.append(" dot_prod += dot(tmp_entries, tmp_vec); \n");
1119  source.append(" } \n");
1120  if (with_alpha_beta)
1121  source.append(" result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
1122  else
1123  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1124  source.append(" } \n");
1125  source.append("} \n");
1126 }
1127 
1128 template<typename StringT>
1129 void generate_compressed_matrix_vec_mul8(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
1130 {
1131  if (with_alpha_beta)
1132  source.append("__kernel void vec_mul8_alpha_beta( \n");
1133  else
1134  source.append("__kernel void vec_mul8( \n");
1135  source.append(" __global const unsigned int * row_indices, \n");
1136  source.append(" __global const uint8 * column_indices, \n");
1137  source.append(" __global const "); source.append(numeric_string); source.append("8 * elements, \n");
1138  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
1139  source.append(" uint4 layout_x, \n");
1140  if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
1141  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1142  source.append(" uint4 layout_result \n");
1143  if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
1144  source.append(") { \n");
1145  source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
1146  source.append(" unsigned int start, next_stop; \n");
1147  source.append(" uint8 col_idx; \n");
1148  source.append(" "); source.append(numeric_string); source.append("8 tmp_vec; \n");
1149  source.append(" "); source.append(numeric_string); source.append("8 tmp_entries; \n");
1150 
1151  source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
1152  source.append(" { \n");
1153  source.append(" dot_prod = 0; \n");
1154  source.append(" start = row_indices[row] / 8; \n");
1155  source.append(" next_stop = row_indices[row+1] / 8; \n");
1156 
1157  source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
1158  source.append(" { \n");
1159  source.append(" col_idx = column_indices[i]; \n");
1160 
1161  source.append(" tmp_entries = elements[i]; \n");
1162  source.append(" tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
1163  source.append(" tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
1164  source.append(" tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
1165  source.append(" tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
1166  source.append(" tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
1167  source.append(" tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
1168  source.append(" tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
1169  source.append(" tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
1170 
1171  source.append(" dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
1172  source.append(" dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
1173  source.append(" } \n");
1174  if (with_alpha_beta)
1175  source.append(" result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
1176  else
1177  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1178  source.append(" } \n");
1179  source.append("} \n");
1180 }
1181 
1182 template<typename StringT>
1183 void generate_compressed_matrix_vec_mul_cpu(StringT & source, std::string const & numeric_string)
1184 {
1185  source.append("__kernel void vec_mul_cpu( \n");
1186  source.append(" __global const unsigned int * row_indices, \n");
1187  source.append(" __global const unsigned int * column_indices, \n");
1188  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1189  source.append(" __global const "); source.append(numeric_string); source.append(" * vector, \n");
1190  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
1191  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1192  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
1193  source.append(" unsigned int size) \n");
1194  source.append("{ \n");
1195  source.append(" unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
1196  source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1197  source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
1198  source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1199  source.append(" { \n");
1200  source.append(" "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
1201  source.append(" unsigned int row_end = row_indices[row+1]; \n");
1202  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
1203  source.append(" dot_prod += elements[i] * vector[column_indices[i]]; \n");
1204  source.append(" result[row] = alpha * dot_prod + ((beta != 0) ? beta * result[row] : 0); \n");
1205  source.append(" } \n");
1206  source.append("} \n");
1207 
1208 }
1209 
1210 
1211 
1216 template<typename StringT>
1218 {
1219  source.append("__kernel void spgemm_stage1( \n");
1220  source.append(" __global const unsigned int * A_row_indices, \n");
1221  source.append(" __global const unsigned int * A_column_indices, \n");
1222  source.append(" unsigned int A_size1, \n");
1223  source.append(" __global unsigned int * group_nnz_array) \n");
1224  source.append("{ \n");
1225  source.append(" unsigned int work_per_item = max((uint) ((A_size1 - 1) / get_global_size(0) + 1), (uint) 1); \n");
1226  source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1227  source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) A_size1); \n");
1228  source.append(" unsigned int max_A_nnz = 0; \n");
1229  source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1230  source.append(" max_A_nnz = max(max_A_nnz, A_row_indices[row + 1] - A_row_indices[row]); \n");
1231 
1232  // load and sum to shared buffer:
1233  source.append(" __local unsigned int shared_nnz[256]; \n");
1234  source.append(" shared_nnz[get_local_id(0)] = max_A_nnz; \n");
1235 
1236  // reduction to obtain final result
1237  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
1238  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1239  source.append(" if (get_local_id(0) < stride) \n");
1240  source.append(" shared_nnz[get_local_id(0)] = max(shared_nnz[get_local_id(0)], shared_nnz[get_local_id(0) + stride]); \n");
1241  source.append(" } \n");
1242 
1243  source.append(" if (get_local_id(0) == 0) \n");
1244  source.append(" group_nnz_array[get_group_id(0)] = shared_nnz[0]; \n");
1245  source.append("} \n");
1246 }
1247 
1248 
1253 template<typename StringT>
1255 {
1256  source.append("__kernel void spgemm_decompose_1( \n");
1257  source.append(" __global const unsigned int * A_row_indices, \n");
1258  source.append(" unsigned int A_size1, \n");
1259  source.append(" unsigned int max_per_row, \n");
1260  source.append(" __global unsigned int * chunks_per_row) \n");
1261  source.append("{ \n");
1262  source.append(" for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) {\n");
1263  source.append(" unsigned int num_entries = A_row_indices[row+1] - A_row_indices[row]; \n");
1264  source.append(" chunks_per_row[row] = (num_entries < max_per_row) ? 1 : ((num_entries - 1) / max_per_row + 1); \n");
1265  source.append(" } \n");
1266  source.append("} \n");
1267 }
1268 
1269 
1274 template<typename StringT>
1275 void generate_compressed_matrix_compressed_matrix_prod_A2(StringT & source, std::string const & numeric_string)
1276 {
1277  source.append("__kernel void spgemm_A2( \n");
1278  source.append(" __global unsigned int *A2_row_indices, \n");
1279  source.append(" __global unsigned int *A2_col_indices, \n");
1280  source.append(" __global "); source.append(numeric_string); source.append(" *A2_elements, \n");
1281  source.append(" unsigned int A2_size1, \n");
1282  source.append(" __global const unsigned int *new_row_buffer) \n");
1283  source.append("{ \n");
1284  source.append(" for (unsigned int i = get_global_id(0); i < A2_size1; i += get_global_size(0)) {\n");
1285  source.append(" unsigned int index_start = new_row_buffer[i]; \n");
1286  source.append(" unsigned int index_stop = new_row_buffer[i+1]; \n");
1287 
1288  source.append(" A2_row_indices[i] = index_start; \n");
1289 
1290  source.append(" for (unsigned int j = index_start; j < index_stop; ++j) { \n");
1291  source.append(" A2_col_indices[j] = j; \n");
1292  source.append(" A2_elements[j] = 1; \n");
1293  source.append(" } \n");
1294  source.append(" } \n");
1295 
1296  source.append(" if (get_global_id(0) == 0) \n");
1297  source.append(" A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; \n");
1298  source.append("} \n");
1299 }
1300 
1305 template<typename StringT>
1306 void generate_compressed_matrix_compressed_matrix_prod_G1(StringT & source, std::string const & numeric_string)
1307 {
1308  source.append("__kernel void spgemm_G1( \n");
1309  source.append(" __global unsigned int *G1_row_indices, \n");
1310  source.append(" __global unsigned int *G1_col_indices, \n");
1311  source.append(" __global "); source.append(numeric_string); source.append(" *G1_elements, \n");
1312  source.append(" unsigned int G1_size1, \n");
1313  source.append(" __global const unsigned int *A_row_indices, \n");
1314  source.append(" __global const unsigned int *A_col_indices, \n");
1315  source.append(" __global const "); source.append(numeric_string); source.append(" *A_elements, \n");
1316  source.append(" unsigned int A_size1, \n");
1317  source.append(" unsigned int A_nnz, \n");
1318  source.append(" unsigned int max_per_row, \n");
1319  source.append(" __global const unsigned int *new_row_buffer) \n");
1320  source.append("{ \n");
1321 
1322  // Part 1: Copy column indices and entries:
1323  source.append(" for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) {\n");
1324  source.append(" G1_col_indices[i] = A_col_indices[i]; \n");
1325  source.append(" G1_elements[i] = A_elements[i]; \n");
1326  source.append(" } \n");
1327 
1328  // Part 2: Derive new row indices:
1329  source.append(" for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) {\n");
1330  source.append(" unsigned int old_start = A_row_indices[i]; \n");
1331  source.append(" unsigned int new_start = new_row_buffer[i]; \n");
1332  source.append(" unsigned int row_chunks = new_row_buffer[i+1] - new_start; \n");
1333 
1334  source.append(" for (unsigned int j=0; j<row_chunks; ++j) \n");
1335  source.append(" G1_row_indices[new_start + j] = old_start + j * max_per_row; \n");
1336  source.append(" } \n");
1337 
1338  // write last entry in row_buffer with thread 0:
1339  source.append(" if (get_global_id(0) == 0) \n");
1340  source.append(" G1_row_indices[G1_size1] = A_row_indices[A_size1]; \n");
1341  source.append("} \n");
1342 }
1343 
1344 
1345 
1351 template<typename StringT>
1353 {
1354  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
1355  source.append("__kernel void spgemm_stage2( \n");
1356  source.append(" __global const unsigned int * A_row_indices, \n");
1357  source.append(" __global const unsigned int * A_col_indices, \n");
1358  source.append(" unsigned int A_size1, \n");
1359  source.append(" __global const unsigned int * B_row_indices, \n");
1360  source.append(" __global const unsigned int * B_col_indices, \n");
1361  source.append(" unsigned int B_size2, \n");
1362  source.append(" __global unsigned int * C_row_indices) \n");
1363  source.append("{ \n");
1364  source.append(" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
1365  source.append(" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
1366  source.append(" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
1367  source.append(" __local unsigned int shared_front[32]; \n");
1368 
1369  source.append(" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
1370  source.append(" { \n");
1371  source.append(" unsigned int row_A_start = A_row_indices[row_C]; \n");
1372  source.append(" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
1373 
1374  source.append(" unsigned int my_row_B = row_A_start + get_local_id(0); \n");
1375  source.append(" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
1376  source.append(" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
1377  source.append(" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
1378 
1379  source.append(" unsigned int num_nnz = 0; \n");
1380  source.append(" if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
1381 
1382  source.append(" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1383  source.append(" while (1) { \n");
1384 
1385  // determine minimum index via reduction:
1386  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1387  source.append(" shared_front[get_local_id(0)] = current_front_index; \n");
1388  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1389  source.append(" if (get_local_id(0) < 16) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 16]); \n");
1390  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1391  source.append(" if (get_local_id(0) < 8) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 8]); \n");
1392  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1393  source.append(" if (get_local_id(0) < 4) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 4]); \n");
1394  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1395  source.append(" if (get_local_id(0) < 2) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 2]); \n");
1396  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1397  source.append(" if (get_local_id(0) < 1) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 1]); \n");
1398  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1399 
1400  source.append(" if (shared_front[0] == B_size2) break; \n");
1401 
1402  // update front:
1403  source.append(" if (current_front_index == shared_front[0]) { \n");
1404  source.append(" ++row_B_start; \n");
1405  source.append(" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1406  source.append(" } \n");
1407 
1408  source.append(" ++num_nnz; \n");
1409  source.append(" } \n");
1410  source.append(" } else { num_nnz = row_B_end - row_B_start; }\n");
1411 
1412  // write number of entries found:
1413  source.append(" if (get_local_id(0) == 0) \n");
1414  source.append(" C_row_indices[row_C] = num_nnz; \n");
1415 
1416  source.append(" } \n");
1417 
1418  source.append("} \n");
1419 
1420 }
1421 
1422 
1427 template<typename StringT>
1428 void generate_compressed_matrix_compressed_matrix_prod_3(StringT & source, std::string const & numeric_string)
1429 {
1430  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
1431  source.append("__kernel void spgemm_stage3( \n");
1432  source.append(" __global const unsigned int * A_row_indices, \n");
1433  source.append(" __global const unsigned int * A_col_indices, \n");
1434  source.append(" __global const "); source.append(numeric_string); source.append(" * A_elements, \n");
1435  source.append(" unsigned int A_size1, \n");
1436  source.append(" __global const unsigned int * B_row_indices, \n");
1437  source.append(" __global const unsigned int * B_col_indices, \n");
1438  source.append(" __global const "); source.append(numeric_string); source.append(" * B_elements, \n");
1439  source.append(" unsigned int B_size2, \n");
1440  source.append(" __global unsigned int * C_row_indices, \n");
1441  source.append(" __global unsigned int * C_col_indices, \n");
1442  source.append(" __global "); source.append(numeric_string); source.append(" * C_elements) \n");
1443  source.append("{ \n");
1444  source.append(" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
1445  source.append(" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
1446  source.append(" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
1447  source.append(" __local unsigned int shared_front[32]; \n");
1448  source.append(" __local "); source.append(numeric_string); source.append(" shared_front_values[32]; \n");
1449  source.append(" unsigned int local_id = get_local_id(0); \n");
1450 
1451  source.append(" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
1452  source.append(" { \n");
1453  source.append(" unsigned int row_A_start = A_row_indices[row_C]; \n");
1454  source.append(" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
1455 
1456  source.append(" unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? local_id : 0); \n"); // single row is a special case
1457  source.append(" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
1458  source.append(" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
1459  source.append(" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
1460 
1461  source.append(" "); source.append(numeric_string); source.append(" val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; \n");
1462  source.append(" unsigned int index_in_C = C_row_indices[row_C] + local_id; \n");
1463 
1464  source.append(" if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
1465 
1466  source.append(" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1467  source.append(" "); source.append(numeric_string); source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
1468 
1469  source.append(" unsigned int index_buffer = 0; \n");
1470  source.append(" "); source.append(numeric_string); source.append(" value_buffer = 0; \n");
1471  source.append(" unsigned int buffer_size = 0; \n");
1472 
1473  source.append(" while (1) { \n");
1474 
1475  // determine minimum index via reduction:
1476  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1477  source.append(" shared_front[local_id] = current_front_index; \n");
1478  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1479  source.append(" if (local_id < 16) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 16]); \n");
1480  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1481  source.append(" if (local_id < 8) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 8]); \n");
1482  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1483  source.append(" if (local_id < 4) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 4]); \n");
1484  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1485  source.append(" if (local_id < 2) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 2]); \n");
1486  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1487  source.append(" if (local_id < 1) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 1]); \n");
1488  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1489 
1490  source.append(" if (shared_front[0] == B_size2) break; \n");
1491 
1492  // compute output value via reduction:
1493  source.append(" shared_front_values[local_id] = (current_front_index == shared_front[0]) ? val_A * current_front_value : 0; \n");
1494  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1495  source.append(" if (local_id < 16) shared_front_values[local_id] += shared_front_values[local_id + 16]; \n");
1496  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1497  source.append(" if (local_id < 8) shared_front_values[local_id] += shared_front_values[local_id + 8]; \n");
1498  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1499  source.append(" if (local_id < 4) shared_front_values[local_id] += shared_front_values[local_id + 4]; \n");
1500  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1501  source.append(" if (local_id < 2) shared_front_values[local_id] += shared_front_values[local_id + 2]; \n");
1502  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1503  source.append(" if (local_id < 1) shared_front_values[local_id] += shared_front_values[local_id + 1]; \n");
1504  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1505 
1506  // update front:
1507  source.append(" if (current_front_index == shared_front[0]) { \n");
1508  source.append(" ++row_B_start; \n");
1509  source.append(" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
1510  source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
1511  source.append(" } \n");
1512 
1513  // write current front to register buffer:
1514  source.append(" index_buffer = (local_id == buffer_size) ? shared_front[0] : index_buffer; \n");
1515  source.append(" value_buffer = (local_id == buffer_size) ? shared_front_values[0] : value_buffer; \n");
1516  source.append(" ++buffer_size; \n");
1517 
1518  // flush register buffer via a coalesced write once full:
1519  source.append(" if (buffer_size == get_local_size(0)) { \n");
1520  source.append(" C_col_indices[index_in_C] = index_buffer; \n");
1521  source.append(" C_elements[index_in_C] = value_buffer; \n");
1522  source.append(" } \n");
1523 
1524  // the following should be in the previous if-conditional, but a bug in NVIDIA drivers 34x.yz requires this slight rewrite
1525  source.append(" index_in_C += (buffer_size == get_local_size(0)) ? get_local_size(0) : 0; \n");
1526  source.append(" buffer_size = (buffer_size == get_local_size(0)) ? 0 : buffer_size; \n");
1527 
1528  source.append(" } \n");
1529 
1530  // write remaining entries in register buffer:
1531  source.append(" if (local_id < buffer_size) { \n");
1532  source.append(" C_col_indices[index_in_C] = index_buffer; \n");
1533  source.append(" C_elements[index_in_C] = value_buffer; \n");
1534  source.append(" } \n");
1535 
1536  // copy to C in coalesced manner:
1537  source.append(" } else { \n");
1538  source.append(" for (unsigned int i = row_B_start + local_id; i < row_B_end; i += get_local_size(0)) { \n");
1539  source.append(" C_col_indices[index_in_C] = B_col_indices[i]; \n");
1540  source.append(" C_elements[index_in_C] = val_A * B_elements[i]; \n");
1541  source.append(" index_in_C += get_local_size(0); \n");
1542  source.append(" } \n");
1543  source.append(" } \n");
1544 
1545  source.append(" } \n");
1546 
1547  source.append("} \n");
1548 
1549 }
1550 
1551 
1552 template<typename StringT>
1553 void generate_compressed_matrix_compressed_matrix_prod(StringT & source, std::string const & numeric_string)
1554 {
1561 }
1562 
1563 template<typename StringT>
1564 void generate_compressed_matrix_assign_to_dense(StringT & source, std::string const & numeric_string)
1565 {
1566 
1567  source.append(" __kernel void assign_to_dense( \n");
1568  source.append(" __global const unsigned int * row_indices, \n");
1569  source.append(" __global const unsigned int * column_indices, \n");
1570  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1571  source.append(" __global "); source.append(numeric_string); source.append(" * B, \n");
1572  source.append(" unsigned int B_row_start, \n");
1573  source.append(" unsigned int B_col_start, \n");
1574  source.append(" unsigned int B_row_inc, \n");
1575  source.append(" unsigned int B_col_inc, \n");
1576  source.append(" unsigned int B_row_size, \n");
1577  source.append(" unsigned int B_col_size, \n");
1578  source.append(" unsigned int B_internal_rows, \n");
1579  source.append(" unsigned int B_internal_cols) { \n");
1580 
1581  source.append(" for (unsigned int i = get_global_id(0); i < B_row_size; i += get_global_size(0)) \n");
1582  source.append(" { \n");
1583  source.append(" unsigned int row_end = row_indices[i+1]; \n");
1584  source.append(" for (unsigned int j = row_indices[i]; j<row_end; j++) \n");
1585  source.append(" { \n");
1586  source.append(" B[(B_row_start + i * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; \n");
1587  source.append(" } \n");
1588  source.append(" } \n");
1589  source.append(" } \n");
1590 
1591 }
1592 
1593 
1595 
1596 // main kernel class
1598 template<typename NumericT>
1600 {
1601  static std::string program_name()
1602  {
1603  return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
1604  }
1605 
1606  static void init(viennacl::ocl::context & ctx)
1607  {
1608  static std::map<cl_context, bool> init_done;
1609  if (!init_done[ctx.handle().get()])
1610  {
1612  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1613 
1614  std::string source;
1615  source.reserve(1024);
1616 
1617  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1618 
1619  if (numeric_string == "float" || numeric_string == "double")
1620  {
1621  generate_compressed_matrix_jacobi(source, numeric_string);
1622  }
1624  generate_compressed_matrix_row_info_extractor(source, numeric_string);
1625  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1626  {
1627  generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, true);
1628  generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, false);
1629  }
1630  generate_compressed_matrix_vec_mul(source, numeric_string, true);
1631  generate_compressed_matrix_vec_mul(source, numeric_string, false);
1632  generate_compressed_matrix_vec_mul4(source, numeric_string, true);
1633  generate_compressed_matrix_vec_mul4(source, numeric_string, false);
1634  generate_compressed_matrix_vec_mul8(source, numeric_string, true);
1635  generate_compressed_matrix_vec_mul8(source, numeric_string, false);
1636  //generate_compressed_matrix_vec_mul_cpu(source, numeric_string);
1637  generate_compressed_matrix_compressed_matrix_prod(source, numeric_string);
1638  generate_compressed_matrix_assign_to_dense(source, numeric_string);
1639 
1640  std::string prog_name = program_name();
1641  #ifdef VIENNACL_BUILD_INFO
1642  std::cout << "Creating program " << prog_name << std::endl;
1643  #endif
1644  ctx.add_program(source, prog_name);
1645  init_done[ctx.handle().get()] = true;
1646  } //if
1647  } //init
1648 };
1649 
1650 
1652 template<typename NumericT>
1654 {
1655  static std::string program_name()
1656  {
1657  return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix_solve";
1658  }
1659 
1660  static void init(viennacl::ocl::context & ctx)
1661  {
1662  static std::map<cl_context, bool> init_done;
1663  if (!init_done[ctx.handle().get()])
1664  {
1666  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1667 
1668  std::string source;
1669  source.reserve(1024);
1670 
1671  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1672 
1673  if (numeric_string == "float" || numeric_string == "double")
1674  {
1677  generate_compressed_matrix_lu_backward(source, numeric_string);
1678  generate_compressed_matrix_lu_forward(source, numeric_string);
1679  generate_compressed_matrix_trans_lu_backward(source, numeric_string);
1680  generate_compressed_matrix_trans_lu_forward(source, numeric_string);
1681  generate_compressed_matrix_trans_unit_lu_backward(source, numeric_string);
1682  generate_compressed_matrix_trans_unit_lu_forward(source, numeric_string);
1684  generate_compressed_matrix_unit_lu_backward(source, numeric_string);
1685  generate_compressed_matrix_unit_lu_forward(source, numeric_string);
1686  }
1687 
1688  std::string prog_name = program_name();
1689  #ifdef VIENNACL_BUILD_INFO
1690  std::cout << "Creating program " << prog_name << std::endl;
1691  #endif
1692  ctx.add_program(source, prog_name);
1693  init_done[ctx.handle().get()] = true;
1694  } //if
1695  } //init
1696 };
1697 
1698 } // namespace kernels
1699 } // namespace opencl
1700 } // namespace linalg
1701 } // namespace viennacl
1702 #endif
1703 
Implements a OpenCL platform within ViennaCL.
void generate_compressed_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)
Various little tools used here and there in ViennaCL.
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:49
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
void generate_compressed_matrix_trans_lu_forward(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities.
void generate_compressed_matrix_block_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_dense_matrix_mult(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
void generate_compressed_matrix_vec_mul_cpu(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_jacobi(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
Common implementations shared by OpenCL-based operations.
void generate_compressed_matrix_assign_to_dense(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_block_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_vec_mul_nvidia(StringT &source, std::string const &numeric_string, unsigned int subwarp_size, bool with_alpha_beta)
void generate_compressed_matrix_compressed_matrix_prod_G1(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling G_1 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:191
void generate_compressed_matrix_compressed_matrix_prod_3(StringT &source, std::string const &numeric_string)
OpenCL kernel for the third stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_trans_unit_lu_forward(StringT &source, std::string const &numeric_string)
Main kernel class for generating OpenCL kernels for compressed_matrix (except solvers).
void generate_compressed_matrix_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Main kernel class for triangular solver OpenCL kernels for compressed_matrix.
void generate_compressed_matrix_compressed_matrix_prod_2(StringT &source)
OpenCL kernel for the second stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_trans_unit_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_1(StringT &source)
OpenCL kernel for the first stage of sparse matrix-matrix multiplication.
void generate_compressed_matrix_vec_mul8(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
void generate_compressed_matrix_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_A2(StringT &source, std::string const &numeric_string)
OpenCL kernel for filling A_2 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonz...
void generate_compressed_matrix_vec_mul4(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
Representation of an OpenCL kernel in ViennaCL.
void generate_compressed_matrix_vec_mul(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
static void init(viennacl::ocl::context &ctx)
void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_compressed_matrix_trans_lu_backward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_compressed_matrix_prod_decompose_1(StringT &source)
OpenCL kernel for decomposing A in C = A * B, such that A = A_2 * G_1 with G_1 containing at most 32 ...
void generate_compressed_matrix_unit_lu_forward(StringT &source, std::string const &numeric_string)
void generate_compressed_matrix_unit_lu_backward(StringT &source, std::string const &numeric_string)