ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
iterative.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 #include "viennacl/tools/tools.hpp"
22 
24 
28 
29 #include "viennacl/ocl/kernel.hpp"
31 #include "viennacl/ocl/utils.hpp"
32 
35 namespace viennacl
36 {
37 namespace linalg
38 {
39 namespace opencl
40 {
41 namespace kernels
42 {
44 
45 template<typename StringT>
46 void generate_pipelined_cg_vector_update(StringT & source, std::string const & numeric_string)
47 {
48  source.append("__kernel void cg_vector_update( \n");
49  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
50  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
51  source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
52  source.append(" __global "); source.append(numeric_string); source.append(" * r, \n");
53  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
54  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
55  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
56  source.append(" unsigned int size, \n");
57  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
58  source.append("{ \n");
59  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
60  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
61  source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
62  source.append(" "); source.append(numeric_string); source.append(" value_r = r[i]; \n");
63  source.append(" \n");
64  source.append(" result[i] += alpha * value_p; \n");
65  source.append(" value_r -= alpha * Ap[i]; \n");
66  source.append(" value_p = value_r + beta * value_p; \n");
67  source.append(" \n");
68  source.append(" p[i] = value_p; \n");
69  source.append(" r[i] = value_r; \n");
70  source.append(" inner_prod_contrib += value_r * value_r; \n");
71  source.append(" } \n");
72 
73  // parallel reduction in work group
74  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
75  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
76  source.append(" { \n");
77  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
78  source.append(" if (get_local_id(0) < stride) \n");
79  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
80  source.append(" } ");
81 
82  // write results to result array
83  source.append(" if (get_local_id(0) == 0) \n ");
84  source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
85 
86  source.append("} \n");
87 }
88 
89 template<typename StringT>
90 void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
91 {
92  std::stringstream ss;
93  ss << subwarp_size;
94 
95  source.append("__kernel void cg_csr_blocked_prod( \n");
96  source.append(" __global const unsigned int * row_indices, \n");
97  source.append(" __global const unsigned int * column_indices, \n");
98  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
99  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
100  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
101  source.append(" unsigned int size, \n");
102  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
103  source.append(" unsigned int buffer_size, \n");
104  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
105  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
106  source.append("{ \n");
107  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
108  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
109  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
110 
111  source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
112  source.append(" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
113  source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
114  source.append(" const unsigned int block_stop = min(block_start + block_increment, size); \n");
115 
116  source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
117  source.append(" row < block_stop; \n");
118  source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
119  source.append(" { \n");
120  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
121  source.append(" unsigned int row_end = row_indices[row+1]; \n");
122  source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
123  source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
124 
125  source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
126  source.append(" #pragma unroll \n");
127  source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
128  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
129 
130  source.append(" if (id_in_row == 0) { \n");
131  source.append(" Ap[row] = shared_elements[get_local_id(0)]; \n");
132  source.append(" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
133  source.append(" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
134  source.append(" } \n");
135  source.append(" } \n");
136 
138  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
139  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
140  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
141  source.append(" { \n");
142  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
143  source.append(" if (get_local_id(0) < stride) { \n");
144  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
145  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
146  source.append(" } ");
147  source.append(" } ");
148 
149  // write results to result array
150  source.append(" if (get_local_id(0) == 0) { \n ");
151  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
152  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
153  source.append(" } \n");
154 
155  source.append("} \n");
156 }
157 
158 template<typename StringT>
159 void generate_compressed_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
160 {
161  source.append("__kernel void cg_csr_prod( \n");
162  source.append(" __global const unsigned int * row_indices, \n");
163  source.append(" __global const unsigned int * column_indices, \n");
164  source.append(" __global const unsigned int * row_blocks, \n");
165  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
166  source.append(" unsigned int num_blocks, \n");
167  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
168  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
169  source.append(" unsigned int size, \n");
170  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
171  source.append(" unsigned int buffer_size, \n");
172  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
173  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
174  source.append(" __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
175  source.append("{ \n");
176 
177  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
178  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
179 
180  source.append(" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
181  source.append(" unsigned int row_start = row_blocks[block_id]; \n");
182  source.append(" unsigned int row_stop = row_blocks[block_id + 1]; \n");
183  source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
184  source.append(" unsigned int element_start = row_indices[row_start]; \n");
185  source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
186 
187  source.append(" if (rows_to_process > 1) { \n"); // CSR stream
188  // load to shared buffer:
189  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
190  source.append(" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
191 
192  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
193 
194  // use one thread per row to sum:
195  source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
196  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
197  source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
198  source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
199  source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
200  source.append(" dot_prod += shared_elements[i]; \n");
201  source.append(" Ap[row] = dot_prod; \n");
202  source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
203  source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
204  source.append(" } \n");
205  source.append(" } \n");
206 
207  source.append(" else \n"); // CSR vector for a single row
208  source.append(" { \n");
209  // load and sum to shared buffer:
210  source.append(" shared_elements[get_local_id(0)] = 0; \n");
211  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
212  source.append(" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
213 
214  // reduction to obtain final result
215  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
216  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
217  source.append(" if (get_local_id(0) < stride) \n");
218  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
219  source.append(" } \n");
220 
221  source.append(" if (get_local_id(0) == 0) { \n");
222  source.append(" Ap[row_start] = shared_elements[0]; \n");
223  source.append(" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
224  source.append(" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
225  source.append(" } \n");
226  source.append(" } \n");
227  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
228  source.append(" } \n");
229 
230  // parallel reduction in work group
231  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
232  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
233  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
234  source.append(" { \n");
235  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
236  source.append(" if (get_local_id(0) < stride) { \n");
237  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
238  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
239  source.append(" } ");
240  source.append(" } ");
241 
242  // write results to result array
243  source.append(" if (get_local_id(0) == 0) { \n ");
244  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
245  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
246  source.append(" } \n");
247 
248  source.append("} \n");
249 
250 }
251 
252 
253 template<typename StringT>
254 void generate_coordinate_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
255 {
256  source.append("__kernel void cg_coo_prod( \n");
257  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
258  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
259  source.append(" __global const uint * group_boundaries, \n");
260  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
261  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
262  source.append(" unsigned int size, \n");
263  source.append(" __local unsigned int * shared_rows, \n");
264  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
265  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
266  source.append(" unsigned int buffer_size, \n");
267  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
268  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
269  source.append("{ \n");
270  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
271  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
272 
274  source.append(" uint2 tmp; \n");
275  source.append(" "); source.append(numeric_string); source.append(" val; \n");
276  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
277  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
278  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
279 
280  source.append(" uint local_index = 0; \n");
281 
282  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
283  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
284 
285  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
286  source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
287 
288  //check for carry from previous loop run:
289  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
290  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
291  source.append(" val += inter_results[get_local_size(0)-1]; \n");
292  source.append(" else {\n");
293  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
294  source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
295  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
296  source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
297  source.append(" } \n");
298  source.append(" } \n");
299 
300  //segmented parallel reduction begin
301  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
302  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
303  source.append(" inter_results[get_local_id(0)] = val; \n");
304  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
305  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
306 
307  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
308  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
309  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
310  source.append(" inter_results[get_local_id(0)] += left; \n");
311  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
312  source.append(" } \n");
313  //segmented parallel reduction end
314 
315  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
316  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
317  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
318  source.append(" Ap[tmp.x] = Ap_entry; \n");
319  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
320  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
321  source.append(" } \n");
322 
323  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
324  source.append(" } \n"); //for k
325 
326  source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
327  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
328  source.append(" Ap[tmp.x] = Ap_entry; \n");
329  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
330  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
331  source.append(" } \n");
332 
334  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
335  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
336  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
337  source.append(" { \n");
338  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
339  source.append(" if (get_local_id(0) < stride) { \n");
340  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
341  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
342  source.append(" } ");
343  source.append(" } ");
344 
345  // write results to result array
346  source.append(" if (get_local_id(0) == 0) { \n ");
347  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
348  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
349  source.append(" } \n");
350 
351  source.append("} \n \n");
352 
353 }
354 
355 
356 template<typename StringT>
357 void generate_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
358 {
359  source.append("__kernel void cg_ell_prod( \n");
360  source.append(" __global const unsigned int * coords, \n");
361  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
362  source.append(" unsigned int internal_row_num, \n");
363  source.append(" unsigned int items_per_row, \n");
364  source.append(" unsigned int aligned_items_per_row, \n");
365  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
366  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
367  source.append(" unsigned int size, \n");
368  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
369  source.append(" unsigned int buffer_size, \n");
370  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
371  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
372  source.append("{ \n");
373  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
374  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
375  source.append(" uint glb_id = get_global_id(0); \n");
376  source.append(" uint glb_sz = get_global_size(0); \n");
377 
378  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
379  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
380 
381  source.append(" uint offset = row; \n");
382  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
383  source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
384  source.append(" sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
385  source.append(" } \n");
386 
387  source.append(" Ap[row] = sum; \n");
388  source.append(" inner_prod_ApAp += sum * sum; \n");
389  source.append(" inner_prod_pAp += p[row] * sum; \n");
390  source.append(" } \n");
391 
393  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
394  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
395  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
396  source.append(" { \n");
397  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
398  source.append(" if (get_local_id(0) < stride) { \n");
399  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
400  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
401  source.append(" } ");
402  source.append(" } ");
403 
404  // write results to result array
405  source.append(" if (get_local_id(0) == 0) { \n ");
406  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
407  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
408  source.append(" } \n");
409  source.append("} \n \n");
410 }
411 
412 template<typename StringT>
413 void generate_sliced_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
414 {
415  source.append("__kernel void cg_sliced_ell_prod( \n");
416  source.append(" __global const unsigned int * columns_per_block, \n");
417  source.append(" __global const unsigned int * column_indices, \n");
418  source.append(" __global const unsigned int * block_start, \n");
419  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
420  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
421  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
422  source.append(" unsigned int size, \n");
423  source.append(" unsigned int block_size, \n");
424  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
425  source.append(" unsigned int buffer_size, \n");
426  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
427  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
428  source.append("{ \n");
429  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
430  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
431  source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
432  source.append(" uint id_in_block = get_local_id(0) % block_size; \n");
433  source.append(" uint num_blocks = (size - 1) / block_size + 1; \n");
434  source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
435  source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
436 
437  source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
438  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
439 
440  source.append(" uint row = block_idx * block_size + id_in_block; \n");
441  source.append(" uint offset = block_start[block_idx]; \n");
442  source.append(" uint num_columns = columns_per_block[block_idx]; \n");
443  source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
444  source.append(" uint index = offset + item_id * block_size + id_in_block; \n");
445  source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
446  source.append(" sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
447  source.append(" } \n");
448 
449  source.append(" if (row < size) {\n");
450  source.append(" Ap[row] = sum; \n");
451  source.append(" inner_prod_ApAp += sum * sum; \n");
452  source.append(" inner_prod_pAp += p[row] * sum; \n");
453  source.append(" } \n");
454  source.append(" } \n");
455 
457  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
458  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
459  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
460  source.append(" { \n");
461  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
462  source.append(" if (get_local_id(0) < stride) { \n");
463  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
464  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
465  source.append(" } ");
466  source.append(" } ");
467 
468  // write results to result array
469  source.append(" if (get_local_id(0) == 0) { \n ");
470  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
471  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
472  source.append(" } \n");
473  source.append("} \n \n");
474 }
475 
476 template<typename StringT>
477 void generate_hyb_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
478 {
479  source.append("__kernel void cg_hyb_prod( \n");
480  source.append(" const __global int* ell_coords, \n");
481  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
482  source.append(" const __global uint* csr_rows, \n");
483  source.append(" const __global uint* csr_cols, \n");
484  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
485  source.append(" unsigned int internal_row_num, \n");
486  source.append(" unsigned int items_per_row, \n");
487  source.append(" unsigned int aligned_items_per_row, \n");
488  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
489  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
490  source.append(" unsigned int size, \n");
491  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
492  source.append(" unsigned int buffer_size, \n");
493  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
494  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
495  source.append("{ \n");
496  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
497  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
498  source.append(" uint glb_id = get_global_id(0); \n");
499  source.append(" uint glb_sz = get_global_size(0); \n");
500 
501  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
502  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
503 
504  source.append(" uint offset = row; \n");
505  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
506  source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
507  source.append(" sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
508  source.append(" } \n");
509 
510  source.append(" uint col_begin = csr_rows[row]; \n");
511  source.append(" uint col_end = csr_rows[row + 1]; \n");
512 
513  source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
514  source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
515  source.append(" } \n");
516 
517  source.append(" Ap[row] = sum; \n");
518  source.append(" inner_prod_ApAp += sum * sum; \n");
519  source.append(" inner_prod_pAp += p[row] * sum; \n");
520  source.append(" } \n");
521 
523  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
524  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
525  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
526  source.append(" { \n");
527  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
528  source.append(" if (get_local_id(0) < stride) { \n");
529  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
530  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
531  source.append(" } ");
532  source.append(" } ");
533 
534  // write results to result array
535  source.append(" if (get_local_id(0) == 0) { \n ");
536  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
537  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
538  source.append(" } \n");
539  source.append("} \n \n");
540 }
541 
542 
544 
545 
546 template<typename StringT>
547 void generate_pipelined_bicgstab_update_s(StringT & source, std::string const & numeric_string)
548 {
549  source.append("__kernel void bicgstab_update_s( \n");
550  source.append(" __global "); source.append(numeric_string); source.append(" * s, \n");
551  source.append(" __global "); source.append(numeric_string); source.append(" const * r, \n");
552  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
553  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
554  source.append(" unsigned int chunk_size, \n");
555  source.append(" unsigned int chunk_offset, \n");
556  source.append(" unsigned int size, \n");
557  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array, \n");
558  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_Ap_in_r0) \n");
559  source.append("{ \n");
560 
561  source.append(" "); source.append(numeric_string); source.append(" alpha = 0; \n");
562 
563  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
564  source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0)]; \n");
565  source.append(" shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
566  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
567  source.append(" { \n");
568  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
569  source.append(" if (get_local_id(0) < stride) { \n");
570  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
571  source.append(" shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride]; \n");
572  source.append(" } ");
573  source.append(" } ");
574 
575  // compute alpha from reduced values:
576  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
577  source.append(" alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
578 
579  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
580  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
581  source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
582  source.append(" \n");
583  source.append(" value_s = r[i] - alpha * Ap[i]; \n");
584  source.append(" inner_prod_contrib += value_s * value_s; \n");
585  source.append(" \n");
586  source.append(" s[i] = value_s; \n");
587  source.append(" } \n");
588  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
589 
590  // parallel reduction in work group
591  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
592  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
593  source.append(" { \n");
594  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
595  source.append(" if (get_local_id(0) < stride) \n");
596  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
597  source.append(" } ");
598 
599  // write results to result array
600  source.append(" if (get_local_id(0) == 0) \n ");
601  source.append(" inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
602 
603  source.append("} \n");
604 
605 }
606 
607 
608 
609 template<typename StringT>
610 void generate_pipelined_bicgstab_vector_update(StringT & source, std::string const & numeric_string)
611 {
612  source.append("__kernel void bicgstab_vector_update( \n");
613  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
614  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
615  source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
616  source.append(" "); source.append(numeric_string); source.append(" omega, \n");
617  source.append(" __global "); source.append(numeric_string); source.append(" const * s, \n");
618  source.append(" __global "); source.append(numeric_string); source.append(" * residual, \n");
619  source.append(" __global "); source.append(numeric_string); source.append(" const * As, \n");
620  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
621  source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
622  source.append(" __global "); source.append(numeric_string); source.append(" const * r0star, \n");
623  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
624  source.append(" unsigned int size, \n");
625  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
626  source.append("{ \n");
627  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r_r0star = 0; \n");
628  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
629  source.append(" "); source.append(numeric_string); source.append(" value_result = result[i]; \n");
630  source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
631  source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
632  source.append(" "); source.append(numeric_string); source.append(" value_residual = residual[i]; \n");
633  source.append(" "); source.append(numeric_string); source.append(" value_As = As[i]; \n");
634  source.append(" "); source.append(numeric_string); source.append(" value_Ap = Ap[i]; \n");
635  source.append(" "); source.append(numeric_string); source.append(" value_r0star = r0star[i]; \n");
636  source.append(" \n");
637  source.append(" value_result += alpha * value_p + omega * value_s; \n");
638  source.append(" value_residual = value_s - omega * value_As; \n");
639  source.append(" value_p = value_residual + beta * (value_p - omega * value_Ap); \n");
640  source.append(" \n");
641  source.append(" result[i] = value_result; \n");
642  source.append(" residual[i] = value_residual; \n");
643  source.append(" p[i] = value_p; \n");
644  source.append(" inner_prod_r_r0star += value_residual * value_r0star; \n");
645  source.append(" } \n");
646 
647  // parallel reduction in work group
648  source.append(" shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
649  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
650  source.append(" { \n");
651  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
652  source.append(" if (get_local_id(0) < stride) \n");
653  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
654  source.append(" } ");
655 
656  // write results to result array
657  source.append(" if (get_local_id(0) == 0) \n ");
658  source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
659 
660  source.append("} \n");
661 }
662 
663 template<typename StringT>
664 void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
665 {
666  std::stringstream ss;
667  ss << subwarp_size;
668 
669  source.append("__kernel void bicgstab_csr_blocked_prod( \n");
670  source.append(" __global const unsigned int * row_indices, \n");
671  source.append(" __global const unsigned int * column_indices, \n");
672  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
673  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
674  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
675  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
676  source.append(" unsigned int size, \n");
677  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
678  source.append(" unsigned int buffer_size, \n");
679  source.append(" unsigned int buffer_offset, \n");
680  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
681  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
682  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
683  source.append("{ \n");
684  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
685  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
686  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
687  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
688 
689  source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
690  source.append(" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
691  source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
692  source.append(" const unsigned int block_stop = min(block_start + block_increment, size); \n");
693 
694  source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
695  source.append(" row < block_stop; \n");
696  source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
697  source.append(" { \n");
698  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
699  source.append(" unsigned int row_end = row_indices[row+1]; \n");
700  source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
701  source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
702 
703  source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
704  source.append(" #pragma unroll \n");
705  source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
706  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
707 
708  source.append(" if (id_in_row == 0) { \n");
709  source.append(" Ap[row] = shared_elements[get_local_id(0)]; \n");
710  source.append(" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
711  source.append(" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
712  source.append(" inner_prod_r0Ap += r0star[row] * shared_elements[get_local_id(0)]; \n");
713  source.append(" } \n");
714  source.append(" } \n");
715 
716  // parallel reduction in work group
717  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
718  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
719  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
720  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
721  source.append(" { \n");
722  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
723  source.append(" if (get_local_id(0) < stride) { \n");
724  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
725  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
726  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
727  source.append(" } ");
728  source.append(" } ");
729 
730  // write results to result array
731  source.append(" if (get_local_id(0) == 0) { \n ");
732  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
733  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
734  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
735  source.append(" } \n");
736 
737  source.append("} \n");
738 }
739 
740 template<typename StringT>
741 void generate_compressed_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
742 {
743  source.append("__kernel void bicgstab_csr_prod( \n");
744  source.append(" __global const unsigned int * row_indices, \n");
745  source.append(" __global const unsigned int * column_indices, \n");
746  source.append(" __global const unsigned int * row_blocks, \n");
747  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
748  source.append(" unsigned int num_blocks, \n");
749  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
750  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
751  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
752  source.append(" unsigned int size, \n");
753  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
754  source.append(" unsigned int buffer_size, \n");
755  source.append(" unsigned int buffer_offset, \n");
756  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
757  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
758  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
759  source.append("{ \n");
760  source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
761  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
762  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
763  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
764 
765  source.append(" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
766  source.append(" unsigned int row_start = row_blocks[block_id]; \n");
767  source.append(" unsigned int row_stop = row_blocks[block_id + 1]; \n");
768  source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
769  source.append(" unsigned int element_start = row_indices[row_start]; \n");
770  source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
771 
772  source.append(" if (rows_to_process > 1) { \n"); // CSR stream
773  // load to shared buffer:
774  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
775  source.append(" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
776 
777  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
778 
779  // use one thread per row to sum:
780  source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
781  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
782  source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
783  source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
784  source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
785  source.append(" dot_prod += shared_elements[i]; \n");
786  source.append(" Ap[row] = dot_prod; \n");
787  source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
788  source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
789  source.append(" inner_prod_r0Ap += r0star[row] * dot_prod; \n");
790  source.append(" } \n");
791  source.append(" } \n");
792 
793  source.append(" else \n"); // CSR vector for a single row
794  source.append(" { \n");
795  // load and sum to shared buffer:
796  source.append(" shared_elements[get_local_id(0)] = 0; \n");
797  source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
798  source.append(" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
799 
800  // reduction to obtain final result
801  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
802  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
803  source.append(" if (get_local_id(0) < stride) \n");
804  source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
805  source.append(" } \n");
806 
807  source.append(" if (get_local_id(0) == 0) { \n");
808  source.append(" Ap[row_start] = shared_elements[0]; \n");
809  source.append(" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
810  source.append(" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
811  source.append(" inner_prod_r0Ap += r0star[row_start] * shared_elements[0]; \n");
812  source.append(" } \n");
813  source.append(" } \n");
814  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
815  source.append(" } \n");
816 
817  // parallel reduction in work group
818  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
819  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
820  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
821  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
822  source.append(" { \n");
823  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
824  source.append(" if (get_local_id(0) < stride) { \n");
825  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
826  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
827  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
828  source.append(" } ");
829  source.append(" } ");
830 
831  // write results to result array
832  source.append(" if (get_local_id(0) == 0) { \n ");
833  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
834  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
835  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
836  source.append(" } \n");
837 
838  source.append("} \n \n");
839 
840 }
841 
842 template<typename StringT>
843 void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
844 {
845  source.append("__kernel void bicgstab_coo_prod( \n");
846  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
847  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
848  source.append(" __global const uint * group_boundaries, \n");
849  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
850  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
851  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
852  source.append(" unsigned int size, \n");
853  source.append(" __local unsigned int * shared_rows, \n");
854  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
855  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
856  source.append(" unsigned int buffer_size, \n");
857  source.append(" unsigned int buffer_offset, \n");
858  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
859  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
860  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
861  source.append("{ \n");
862  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
863  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
864  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
865 
867  source.append(" uint2 tmp; \n");
868  source.append(" "); source.append(numeric_string); source.append(" val; \n");
869  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
870  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
871  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
872 
873  source.append(" uint local_index = 0; \n");
874 
875  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
876  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
877 
878  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
879  source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
880 
881  //check for carry from previous loop run:
882  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
883  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
884  source.append(" val += inter_results[get_local_size(0)-1]; \n");
885  source.append(" else {\n");
886  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
887  source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
888  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
889  source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
890  source.append(" inner_prod_r0Ap += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
891  source.append(" } \n");
892  source.append(" } \n");
893 
894  //segmented parallel reduction begin
895  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
896  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
897  source.append(" inter_results[get_local_id(0)] = val; \n");
898  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
899  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
900 
901  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
902  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
903  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
904  source.append(" inter_results[get_local_id(0)] += left; \n");
905  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
906  source.append(" } \n");
907  //segmented parallel reduction end
908 
909  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
910  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
911  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
912  source.append(" Ap[tmp.x] = Ap_entry; \n");
913  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
914  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
915  source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
916  source.append(" } \n");
917 
918  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
919  source.append(" } \n"); //for k
920 
921  source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
922  source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
923  source.append(" Ap[tmp.x] = Ap_entry; \n");
924  source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
925  source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
926  source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
927  source.append(" } \n");
928 
929  // parallel reduction in work group
930  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
931  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
932  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
933  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
934  source.append(" { \n");
935  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
936  source.append(" if (get_local_id(0) < stride) { \n");
937  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
938  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
939  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
940  source.append(" } ");
941  source.append(" } ");
942 
943  // write results to result array
944  source.append(" if (get_local_id(0) == 0) { \n ");
945  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
946  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
947  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
948  source.append(" } \n");
949 
950  source.append("} \n \n");
951 
952 }
953 
954 
955 template<typename StringT>
956 void generate_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
957 {
958  source.append("__kernel void bicgstab_ell_prod( \n");
959  source.append(" __global const unsigned int * coords, \n");
960  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
961  source.append(" unsigned int internal_row_num, \n");
962  source.append(" unsigned int items_per_row, \n");
963  source.append(" unsigned int aligned_items_per_row, \n");
964  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
965  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
966  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
967  source.append(" unsigned int size, \n");
968  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
969  source.append(" unsigned int buffer_size, \n");
970  source.append(" unsigned int buffer_offset, \n");
971  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
972  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
973  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
974  source.append("{ \n");
975  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
976  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
977  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
978  source.append(" uint glb_id = get_global_id(0); \n");
979  source.append(" uint glb_sz = get_global_size(0); \n");
980 
981  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
982  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
983 
984  source.append(" uint offset = row; \n");
985  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
986  source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
987  source.append(" sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
988  source.append(" } \n");
989 
990  source.append(" Ap[row] = sum; \n");
991  source.append(" inner_prod_ApAp += sum * sum; \n");
992  source.append(" inner_prod_pAp += p[row] * sum; \n");
993  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
994  source.append(" } \n");
995 
996  // parallel reduction in work group
997  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
998  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
999  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1000  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1001  source.append(" { \n");
1002  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1003  source.append(" if (get_local_id(0) < stride) { \n");
1004  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1005  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1006  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1007  source.append(" } ");
1008  source.append(" } ");
1009 
1010  // write results to result array
1011  source.append(" if (get_local_id(0) == 0) { \n ");
1012  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1013  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1014  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1015  source.append(" } \n");
1016  source.append("} \n \n");
1017 }
1018 
1019 template<typename StringT>
1020 void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
1021 {
1022  source.append("__kernel void bicgstab_sliced_ell_prod( \n");
1023  source.append(" __global const unsigned int * columns_per_block, \n");
1024  source.append(" __global const unsigned int * column_indices, \n");
1025  source.append(" __global const unsigned int * block_start, \n");
1026  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1027  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1028  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1029  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
1030  source.append(" unsigned int size, \n");
1031  source.append(" unsigned int block_size, \n");
1032  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1033  source.append(" unsigned int buffer_size, \n");
1034  source.append(" unsigned int buffer_offset, \n");
1035  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1036  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
1037  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
1038  source.append("{ \n");
1039  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
1040  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
1041  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
1042  source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
1043  source.append(" uint id_in_block = get_local_id(0) % block_size; \n");
1044  source.append(" uint num_blocks = (size - 1) / block_size + 1; \n");
1045  source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
1046  source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
1047 
1048  source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
1049  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
1050 
1051  source.append(" uint row = block_idx * block_size + id_in_block; \n");
1052  source.append(" uint offset = block_start[block_idx]; \n");
1053  source.append(" uint num_columns = columns_per_block[block_idx]; \n");
1054  source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
1055  source.append(" uint index = offset + item_id * block_size + id_in_block; \n");
1056  source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
1057  source.append(" sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
1058  source.append(" } \n");
1059 
1060  source.append(" if (row < size) {\n");
1061  source.append(" Ap[row] = sum; \n");
1062  source.append(" inner_prod_ApAp += sum * sum; \n");
1063  source.append(" inner_prod_pAp += p[row] * sum; \n");
1064  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
1065  source.append(" } \n");
1066  source.append(" } \n");
1067 
1068  // parallel reduction in work group
1069  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1070  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1071  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1072  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1073  source.append(" { \n");
1074  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1075  source.append(" if (get_local_id(0) < stride) { \n");
1076  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1077  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1078  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1079  source.append(" } ");
1080  source.append(" } ");
1081 
1082  // write results to result array
1083  source.append(" if (get_local_id(0) == 0) { \n ");
1084  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1085  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1086  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1087  source.append(" } \n");
1088  source.append("} \n \n");
1089 }
1090 
1091 template<typename StringT>
1092 void generate_hyb_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
1093 {
1094  source.append("__kernel void bicgstab_hyb_prod( \n");
1095  source.append(" const __global int* ell_coords, \n");
1096  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
1097  source.append(" const __global uint* csr_rows, \n");
1098  source.append(" const __global uint* csr_cols, \n");
1099  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
1100  source.append(" unsigned int internal_row_num, \n");
1101  source.append(" unsigned int items_per_row, \n");
1102  source.append(" unsigned int aligned_items_per_row, \n");
1103  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1104  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1105  source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
1106  source.append(" unsigned int size, \n");
1107  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1108  source.append(" unsigned int buffer_size, \n");
1109  source.append(" unsigned int buffer_offset, \n");
1110  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1111  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
1112  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
1113  source.append("{ \n");
1114  source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
1115  source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
1116  source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
1117  source.append(" uint glb_id = get_global_id(0); \n");
1118  source.append(" uint glb_sz = get_global_size(0); \n");
1119 
1120  source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
1121  source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
1122 
1123  source.append(" uint offset = row; \n");
1124  source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
1125  source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
1126  source.append(" sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
1127  source.append(" } \n");
1128 
1129  source.append(" uint col_begin = csr_rows[row]; \n");
1130  source.append(" uint col_end = csr_rows[row + 1]; \n");
1131 
1132  source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
1133  source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
1134  source.append(" } \n");
1135 
1136  source.append(" Ap[row] = sum; \n");
1137  source.append(" inner_prod_ApAp += sum * sum; \n");
1138  source.append(" inner_prod_pAp += p[row] * sum; \n");
1139  source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
1140  source.append(" } \n");
1141 
1142  // parallel reduction in work group
1143  source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
1144  source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
1145  source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
1146  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1147  source.append(" { \n");
1148  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1149  source.append(" if (get_local_id(0) < stride) { \n");
1150  source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
1151  source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
1152  source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
1153  source.append(" } ");
1154  source.append(" } ");
1155 
1156  // write results to result array
1157  source.append(" if (get_local_id(0) == 0) { \n ");
1158  source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
1159  source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
1160  source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
1161  source.append(" } \n");
1162  source.append("} \n \n");
1163 }
1164 
1166 
1167 
1168 template <typename StringType>
1169 void generate_pipelined_gmres_gram_schmidt_stage1(StringType & source, std::string const & numeric_string, bool is_nvidia)
1170 {
1171  source.append("__kernel void gmres_gram_schmidt_1( \n");
1172  source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
1173  source.append(" unsigned int size, \n");
1174  source.append(" unsigned int internal_size, \n");
1175  source.append(" unsigned int k, \n");
1176  source.append(" __global "); source.append(numeric_string); source.append(" * vi_in_vk_buffer, \n");
1177  source.append(" unsigned int chunk_size) \n");
1178  source.append("{ \n");
1179 
1180  source.append(" __local "); source.append(numeric_string); source.append(" shared_array[7*128]; \n");
1181  if (!is_nvidia) // use of thread-local variables entails a 2x performance drop on NVIDIA GPUs, but is faster an AMD
1182  {
1183  source.append(" "); source.append(numeric_string); source.append(" vi_in_vk[7]; \n");
1184  }
1185  source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
1186 
1187  source.append(" unsigned int k_base = 0; \n");
1188  source.append(" while (k_base < k) { \n");
1189  source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1190 
1191  if (is_nvidia)
1192  {
1193  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1194  source.append(" shared_array[get_local_id(0) + j*chunk_size] = 0; \n");
1195  }
1196  else
1197  {
1198  source.append(" vi_in_vk[0] = 0;\n");
1199  source.append(" vi_in_vk[1] = 0;\n");
1200  source.append(" vi_in_vk[2] = 0;\n");
1201  source.append(" vi_in_vk[3] = 0;\n");
1202  source.append(" vi_in_vk[4] = 0;\n");
1203  source.append(" vi_in_vk[5] = 0;\n");
1204  source.append(" vi_in_vk[6] = 0;\n");
1205  }
1206  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1207  source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
1208  source.append(" \n");
1209  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1210  if (is_nvidia)
1211  source.append(" shared_array[get_local_id(0) + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1212  else
1213  source.append(" vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1214  source.append(" } \n");
1215 
1216  // parallel reduction in work group
1217  if (!is_nvidia)
1218  {
1219  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1220  source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
1221  }
1222  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1223  source.append(" { \n");
1224  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1225  source.append(" if (get_local_id(0) < stride) { \n");
1226  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1227  source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1228  source.append(" } ");
1229  source.append(" } ");
1230 
1231  // write results to result array
1232  source.append(" if (get_local_id(0) == 0) \n ");
1233  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1234  source.append(" vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
1235 
1236  source.append(" k_base += vecs_in_iteration; \n");
1237  source.append(" } \n");
1238 
1239  source.append("} \n");
1240 
1241 }
1242 
1243 template <typename StringType>
1244 void generate_pipelined_gmres_gram_schmidt_stage2(StringType & source, std::string const & numeric_string)
1245 {
1246  source.append("__kernel void gmres_gram_schmidt_2( \n");
1247  source.append(" __global "); source.append(numeric_string); source.append(" * krylov_basis, \n");
1248  source.append(" unsigned int size, \n");
1249  source.append(" unsigned int internal_size, \n");
1250  source.append(" unsigned int k, \n");
1251  source.append(" __global "); source.append(numeric_string); source.append(" const * vi_in_vk_buffer, \n");
1252  source.append(" unsigned int chunk_size, \n");
1253  source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
1254  source.append(" unsigned int krylov_dim, \n");
1255  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1256  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
1257  source.append("{ \n");
1258 
1259  source.append(" "); source.append(numeric_string); source.append(" vk_dot_vk = 0; \n");
1260  source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
1261 
1262  source.append(" unsigned int k_base = 0; \n");
1263  source.append(" while (k_base < k) { \n");
1264  source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1265 
1266  // parallel reduction in work group for <v_i, v_k>
1267  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1268  source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
1269  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1270  source.append(" { \n");
1271  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1272  source.append(" if (get_local_id(0) < stride) { \n");
1273  source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1274  source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1275  source.append(" } ");
1276  source.append(" } ");
1277  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1278 
1279  // v_k -= <v_i, v_k> v_i:
1280  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1281  source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
1282  source.append(" \n");
1283  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1284  source.append(" value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
1285  source.append(" vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; \n");
1286  source.append(" krylov_basis[i + k * internal_size] = value_vk; \n");
1287  source.append(" } \n");
1288 
1289  // write to R: (to avoid thread divergence, all threads write the same value)
1290  source.append(" if (get_group_id(0) == 0) \n");
1291  source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1292  source.append(" R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
1293  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1294 
1295  source.append(" k_base += vecs_in_iteration; \n");
1296  source.append(" } \n");
1297 
1298  // parallel reduction in work group for <v_k, v_k>
1299  source.append(" shared_array[get_local_id(0)] = vk_dot_vk; \n");
1300  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1301  source.append(" { \n");
1302  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1303  source.append(" if (get_local_id(0) < stride) \n");
1304  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1305  source.append(" } ");
1306 
1307  // write results to result array
1308  source.append(" if (get_local_id(0) == 0) \n ");
1309  source.append(" inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
1310 
1311  source.append("} \n");
1312 }
1313 
1314 template <typename StringType>
1315 void generate_pipelined_gmres_normalize_vk(StringType & source, std::string const & numeric_string)
1316 {
1317  source.append("__kernel void gmres_normalize_vk( \n");
1318  source.append(" __global "); source.append(numeric_string); source.append(" * vk, \n");
1319  source.append(" unsigned int vk_offset, \n");
1320  source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
1321  source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
1322  source.append(" unsigned int R_offset, \n");
1323  source.append(" __global "); source.append(numeric_string); source.append(" const * inner_prod_buffer, \n");
1324  source.append(" unsigned int chunk_size, \n");
1325  source.append(" __global "); source.append(numeric_string); source.append(" * r_dot_vk_buffer, \n");
1326  source.append(" unsigned int chunk_offset, \n");
1327  source.append(" unsigned int size, \n");
1328  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
1329  source.append("{ \n");
1330 
1331  source.append(" "); source.append(numeric_string); source.append(" norm_vk = 0; \n");
1332 
1333  // parallel reduction in work group to compute <vk, vk>
1334  source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
1335  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1336  source.append(" { \n");
1337  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1338  source.append(" if (get_local_id(0) < stride) \n");
1339  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1340  source.append(" } ");
1341 
1342  // compute alpha from reduced values:
1343  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1344  source.append(" norm_vk = sqrt(shared_array[0]); \n");
1345 
1346  source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
1347  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1348  source.append(" "); source.append(numeric_string); source.append(" value_vk = vk[i + vk_offset] / norm_vk; \n");
1349  source.append(" \n");
1350  source.append(" inner_prod_contrib += residual[i] * value_vk; \n");
1351  source.append(" \n");
1352  source.append(" vk[i + vk_offset] = value_vk; \n");
1353  source.append(" } \n");
1354  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1355 
1356  // parallel reduction in work group
1357  source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
1358  source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1359  source.append(" { \n");
1360  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
1361  source.append(" if (get_local_id(0) < stride) \n");
1362  source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1363  source.append(" } ");
1364 
1365  // write results to result array
1366  source.append(" if (get_local_id(0) == 0) \n ");
1367  source.append(" r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
1368  source.append(" if (get_global_id(0) == 0) \n ");
1369  source.append(" R_buffer[R_offset] = norm_vk; \n");
1370 
1371  source.append("} \n");
1372 
1373 }
1374 
1375 template <typename StringType>
1376 void generate_pipelined_gmres_update_result(StringType & source, std::string const & numeric_string)
1377 {
1378  source.append("__kernel void gmres_update_result( \n");
1379  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1380  source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
1381  source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
1382  source.append(" unsigned int size, \n");
1383  source.append(" unsigned int internal_size, \n");
1384  source.append(" __global "); source.append(numeric_string); source.append(" const * coefficients, \n");
1385  source.append(" unsigned int k) \n");
1386  source.append("{ \n");
1387 
1388  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1389  source.append(" "); source.append(numeric_string); source.append(" value_result = result[i] + coefficients[0] * residual[i]; \n");
1390  source.append(" \n");
1391  source.append(" for (unsigned int j = 1; j < k; ++j) \n");
1392  source.append(" value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
1393  source.append(" \n");
1394  source.append(" result[i] = value_result; \n");
1395  source.append(" } \n");
1396 
1397  source.append("} \n");
1398 }
1399 
1400 
1401 template <typename StringType>
1402 void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType & source, std::string const & numeric_string)
1403 {
1404  source.append("__kernel void gmres_csr_blocked_prod( \n");
1405  source.append(" __global const unsigned int * row_indices, \n");
1406  source.append(" __global const unsigned int * column_indices, \n");
1407  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1408  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1409  source.append(" unsigned int offset_p, \n");
1410  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1411  source.append(" unsigned int offset_Ap, \n");
1412  source.append(" unsigned int size, \n");
1413  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1414  source.append(" unsigned int buffer_size, \n");
1415  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1416  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1417  source.append("{ \n");
1418  source.append(" cg_csr_blocked_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1419  source.append("} \n \n");
1420 
1421 }
1422 
1423 template <typename StringType>
1424 void generate_compressed_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1425 {
1426  source.append("__kernel void gmres_csr_prod( \n");
1427  source.append(" __global const unsigned int * row_indices, \n");
1428  source.append(" __global const unsigned int * column_indices, \n");
1429  source.append(" __global const unsigned int * row_blocks, \n");
1430  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1431  source.append(" unsigned int num_blocks, \n");
1432  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1433  source.append(" unsigned int offset_p, \n");
1434  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1435  source.append(" unsigned int offset_Ap, \n");
1436  source.append(" unsigned int size, \n");
1437  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1438  source.append(" unsigned int buffer_size, \n");
1439  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1440  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
1441  source.append(" __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
1442  source.append("{ \n");
1443  source.append(" cg_csr_prod(row_indices, column_indices, row_blocks, elements, num_blocks, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp, shared_elements); \n");
1444  source.append("} \n \n");
1445 
1446 }
1447 
1448 template <typename StringType>
1449 void generate_coordinate_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1450 {
1451  source.append("__kernel void gmres_coo_prod( \n");
1452  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
1453  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1454  source.append(" __global const uint * group_boundaries, \n");
1455  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1456  source.append(" unsigned int offset_p, \n");
1457  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1458  source.append(" unsigned int offset_Ap, \n");
1459  source.append(" unsigned int size, \n");
1460  source.append(" __local unsigned int * shared_rows, \n");
1461  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
1462  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1463  source.append(" unsigned int buffer_size, \n");
1464  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1465  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1466  source.append("{ \n");
1467  source.append(" cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1468  source.append("} \n \n");
1469 
1470 }
1471 
1472 
1473 template <typename StringType>
1474 void generate_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1475 {
1476  source.append("__kernel void gmres_ell_prod( \n");
1477  source.append(" __global const unsigned int * coords, \n");
1478  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1479  source.append(" unsigned int internal_row_num, \n");
1480  source.append(" unsigned int items_per_row, \n");
1481  source.append(" unsigned int aligned_items_per_row, \n");
1482  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1483  source.append(" unsigned int offset_p, \n");
1484  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1485  source.append(" unsigned int offset_Ap, \n");
1486  source.append(" unsigned int size, \n");
1487  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1488  source.append(" unsigned int buffer_size, \n");
1489  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1490  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1491  source.append("{ \n");
1492  source.append(" cg_ell_prod(coords, elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1493  source.append("} \n \n");
1494 }
1495 
1496 template <typename StringType>
1497 void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1498 {
1499  source.append("__kernel void gmres_sliced_ell_prod( \n");
1500  source.append(" __global const unsigned int * columns_per_block, \n");
1501  source.append(" __global const unsigned int * column_indices, \n");
1502  source.append(" __global const unsigned int * block_start, \n");
1503  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1504  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1505  source.append(" unsigned int offset_p, \n");
1506  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1507  source.append(" unsigned int offset_Ap, \n");
1508  source.append(" unsigned int size, \n");
1509  source.append(" unsigned int block_size, \n");
1510  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1511  source.append(" unsigned int buffer_size, \n");
1512  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1513  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1514  source.append("{ \n");
1515  source.append(" cg_sliced_ell_prod(columns_per_block, column_indices, block_start, elements, p + offset_p, Ap + offset_Ap, size, block_size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1516  source.append("} \n \n");
1517 }
1518 
1519 template <typename StringType>
1520 void generate_hyb_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
1521 {
1522  source.append("__kernel void gmres_hyb_prod( \n");
1523  source.append(" const __global int* ell_coords, \n");
1524  source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
1525  source.append(" const __global uint* csr_rows, \n");
1526  source.append(" const __global uint* csr_cols, \n");
1527  source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
1528  source.append(" unsigned int internal_row_num, \n");
1529  source.append(" unsigned int items_per_row, \n");
1530  source.append(" unsigned int aligned_items_per_row, \n");
1531  source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
1532  source.append(" unsigned int offset_p, \n");
1533  source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
1534  source.append(" unsigned int offset_Ap, \n");
1535  source.append(" unsigned int size, \n");
1536  source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
1537  source.append(" unsigned int buffer_size, \n");
1538  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
1539  source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
1540  source.append("{ \n");
1541  source.append(" cg_hyb_prod(ell_coords, ell_elements, csr_rows, csr_cols, csr_elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1542  source.append("} \n \n");
1543 }
1544 
1545 
1546 
1547 
1549 
1550 // main kernel class
1552 template<typename NumericT>
1554 {
1555  static std::string program_name()
1556  {
1557  return viennacl::ocl::type_to_string<NumericT>::apply() + "_iterative";
1558  }
1559 
1560  static void init(viennacl::ocl::context & ctx)
1561  {
1562  static std::map<cl_context, bool> init_done;
1563  if (!init_done[ctx.handle().get()])
1564  {
1566  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1567 
1568  std::string source;
1569  source.reserve(1024);
1570 
1571  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1572 
1573  generate_pipelined_cg_vector_update(source, numeric_string);
1574  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1575  generate_compressed_matrix_pipelined_cg_blocked_prod(source, numeric_string, 16);
1576  generate_compressed_matrix_pipelined_cg_prod(source, numeric_string);
1577  generate_coordinate_matrix_pipelined_cg_prod(source, numeric_string);
1578  generate_ell_matrix_pipelined_cg_prod(source, numeric_string);
1579  generate_sliced_ell_matrix_pipelined_cg_prod(source, numeric_string);
1580  generate_hyb_matrix_pipelined_cg_prod(source, numeric_string);
1581 
1582  generate_pipelined_bicgstab_update_s(source, numeric_string);
1583  generate_pipelined_bicgstab_vector_update(source, numeric_string);
1584  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1588  generate_ell_matrix_pipelined_bicgstab_prod(source, numeric_string);
1590  generate_hyb_matrix_pipelined_bicgstab_prod(source, numeric_string);
1591 
1592  generate_pipelined_gmres_gram_schmidt_stage1(source, numeric_string, ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id); // NVIDIA GPUs require special treatment here
1593  generate_pipelined_gmres_gram_schmidt_stage2(source, numeric_string);
1594  generate_pipelined_gmres_normalize_vk(source, numeric_string);
1595  generate_pipelined_gmres_update_result(source, numeric_string);
1596  if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
1598  generate_compressed_matrix_pipelined_gmres_prod(source, numeric_string);
1599  generate_coordinate_matrix_pipelined_gmres_prod(source, numeric_string);
1600  generate_ell_matrix_pipelined_gmres_prod(source, numeric_string);
1601  generate_sliced_ell_matrix_pipelined_gmres_prod(source, numeric_string);
1602  generate_hyb_matrix_pipelined_gmres_prod(source, numeric_string);
1603 
1604  std::string prog_name = program_name();
1605  #ifdef VIENNACL_BUILD_INFO
1606  std::cout << "Creating program " << prog_name << std::endl;
1607  #endif
1608  ctx.add_program(source, prog_name);
1609  init_done[ctx.handle().get()] = true;
1610  } //if
1611  } //init
1612 };
1613 
1614 } // namespace kernels
1615 } // namespace opencl
1616 } // namespace linalg
1617 } // namespace viennacl
1618 #endif
1619 
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
Definition: iterative.hpp:1553
void generate_sliced_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:413
void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1402
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Definition: iterative.hpp:1560
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
Provides OpenCL-related utilities.
void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:1020
void generate_pipelined_gmres_gram_schmidt_stage1(StringType &source, std::string const &numeric_string, bool is_nvidia)
Definition: iterative.hpp:1169
void generate_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:357
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:611
void generate_hyb_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:1092
void generate_pipelined_gmres_normalize_vk(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1315
void generate_pipelined_gmres_gram_schmidt_stage2(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1244
void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
Definition: iterative.hpp:90
void generate_compressed_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1424
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:956
const OCL_TYPE & get() const
Definition: handle.hpp:191
void generate_compressed_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:159
void generate_pipelined_bicgstab_vector_update(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:610
void generate_coordinate_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:254
void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT &source, std::string const &numeric_string, unsigned int subwarp_size)
Definition: iterative.hpp:664
void generate_coordinate_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1449
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
Proxy classes for vectors.
void generate_hyb_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1520
void generate_hyb_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:477
void generate_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1474
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:843
void generate_pipelined_bicgstab_update_s(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:547
void generate_pipelined_cg_vector_update(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:46
void generate_pipelined_gmres_update_result(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1376
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_compressed_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
Definition: iterative.hpp:741
void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Definition: iterative.hpp:1497