1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
36 template<
typename StringT>
39 source.append(
"__kernel void level_scheduling_substitute( \n");
40 source.append(
" __global const unsigned int * row_index_array, \n");
41 source.append(
" __global const unsigned int * row_indices, \n");
42 source.append(
" __global const unsigned int * column_indices, \n");
43 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
44 source.append(
" __global "); source.append(numeric_string); source.append(
" * vec, \n");
45 source.append(
" unsigned int size) \n");
46 source.append(
"{ \n");
47 source.append(
" for (unsigned int row = get_global_id(0); \n");
48 source.append(
" row < size; \n");
49 source.append(
" row += get_global_size(0)) \n");
50 source.append(
" { \n");
51 source.append(
" unsigned int eq_row = row_index_array[row]; \n");
52 source.append(
" "); source.append(numeric_string); source.append(
" vec_entry = vec[eq_row]; \n");
53 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
55 source.append(
" for (unsigned int j = row_indices[row]; j < row_end; ++j) \n");
56 source.append(
" vec_entry -= vec[column_indices[j]] * elements[j]; \n");
58 source.append(
" vec[eq_row] = vec_entry; \n");
59 source.append(
" } \n");
60 source.append(
"} \n");
66 template<
typename StringT>
69 source.append(
"__kernel void extract_L_1( \n");
70 source.append(
" __global unsigned int const *A_row_indices, \n");
71 source.append(
" __global unsigned int const *A_col_indices, \n");
72 source.append(
" unsigned int A_size1, \n");
73 source.append(
" __global unsigned int *L_row_indices) { \n");
75 source.append(
" for (unsigned int row = get_global_id(0); \n");
76 source.append(
" row < A_size1; \n");
77 source.append(
" row += get_global_size(0)) \n");
78 source.append(
" { \n");
79 source.append(
" unsigned int row_begin = A_row_indices[row]; \n");
80 source.append(
" unsigned int row_end = A_row_indices[row+1]; \n");
82 source.append(
" unsigned int num_entries_L = 0; \n");
83 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
84 source.append(
" unsigned int col = A_col_indices[j]; \n");
85 source.append(
" if (col <= row) ++num_entries_L; \n");
86 source.append(
" } \n");
88 source.append(
" L_row_indices[row] = num_entries_L; \n");
89 source.append(
" } \n");
90 source.append(
"} \n");
93 template<
typename StringT>
96 source.append(
"__kernel void extract_L_2( \n");
97 source.append(
" __global unsigned int const *A_row_indices, \n");
98 source.append(
" __global unsigned int const *A_col_indices, \n");
99 source.append(
" __global "); source.append(numeric_string); source.append(
" const *A_elements, \n");
100 source.append(
" unsigned int A_size1, \n");
101 source.append(
" __global unsigned int const *L_row_indices, \n");
102 source.append(
" __global unsigned int *L_col_indices, \n");
103 source.append(
" __global "); source.append(numeric_string); source.append(
" *L_elements) { \n");
105 source.append(
" for (unsigned int row = get_global_id(0); \n");
106 source.append(
" row < A_size1; \n");
107 source.append(
" row += get_global_size(0)) \n");
108 source.append(
" { \n");
109 source.append(
" unsigned int row_begin = A_row_indices[row]; \n");
110 source.append(
" unsigned int row_end = A_row_indices[row+1]; \n");
112 source.append(
" unsigned int index_L = L_row_indices[row]; \n");
113 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
114 source.append(
" unsigned int col = A_col_indices[j]; \n");
115 source.append(
" "); source.append(numeric_string); source.append(
" value = A_elements[j]; \n");
117 source.append(
" if (col <= row) { \n");
118 source.append(
" L_col_indices[index_L] = col; \n");
119 source.append(
" L_elements[index_L] = value; \n");
120 source.append(
" ++index_L; \n");
121 source.append(
" } \n");
122 source.append(
" } \n");
124 source.append(
" } \n");
125 source.append(
"} \n");
129 template<
typename StringT>
132 source.append(
"__kernel void icc_chow_patel_sweep_kernel( \n");
133 source.append(
" __global unsigned int const *L_row_indices, \n");
134 source.append(
" __global unsigned int const *L_col_indices, \n");
135 source.append(
" __global "); source.append(numeric_string); source.append(
" *L_elements, \n");
136 source.append(
" __global "); source.append(numeric_string); source.append(
" const *L_backup, \n");
137 source.append(
" unsigned int L_size1, \n");
139 source.append(
" __global "); source.append(numeric_string); source.append(
" const *aij_L) { \n");
141 source.append(
" for (unsigned int row = get_global_id(0); \n");
142 source.append(
" row < L_size1; \n");
143 source.append(
" row += get_global_size(0)) \n");
144 source.append(
" { \n");
149 source.append(
" unsigned int row_Li_start = L_row_indices[row]; \n");
150 source.append(
" unsigned int row_Li_end = L_row_indices[row + 1]; \n");
152 source.append(
" for (unsigned int i = row_Li_start; i < row_Li_end; ++i) { \n");
153 source.append(
" unsigned int col = L_col_indices[i]; \n");
155 source.append(
" unsigned int row_Lj_start = L_row_indices[col]; \n");
156 source.append(
" unsigned int row_Lj_end = L_row_indices[col + 1]; \n");
158 source.append(
" unsigned int index_Lj = row_Lj_start; \n");
159 source.append(
" unsigned int col_Lj = L_col_indices[index_Lj]; \n");
161 source.append(
" "); source.append(numeric_string); source.append(
" s = aij_L[i]; \n");
162 source.append(
" for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) { \n");
163 source.append(
" unsigned int col_Li = L_col_indices[index_Li]; \n");
165 source.append(
" while (col_Lj < col_Li) { \n");
166 source.append(
" ++index_Lj; \n");
167 source.append(
" col_Lj = L_col_indices[index_Lj]; \n");
168 source.append(
" } \n");
170 source.append(
" if (col_Lj == col_Li) \n");
171 source.append(
" s -= L_backup[index_Li] * L_backup[index_Lj]; \n");
172 source.append(
" } \n");
175 source.append(
" L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); \n");
176 source.append(
" } \n");
178 source.append(
" } \n");
179 source.append(
"} \n");
185 template<
typename StringT>
188 source.append(
"__kernel void extract_LU_1( \n");
189 source.append(
" __global unsigned int const *A_row_indices, \n");
190 source.append(
" __global unsigned int const *A_col_indices, \n");
191 source.append(
" unsigned int A_size1, \n");
192 source.append(
" __global unsigned int *L_row_indices, \n");
193 source.append(
" __global unsigned int *U_row_indices) { \n");
195 source.append(
" for (unsigned int row = get_global_id(0); \n");
196 source.append(
" row < A_size1; \n");
197 source.append(
" row += get_global_size(0)) \n");
198 source.append(
" { \n");
199 source.append(
" unsigned int row_begin = A_row_indices[row]; \n");
200 source.append(
" unsigned int row_end = A_row_indices[row+1]; \n");
202 source.append(
" unsigned int num_entries_L = 0; \n");
203 source.append(
" unsigned int num_entries_U = 0; \n");
204 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
205 source.append(
" unsigned int col = A_col_indices[j]; \n");
206 source.append(
" if (col <= row) ++num_entries_L; \n");
207 source.append(
" if (col >= row) ++num_entries_U; \n");
208 source.append(
" } \n");
210 source.append(
" L_row_indices[row] = num_entries_L; \n");
211 source.append(
" U_row_indices[row] = num_entries_U; \n");
212 source.append(
" } \n");
213 source.append(
"} \n");
216 template<
typename StringT>
219 source.append(
"__kernel void extract_LU_2( \n");
220 source.append(
" __global unsigned int const *A_row_indices, \n");
221 source.append(
" __global unsigned int const *A_col_indices, \n");
222 source.append(
" __global "); source.append(numeric_string); source.append(
" const *A_elements, \n");
223 source.append(
" unsigned int A_size1, \n");
224 source.append(
" __global unsigned int const *L_row_indices, \n");
225 source.append(
" __global unsigned int *L_col_indices, \n");
226 source.append(
" __global "); source.append(numeric_string); source.append(
" *L_elements, \n");
227 source.append(
" __global unsigned int const *U_row_indices, \n");
228 source.append(
" __global unsigned int *U_col_indices, \n");
229 source.append(
" __global "); source.append(numeric_string); source.append(
" *U_elements) { \n");
231 source.append(
" for (unsigned int row = get_global_id(0); \n");
232 source.append(
" row < A_size1; \n");
233 source.append(
" row += get_global_size(0)) \n");
234 source.append(
" { \n");
235 source.append(
" unsigned int row_begin = A_row_indices[row]; \n");
236 source.append(
" unsigned int row_end = A_row_indices[row+1]; \n");
238 source.append(
" unsigned int index_L = L_row_indices[row]; \n");
239 source.append(
" unsigned int index_U = U_row_indices[row]; \n");
240 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
241 source.append(
" unsigned int col = A_col_indices[j]; \n");
242 source.append(
" "); source.append(numeric_string); source.append(
" value = A_elements[j]; \n");
244 source.append(
" if (col <= row) { \n");
245 source.append(
" L_col_indices[index_L] = col; \n");
246 source.append(
" L_elements[index_L] = value; \n");
247 source.append(
" ++index_L; \n");
248 source.append(
" } \n");
249 source.append(
" if (col >= row) { \n");
250 source.append(
" U_col_indices[index_U] = col; \n");
251 source.append(
" U_elements[index_U] = value; \n");
252 source.append(
" ++index_U; \n");
253 source.append(
" } \n");
254 source.append(
" } \n");
256 source.append(
" } \n");
257 source.append(
"} \n");
260 template<
typename StringT>
263 source.append(
"__kernel void ilu_scale_kernel_1( \n");
264 source.append(
" __global unsigned int const *A_row_indices, \n");
265 source.append(
" __global unsigned int const *A_col_indices, \n");
266 source.append(
" __global "); source.append(numeric_string); source.append(
" const *A_elements, \n");
267 source.append(
" unsigned int A_size1, \n");
268 source.append(
" __global "); source.append(numeric_string); source.append(
" *D_elements) { \n");
270 source.append(
" for (unsigned int row = get_global_id(0); \n");
271 source.append(
" row < A_size1; \n");
272 source.append(
" row += get_global_size(0)) \n");
273 source.append(
" { \n");
274 source.append(
" unsigned int row_begin = A_row_indices[row]; \n");
275 source.append(
" unsigned int row_end = A_row_indices[row+1]; \n");
277 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
278 source.append(
" unsigned int col = A_col_indices[j]; \n");
280 source.append(
" if (col == row) { \n");
281 source.append(
" D_elements[row] = 1 / sqrt(fabs(A_elements[j])); \n");
282 source.append(
" break; \n");
283 source.append(
" } \n");
284 source.append(
" } \n");
286 source.append(
" } \n");
287 source.append(
"} \n");
290 template<
typename StringT>
293 source.append(
"__kernel void ilu_scale_kernel_2( \n");
294 source.append(
" __global unsigned int const *R_row_indices, \n");
295 source.append(
" __global unsigned int const *R_col_indices, \n");
296 source.append(
" __global "); source.append(numeric_string); source.append(
" *R_elements, \n");
297 source.append(
" unsigned int R_size1, \n");
298 source.append(
" __global "); source.append(numeric_string); source.append(
" const *D_elements) { \n");
300 source.append(
" for (unsigned int row = get_global_id(0); \n");
301 source.append(
" row < R_size1; \n");
302 source.append(
" row += get_global_size(0)) \n");
303 source.append(
" { \n");
304 source.append(
" unsigned int row_begin = R_row_indices[row]; \n");
305 source.append(
" unsigned int row_end = R_row_indices[row+1]; \n");
307 source.append(
" "); source.append(numeric_string); source.append(
" D_row = D_elements[row]; \n");
308 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) \n");
309 source.append(
" R_elements[j] *= D_row * D_elements[R_col_indices[j]]; \n");
311 source.append(
" } \n");
312 source.append(
"} \n");
315 template<
typename StringT>
318 source.append(
"__kernel void ilu_chow_patel_sweep_kernel( \n");
319 source.append(
" __global unsigned int const *L_row_indices, \n");
320 source.append(
" __global unsigned int const *L_col_indices, \n");
321 source.append(
" __global "); source.append(numeric_string); source.append(
" *L_elements, \n");
322 source.append(
" __global "); source.append(numeric_string); source.append(
" const *L_backup, \n");
323 source.append(
" unsigned int L_size1, \n");
325 source.append(
" __global "); source.append(numeric_string); source.append(
" const *aij_L, \n");
327 source.append(
" __global unsigned int const *U_trans_row_indices, \n");
328 source.append(
" __global unsigned int const *U_trans_col_indices, \n");
329 source.append(
" __global "); source.append(numeric_string); source.append(
" *U_trans_elements, \n");
330 source.append(
" __global "); source.append(numeric_string); source.append(
" const *U_trans_backup, \n");
332 source.append(
" __global "); source.append(numeric_string); source.append(
" const *aij_U_trans) { \n");
334 source.append(
" for (unsigned int row = get_global_id(0); \n");
335 source.append(
" row < L_size1; \n");
336 source.append(
" row += get_global_size(0)) \n");
337 source.append(
" { \n");
342 source.append(
" unsigned int row_L_start = L_row_indices[row]; \n");
343 source.append(
" unsigned int row_L_end = L_row_indices[row + 1]; \n");
345 source.append(
" for (unsigned int j = row_L_start; j < row_L_end; ++j) { \n");
346 source.append(
" unsigned int col = L_col_indices[j]; \n");
348 source.append(
" if (col == row) continue; \n");
350 source.append(
" unsigned int row_U_start = U_trans_row_indices[col]; \n");
351 source.append(
" unsigned int row_U_end = U_trans_row_indices[col + 1]; \n");
353 source.append(
" unsigned int index_U = row_U_start; \n");
354 source.append(
" unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1; \n");
356 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
357 source.append(
" for (unsigned int k = row_L_start; k < j; ++k) { \n");
358 source.append(
" unsigned int col_L = L_col_indices[k]; \n");
360 source.append(
" while (col_U < col_L) { \n");
361 source.append(
" ++index_U; \n");
362 source.append(
" col_U = U_trans_col_indices[index_U]; \n");
363 source.append(
" } \n");
365 source.append(
" if (col_U == col_L) \n");
366 source.append(
" sum += L_backup[k] * U_trans_backup[index_U]; \n");
367 source.append(
" } \n");
370 source.append(
" L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; \n");
371 source.append(
" } \n");
376 source.append(
" unsigned int row_U_start = U_trans_row_indices[row]; \n");
377 source.append(
" unsigned int row_U_end = U_trans_row_indices[row + 1]; \n");
379 source.append(
" for (unsigned int j = row_U_start; j < row_U_end; ++j) { \n");
380 source.append(
" unsigned int col = U_trans_col_indices[j]; \n");
382 source.append(
" row_L_start = L_row_indices[col]; \n");
383 source.append(
" row_L_end = L_row_indices[col + 1]; \n");
386 source.append(
" unsigned int index_L = row_L_start; \n");
387 source.append(
" unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1; \n");
388 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
389 source.append(
" for (unsigned int k = row_U_start; k < j; ++k) { \n");
390 source.append(
" unsigned int col_U = U_trans_col_indices[k]; \n");
393 source.append(
" while (col_L < col_U) { \n");
394 source.append(
" ++index_L; \n");
395 source.append(
" col_L = L_col_indices[index_L]; \n");
396 source.append(
" } \n");
398 source.append(
" if (col_U == col_L) \n");
399 source.append(
" sum += L_backup[index_L] * U_trans_backup[k]; \n");
400 source.append(
" } \n");
403 source.append(
" U_trans_elements[j] = aij_U_trans[j] - sum; \n");
404 source.append(
" } \n");
406 source.append(
" } \n");
407 source.append(
"} \n");
411 template<
typename StringT>
414 source.append(
"__kernel void ilu_form_neumann_matrix_kernel( \n");
415 source.append(
" __global unsigned int const *R_row_indices, \n");
416 source.append(
" __global unsigned int const *R_col_indices, \n");
417 source.append(
" __global "); source.append(numeric_string); source.append(
" *R_elements, \n");
418 source.append(
" unsigned int R_size1, \n");
419 source.append(
" __global "); source.append(numeric_string); source.append(
" *D_elements) { \n");
421 source.append(
" for (unsigned int row = get_global_id(0); \n");
422 source.append(
" row < R_size1; \n");
423 source.append(
" row += get_global_size(0)) \n");
424 source.append(
" { \n");
425 source.append(
" unsigned int row_begin = R_row_indices[row]; \n");
426 source.append(
" unsigned int row_end = R_row_indices[row+1]; \n");
429 source.append(
" "); source.append(numeric_string); source.append(
" diag = D_elements[row]; \n");
430 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
431 source.append(
" unsigned int col = R_col_indices[j]; \n");
432 source.append(
" if (col == row) { \n");
433 source.append(
" diag = R_elements[j]; \n");
434 source.append(
" R_elements[j] = 0; \n");
435 source.append(
" break; \n");
436 source.append(
" } \n");
437 source.append(
" } \n");
438 source.append(
" D_elements[row] = diag; \n");
441 source.append(
" for (unsigned int j=row_begin; j<row_end; ++j) \n");
442 source.append(
" R_elements[j] /= -diag; \n");
444 source.append(
" } \n");
445 source.append(
"} \n");
452 template<
class NumericT>
462 static std::map<cl_context, bool> init_done;
469 source.reserve(1024);
471 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
474 if (numeric_string ==
"float" || numeric_string ==
"double")
491 #ifdef VIENNACL_BUILD_INFO
492 std::cout <<
"Creating program " << prog_name << std::endl;
494 ctx.add_program(source, prog_name);
495 init_done[ctx.handle().get()] =
true;
void generate_ilu_level_scheduling_substitute(StringT &source, std::string const &numeric_string)
void generate_ilu_extract_LU_2(StringT &source, std::string const &numeric_string)
void generate_icc_extract_L_2(StringT &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
void generate_ilu_extract_LU_1(StringT &source)
static void apply(viennacl::ocl::context const &)
void generate_icc_extract_L_1(StringT &source)
const OCL_TYPE & get() const
Main kernel class for generating OpenCL kernels for incomplete LU factorization preconditioners.
void generate_ilu_chow_patel_sweep_kernel(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
void generate_ilu_scale_kernel_1(StringT &source, std::string const &numeric_string)
void generate_ilu_scale_kernel_2(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_icc_chow_patel_sweep_kernel(StringT &source, std::string const &numeric_string)
void generate_ilu_form_neumann_matrix_kernel(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
static std::string program_name()