1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
41 template<
typename StringT>
45 source.append(
"__kernel void vec_mul_alpha_beta( \n");
47 source.append(
"__kernel void vec_mul( \n");
48 source.append(
" const __global int* ell_coords, \n");
49 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
50 source.append(
" const __global uint* csr_rows, \n");
51 source.append(
" const __global uint* csr_cols, \n");
52 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
53 source.append(
" const __global "); source.append(numeric_string); source.append(
" * x, \n");
54 source.append(
" uint4 layout_x, \n");
55 if (with_alpha_beta) { source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n"); }
56 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
57 source.append(
" uint4 layout_result, \n");
58 if (with_alpha_beta) { source.append(
" "); source.append(numeric_string); source.append(
" beta, \n"); }
59 source.append(
" unsigned int row_num, \n");
60 source.append(
" unsigned int internal_row_num, \n");
61 source.append(
" unsigned int items_per_row, \n");
62 source.append(
" unsigned int aligned_items_per_row) \n");
63 source.append(
"{ \n");
64 source.append(
" uint glb_id = get_global_id(0); \n");
65 source.append(
" uint glb_sz = get_global_size(0); \n");
67 source.append(
" for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
68 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
70 source.append(
" uint offset = row_id; \n");
71 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
72 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
74 source.append(
" if (val != ("); source.append(numeric_string); source.append(
")0) { \n");
75 source.append(
" int col = ell_coords[offset]; \n");
76 source.append(
" sum += (x[col * layout_x.y + layout_x.x] * val); \n");
77 source.append(
" } \n");
79 source.append(
" } \n");
81 source.append(
" uint col_begin = csr_rows[row_id]; \n");
82 source.append(
" uint col_end = csr_rows[row_id + 1]; \n");
84 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
85 source.append(
" sum += (x[csr_cols[item_id] * layout_x.y + layout_x.x] * csr_elements[item_id]); \n");
86 source.append(
" } \n");
89 source.append(
" result[row_id * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row_id * layout_result.y + layout_result.x] : 0); \n");
91 source.append(
" result[row_id * layout_result.y + layout_result.x] = sum; \n");
92 source.append(
" } \n");
93 source.append(
"} \n");
98 template<
typename StringT>
100 bool B_transposed,
bool B_row_major,
bool C_row_major)
102 source.append(
"__kernel void ");
104 source.append(
"( \n");
105 source.append(
" const __global int* ell_coords, \n");
106 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
107 source.append(
" const __global uint* csr_rows, \n");
108 source.append(
" const __global uint* csr_cols, \n");
109 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
110 source.append(
" unsigned int row_num, \n");
111 source.append(
" unsigned int internal_row_num, \n");
112 source.append(
" unsigned int items_per_row, \n");
113 source.append(
" unsigned int aligned_items_per_row, \n");
114 source.append(
" __global const "); source.append(numeric_string); source.append(
"* d_mat, \n");
115 source.append(
" unsigned int d_mat_row_start, \n");
116 source.append(
" unsigned int d_mat_col_start, \n");
117 source.append(
" unsigned int d_mat_row_inc, \n");
118 source.append(
" unsigned int d_mat_col_inc, \n");
119 source.append(
" unsigned int d_mat_row_size, \n");
120 source.append(
" unsigned int d_mat_col_size, \n");
121 source.append(
" unsigned int d_mat_internal_rows, \n");
122 source.append(
" unsigned int d_mat_internal_cols, \n");
123 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
124 source.append(
" unsigned int result_row_start, \n");
125 source.append(
" unsigned int result_col_start, \n");
126 source.append(
" unsigned int result_row_inc, \n");
127 source.append(
" unsigned int result_col_inc, \n");
128 source.append(
" unsigned int result_row_size, \n");
129 source.append(
" unsigned int result_col_size, \n");
130 source.append(
" unsigned int result_internal_rows, \n");
131 source.append(
" unsigned int result_internal_cols) { \n");
133 source.append(
" uint glb_id = get_global_id(0); \n");
134 source.append(
" uint glb_sz = get_global_size(0); \n");
136 source.append(
" for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
137 source.append(
" for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
138 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
140 source.append(
" uint offset = row_id; \n");
141 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
142 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
144 source.append(
" if (val != ("); source.append(numeric_string); source.append(
")0) { \n");
145 source.append(
" int col = ell_coords[offset]; \n");
146 if (B_transposed && B_row_major)
147 source.append(
" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ] * val; \n");
148 else if (B_transposed && !B_row_major)
149 source.append(
" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
150 else if (!B_transposed && B_row_major)
151 source.append(
" sum += d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] * val; \n");
153 source.append(
" sum += d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
154 source.append(
" } \n");
156 source.append(
" } \n");
158 source.append(
" uint col_begin = csr_rows[row_id]; \n");
159 source.append(
" uint col_end = csr_rows[row_id + 1]; \n");
161 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
162 if (B_transposed && B_row_major)
163 source.append(
" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + csr_cols[item_id] * d_mat_col_inc ] * csr_elements[item_id]; \n");
164 else if (B_transposed && !B_row_major)
165 source.append(
" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + csr_cols[item_id] * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
166 else if (!B_transposed && B_row_major)
167 source.append(
" sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] * csr_elements[item_id]; \n");
169 source.append(
" sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
170 source.append(
" } \n");
173 source.append(
" result[ (result_row_start + row_id * result_row_inc) * result_internal_cols + result_col_start + result_col * result_col_inc ] = sum; \n");
175 source.append(
" result[ (result_row_start + row_id * result_row_inc) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = sum; \n");
176 source.append(
" } \n");
177 source.append(
" } \n");
178 source.append(
"} \n");
182 template<
typename StringT>
200 template<
typename NumericT>
210 static std::map<cl_context, bool> init_done;
217 source.reserve(1024);
219 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
226 #ifdef VIENNACL_BUILD_INFO
227 std::cout <<
"Creating program " << prog_name << std::endl;
229 ctx.add_program(source, prog_name);
230 init_done[ctx.handle().get()] =
true;
void generate_hyb_vec_mul(StringT &source, std::string const &numeric_string, bool with_alpha_beta)
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Common implementations shared by OpenCL-based operations.
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const
static std::string program_name()
void generate_hyb_matrix_dense_matrix_mul(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Representation of an OpenCL kernel in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Helper class for converting a type to its string representation.
void generate_hyb_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Main kernel class for generating OpenCL kernels for hyb_matrix.