1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
38 template <
typename StringType>
41 source.append(
"__kernel void scan_1(__global "); source.append(numeric_string); source.append(
"* X, \n");
42 source.append(
" unsigned int startX, \n");
43 source.append(
" unsigned int incX, \n");
44 source.append(
" unsigned int sizeX, \n");
46 source.append(
" __global "); source.append(numeric_string); source.append(
"* Y, \n");
47 source.append(
" unsigned int startY, \n");
48 source.append(
" unsigned int incY, \n");
50 source.append(
" unsigned int scan_offset, \n");
51 source.append(
" __global "); source.append(numeric_string); source.append(
"* carries) { \n");
53 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_buffer[256]; \n");
54 source.append(
" "); source.append(numeric_string); source.append(
" my_value; \n");
56 source.append(
" unsigned int work_per_thread = (sizeX - 1) / get_global_size(0) + 1; \n");
57 source.append(
" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n");
58 source.append(
" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
59 source.append(
" unsigned int block_offset = 0; \n");
62 source.append(
" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) { \n");
65 source.append(
" my_value = (i < sizeX) ? X[i * incX + startX] : 0; \n");
68 source.append(
" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
69 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
70 source.append(
" shared_buffer[get_local_id(0)] = my_value; \n");
71 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
72 source.append(
" if (get_local_id(0) >= stride) \n");
73 source.append(
" my_value += shared_buffer[get_local_id(0) - stride]; \n");
74 source.append(
" } \n");
75 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
76 source.append(
" shared_buffer[get_local_id(0)] = my_value; \n");
77 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
80 source.append(
" if (scan_offset > 0) \n");
81 source.append(
" my_value = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
83 source.append(
" if (i < sizeX) \n");
84 source.append(
" Y[i * incY + startY] = block_offset + my_value; \n");
86 source.append(
" block_offset += shared_buffer[get_local_size(0)-1]; \n");
87 source.append(
" } \n");
90 source.append(
" if (get_local_id(0) == 0) carries[get_group_id(0)] = block_offset; \n");
92 source.append(
"} \n");
95 template <
typename StringType>
98 source.append(
"__kernel void scan_2(__global "); source.append(numeric_string); source.append(
"* carries) { \n");
100 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_buffer[256]; \n");
103 source.append(
" "); source.append(numeric_string); source.append(
" my_carry = carries[get_local_id(0)]; \n");
106 source.append(
" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
107 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
108 source.append(
" shared_buffer[get_local_id(0)] = my_carry; \n");
109 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
110 source.append(
" if (get_local_id(0) >= stride) \n");
111 source.append(
" my_carry += shared_buffer[get_local_id(0) - stride]; \n");
112 source.append(
" } \n");
113 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
114 source.append(
" shared_buffer[get_local_id(0)] = my_carry; \n");
115 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
118 source.append(
" carries[get_local_id(0)] = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
120 source.append(
"} \n");
123 template <
typename StringType>
126 source.append(
"__kernel void scan_3(__global "); source.append(numeric_string); source.append(
" * Y, \n");
127 source.append(
" unsigned int startY, \n");
128 source.append(
" unsigned int incY, \n");
129 source.append(
" unsigned int sizeY, \n");
131 source.append(
" __global "); source.append(numeric_string); source.append(
"* carries) { \n");
133 source.append(
" unsigned int work_per_thread = (sizeY - 1) / get_global_size(0) + 1; \n");
134 source.append(
" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n");
135 source.append(
" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
137 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_offset; \n");
139 source.append(
" if (get_local_id(0) == 0) shared_offset = carries[get_group_id(0)]; \n");
140 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
142 source.append(
" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) \n");
143 source.append(
" if (i < sizeY) \n");
144 source.append(
" Y[i * incY + startY] += shared_offset; \n");
146 source.append(
"} \n");
154 template<
typename NumericT>
164 static std::map<cl_context, bool> init_done;
171 source.reserve(1024);
173 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
180 #ifdef VIENNACL_BUILD_INFO
181 std::cout <<
"Creating program " << prog_name << std::endl;
183 ctx.add_program(source, prog_name);
184 init_done[ctx.handle().get()] =
true;
void generate_scan_kernel_3(StringType &source, std::string const &numeric_string)
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
static void init(viennacl::ocl::context &ctx)
Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices...
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const
static std::string program_name()
Representation of an OpenCL kernel in ViennaCL.
void generate_scan_kernel_1(StringType &source, std::string const &numeric_string)
void generate_scan_kernel_2(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.