1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
41 template<
typename StringT>
44 source.append(
"__kernel void bluestein_post(__global "); source.append(numeric_string); source.append(
"2 *Z, \n");
45 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *out, \n");
46 source.append(
" unsigned int size) \n");
47 source.append(
"{ \n");
48 source.append(
" unsigned int glb_id = get_global_id(0); \n");
49 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
51 source.append(
" unsigned int double_size = size << 1; \n");
52 source.append(
" "); source.append(numeric_string); source.append(
" sn_a, cs_a; \n");
53 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
55 source.append(
" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
56 source.append(
" unsigned int rm = i * i % (double_size); \n");
57 source.append(
" "); source.append(numeric_string); source.append(
" angle = ("); source.append(numeric_string); source.append(
")rm / size * (-NUM_PI); \n");
59 source.append(
" sn_a = sincos(angle, &cs_a); \n");
61 source.append(
" "); source.append(numeric_string); source.append(
"2 b_i = ("); source.append(numeric_string); source.append(
"2)(cs_a, sn_a); \n");
62 source.append(
" out[i] = ("); source.append(numeric_string); source.append(
"2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
63 source.append(
" } \n");
64 source.append(
"} \n");
68 template<
typename StringT>
71 source.append(
"__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
72 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *A, \n");
73 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *B, \n");
74 source.append(
" unsigned int size, \n");
75 source.append(
" unsigned int ext_size \n");
76 source.append(
" ) { \n");
77 source.append(
" unsigned int glb_id = get_global_id(0); \n");
78 source.append(
" unsigned int glb_sz = get_global_size(0); \n");
80 source.append(
" unsigned int double_size = size << 1; \n");
82 source.append(
" "); source.append(numeric_string); source.append(
" sn_a, cs_a; \n");
83 source.append(
" const "); source.append(numeric_string); source.append(
" NUM_PI = 3.14159265358979323846; \n");
85 source.append(
" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
86 source.append(
" unsigned int rm = i * i % (double_size); \n");
87 source.append(
" "); source.append(numeric_string); source.append(
" angle = ("); source.append(numeric_string); source.append(
")rm / size * NUM_PI; \n");
89 source.append(
" sn_a = sincos(-angle, &cs_a); \n");
91 source.append(
" "); source.append(numeric_string); source.append(
"2 a_i = ("); source.append(numeric_string); source.append(
"2)(cs_a, sn_a); \n");
92 source.append(
" "); source.append(numeric_string); source.append(
"2 b_i = ("); source.append(numeric_string); source.append(
"2)(cs_a, -sn_a); \n");
94 source.append(
" A[i] = ("); source.append(numeric_string); source.append(
"2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
95 source.append(
" B[i] = b_i; \n");
98 source.append(
" if (i) \n");
99 source.append(
" B[ext_size - i] = b_i; \n");
100 source.append(
" } \n");
101 source.append(
"} \n");
105 template<
typename StringT>
108 source.append(
"__kernel void complex_to_real(__global "); source.append(numeric_string); source.append(
"2 *in, \n");
109 source.append(
" __global "); source.append(numeric_string); source.append(
" *out, \n");
110 source.append(
" unsigned int size) { \n");
111 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
112 source.append(
" out[i] = in[i].x; \n");
113 source.append(
"} \n");
117 template<
typename StringT>
120 source.append(
"__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append(
"2 *input1, \n");
121 source.append(
" unsigned int size, \n");
122 source.append(
" "); source.append(numeric_string); source.append(
" factor) { \n");
123 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
124 source.append(
" input1[i] /= factor; \n");
125 source.append(
"} \n");
129 template<
typename StringT>
132 source.append(
"__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append(
"2 *input1, \n");
133 source.append(
" __global const "); source.append(numeric_string); source.append(
"2 *input2, \n");
134 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *output, \n");
135 source.append(
" unsigned int size) { \n");
136 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
137 source.append(
" "); source.append(numeric_string); source.append(
"2 in1 = input1[i]; \n");
138 source.append(
" "); source.append(numeric_string); source.append(
"2 in2 = input2[i]; \n");
140 source.append(
" output[i] = ("); source.append(numeric_string); source.append(
"2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
141 source.append(
" } \n");
142 source.append(
"} \n");
146 template<
typename StringT>
149 source.append(
"__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(
" *in, \n");
150 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *out, \n");
151 source.append(
" unsigned int size) { \n");
152 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
153 source.append(
" "); source.append(numeric_string); source.append(
"2 val = 0; \n");
154 source.append(
" val.x = in[i]; \n");
155 source.append(
" out[i] = val; \n");
156 source.append(
" } \n");
157 source.append(
"} \n");
161 template<
typename StringT>
164 source.append(
"__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(
" *vec, uint size) { \n");
165 source.append(
" for (uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
166 source.append(
" "); source.append(numeric_string); source.append(
" val1 = vec[i]; \n");
167 source.append(
" "); source.append(numeric_string); source.append(
" val2 = vec[size - i - 1]; \n");
169 source.append(
" vec[i] = val2; \n");
170 source.append(
" vec[size - i - 1] = val1; \n");
171 source.append(
" } \n");
172 source.append(
"} \n");
176 template<
typename StringT>
179 source.append(
"__kernel void transpose(__global "); source.append(numeric_string); source.append(
"2 *input, \n");
180 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *output, \n");
181 source.append(
" unsigned int row_num, \n");
182 source.append(
" unsigned int col_num) { \n");
183 source.append(
" unsigned int size = row_num * col_num; \n");
184 source.append(
" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
185 source.append(
" unsigned int row = i / col_num; \n");
186 source.append(
" unsigned int col = i - row*col_num; \n");
188 source.append(
" unsigned int new_pos = col * row_num + row; \n");
190 source.append(
" output[new_pos] = input[i]; \n");
191 source.append(
" } \n");
192 source.append(
"} \n");
196 template<
typename StringT>
199 source.append(
"__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append(
"2* input, \n");
200 source.append(
" unsigned int row_num, \n");
201 source.append(
" unsigned int col_num) { \n");
202 source.append(
" unsigned int size = row_num * col_num; \n");
203 source.append(
" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
204 source.append(
" unsigned int row = i / col_num; \n");
205 source.append(
" unsigned int col = i - row*col_num; \n");
207 source.append(
" unsigned int new_pos = col * row_num + row; \n");
209 source.append(
" if (i < new_pos) { \n");
210 source.append(
" "); source.append(numeric_string); source.append(
"2 val = input[i]; \n");
211 source.append(
" input[i] = input[new_pos]; \n");
212 source.append(
" input[new_pos] = val; \n");
213 source.append(
" } \n");
214 source.append(
" } \n");
215 source.append(
"} \n");
219 template<
typename StringT>
222 source.append(
"__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(
" *vander, \n");
223 source.append(
" __global "); source.append(numeric_string); source.append(
" *vector, \n");
224 source.append(
" __global "); source.append(numeric_string); source.append(
" *result, \n");
225 source.append(
" uint size) { \n");
226 source.append(
" for (uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
227 source.append(
" "); source.append(numeric_string); source.append(
" mul = vander[i]; \n");
228 source.append(
" "); source.append(numeric_string); source.append(
" pwr = 1; \n");
229 source.append(
" "); source.append(numeric_string); source.append(
" val = 0; \n");
231 source.append(
" for (uint j = 0; j < size; j++) { \n");
232 source.append(
" val = val + pwr * vector[j]; \n");
233 source.append(
" pwr *= mul; \n");
234 source.append(
" } \n");
236 source.append(
" result[i] = val; \n");
237 source.append(
" } \n");
238 source.append(
"} \n");
242 template<
typename StringT>
245 source.append(
"__kernel void zero2(__global "); source.append(numeric_string); source.append(
"2 *input1, \n");
246 source.append(
" __global "); source.append(numeric_string); source.append(
"2 *input2, \n");
247 source.append(
" unsigned int size) { \n");
248 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
249 source.append(
" input1[i] = 0; \n");
250 source.append(
" input2[i] = 0; \n");
251 source.append(
" } \n");
252 source.append(
"} \n");
259 template<
typename NumericT>
269 static std::map<cl_context, bool> init_done;
276 source.reserve(8192);
278 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
281 if (numeric_string ==
"float" || numeric_string ==
"double")
297 #ifdef VIENNACL_BUILD_INFO
298 std::cout <<
"Creating program " << prog_name << std::endl;
300 ctx.add_program(source, prog_name);
301 init_done[ctx.handle().get()] =
true;
void generate_fft_div_vec_scalar(StringT &source, std::string const &numeric_string)
OpenCL kernel generation code for dividing a complex number by a real number.
Main kernel class for generating OpenCL kernels for the fast Fourier transform.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
static std::string program_name()
void generate_fft_bluestein_post(StringT &source, std::string const &numeric_string)
Provides OpenCL-related utilities.
void generate_fft_reverse_inplace(StringT &source, std::string const &numeric_string)
Reverses the entries in a vector.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
void generate_fft_complex_to_real(StringT &source, std::string const &numeric_string)
Extract real part of a complex number array.
static void apply(viennacl::ocl::context const &)
const OCL_TYPE & get() const
void generate_fft_zero2(StringT &source, std::string const &numeric_string)
Zero two complex vectors (to avoid kernel launch overhead)
void generate_fft_mult_vec(StringT &source, std::string const &numeric_string)
Elementwise product of two complex vectors.
void generate_fft_transpose_inplace(StringT &source, std::string const &numeric_string)
Simplistic inplace matrix transpose function.
void generate_fft_bluestein_pre(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_fft_vandermonde_prod(StringT &source, std::string const &numeric_string)
Computes the matrix vector product with a Vandermonde matrix.
void generate_fft_real_to_complex(StringT &source, std::string const &numeric_string)
Embedds a real-valued vector into a complex one.
Helper class for converting a type to its string representation.
static void init(viennacl::ocl::context &ctx)
void generate_fft_transpose(StringT &source, std::string const &numeric_string)
Simplistic matrix transpose function.