1 #ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
47 namespace FFT_DATA_ORDER
86 v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
87 v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
88 v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
89 v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
90 v = (v >> 16) | (v << 16);
91 v = v >> (32 - bit_size);
95 template<
typename NumericT,
unsigned int AlignmentV>
99 #ifdef VIENNACL_WITH_OPENMP
100 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
102 for (
long i2 = 0; i2 < long(size * 2); i2 += 2)
105 input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
109 template<
typename NumericT>
113 #ifdef VIENNACL_WITH_OPENMP
114 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
116 for (
long i2 = 0; i2 < long(size * 2); i2 += 2)
119 input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
123 template<
typename NumericT,
unsigned int AlignmentV>
127 #ifdef VIENNACL_WITH_OPENMP
128 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
130 for (
long i2 = 0; i2 < long(size); i2++)
133 in(i * 2) =
static_cast<NumericT>(std::real(input_complex[i]));
134 in(i * 2 + 1) =
static_cast<NumericT>(std::imag(input_complex[i]));
138 template<
typename NumericT>
142 #ifdef VIENNACL_WITH_OPENMP
143 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
145 for (
long i2 = 0; i2 < long(size * 2); i2 += 2)
148 input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
152 template<
typename NumericT>
155 #ifdef VIENNACL_WITH_OPENMP
156 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
158 for (
long i2 = 0; i2 < long(size); i2++)
161 in[i * 2] =
static_cast<NumericT>(std::real(input_complex[i]));
162 in[i * 2 + 1] =
static_cast<NumericT>(std::imag(input_complex[i]));
166 template<
typename NumericT>
170 std::vector<NumericT> temp(2 * size);
171 #ifdef VIENNACL_WITH_OPENMP
172 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
174 for (
long i2 = 0; i2 < long(size); i2++)
177 temp[i * 2] =
static_cast<NumericT>(std::real(input_complex[i]));
178 temp[i * 2 + 1] =
static_cast<NumericT>(std::imag(input_complex[i]));
183 template<
typename NumericT>
186 #ifdef VIENNACL_WITH_OPENMP
187 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
189 for (
long i2 = 0; i2 < long(size); i2++)
204 template<
typename NumericT>
205 void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output,
210 #ifdef VIENNACL_WITH_OPENMP
213 for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
218 std::complex<NumericT> f = 0;
221 std::complex<NumericT> input;
223 input = input_complex[batch_id * stride + n];
225 input = input_complex[n * stride + batch_id];
230 std::complex<NumericT> ex(cs, sn);
231 std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(),
232 input.real() * ex.imag() + input.imag() * ex.real());
236 output[batch_id * stride + k] = f;
238 output[k * stride + batch_id] = f;
250 template<
typename NumericT,
unsigned int AlignmentV>
257 std::vector<std::complex<NumericT> > input_complex(size * batch_num);
258 std::vector<std::complex<NumericT> > output(size * batch_num);
262 fft_direct(&input_complex[0], &output[0], size, stride, batch_num,
sign, data_order);
273 template<
typename NumericT,
unsigned int AlignmentV>
284 std::vector<std::complex<NumericT> > input_complex(size_mat);
285 std::vector<std::complex<NumericT> > output(size_mat);
287 NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in);
288 NumericT * data_B = detail::extract_raw_pointer<NumericT>(out);
292 fft_direct(&input_complex[0], &output[0], size, stride, batch_num,
sign, data_order);
301 template<
typename NumericT,
unsigned int AlignmentV>
306 std::vector<std::complex<NumericT> > input(size * batch_num);
308 #ifdef VIENNACL_WITH_OPENMP
309 #pragma omp parallel for
311 for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
321 std::complex<NumericT> tmp = input[batch_id * stride + i];
322 input[batch_id * stride + i] = input[batch_id * stride + v];
323 input[batch_id * stride + v] = tmp;
327 std::complex<NumericT> tmp = input[i * stride + batch_id];
328 input[i * stride + batch_id] = input[v * stride + batch_id];
329 input[v * stride + batch_id] = tmp;
341 template<
typename NumericT,
unsigned int AlignmentV>
347 NumericT * data = detail::extract_raw_pointer<NumericT>(in);
352 std::vector<std::complex<NumericT> > input(size_mat);
356 #ifdef VIENNACL_WITH_OPENMP
357 #pragma omp parallel for
359 for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
369 std::complex<NumericT> tmp = input[batch_id * stride + i];
370 input[batch_id * stride + i] = input[batch_id * stride + v];
371 input[batch_id * stride + v] = tmp;
374 std::complex<NumericT> tmp = input[i * stride + batch_id];
375 input[i * stride + batch_id] = input[v * stride + batch_id];
376 input[v * stride + batch_id] = tmp;
388 template<
typename NumericT>
400 #ifdef VIENNACL_WITH_OPENMP
401 #pragma omp parallel for
403 for (
long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
406 for (
vcl_size_t tid = 0; tid < half_size; tid++)
410 std::complex<NumericT> in1;
411 std::complex<NumericT> in2;
415 offset = batch_id * stride + pos;
416 in1 = input_complex[offset];
417 in2 = input_complex[offset + ss];
421 offset = pos * stride + batch_id;
422 in1 = input_complex[offset];
423 in2 = input_complex[offset + ss *
stride];
428 std::complex<NumericT> ex(cs, sn);
429 std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
430 in2.real() * ex.imag() + in2.imag() * ex.real());
432 input_complex[offset + ss] = in1 - tmp;
434 input_complex[offset + ss *
stride] = in1 - tmp;
435 input_complex[offset] = in1 + tmp;
446 template<
typename NumericT>
454 for (
vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++)
456 #ifdef VIENNACL_WITH_OPENMP
457 #pragma omp parallel for
459 for (
long p2 = 0; p2 < long(size); p2 += 1)
465 lcl_input[v] = input_complex[batch_id * stride + p];
467 lcl_input[v] = input_complex[p * stride + batch_id];
473 #ifdef VIENNACL_WITH_OPENMP
474 #pragma omp parallel for
476 for (
long tid2 = 0; tid2 < long(size)/2; tid2++)
480 vcl_size_t pos = ((tid >> s) << (s + 1)) + group;
482 std::complex<NumericT> in1 = lcl_input[pos];
483 std::complex<NumericT> in2 = lcl_input[pos + ss];
489 std::complex<NumericT> ex(cs, sn);
491 std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
492 in2.real() * ex.imag() + in2.imag() * ex.real());
494 lcl_input[pos + ss] = in1 - tmp;
495 lcl_input[pos] = in1 + tmp;
499 #ifdef VIENNACL_WITH_OPENMP
500 #pragma omp parallel for
503 for (
long p2 = 0; p2 < long(size); p2 += 1)
507 input_complex[batch_id * stride + p] = lcl_input[p];
509 input_complex[p * stride + batch_id] = lcl_input[p];
524 template<
typename NumericT,
unsigned int AlignmentV>
532 std::vector<std::complex<NumericT> > input_complex(size * batch_num);
533 std::vector<std::complex<NumericT> > lcl_input(size * batch_num);
542 viennacl::linalg::host_based::reorder<NumericT>(in,
size,
stride, bit_size, batch_num, data_order);
557 template<
typename NumericT,
unsigned int AlignmentV>
565 NumericT * data = detail::extract_raw_pointer<NumericT>(in);
571 std::vector<std::complex<NumericT> > input_complex(size_mat);
577 std::vector<std::complex<NumericT> > lcl_input(size_mat);
582 viennacl::linalg::host_based::reorder<NumericT>(in,
size,
stride, bit_size, batch_num, data_order);
598 template<
typename NumericT,
unsigned int AlignmentV>
609 std::vector<std::complex<NumericT> > input_complex(size);
610 std::vector<std::complex<NumericT> > output_complex(size);
612 std::vector<std::complex<NumericT> > A_complex(ext_size);
613 std::vector<std::complex<NumericT> > B_complex(ext_size);
614 std::vector<std::complex<NumericT> > Z_complex(ext_size);
617 #ifdef VIENNACL_WITH_OPENMP
618 #pragma omp parallel for
620 for (
long i2 = 0; i2 < long(ext_size); i2++)
630 #ifdef VIENNACL_WITH_OPENMP
631 #pragma omp parallel for
633 for (
long i2 = 0; i2 < long(size); i2++)
642 std::complex<NumericT> a_i(cs_a, sn_a);
643 std::complex<NumericT> b_i(cs_a, -sn_a);
645 A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(),
646 input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real());
651 B_complex[ext_size - i] = b_i;
662 #ifdef VIENNACL_WITH_OPENMP
663 #pragma omp parallel for
665 for (
long i2 = 0; i2 < long(size); i2++)
672 std::complex<NumericT> b_i(cs_a, sn_a);
673 output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(),
674 Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real());
683 template<
typename NumericT,
unsigned int AlignmentV>
689 input[i] /= norm_factor;
696 template<
typename NumericT,
unsigned int AlignmentV>
703 std::vector<std::complex<NumericT> > input1_complex(size);
704 std::vector<std::complex<NumericT> > input2_complex(size);
705 std::vector<std::complex<NumericT> > output_complex(size);
709 #ifdef VIENNACL_WITH_OPENMP
710 #pragma omp parallel for
712 for (
long i2 = 0; i2 < long(size); i2++)
715 std::complex<NumericT> in1 = input1_complex[i];
716 std::complex<NumericT> in2 = input2_complex[i];
717 output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(),
718 in1.real() * in2.imag() + in1.imag() * in2.real());
726 template<
typename NumericT,
unsigned int AlignmentV>
734 NumericT * data = detail::extract_raw_pointer<NumericT>(input);
736 std::vector<std::complex<NumericT> > input_complex(size);
739 #ifdef VIENNACL_WITH_OPENMP
740 #pragma omp parallel for
742 for (
long i2 = 0; i2 < long(size); i2++)
751 std::complex<NumericT> val = input_complex[i];
752 input_complex[i] = input_complex[new_pos];
753 input_complex[new_pos] = val;
763 template<
typename NumericT,
unsigned int AlignmentV>
772 NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input);
773 NumericT * data_B = detail::extract_raw_pointer<NumericT>(output);
775 std::vector<std::complex<NumericT> > input_complex(size);
778 std::vector<std::complex<NumericT> > output_complex(size);
779 #ifdef VIENNACL_WITH_OPENMP
780 #pragma omp parallel for
782 for (
long i2 = 0; i2 < long(size); i2++)
788 output_complex[new_pos] = input_complex[i];
796 template<
typename NumericT>
800 NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in);
801 NumericT * data_out = detail::extract_raw_pointer<NumericT>(out);
803 #ifdef VIENNACL_WITH_OPENMP
804 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
806 for (
long i2 = 0; i2 < long(size); i2++)
809 data_out[2*i ] = data_in[i];
817 template<
typename NumericT>
821 NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in);
822 NumericT * data_out = detail::extract_raw_pointer<NumericT>(out);
824 #ifdef VIENNACL_WITH_OPENMP
825 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
827 for (
long i = 0; i < long(size); i++)
828 data_out[i] = data_in[2*i];
834 template<
typename NumericT>
839 #ifdef VIENNACL_WITH_OPENMP
840 #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
842 for (
long i2 = 0; i2 < long(size); i2++)
848 in[size - i - 1] = val1;
void fft_radix2_local(std::complex< NumericT > *input_complex, std::complex< NumericT > *lcl_input, vcl_size_t batch_num, vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 algorithm for computing Fourier transformation. Kernel for computing bigger amount of data...
Implementation of the dense matrix class.
void zero2(NumericT *input1, NumericT *input2, vcl_size_t size)
void reverse(viennacl::vector_base< NumericT > &in)
Reverse vector to opposite order and save it in input vector.
void radix2(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 1D algorithm for computing Fourier transformation.
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
void real_to_complex(viennacl::vector_base< NumericT > const &in, viennacl::vector_base< NumericT > &out, vcl_size_t size)
Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imagina...
void multiply_complex(viennacl::vector< NumericT, AlignmentV > const &input1, viennacl::vector< NumericT, AlignmentV > const &input2, viennacl::vector< NumericT, AlignmentV > &output)
Complex multiplikation of two vectors.
void fft_direct(std::complex< NumericT > *input_complex, std::complex< NumericT > *output, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct algoritm kenrnel.
vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size)
void copy_to_complex_array(std::complex< NumericT > *input_complex, viennacl::vector< NumericT, AlignmentV > const &in, vcl_size_t size)
void bluestein(viennacl::vector< NumericT, AlignmentV > &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t)
Bluestein's algorithm for computing Fourier transformation.
void direct(viennacl::vector< NumericT, AlignmentV > const &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct 1D algorithm for computing Fourier transformation.
void copy_to_vector(std::complex< NumericT > *input_complex, viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
void convolve_i(viennacl::vector< SCALARTYPE, ALIGNMENT > &input1, viennacl::vector< SCALARTYPE, ALIGNMENT > &input2, viennacl::vector< SCALARTYPE, ALIGNMENT > &output)
vcl_size_t next_power_2(vcl_size_t n)
void transpose(viennacl::matrix< NumericT, viennacl::row_major, AlignmentV > &input)
Inplace transpose of matrix.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
size_type size() const
Returns the length of the vector (cf. std::vector)
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
vcl_size_t num_bits(vcl_size_t size)
void fft_radix2(std::complex< NumericT > *input_complex, vcl_size_t batch_num, vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 algorithm for computing Fourier transformation. Kernel for computing smaller amount of data...
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
void normalize(viennacl::vector< NumericT, AlignmentV > &input)
Normalize vector with his own size.
const vcl_size_t MAX_LOCAL_POINTS_NUM
void complex_to_real(viennacl::vector_base< NumericT > const &in, viennacl::vector_base< NumericT > &out, vcl_size_t size)
Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imagina...
void reorder(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
ScalarType fft(std::vector< ScalarType > &in, std::vector< ScalarType > &out, unsigned int, unsigned int, unsigned int batch_size)
SCALARTYPE sign(SCALARTYPE val)
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...