1 #ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
80 inline __host__ __device__ float2
operator+(float2 a, float2 b)
82 return make_float2(a.x + b.x, a.y + b.y);
86 inline __host__ __device__ float2
operator-(float2 a, float2 b)
88 return make_float2(a.x - b.x, a.y - b.y);
91 template<
typename SCALARTYPE>
92 inline __device__ float2
operator/(float2 a,SCALARTYPE b)
94 return make_float2(a.x/b, a.y/b);
98 inline __device__ float2
operator*(float2 in1, float2 in2)
100 return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
104 inline __host__ __device__ double2
operator+(double2 a, double2 b)
106 return make_double2(a.x + b.x, a.y + b.y);
110 inline __host__ __device__ double2
operator-(double2 a, double2 b)
112 return make_double2(a.x - b.x, a.y - b.y);
116 template<
typename SCALARTYPE>
117 inline __host__ __device__ double2
operator/(double2 a,SCALARTYPE b)
119 return make_double2(a.x/b, a.y/b);
123 inline __host__ __device__ double2
operator*(double2 in1, double2 in2)
125 return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
130 v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
131 v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
132 v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
133 v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
134 v = (v >> 16) | (v << 16);
135 v = v >> (32 - bit_size);
139 template<
typename Numeric2T,
typename NumericT>
141 const Numeric2T * input,
145 unsigned int batch_num,
150 const NumericT NUM_PI(3.14159265358979323846);
152 for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
154 for (
unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
160 for (
unsigned int n = 0; n <
size; n++)
164 in = input[batch_id * stride + n];
166 in = input[n * stride + batch_id];
169 NumericT arg = sign * 2 * NUM_PI * k / size * n;
177 tmp.x = in.x * ex.x - in.y * ex.y;
178 tmp.y = in.x * ex.y + in.y * ex.x;
183 output[batch_id * stride + k] = f;
185 output[k * stride + batch_id] = f;
196 template<
typename NumericT,
unsigned int AlignmentV>
205 fft_direct<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(in)),
207 static_cast<unsigned int>(size),
208 static_cast<unsigned int>(
stride),
209 static_cast<unsigned int>(batch_num),
221 template<
typename NumericT,
unsigned int AlignmentV>
230 fft_direct<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(in)),
232 static_cast<unsigned int>(size),
233 static_cast<unsigned int>(
stride),
234 static_cast<unsigned int>(batch_num),
240 template<
typename NumericT>
242 unsigned int bit_size,
245 unsigned int batch_num,
249 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
250 unsigned int glb_sz = gridDim.x * blockDim.x;
252 for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
254 for (
unsigned int i = glb_id; i <
size; i += glb_sz)
262 NumericT tmp = input[batch_id * stride + i];
263 input[batch_id * stride + i] = input[batch_id * stride + v];
264 input[batch_id * stride + v] = tmp;
268 NumericT tmp = input[i * stride + batch_id];
269 input[i * stride + batch_id] = input[v * stride + batch_id];
270 input[v * stride + batch_id] = tmp;
281 template<
typename NumericT,
unsigned int AlignmentV>
289 static_cast<unsigned int>(bits_datasize),
290 static_cast<unsigned int>(
size),
291 static_cast<unsigned int>(stride),
292 static_cast<unsigned int>(batch_num),
293 static_cast<bool>(data_order));
297 template<
typename Numeric2T,
typename NumericT>
299 unsigned int bit_size,
302 unsigned int batch_num,
306 __shared__ Numeric2T lcl_input[1024];
307 unsigned int grp_id = blockIdx.x;
308 unsigned int grp_num = gridDim.x;
310 unsigned int lcl_sz = blockDim.x;
311 unsigned int lcl_id = threadIdx.x;
312 const NumericT NUM_PI(3.14159265358979323846);
314 for (
unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
316 for (
unsigned int p = lcl_id; p <
size; p += lcl_sz)
320 lcl_input[v] = input[batch_id * stride + p];
322 lcl_input[v] = input[p * stride + batch_id];
328 for (
unsigned int s = 0; s < bit_size; s++)
330 unsigned int ss = 1 << s;
332 for (
unsigned int tid = lcl_id; tid <
size; tid += lcl_sz)
334 unsigned int group = (tid & (ss - 1));
335 unsigned int pos = ((tid >> s) << (s + 1)) + group;
337 Numeric2T in1 = lcl_input[pos];
338 Numeric2T in2 = lcl_input[pos + ss];
340 NumericT arg = group * sign * NUM_PI / ss;
349 tmp.x = in2.x * ex.x - in2.y * ex.y;
350 tmp.y = in2.x * ex.y + in2.y * ex.x;
352 lcl_input[pos + ss] = in1 - tmp;
353 lcl_input[pos] = in1 + tmp;
359 for (
unsigned int p = lcl_id; p <
size; p += lcl_sz)
362 input[batch_id * stride + p] = lcl_input[p];
364 input[p * stride + batch_id] = lcl_input[p];
370 template<
typename Numeric2T,
typename NumericT>
373 unsigned int bit_size,
376 unsigned int batch_num,
381 unsigned int ss = 1 << s;
382 unsigned int half_size = size >> 1;
385 const NumericT NUM_PI(3.14159265358979323846);
387 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
388 unsigned int glb_sz = gridDim.x * blockDim.x;
390 for (
unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
392 for (
unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
394 unsigned int group = (tid & (ss - 1));
395 unsigned int pos = ((tid >> s) << (s + 1)) + group;
401 offset = batch_id * stride + pos;
403 in2 = input[offset + ss];
407 offset = pos * stride + batch_id;
409 in2 = input[offset + ss *
stride];
412 NumericT arg = group * sign * NUM_PI / ss;
422 tmp.x = in2.x * ex.x - in2.y * ex.y;
423 tmp.y = in2.x * ex.y + in2.y * ex.x;
426 input[offset + ss] = in1 - tmp;
428 input[offset + ss *
stride] = in1 - tmp;
429 input[offset] = in1 + tmp;
441 template<
typename NumericT,
unsigned int AlignmentV>
452 fft_radix2_local<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(in)),
453 static_cast<unsigned int>(bit_size),
454 static_cast<unsigned int>(
size),
455 static_cast<unsigned int>(stride),
456 static_cast<unsigned int>(batch_num),
457 static_cast<NumericT>(
sign),
458 static_cast<bool>(data_order));
464 static_cast<unsigned int>(bit_size),
465 static_cast<unsigned int>(
size),
466 static_cast<unsigned int>(stride),
467 static_cast<unsigned int>(batch_num),
468 static_cast<bool>(data_order));
474 static_cast<unsigned int>(
step),
475 static_cast<unsigned int>(bit_size),
476 static_cast<unsigned int>(size),
477 static_cast<unsigned int>(
stride),
478 static_cast<unsigned int>(batch_num),
480 static_cast<bool>(data_order));
493 template<
typename NumericT,
unsigned int AlignmentV>
504 fft_radix2_local<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(in)),
505 static_cast<unsigned int>(bit_size),
506 static_cast<unsigned int>(
size),
507 static_cast<unsigned int>(stride),
508 static_cast<unsigned int>(batch_num),
510 static_cast<bool>(data_order));
516 static_cast<unsigned int>(bit_size),
517 static_cast<unsigned int>(
size),
518 static_cast<unsigned int>(stride),
519 static_cast<unsigned int>(batch_num),
520 static_cast<bool>(data_order));
525 static_cast<unsigned int>(
step),
526 static_cast<unsigned int>(bit_size),
527 static_cast<unsigned int>(size),
528 static_cast<unsigned int>(
stride),
529 static_cast<unsigned int>(batch_num),
531 static_cast<bool>(data_order));
537 template<
typename Numeric2T,
typename NumericT>
540 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
541 unsigned int glb_sz =gridDim.x * blockDim.x;
543 unsigned int double_size = size << 1;
545 const NumericT NUM_PI(3.14159265358979323846);
547 for (
unsigned int i = glb_id; i <
size; i += glb_sz)
549 unsigned int rm = i * i % (double_size);
558 out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
559 out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
563 template<
typename Numeric2T,
typename NumericT>
564 __global__
void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
565 unsigned int size,
unsigned int ext_size,
NumericT sign)
567 unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
568 unsigned int glb_sz = gridDim.x * blockDim.x;
570 unsigned int double_size = size << 1;
573 const NumericT NUM_PI(3.14159265358979323846);
575 for (
unsigned int i = glb_id; i <
size; i += glb_sz)
577 unsigned int rm = i * i % (double_size);
591 A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
592 A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
597 B[ext_size - i] = b_i;
601 template<
typename NumericT>
604 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
621 template<
typename NumericT,
unsigned int AlignmentV>
636 static_cast<unsigned int>(ext_size));
642 static_cast<unsigned int>(size),
643 static_cast<unsigned int>(ext_size),
651 static_cast<unsigned int>(size),
656 template<
typename NumericT>
662 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
666 output[i] = in1 * in2;
673 template<
typename NumericT,
unsigned int AlignmentV>
682 fft_mult_vec<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(input1)),
685 static_cast<unsigned int>(size));
689 template<
typename Numeric2T,
typename NumericT>
692 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
693 input1[i] = input1[i]/factor;
699 template<
typename NumericT,
unsigned int AlignmentV>
706 fft_div_vec_scalar<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(input)),
707 static_cast<unsigned int>(size),
712 template<
typename NumericT>
715 unsigned int row_num,
716 unsigned int col_num)
718 unsigned int size = row_num * col_num;
719 for (
unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
721 unsigned int row = i / col_num;
722 unsigned int col = i - row*col_num;
723 unsigned int new_pos = col * row_num +
row;
724 output[new_pos] = input[i];
731 template<
typename NumericT,
unsigned int AlignmentV>
737 transpose<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(input)),
745 template<
typename NumericT>
748 unsigned int row_num,
749 unsigned int col_num)
751 unsigned int size = row_num * col_num;
752 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
754 unsigned int row = i / col_num;
755 unsigned int col = i - row*col_num;
756 unsigned int new_pos = col * row_num +
row;
760 input[i] = input[new_pos];
761 input[new_pos] = val;
769 template<
typename NumericT,
unsigned int AlignmentV>
774 transpose_inplace<<<128,128>>>(
reinterpret_cast<numeric2_type *
>(
viennacl::cuda_arg(input)),
781 template<
typename RealT,
typename ComplexT>
784 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
796 template<
typename NumericT>
804 static_cast<unsigned int>(size));
808 template<
typename ComplexT,
typename RealT>
811 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
818 template<
typename NumericT>
824 complex_to_real<<<128,128>>>(
reinterpret_cast<const numeric2_type *
>(
viennacl::cuda_arg(in)),
826 static_cast<unsigned int>(
size));
831 template<
typename NumericT>
834 for (
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
839 vec[size - i - 1] = val1;
846 template<
typename NumericT>
Helper class for checking whether a matrix has a row-major layout.
Implementation of the dense matrix class.
__global__ void bluestein_post(Numeric2T *Z, Numeric2T *out, unsigned int size, NumericT sign)
__global__ void real_to_complex(const RealT *in, ComplexT *out, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
__global__ void fft_direct(const Numeric2T *input, Numeric2T *output, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
This file provides the forward declarations for the main types used within ViennaCL.
__global__ void fft_mult_vec(const NumericT *input1, const NumericT *input2, NumericT *output, unsigned int size)
__host__ __device__ float2 operator+(float2 a, float2 b)
const vcl_size_t MAX_LOCAL_POINTS_NUM
vcl_size_t num_bits(vcl_size_t size)
__global__ void reverse_inplace(NumericT *vec, unsigned int size)
__global__ void transpose(const NumericT *input, NumericT *output, unsigned int row_num, unsigned int col_num)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
void radix2(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 1D algorithm for computing Fourier transformation.
void bluestein(viennacl::vector< NumericT, AlignmentV > &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t)
Bluestein's algorithm for computing Fourier transformation.
__global__ void fft_div_vec_scalar(Numeric2T *input1, unsigned int size, NumericT factor)
__global__ void fft_reorder(NumericT *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, bool is_row_major)
__device__ float2 operator*(float2 in1, float2 in2)
__device__ float2 operator/(float2 a, SCALARTYPE b)
void normalize(viennacl::vector< NumericT, AlignmentV > &input)
Normalize vector on with his own size.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
void convolve_i(viennacl::vector< SCALARTYPE, ALIGNMENT > &input1, viennacl::vector< SCALARTYPE, ALIGNMENT > &input2, viennacl::vector< SCALARTYPE, ALIGNMENT > &output)
__global__ void zero2(NumericT *input1, NumericT *input2, unsigned int size)
__global__ void fft_radix2(Numeric2T *input, unsigned int s, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
__device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
size_type size() const
Returns the length of the vector (cf. std::vector)
__global__ void bluestein_pre(Numeric2T *input, Numeric2T *A, Numeric2T *B, unsigned int size, unsigned int ext_size, NumericT sign)
Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled executi...
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
__global__ void transpose_inplace(NumericT *input, unsigned int row_num, unsigned int col_num)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
void reverse(viennacl::vector_base< NumericT > &in)
Reverse vector to oposite order and save it in input vector.
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
void multiply_complex(viennacl::vector< NumericT, AlignmentV > const &input1, viennacl::vector< NumericT, AlignmentV > const &input2, viennacl::vector< NumericT, AlignmentV > &output)
Mutiply two complex vectors and store result in output.
__global__ void complex_to_real(const ComplexT *in, RealT *out, unsigned int size)
__host__ __device__ float2 operator-(float2 a, float2 b)
void reorder(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
void direct(viennacl::vector< NumericT, AlignmentV > const &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct 1D algorithm for computing Fourier transformation.
vcl_size_t next_power_2(vcl_size_t n)
__global__ void fft_radix2_local(Numeric2T *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Implementation of the ViennaCL scalar class.
ScalarType fft(std::vector< ScalarType > &in, std::vector< ScalarType > &out, unsigned int, unsigned int, unsigned int batch_size)
SCALARTYPE sign(SCALARTYPE val)
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...