ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fft_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
24 #include <cmath>
25 #include <viennacl/matrix.hpp>
26 #include <viennacl/vector.hpp>
27 
28 #include "viennacl/forwards.h"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/tools/tools.hpp"
34 
35 namespace viennacl
36 {
37 namespace linalg
38 {
39 namespace cuda
40 {
41 namespace detail
42 {
43  namespace fft
44  {
46 
48  {
49  vcl_size_t bits_datasize = 0;
50  vcl_size_t ds = 1;
51 
52  while (ds < size)
53  {
54  ds = ds << 1;
55  bits_datasize++;
56  }
57 
58  return bits_datasize;
59  }
60 
62  {
63  n = n - 1;
64 
65  vcl_size_t power = 1;
66 
67  while (power < sizeof(vcl_size_t) * 8)
68  {
69  n = n | (n >> power);
70  power *= 2;
71  }
72 
73  return n + 1;
74  }
75 
76  } //namespace fft
77 } //namespace detail
78 
79 // addition
80 inline __host__ __device__ float2 operator+(float2 a, float2 b)
81 {
82  return make_float2(a.x + b.x, a.y + b.y);
83 }
84 
85 // subtract
86 inline __host__ __device__ float2 operator-(float2 a, float2 b)
87 {
88  return make_float2(a.x - b.x, a.y - b.y);
89 }
90 // division
91 template<typename SCALARTYPE>
92 inline __device__ float2 operator/(float2 a,SCALARTYPE b)
93 {
94  return make_float2(a.x/b, a.y/b);
95 }
96 
97 //multiplication
98 inline __device__ float2 operator*(float2 in1, float2 in2)
99 {
100  return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
101 }
102 
103 // addition
104 inline __host__ __device__ double2 operator+(double2 a, double2 b)
105 {
106  return make_double2(a.x + b.x, a.y + b.y);
107 }
108 
109 // subtraction
110 inline __host__ __device__ double2 operator-(double2 a, double2 b)
111 {
112  return make_double2(a.x - b.x, a.y - b.y);
113 }
114 
115 // division
116 template<typename SCALARTYPE>
117 inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
118 {
119  return make_double2(a.x/b, a.y/b);
120 }
121 
122 //multiplication
123 inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
124 {
125  return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
126 }
127 
128 inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
129 {
130  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
131  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
132  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
133  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
134  v = (v >> 16) | (v << 16);
135  v = v >> (32 - bit_size);
136  return v;
137 }
138 
139 template<typename Numeric2T, typename NumericT>
140 __global__ void fft_direct(
141  const Numeric2T * input,
142  Numeric2T * output,
143  unsigned int size,
144  unsigned int stride,
145  unsigned int batch_num,
146  NumericT sign,
147  bool is_row_major)
148 {
149 
150  const NumericT NUM_PI(3.14159265358979323846);
151 
152  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
153  {
154  for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
155  {
156  Numeric2T f;
157  f.x = 0;
158  f.y = 0;
159 
160  for (unsigned int n = 0; n < size; n++)
161  {
162  Numeric2T in;
163  if (!is_row_major)
164  in = input[batch_id * stride + n]; //input index here
165  else
166  in = input[n * stride + batch_id];//input index here
167 
168  NumericT sn,cs;
169  NumericT arg = sign * 2 * NUM_PI * k / size * n;
170  sn = sin(arg);
171  cs = cos(arg);
172 
173  Numeric2T ex;
174  ex.x = cs;
175  ex.y = sn;
176  Numeric2T tmp;
177  tmp.x = in.x * ex.x - in.y * ex.y;
178  tmp.y = in.x * ex.y + in.y * ex.x;
179  f = f + tmp;
180  }
181 
182  if (!is_row_major)
183  output[batch_id * stride + k] = f; // output index here
184  else
185  output[k * stride + batch_id] = f;// output index here
186  }
187  }
188 }
189 
196 template<typename NumericT, unsigned int AlignmentV>
200  NumericT sign = NumericT(-1),
202 {
204 
205  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
206  reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
207  static_cast<unsigned int>(size),
208  static_cast<unsigned int>(stride),
209  static_cast<unsigned int>(batch_num),
210  sign,
212  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
213 }
214 
221 template<typename NumericT, unsigned int AlignmentV>
224  vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
225  NumericT sign = NumericT(-1),
227 {
229 
230  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
231  reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
232  static_cast<unsigned int>(size),
233  static_cast<unsigned int>(stride),
234  static_cast<unsigned int>(batch_num),
235  sign,
237  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
238 }
239 
240 template<typename NumericT>
241 __global__ void fft_reorder(NumericT * input,
242  unsigned int bit_size,
243  unsigned int size,
244  unsigned int stride,
245  unsigned int batch_num,
246  bool is_row_major)
247 {
248 
249  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
250  unsigned int glb_sz = gridDim.x * blockDim.x;
251 
252  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
253  {
254  for (unsigned int i = glb_id; i < size; i += glb_sz)
255  {
256  unsigned int v = get_reorder_num(i, bit_size);
257 
258  if (i < v)
259  {
260  if (!is_row_major)
261  {
262  NumericT tmp = input[batch_id * stride + i]; // index
263  input[batch_id * stride + i] = input[batch_id * stride + v];//index
264  input[batch_id * stride + v] = tmp;//index
265  }
266  else
267  {
268  NumericT tmp = input[i * stride + batch_id];
269  input[i * stride + batch_id] = input[v * stride + batch_id];
270  input[v * stride + batch_id] = tmp;
271  }
272  }
273  }
274  }
275 }
276 
277 /***
278  * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
279  * Such reordering should be done before in-place FFT.
280  */
281 template<typename NumericT, unsigned int AlignmentV>
283  vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
285 {
287 
288  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
289  static_cast<unsigned int>(bits_datasize),
290  static_cast<unsigned int>(size),
291  static_cast<unsigned int>(stride),
292  static_cast<unsigned int>(batch_num),
293  static_cast<bool>(data_order));
294  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
295 }
296 
297 template<typename Numeric2T, typename NumericT>
298 __global__ void fft_radix2_local(Numeric2T * input,
299  unsigned int bit_size,
300  unsigned int size,
301  unsigned int stride,
302  unsigned int batch_num,
303  NumericT sign,
304  bool is_row_major)
305 {
306  __shared__ Numeric2T lcl_input[1024];
307  unsigned int grp_id = blockIdx.x;
308  unsigned int grp_num = gridDim.x;
309 
310  unsigned int lcl_sz = blockDim.x;
311  unsigned int lcl_id = threadIdx.x;
312  const NumericT NUM_PI(3.14159265358979323846);
313 
314  for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
315  {
316  for (unsigned int p = lcl_id; p < size; p += lcl_sz)
317  {
318  unsigned int v = get_reorder_num(p, bit_size);
319  if (!is_row_major)
320  lcl_input[v] = input[batch_id * stride + p];
321  else
322  lcl_input[v] = input[p * stride + batch_id];
323  }
324 
325  __syncthreads();
326 
327  //performs Cooley-Tukey FFT on local arrayfft
328  for (unsigned int s = 0; s < bit_size; s++)
329  {
330  unsigned int ss = 1 << s;
331  NumericT cs, sn;
332  for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
333  {
334  unsigned int group = (tid & (ss - 1));
335  unsigned int pos = ((tid >> s) << (s + 1)) + group;
336 
337  Numeric2T in1 = lcl_input[pos];
338  Numeric2T in2 = lcl_input[pos + ss];
339 
340  NumericT arg = group * sign * NUM_PI / ss;
341 
342  sn = sin(arg);
343  cs = cos(arg);
344  Numeric2T ex;
345  ex.x = cs;
346  ex.y = sn;
347 
348  Numeric2T tmp;
349  tmp.x = in2.x * ex.x - in2.y * ex.y;
350  tmp.y = in2.x * ex.y + in2.y * ex.x;
351 
352  lcl_input[pos + ss] = in1 - tmp;
353  lcl_input[pos] = in1 + tmp;
354  }
355  __syncthreads();
356  }
357 
358  //copy local array back to global memory
359  for (unsigned int p = lcl_id; p < size; p += lcl_sz)
360  {
361  if (!is_row_major)
362  input[batch_id * stride + p] = lcl_input[p]; //index
363  else
364  input[p * stride + batch_id] = lcl_input[p];
365  }
366 
367  }
368 }
369 
370 template<typename Numeric2T, typename NumericT>
371 __global__ void fft_radix2(Numeric2T * input,
372  unsigned int s,
373  unsigned int bit_size,
374  unsigned int size,
375  unsigned int stride,
376  unsigned int batch_num,
377  NumericT sign,
378  bool is_row_major)
379 {
380 
381  unsigned int ss = 1 << s;
382  unsigned int half_size = size >> 1;
383 
384  NumericT cs, sn;
385  const NumericT NUM_PI(3.14159265358979323846);
386 
387  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
388  unsigned int glb_sz = gridDim.x * blockDim.x;
389 
390  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
391  {
392  for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
393  {
394  unsigned int group = (tid & (ss - 1));
395  unsigned int pos = ((tid >> s) << (s + 1)) + group;
396  Numeric2T in1;
397  Numeric2T in2;
398  unsigned int offset;
399  if (!is_row_major)
400  {
401  offset = batch_id * stride + pos;
402  in1 = input[offset]; //index
403  in2 = input[offset + ss];//index
404  }
405  else
406  {
407  offset = pos * stride + batch_id;
408  in1 = input[offset]; //index
409  in2 = input[offset + ss * stride];//index
410  }
411 
412  NumericT arg = group * sign * NUM_PI / ss;
413 
414  sn = sin(arg);
415  cs = cos(arg);
416 
417  Numeric2T ex;
418  ex.x = cs;
419  ex.y = sn;
420 
421  Numeric2T tmp;
422  tmp.x = in2.x * ex.x - in2.y * ex.y;
423  tmp.y = in2.x * ex.y + in2.y * ex.x;
424 
425  if (!is_row_major)
426  input[offset + ss] = in1 - tmp; //index
427  else
428  input[offset + ss * stride] = in1 - tmp; //index
429  input[offset] = in1 + tmp; //index
430  }
431  }
432 }
433 
441 template<typename NumericT, unsigned int AlignmentV>
443  vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
445 {
447 
448  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
449 
451  {
452  fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
453  static_cast<unsigned int>(bit_size),
454  static_cast<unsigned int>(size),
455  static_cast<unsigned int>(stride),
456  static_cast<unsigned int>(batch_num),
457  static_cast<NumericT>(sign),
458  static_cast<bool>(data_order));
459  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
460  }
461  else
462  {
463  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
464  static_cast<unsigned int>(bit_size),
465  static_cast<unsigned int>(size),
466  static_cast<unsigned int>(stride),
467  static_cast<unsigned int>(batch_num),
468  static_cast<bool>(data_order));
469  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
470 
471  for (vcl_size_t step = 0; step < bit_size; step++)
472  {
473  fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
474  static_cast<unsigned int>(step),
475  static_cast<unsigned int>(bit_size),
476  static_cast<unsigned int>(size),
477  static_cast<unsigned int>(stride),
478  static_cast<unsigned int>(batch_num),
479  sign,
480  static_cast<bool>(data_order));
481  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
482  }
483  }
484 }
485 
493 template<typename NumericT, unsigned int AlignmentV>
495  vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
497 {
499 
500  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
501 
503  {
504  fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
505  static_cast<unsigned int>(bit_size),
506  static_cast<unsigned int>(size),
507  static_cast<unsigned int>(stride),
508  static_cast<unsigned int>(batch_num),
509  sign,
510  static_cast<bool>(data_order));
511  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
512  }
513  else
514  {
515  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
516  static_cast<unsigned int>(bit_size),
517  static_cast<unsigned int>(size),
518  static_cast<unsigned int>(stride),
519  static_cast<unsigned int>(batch_num),
520  static_cast<bool>(data_order));
521  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
522  for (vcl_size_t step = 0; step < bit_size; step++)
523  {
524  fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
525  static_cast<unsigned int>(step),
526  static_cast<unsigned int>(bit_size),
527  static_cast<unsigned int>(size),
528  static_cast<unsigned int>(stride),
529  static_cast<unsigned int>(batch_num),
530  sign,
531  static_cast<bool>(data_order));
532  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
533  }
534  }
535 }
536 
537 template<typename Numeric2T, typename NumericT>
538 __global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
539 {
540  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
541  unsigned int glb_sz =gridDim.x * blockDim.x;
542 
543  unsigned int double_size = size << 1;
544  NumericT sn_a, cs_a;
545  const NumericT NUM_PI(3.14159265358979323846);
546 
547  for (unsigned int i = glb_id; i < size; i += glb_sz)
548  {
549  unsigned int rm = i * i % (double_size);
550  NumericT angle = (NumericT)rm / size * (-NUM_PI);
551 
552  sn_a = sin(angle);
553  cs_a= cos(angle);
554 
555  Numeric2T b_i;
556  b_i.x = cs_a;
557  b_i.y = sn_a;
558  out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
559  out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
560  }
561 }
562 
563 template<typename Numeric2T, typename NumericT>
564 __global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
565  unsigned int size, unsigned int ext_size, NumericT sign)
566 {
567  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
568  unsigned int glb_sz = gridDim.x * blockDim.x;
569 
570  unsigned int double_size = size << 1;
571 
572  NumericT sn_a, cs_a;
573  const NumericT NUM_PI(3.14159265358979323846);
574 
575  for (unsigned int i = glb_id; i < size; i += glb_sz)
576  {
577  unsigned int rm = i * i % (double_size);
578  NumericT angle = (NumericT)rm / size * NUM_PI;
579 
580  sn_a = sin(-angle);
581  cs_a= cos(-angle);
582 
583  Numeric2T a_i;
584  a_i.x =cs_a;
585  a_i.y =sn_a;
586 
587  Numeric2T b_i;
588  b_i.x =cs_a;
589  b_i.y =-sn_a;
590 
591  A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
592  A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
593  B[i] = b_i;
594 
595  // very bad instruction, to be fixed
596  if (i)
597  B[ext_size - i] = b_i;
598  }
599 }
600 
601 template<typename NumericT>
602 __global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
603 {
604  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
605  {
606  input1[i].x = 0;
607  input1[i].y = 0;
608 
609  input2[i].x = 0;
610  input2[i].y = 0;
611  }
612 }
613 
621 template<typename NumericT, unsigned int AlignmentV>
624 {
626 
627  vcl_size_t size = in.size() >> 1;
629 
633 
634  zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
635  reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
636  static_cast<unsigned int>(ext_size));
638 
639  bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
640  reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
641  reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
642  static_cast<unsigned int>(size),
643  static_cast<unsigned int>(ext_size),
644  NumericT(1));
645  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
646 
648 
649  bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)),
650  reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
651  static_cast<unsigned int>(size),
652  NumericT(1));
653  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
654 }
655 
656 template<typename NumericT>
657 __global__ void fft_mult_vec(const NumericT * input1,
658  const NumericT * input2,
659  NumericT * output,
660  unsigned int size)
661 {
662  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
663  {
664  NumericT in1 = input1[i];
665  NumericT in2 = input2[i];
666  output[i] = in1 * in2;
667  }
668 }
669 
673 template<typename NumericT, unsigned int AlignmentV>
677 {
679 
680  vcl_size_t size = input1.size() / 2;
681 
682  fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)),
683  reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)),
684  reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
685  static_cast<unsigned int>(size));
686  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
687 }
688 
689 template<typename Numeric2T, typename NumericT>
690 __global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
691 {
692  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
693  input1[i] = input1[i]/factor;
694 }
695 
699 template<typename NumericT, unsigned int AlignmentV>
701 {
703 
704  vcl_size_t size = input.size() >> 1;
705  NumericT norm_factor = static_cast<NumericT>(size);
706  fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
707  static_cast<unsigned int>(size),
708  norm_factor);
709  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
710 }
711 
712 template<typename NumericT>
713 __global__ void transpose(const NumericT * input,
714  NumericT * output,
715  unsigned int row_num,
716  unsigned int col_num)
717 {
718  unsigned int size = row_num * col_num;
719  for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
720  {
721  unsigned int row = i / col_num;
722  unsigned int col = i - row*col_num;
723  unsigned int new_pos = col * row_num + row;
724  output[new_pos] = input[i];
725  }
726 }
727 
731 template<typename NumericT, unsigned int AlignmentV>
734 {
736 
737  transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)),
738  reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
739  static_cast<unsigned int>(input.internal_size1()>>1),
740  static_cast<unsigned int>(input.internal_size2()>>1));
741  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
742 
743 }
744 
745 template<typename NumericT>
746 __global__ void transpose_inplace(
747  NumericT * input,
748  unsigned int row_num,
749  unsigned int col_num)
750 {
751  unsigned int size = row_num * col_num;
752  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
753  {
754  unsigned int row = i / col_num;
755  unsigned int col = i - row*col_num;
756  unsigned int new_pos = col * row_num + row;
757  if (i < new_pos)
758  {
759  NumericT val = input[i];
760  input[i] = input[new_pos];
761  input[new_pos] = val;
762  }
763  }
764 }
765 
769 template<typename NumericT, unsigned int AlignmentV>
771 {
773 
774  transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
775  static_cast<unsigned int>(input.internal_size1()>>1),
776  static_cast<unsigned int>(input.internal_size2() >> 1));
777  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
778 
779 }
780 
781 template<typename RealT,typename ComplexT>
782 __global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
783 {
784  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
785  {
786  ComplexT val;
787  val.x = in[i];
788  val.y = 0;
789  out[i] = val;
790  }
791 }
792 
796 template<typename NumericT>
799 {
801 
802  real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
803  reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
804  static_cast<unsigned int>(size));
805  VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
806 }
807 
808 template<typename ComplexT,typename RealT>
809 __global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
810 {
811  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
812  out[i] = in[i].x;
813 }
814 
818 template<typename NumericT>
821 {
823 
824  complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
825  viennacl::cuda_arg(out),
826  static_cast<unsigned int>(size));
827  VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
828 
829 }
830 
831 template<typename NumericT>
832 __global__ void reverse_inplace(NumericT * vec, unsigned int size)
833 {
834  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
835  {
836  NumericT val1 = vec[i];
837  NumericT val2 = vec[size - i - 1];
838  vec[i] = val2;
839  vec[size - i - 1] = val1;
840  }
841 }
842 
846 template<typename NumericT>
848 {
849  vcl_size_t size = in.size();
850  reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size));
851  VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
852 }
853 
854 } //namespace cuda
855 } //namespace linalg
856 } //namespace viennacl
857 
858 #endif /* FFT_OPERATIONS_HPP_ */
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:484
Implementation of the dense matrix class.
__global__ void bluestein_post(Numeric2T *Z, Numeric2T *out, unsigned int size, NumericT sign)
Various little tools used here and there in ViennaCL.
__global__ void real_to_complex(const RealT *in, ComplexT *out, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
__global__ void fft_direct(const Numeric2T *input, Numeric2T *output, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
This file provides the forward declarations for the main types used within ViennaCL.
endcode *Final step
A dense matrix class.
Definition: forwards.h:375
__global__ void fft_mult_vec(const NumericT *input1, const NumericT *input2, NumericT *output, unsigned int size)
float NumericT
Definition: bisect.cpp:40
__host__ __device__ float2 operator+(float2 a, float2 b)
vcl_size_t num_bits(vcl_size_t size)
__global__ void reverse_inplace(NumericT *vec, unsigned int size)
__global__ void transpose(const NumericT *input, NumericT *output, unsigned int row_num, unsigned int col_num)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
void radix2(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 1D algorithm for computing Fourier transformation.
void bluestein(viennacl::vector< NumericT, AlignmentV > &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t)
Bluestein's algorithm for computing Fourier transformation.
__global__ void fft_div_vec_scalar(Numeric2T *input1, unsigned int size, NumericT factor)
__global__ void fft_reorder(NumericT *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, bool is_row_major)
__device__ float2 operator*(float2 in1, float2 in2)
std::size_t vcl_size_t
Definition: forwards.h:75
__device__ float2 operator/(float2 a, SCALARTYPE b)
void normalize(viennacl::vector< NumericT, AlignmentV > &input)
Normalize vector on with his own size.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
void convolve_i(viennacl::vector< SCALARTYPE, ALIGNMENT > &input1, viennacl::vector< SCALARTYPE, ALIGNMENT > &input2, viennacl::vector< SCALARTYPE, ALIGNMENT > &output)
__global__ void zero2(NumericT *input1, NumericT *input2, unsigned int size)
__global__ void fft_radix2(Numeric2T *input, unsigned int s, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
__device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
__global__ void bluestein_pre(Numeric2T *input, Numeric2T *A, Numeric2T *B, unsigned int size, unsigned int ext_size, NumericT sign)
Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled executi...
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:30
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:240
__global__ void transpose_inplace(NumericT *input, unsigned int row_num, unsigned int col_num)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Definition: common.hpp:39
void reverse(viennacl::vector_base< NumericT > &in)
Reverse vector to oposite order and save it in input vector.
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:238
void multiply_complex(viennacl::vector< NumericT, AlignmentV > const &input1, viennacl::vector< NumericT, AlignmentV > const &input2, viennacl::vector< NumericT, AlignmentV > &output)
Mutiply two complex vectors and store result in output.
__global__ void complex_to_real(const ComplexT *in, RealT *out, unsigned int size)
__host__ __device__ float2 operator-(float2 a, float2 b)
void reorder(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
void direct(viennacl::vector< NumericT, AlignmentV > const &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct 1D algorithm for computing Fourier transformation.
vcl_size_t next_power_2(vcl_size_t n)
__global__ void fft_radix2_local(Numeric2T *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Implementation of the ViennaCL scalar class.
ScalarType fft(std::vector< ScalarType > &in, std::vector< ScalarType > &out, unsigned int, unsigned int, unsigned int batch_size)
Definition: fft_1d.cpp:719
SCALARTYPE sign(SCALARTYPE val)
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...