ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix_operations_col.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
26 namespace viennacl
27 {
28 namespace linalg
29 {
30 namespace cuda
31 {
32 
33 template<typename DestNumericT, typename SrcNumericT>
34 __global__ void convert_col_kernel(DestNumericT * A,
35  unsigned int A_start1, unsigned int A_start2,
36  unsigned int A_inc1, unsigned int A_inc2,
37  unsigned int A_size1, unsigned int A_size2,
38  unsigned int A_internal_size1, unsigned int A_internal_size2,
39 
40  const SrcNumericT * B,
41  unsigned int B_start1, unsigned int B_start2,
42  unsigned int B_inc1, unsigned int B_inc2,
43  unsigned int B_internal_size1, unsigned int B_internal_size2)
44 {
45  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
46  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
47 
48  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
49  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
50  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1];
51 }
52 
53 //
54 // am
55 //
56 
57 // alpha on CPU
58 template<typename NumericT>
59 __global__ void am_col_kernel(NumericT * A,
60  unsigned int A_start1, unsigned int A_start2,
61  unsigned int A_inc1, unsigned int A_inc2,
62  unsigned int A_size1, unsigned int A_size2,
63  unsigned int A_internal_size1, unsigned int A_internal_size2,
64 
65  NumericT fac2,
66  unsigned int options2,
67  const NumericT * B,
68  unsigned int B_start1, unsigned int B_start2,
69  unsigned int B_inc1, unsigned int B_inc2,
70  unsigned int B_internal_size1, unsigned int B_internal_size2)
71 {
72  NumericT alpha = fac2;
73  if (options2 & (1 << 0))
74  alpha = -alpha;
75 
76  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
77  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
78 
79  if (options2 & (1 << 1))
80  {
81  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
82  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
83  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
84  }
85  else
86  {
87  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
88  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
89  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
90  }
91 }
92 
93 // alpha on GPU
94 template<typename NumericT>
95 __global__ void am_col_kernel(NumericT * A,
96  unsigned int A_start1, unsigned int A_start2,
97  unsigned int A_inc1, unsigned int A_inc2,
98  unsigned int A_size1, unsigned int A_size2,
99  unsigned int A_internal_size1, unsigned int A_internal_size2,
100 
101  const NumericT * fac2,
102  unsigned int options2,
103  const NumericT * B,
104  unsigned int B_start1, unsigned int B_start2,
105  unsigned int B_inc1, unsigned int B_inc2,
106  unsigned int B_internal_size1, unsigned int B_internal_size2)
107 {
108  NumericT alpha = *fac2;
109  if (options2 & (1 << 0))
110  alpha = -alpha;
111 
112  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
113  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
114 
115  if (options2 & (1 << 1))
116  {
117  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
118  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
119  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
120  }
121  else
122  {
123  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
124  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
125  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
126  }
127 }
128 
129 
130 //
131 // ambm
132 //
133 
134 // alpha and beta on CPU
135 template<typename NumericT>
136 __global__ void ambm_col_kernel(NumericT * A,
137  unsigned int A_start1, unsigned int A_start2,
138  unsigned int A_inc1, unsigned int A_inc2,
139  unsigned int A_size1, unsigned int A_size2,
140  unsigned int A_internal_size1, unsigned int A_internal_size2,
141 
142  NumericT fac2,
143  unsigned int options2,
144  const NumericT * B,
145  unsigned int B_start1, unsigned int B_start2,
146  unsigned int B_inc1, unsigned int B_inc2,
147  unsigned int B_internal_size1, unsigned int B_internal_size2,
148 
149  NumericT fac3,
150  unsigned int options3,
151  const NumericT * C,
152  unsigned int C_start1, unsigned int C_start2,
153  unsigned int C_inc1, unsigned int C_inc2,
154  unsigned int C_internal_size1, unsigned int C_internal_size2)
155 {
156  NumericT alpha = fac2;
157  if (options2 & (1 << 0))
158  alpha = -alpha;
159 
160  NumericT beta = fac3;
161  if (options3 & (1 << 0))
162  beta = -beta;
163 
164  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
165  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
166 
167  if (options2 & (1 << 1))
168  {
169  if (options3 & (1 << 1))
170  {
171  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
172  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
173  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
174  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
175  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
176  }
177  else
178  {
179  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
180  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
181  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
182  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
183  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
184  }
185  }
186  else
187  {
188  if (options3 & (1 << 1))
189  {
190  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
191  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
192  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
193  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
194  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
195  }
196  else
197  {
198  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
199  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
200  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
201  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
202  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
203  }
204  }
205 }
206 
207 
208 // alpha on CPU, beta on GPU
209 template<typename NumericT>
210 __global__ void ambm_col_kernel(NumericT * A,
211  unsigned int A_start1, unsigned int A_start2,
212  unsigned int A_inc1, unsigned int A_inc2,
213  unsigned int A_size1, unsigned int A_size2,
214  unsigned int A_internal_size1, unsigned int A_internal_size2,
215 
216  NumericT fac2,
217  unsigned int options2,
218  const NumericT * B,
219  unsigned int B_start1, unsigned int B_start2,
220  unsigned int B_inc1, unsigned int B_inc2,
221  unsigned int B_internal_size1, unsigned int B_internal_size2,
222 
223  const NumericT * fac3,
224  unsigned int options3,
225  const NumericT * C,
226  unsigned int C_start1, unsigned int C_start2,
227  unsigned int C_inc1, unsigned int C_inc2,
228  unsigned int C_internal_size1, unsigned int C_internal_size2)
229 {
230  NumericT alpha = fac2;
231  if (options2 & (1 << 0))
232  alpha = -alpha;
233 
234  NumericT beta = *fac3;
235  if (options3 & (1 << 0))
236  beta = -beta;
237 
238  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
239  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
240 
241  if (options2 & (1 << 1))
242  {
243  if (options3 & (1 << 1))
244  {
245  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
246  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
247  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
248  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
249  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
250  }
251  else
252  {
253  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
254  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
255  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
256  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
257  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
258  }
259  }
260  else
261  {
262  if (options3 & (1 << 1))
263  {
264  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
265  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
266  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
267  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
268  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
269  }
270  else
271  {
272  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
273  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
274  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
275  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
276  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
277  }
278  }
279 }
280 
281 // alpha on GPU, beta on CPU
282 template<typename NumericT>
283 __global__ void ambm_col_kernel(NumericT * A,
284  unsigned int A_start1, unsigned int A_start2,
285  unsigned int A_inc1, unsigned int A_inc2,
286  unsigned int A_size1, unsigned int A_size2,
287  unsigned int A_internal_size1, unsigned int A_internal_size2,
288 
289  const NumericT * fac2,
290  unsigned int options2,
291  const NumericT * B,
292  unsigned int B_start1, unsigned int B_start2,
293  unsigned int B_inc1, unsigned int B_inc2,
294  unsigned int B_internal_size1, unsigned int B_internal_size2,
295 
296  NumericT fac3,
297  unsigned int options3,
298  const NumericT * C,
299  unsigned int C_start1, unsigned int C_start2,
300  unsigned int C_inc1, unsigned int C_inc2,
301  unsigned int C_internal_size1, unsigned int C_internal_size2)
302 {
303  NumericT alpha = *fac2;
304  if (options2 & (1 << 0))
305  alpha = -alpha;
306 
307  NumericT beta = fac3;
308  if (options3 & (1 << 0))
309  beta = -beta;
310 
311  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
312  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
313 
314  if (options2 & (1 << 1))
315  {
316  if (options3 & (1 << 1))
317  {
318  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
319  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
320  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
321  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
322  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
323  }
324  else
325  {
326  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
327  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
328  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
329  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
330  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
331  }
332  }
333  else
334  {
335  if (options3 & (1 << 1))
336  {
337  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
338  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
339  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
340  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
341  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
342  }
343  else
344  {
345  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
346  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
347  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
348  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
349  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
350  }
351  }
352 }
353 
354 
355 // alpha and beta on GPU
356 template<typename NumericT>
357 __global__ void ambm_col_kernel(
358  NumericT * A,
359  unsigned int A_start1, unsigned int A_start2,
360  unsigned int A_inc1, unsigned int A_inc2,
361  unsigned int A_size1, unsigned int A_size2,
362  unsigned int A_internal_size1, unsigned int A_internal_size2,
363 
364  const NumericT * fac2,
365  unsigned int options2,
366  const NumericT * B,
367  unsigned int B_start1, unsigned int B_start2,
368  unsigned int B_inc1, unsigned int B_inc2,
369  unsigned int B_internal_size1, unsigned int B_internal_size2,
370 
371  const NumericT * fac3,
372  unsigned int options3,
373  const NumericT * C,
374  unsigned int C_start1, unsigned int C_start2,
375  unsigned int C_inc1, unsigned int C_inc2,
376  unsigned int C_internal_size1, unsigned int C_internal_size2)
377 {
378  NumericT alpha = *fac2;
379  if (options2 & (1 << 0))
380  alpha = -alpha;
381 
382  NumericT beta = *fac3;
383  if (options3 & (1 << 0))
384  beta = -beta;
385 
386  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
387  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
388 
389  if (options2 & (1 << 1))
390  {
391  if (options3 & (1 << 1))
392  {
393  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
394  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
395  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
396  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
397  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
398  }
399  else
400  {
401  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
402  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
403  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
404  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
405  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
406  }
407  }
408  else
409  {
410  if (options3 & (1 << 1))
411  {
412  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
413  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
414  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
415  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
416  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
417  }
418  else
419  {
420  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
421  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
422  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
423  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
424  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
425  }
426  }
427 }
428 
429 
430 //
431 // ambm_m
432 //
433 
434 // alpha and beta on CPU
435 template<typename NumericT>
436 __global__ void ambm_m_col_kernel(
437  NumericT * A,
438  unsigned int A_start1, unsigned int A_start2,
439  unsigned int A_inc1, unsigned int A_inc2,
440  unsigned int A_size1, unsigned int A_size2,
441  unsigned int A_internal_size1, unsigned int A_internal_size2,
442 
443  NumericT fac2,
444  unsigned int options2,
445  const NumericT * B,
446  unsigned int B_start1, unsigned int B_start2,
447  unsigned int B_inc1, unsigned int B_inc2,
448  unsigned int B_internal_size1, unsigned int B_internal_size2,
449 
450  NumericT fac3,
451  unsigned int options3,
452  const NumericT * C,
453  unsigned int C_start1, unsigned int C_start2,
454  unsigned int C_inc1, unsigned int C_inc2,
455  unsigned int C_internal_size1, unsigned int C_internal_size2)
456 {
457  NumericT alpha = fac2;
458  if (options2 & (1 << 0))
459  alpha = -alpha;
460 
461  NumericT beta = fac3;
462  if (options3 & (1 << 0))
463  beta = -beta;
464 
465  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
466  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
467 
468  if (options2 & (1 << 1))
469  {
470  if (options3 & (1 << 1))
471  {
472  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
473  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
474  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
475  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
476  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
477  }
478  else
479  {
480  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
481  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
482  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
483  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
484  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
485  }
486  }
487  else
488  {
489  if (options3 & (1 << 1))
490  {
491  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
492  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
493  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
494  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
495  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
496  }
497  else
498  {
499  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
500  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
501  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
502  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
503  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
504  }
505  }
506 }
507 
508 
509 // alpha on CPU, beta on GPU
510 template<typename NumericT>
511 __global__ void ambm_m_col_kernel(
512  NumericT * A,
513  unsigned int A_start1, unsigned int A_start2,
514  unsigned int A_inc1, unsigned int A_inc2,
515  unsigned int A_size1, unsigned int A_size2,
516  unsigned int A_internal_size1, unsigned int A_internal_size2,
517 
518  NumericT fac2,
519  unsigned int options2,
520  const NumericT * B,
521  unsigned int B_start1, unsigned int B_start2,
522  unsigned int B_inc1, unsigned int B_inc2,
523  unsigned int B_internal_size1, unsigned int B_internal_size2,
524 
525  const NumericT * fac3,
526  unsigned int options3,
527  const NumericT * C,
528  unsigned int C_start1, unsigned int C_start2,
529  unsigned int C_inc1, unsigned int C_inc2,
530  unsigned int C_internal_size1, unsigned int C_internal_size2)
531 {
532  NumericT alpha = fac2;
533  if (options2 & (1 << 0))
534  alpha = -alpha;
535 
536  NumericT beta = *fac3;
537  if (options3 & (1 << 0))
538  beta = -beta;
539 
540  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
541  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
542 
543  if (options2 & (1 << 1))
544  {
545  if (options3 & (1 << 1))
546  {
547  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
548  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
549  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
550  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
551  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
552  }
553  else
554  {
555  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
556  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
557  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
558  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
559  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
560  }
561  }
562  else
563  {
564  if (options3 & (1 << 1))
565  {
566  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
567  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
568  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
569  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
570  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
571  }
572  else
573  {
574  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
575  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
576  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
577  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
578  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
579  }
580  }
581 }
582 
583 // alpha on GPU, beta on CPU
584 template<typename NumericT>
585 __global__ void ambm_m_col_kernel(
586  NumericT * A,
587  unsigned int A_start1, unsigned int A_start2,
588  unsigned int A_inc1, unsigned int A_inc2,
589  unsigned int A_size1, unsigned int A_size2,
590  unsigned int A_internal_size1, unsigned int A_internal_size2,
591 
592  const NumericT * fac2,
593  unsigned int options2,
594  const NumericT * B,
595  unsigned int B_start1, unsigned int B_start2,
596  unsigned int B_inc1, unsigned int B_inc2,
597  unsigned int B_internal_size1, unsigned int B_internal_size2,
598 
599  NumericT fac3,
600  unsigned int options3,
601  const NumericT * C,
602  unsigned int C_start1, unsigned int C_start2,
603  unsigned int C_inc1, unsigned int C_inc2,
604  unsigned int C_internal_size1, unsigned int C_internal_size2)
605 {
606  NumericT alpha = *fac2;
607  if (options2 & (1 << 0))
608  alpha = -alpha;
609 
610  NumericT beta = fac3;
611  if (options3 & (1 << 0))
612  beta = -beta;
613 
614  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
615  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
616 
617  if (options2 & (1 << 1))
618  {
619  if (options3 & (1 << 1))
620  {
621  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
622  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
623  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
624  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
625  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
626  }
627  else
628  {
629  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
630  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
631  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
632  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
633  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
634  }
635  }
636  else
637  {
638  if (options3 & (1 << 1))
639  {
640  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
641  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
642  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
643  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
644  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
645  }
646  else
647  {
648  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
649  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
650  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
651  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
652  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
653  }
654  }
655 }
656 
657 
658 // alpha and beta on GPU
659 template<typename NumericT>
660 __global__ void ambm_m_col_kernel(
661  NumericT * A,
662  unsigned int A_start1, unsigned int A_start2,
663  unsigned int A_inc1, unsigned int A_inc2,
664  unsigned int A_size1, unsigned int A_size2,
665  unsigned int A_internal_size1, unsigned int A_internal_size2,
666 
667  const NumericT * fac2,
668  unsigned int options2,
669  const NumericT * B,
670  unsigned int B_start1, unsigned int B_start2,
671  unsigned int B_inc1, unsigned int B_inc2,
672  unsigned int B_internal_size1, unsigned int B_internal_size2,
673 
674  const NumericT * fac3,
675  unsigned int options3,
676  const NumericT * C,
677  unsigned int C_start1, unsigned int C_start2,
678  unsigned int C_inc1, unsigned int C_inc2,
679  unsigned int C_internal_size1, unsigned int C_internal_size2)
680 {
681  NumericT alpha = *fac2;
682  if (options2 & (1 << 0))
683  alpha = -alpha;
684 
685  NumericT beta = *fac3;
686  if (options3 & (1 << 0))
687  beta = -beta;
688 
689  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
690  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
691 
692  if (options2 & (1 << 1))
693  {
694  if (options3 & (1 << 1))
695  {
696  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
697  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
698  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
699  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
700  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
701  }
702  else
703  {
704  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
705  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
706  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
707  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
708  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
709  }
710  }
711  else
712  {
713  if (options3 & (1 << 1))
714  {
715  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
716  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
717  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
718  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
719  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
720  }
721  else
722  {
723  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
724  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
725  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
726  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
727  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
728  }
729  }
730 }
731 
732 
733 
734 //
735 // assignments
736 //
737 
738 template<typename NumericT>
739 __global__ void matrix_col_assign_kernel(
740  NumericT * A,
741  unsigned int A_start1, unsigned int A_start2,
742  unsigned int A_inc1, unsigned int A_inc2,
743  unsigned int A_size1, unsigned int A_size2,
744  unsigned int A_internal_size1, unsigned int A_internal_size2,
745  NumericT alpha)
746 {
747  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
748  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
749 
750  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
751  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
752  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
753 }
754 
755 
756 template<typename NumericT>
758  NumericT * A,
759  unsigned int A_start1, unsigned int A_start2,
760  unsigned int A_inc1, unsigned int A_inc2,
761  unsigned int A_size1, unsigned int A_size2,
762  unsigned int A_internal_size1, unsigned int A_internal_size2,
763  NumericT alpha)
764 {
765  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
766 
767  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
768  A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
769 }
770 
771 //
772 // binary element-wise operations
773 //
774 
775 template<typename NumericT>
776 __global__ void element_op_col_kernel(
777  NumericT * A,
778  unsigned int A_start1, unsigned int A_start2,
779  unsigned int A_inc1, unsigned int A_inc2,
780  unsigned int A_size1, unsigned int A_size2,
781  unsigned int A_internal_size1, unsigned int A_internal_size2,
782 
783  const NumericT * B,
784  unsigned int B_start1, unsigned int B_start2,
785  unsigned int B_inc1, unsigned int B_inc2,
786  unsigned int B_internal_size1, unsigned int B_internal_size2,
787 
788  const NumericT * C,
789  unsigned int C_start1, unsigned int C_start2,
790  unsigned int C_inc1, unsigned int C_inc2,
791  unsigned int C_internal_size1, unsigned int C_internal_size2,
792 
793  unsigned int op_type) //0: product, 1: division, 2: pow
794 {
795  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
796  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
797 
798  if (op_type == 2)
799  {
800  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
801  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
802  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
803  = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
804  C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
805  }
806  else if (op_type == 1)
807  {
808  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
809  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
810  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
811  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
812  / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
813  }
814  else if (op_type == 0)
815  {
816  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
817  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
818  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
819  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
820  * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
821  }
822 }
823 
824 template<typename NumericT>
826  NumericT * A,
827  unsigned int A_start1, unsigned int A_start2,
828  unsigned int A_inc1, unsigned int A_inc2,
829  unsigned int A_size1, unsigned int A_size2,
830  unsigned int A_internal_size1, unsigned int A_internal_size2,
831 
832  const NumericT * B,
833  unsigned int B_start1, unsigned int B_start2,
834  unsigned int B_inc1, unsigned int B_inc2,
835  unsigned int B_internal_size1, unsigned int B_internal_size2,
836 
837  const NumericT * C,
838  unsigned int C_start1, unsigned int C_start2,
839  unsigned int C_inc1, unsigned int C_inc2,
840  unsigned int C_internal_size1, unsigned int C_internal_size2,
841 
842  unsigned int op_type) //0: product, 1: division, 2: pow
843 {
844  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
845  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
846 
847  if (op_type == 1)
848  {
849  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
850  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
851  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
852  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
853  / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
854  }
855  else if (op_type == 0)
856  {
857  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
858  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
859  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
860  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
861  * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
862  }
863 }
864 
865 
866 //
867 // unary element-wise operations
868 //
869 
870 // abs
871 template<typename NumericT>
873  NumericT * A,
874  unsigned int A_start1, unsigned int A_start2,
875  unsigned int A_inc1, unsigned int A_inc2,
876  unsigned int A_size1, unsigned int A_size2,
877  unsigned int A_internal_size1, unsigned int A_internal_size2,
878 
879  const NumericT * B,
880  unsigned int B_start1, unsigned int B_start2,
881  unsigned int B_inc1, unsigned int B_inc2,
882  unsigned int B_internal_size1, unsigned int B_internal_size2)
883 {
884  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
885  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
886 
887  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
888  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
889  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
890 }
891 
892 
893 // acos
894 template<typename NumericT>
896  NumericT * A,
897  unsigned int A_start1, unsigned int A_start2,
898  unsigned int A_inc1, unsigned int A_inc2,
899  unsigned int A_size1, unsigned int A_size2,
900  unsigned int A_internal_size1, unsigned int A_internal_size2,
901 
902  const NumericT * B,
903  unsigned int B_start1, unsigned int B_start2,
904  unsigned int B_inc1, unsigned int B_inc2,
905  unsigned int B_internal_size1, unsigned int B_internal_size2)
906 {
907  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
908  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
909 
910  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
911  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
912  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
913 }
914 
915 
916 // asin
917 template<typename NumericT>
919  NumericT * A,
920  unsigned int A_start1, unsigned int A_start2,
921  unsigned int A_inc1, unsigned int A_inc2,
922  unsigned int A_size1, unsigned int A_size2,
923  unsigned int A_internal_size1, unsigned int A_internal_size2,
924 
925  const NumericT * B,
926  unsigned int B_start1, unsigned int B_start2,
927  unsigned int B_inc1, unsigned int B_inc2,
928  unsigned int B_internal_size1, unsigned int B_internal_size2)
929 {
930  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
931  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
932 
933  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
934  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
935  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
936 }
937 
938 
939 // atan
940 template<typename NumericT>
942  NumericT * A,
943  unsigned int A_start1, unsigned int A_start2,
944  unsigned int A_inc1, unsigned int A_inc2,
945  unsigned int A_size1, unsigned int A_size2,
946  unsigned int A_internal_size1, unsigned int A_internal_size2,
947 
948  const NumericT * B,
949  unsigned int B_start1, unsigned int B_start2,
950  unsigned int B_inc1, unsigned int B_inc2,
951  unsigned int B_internal_size1, unsigned int B_internal_size2)
952 {
953  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
954  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
955 
956  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
957  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
958  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
959 }
960 
961 
962 // ceil
963 template<typename NumericT>
965  NumericT * A,
966  unsigned int A_start1, unsigned int A_start2,
967  unsigned int A_inc1, unsigned int A_inc2,
968  unsigned int A_size1, unsigned int A_size2,
969  unsigned int A_internal_size1, unsigned int A_internal_size2,
970 
971  const NumericT * B,
972  unsigned int B_start1, unsigned int B_start2,
973  unsigned int B_inc1, unsigned int B_inc2,
974  unsigned int B_internal_size1, unsigned int B_internal_size2)
975 {
976  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
977  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
978 
979  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
980  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
981  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
982 }
983 
984 
985 // cos
986 template<typename NumericT>
988  NumericT * A,
989  unsigned int A_start1, unsigned int A_start2,
990  unsigned int A_inc1, unsigned int A_inc2,
991  unsigned int A_size1, unsigned int A_size2,
992  unsigned int A_internal_size1, unsigned int A_internal_size2,
993 
994  const NumericT * B,
995  unsigned int B_start1, unsigned int B_start2,
996  unsigned int B_inc1, unsigned int B_inc2,
997  unsigned int B_internal_size1, unsigned int B_internal_size2)
998 {
999  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1000  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1001 
1002  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1003  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1004  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1005 }
1006 
1007 
1008 // cosh
1009 template<typename NumericT>
1011  NumericT * A,
1012  unsigned int A_start1, unsigned int A_start2,
1013  unsigned int A_inc1, unsigned int A_inc2,
1014  unsigned int A_size1, unsigned int A_size2,
1015  unsigned int A_internal_size1, unsigned int A_internal_size2,
1016 
1017  const NumericT * B,
1018  unsigned int B_start1, unsigned int B_start2,
1019  unsigned int B_inc1, unsigned int B_inc2,
1020  unsigned int B_internal_size1, unsigned int B_internal_size2)
1021 {
1022  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1023  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1024 
1025  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1026  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1027  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1028 }
1029 
1030 
1031 // exp
1032 template<typename NumericT>
1034  NumericT * A,
1035  unsigned int A_start1, unsigned int A_start2,
1036  unsigned int A_inc1, unsigned int A_inc2,
1037  unsigned int A_size1, unsigned int A_size2,
1038  unsigned int A_internal_size1, unsigned int A_internal_size2,
1039 
1040  const NumericT * B,
1041  unsigned int B_start1, unsigned int B_start2,
1042  unsigned int B_inc1, unsigned int B_inc2,
1043  unsigned int B_internal_size1, unsigned int B_internal_size2)
1044 {
1045  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1046  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1047 
1048  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1049  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1050  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1051 }
1052 
1053 
1054 // fabs
1055 template<typename NumericT>
1057  NumericT * A,
1058  unsigned int A_start1, unsigned int A_start2,
1059  unsigned int A_inc1, unsigned int A_inc2,
1060  unsigned int A_size1, unsigned int A_size2,
1061  unsigned int A_internal_size1, unsigned int A_internal_size2,
1062 
1063  const NumericT * B,
1064  unsigned int B_start1, unsigned int B_start2,
1065  unsigned int B_inc1, unsigned int B_inc2,
1066  unsigned int B_internal_size1, unsigned int B_internal_size2)
1067 {
1068  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1069  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1070 
1071  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1072  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1073  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1074 }
1075 
1076 
1077 // floor
1078 template<typename NumericT>
1080  NumericT * A,
1081  unsigned int A_start1, unsigned int A_start2,
1082  unsigned int A_inc1, unsigned int A_inc2,
1083  unsigned int A_size1, unsigned int A_size2,
1084  unsigned int A_internal_size1, unsigned int A_internal_size2,
1085 
1086  const NumericT * B,
1087  unsigned int B_start1, unsigned int B_start2,
1088  unsigned int B_inc1, unsigned int B_inc2,
1089  unsigned int B_internal_size1, unsigned int B_internal_size2)
1090 {
1091  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1092  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1093 
1094  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1095  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1096  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1097 }
1098 
1099 
1100 // log
1101 template<typename NumericT>
1103  NumericT * A,
1104  unsigned int A_start1, unsigned int A_start2,
1105  unsigned int A_inc1, unsigned int A_inc2,
1106  unsigned int A_size1, unsigned int A_size2,
1107  unsigned int A_internal_size1, unsigned int A_internal_size2,
1108 
1109  const NumericT * B,
1110  unsigned int B_start1, unsigned int B_start2,
1111  unsigned int B_inc1, unsigned int B_inc2,
1112  unsigned int B_internal_size1, unsigned int B_internal_size2)
1113 {
1114  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1115  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1116 
1117  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1118  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1119  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1120 }
1121 
1122 
1123 // log10
1124 template<typename NumericT>
1126  NumericT * A,
1127  unsigned int A_start1, unsigned int A_start2,
1128  unsigned int A_inc1, unsigned int A_inc2,
1129  unsigned int A_size1, unsigned int A_size2,
1130  unsigned int A_internal_size1, unsigned int A_internal_size2,
1131 
1132  const NumericT * B,
1133  unsigned int B_start1, unsigned int B_start2,
1134  unsigned int B_inc1, unsigned int B_inc2,
1135  unsigned int B_internal_size1, unsigned int B_internal_size2)
1136 {
1137  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1138  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1139 
1140  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1141  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1142  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1143 }
1144 
1145 
1146 // sin
1147 template<typename NumericT>
1149  NumericT * A,
1150  unsigned int A_start1, unsigned int A_start2,
1151  unsigned int A_inc1, unsigned int A_inc2,
1152  unsigned int A_size1, unsigned int A_size2,
1153  unsigned int A_internal_size1, unsigned int A_internal_size2,
1154 
1155  const NumericT * B,
1156  unsigned int B_start1, unsigned int B_start2,
1157  unsigned int B_inc1, unsigned int B_inc2,
1158  unsigned int B_internal_size1, unsigned int B_internal_size2)
1159 {
1160  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1161  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1162 
1163  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1164  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1165  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1166 }
1167 
1168 
1169 // sinh
1170 template<typename NumericT>
1172  NumericT * A,
1173  unsigned int A_start1, unsigned int A_start2,
1174  unsigned int A_inc1, unsigned int A_inc2,
1175  unsigned int A_size1, unsigned int A_size2,
1176  unsigned int A_internal_size1, unsigned int A_internal_size2,
1177 
1178  const NumericT * B,
1179  unsigned int B_start1, unsigned int B_start2,
1180  unsigned int B_inc1, unsigned int B_inc2,
1181  unsigned int B_internal_size1, unsigned int B_internal_size2)
1182 {
1183  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1184  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1185 
1186  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1187  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1188  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1189 }
1190 
1191 
1192 // sqrt
1193 template<typename NumericT>
1195  NumericT * A,
1196  unsigned int A_start1, unsigned int A_start2,
1197  unsigned int A_inc1, unsigned int A_inc2,
1198  unsigned int A_size1, unsigned int A_size2,
1199  unsigned int A_internal_size1, unsigned int A_internal_size2,
1200 
1201  const NumericT * B,
1202  unsigned int B_start1, unsigned int B_start2,
1203  unsigned int B_inc1, unsigned int B_inc2,
1204  unsigned int B_internal_size1, unsigned int B_internal_size2)
1205 {
1206  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1207  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1208 
1209  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1210  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1211  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1212 }
1213 
1214 
1215 // tan
1216 template<typename NumericT>
1218  NumericT * A,
1219  unsigned int A_start1, unsigned int A_start2,
1220  unsigned int A_inc1, unsigned int A_inc2,
1221  unsigned int A_size1, unsigned int A_size2,
1222  unsigned int A_internal_size1, unsigned int A_internal_size2,
1223 
1224  const NumericT * B,
1225  unsigned int B_start1, unsigned int B_start2,
1226  unsigned int B_inc1, unsigned int B_inc2,
1227  unsigned int B_internal_size1, unsigned int B_internal_size2)
1228 {
1229  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1230  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1231 
1232  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1233  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1234  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1235 }
1236 
1237 
1238 // tanh
1239 template<typename NumericT>
1241  NumericT * A,
1242  unsigned int A_start1, unsigned int A_start2,
1243  unsigned int A_inc1, unsigned int A_inc2,
1244  unsigned int A_size1, unsigned int A_size2,
1245  unsigned int A_internal_size1, unsigned int A_internal_size2,
1246 
1247  const NumericT * B,
1248  unsigned int B_start1, unsigned int B_start2,
1249  unsigned int B_inc1, unsigned int B_inc2,
1250  unsigned int B_internal_size1, unsigned int B_internal_size2)
1251 {
1252  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1253  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1254 
1255  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1256  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1257  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1258 }
1259 
1260 
1261 
1262 //
1263 // matrix-vector product
1264 //
1265 
1266 template<typename NumericT>
1267 __global__ void vec_mul_col_kernel(
1268  const NumericT * A,
1269  unsigned int A_row_start,
1270  unsigned int A_col_start,
1271  unsigned int A_row_inc,
1272  unsigned int A_col_inc,
1273  unsigned int A_row_size,
1274  unsigned int A_col_size,
1275  unsigned int A_internal_rows,
1276  unsigned int A_internal_cols,
1277  const NumericT * v,
1278  unsigned int v_start,
1279  unsigned int v_inc,
1280  unsigned int v_size,
1281  NumericT * result,
1282  unsigned int result_start,
1283  unsigned int result_inc,
1284  unsigned int result_size)
1285 {
1286 
1287  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
1288  {
1289  NumericT dot_prod = 0;
1290  for (unsigned int col = 0; col < A_col_size; ++col)
1291  dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
1292  result[row * result_inc + result_start] = dot_prod;
1293  }
1294 }
1295 
1296 
1297 template<typename NumericT>
1299  const NumericT * A,
1300  unsigned int A_row_start,
1301  unsigned int A_col_start,
1302  unsigned int A_row_inc,
1303  unsigned int A_col_inc,
1304  unsigned int A_row_size,
1305  unsigned int A_col_size,
1306  unsigned int A_internal_rows,
1307  unsigned int A_internal_cols,
1308  const NumericT * v,
1309  unsigned int v_start,
1310  unsigned int v_inc,
1311  unsigned int v_size,
1312  NumericT * result,
1313  unsigned int result_start,
1314  unsigned int result_inc,
1315  unsigned int result_size)
1316 {
1317  __shared__ NumericT work[128];
1318 
1319  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1320  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1321  unsigned int lid = threadIdx.x;
1322 
1323  for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
1324  {
1325  NumericT dot_prod = 0;
1326  for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
1327  dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
1328  work[lid] = dot_prod;
1329 
1330  for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
1331  __syncthreads();
1332  if (lid < stride)
1333  work[lid] += work[lid+stride];
1334  }
1335 
1336  if (lid == 0)
1337  result[row * result_inc + result_start] = work[0];
1338  }
1339 }
1340 
1341 
1342 //
1343 // matrix-matrix products
1344 //
1345 
1346 
1347 
1348 
1349 //
1350 // scaled rank-1-update
1351 //
1352 
1353 // alpha on CPU
1354 template<typename NumericT>
1356  NumericT * A,
1357  unsigned int A_start1, unsigned int A_start2,
1358  unsigned int A_inc1, unsigned int A_inc2,
1359  unsigned int A_size1, unsigned int A_size2,
1360  unsigned int A_internal_size1, unsigned int A_internal_size2,
1361 
1362  NumericT val,
1363  unsigned int options2,
1364 
1365  const NumericT * vec1,
1366  unsigned int start1,
1367  unsigned int inc1,
1368  unsigned int size1,
1369 
1370  const NumericT * vec2,
1371  unsigned int start2,
1372  unsigned int inc2,
1373  unsigned int size2)
1374 {
1375  NumericT alpha = val;
1376  if (options2 & (1 << 0))
1377  alpha = -alpha;
1378  if (options2 & (1 << 1))
1379  alpha = NumericT(1) / alpha;
1380 
1381  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1382  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1383 
1384  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1385  {
1386  NumericT tmp = alpha * vec1[row * inc1 + start1];
1387  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1388  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1389  }
1390 }
1391 
1392 
1393 // alpha on GPU
1394 template<typename NumericT>
1396  NumericT * A,
1397  unsigned int A_start1, unsigned int A_start2,
1398  unsigned int A_inc1, unsigned int A_inc2,
1399  unsigned int A_size1, unsigned int A_size2,
1400  unsigned int A_internal_size1, unsigned int A_internal_size2,
1401 
1402  const NumericT * val,
1403  unsigned int options2,
1404 
1405  const NumericT * vec1,
1406  unsigned int start1,
1407  unsigned int inc1,
1408  unsigned int size1,
1409 
1410  const NumericT * vec2,
1411  unsigned int start2,
1412  unsigned int inc2,
1413  unsigned int size2)
1414 {
1415  NumericT alpha = *val;
1416  if (options2 & (1 << 0))
1417  alpha = -alpha;
1418  if (options2 & (1 << 1))
1419  alpha = NumericT(1) / alpha;
1420 
1421  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1422  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1423 
1424  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1425  {
1426  NumericT tmp = alpha * vec1[row * inc1 + start1];
1427  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1428  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1429  }
1430 }
1431 
1432 
1433 template <typename T>
1435  T * A,
1436  T * D,
1437  T * S,
1438  unsigned int size1,
1439  unsigned int size2,
1440  unsigned int stride)
1441 {
1442  unsigned int size = min(size1, size2);
1443  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1444  S[0] = 0;
1445 
1446  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1447  i < size;
1448  i += gridDim.x * blockDim.x)
1449  {
1450  D[i] = A[i*stride + i];
1451  S[i+1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0;
1452  }
1453 }
1454 
1455 template <typename T>
1457  T * A,
1458  T * D,
1459  T * S,
1460  unsigned int size1,
1461  unsigned int size2,
1462  unsigned int stride)
1463 {
1464  unsigned int size = min(size1, size2);
1465  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1466  S[0] = 0;
1467 
1468  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1469  i < size;
1470  i += gridDim.x * blockDim.x)
1471  {
1472  D[i] = A[i*stride + i];
1473  S[i+1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0;
1474  }
1475 }
1476 
1477 
1478 
1479 template<typename T>
1481  T * A,
1482  T * V,
1483  unsigned int row_start,
1484  unsigned int col_start,
1485  unsigned int size,
1486  unsigned int stride)
1487 {
1488  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1489  unsigned int sz = gridDim.x * blockDim.x;
1490 
1491  for(unsigned int i = row_start + x; i < size; i += sz)
1492  {
1493  V[i - row_start] = A[i * stride + col_start];
1494  }
1495 }
1496 
1497 template<typename T>
1499  T * A,
1500  T * V,
1501  unsigned int row_start,
1502  unsigned int col_start,
1503  unsigned int size,
1504  unsigned int stride)
1505 {
1506  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1507  unsigned int sz = gridDim.x * blockDim.x;
1508 
1509  for(unsigned int i = row_start + x; i < size; i += sz)
1510  {
1511  V[i - row_start] = A[i + col_start * stride];
1512  }
1513 }
1514 
1515 template<typename T>
1517  T * A,
1518  T * V,
1519  unsigned int row_start,
1520  unsigned int col_start,
1521  unsigned int size,
1522  unsigned int stride)
1523 {
1524  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1525  unsigned int sz = gridDim.x * blockDim.x;
1526 
1527  for(unsigned int i = col_start + x; i < size; i += sz)
1528  {
1529  V[i - col_start] = A[row_start * stride + i];
1530  }
1531 
1532 }
1533 
1534 template<typename T>
1536  T * A,
1537  T * V,
1538  unsigned int row_start,
1539  unsigned int col_start,
1540  unsigned int size,
1541  unsigned int stride)
1542 {
1543  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1544  unsigned int sz = gridDim.x * blockDim.x;
1545 
1546  for(unsigned int i = col_start + x; i < size; i += sz)
1547  {
1548  V[i - col_start] = A[row_start + i * stride];
1549  }
1550 
1551 }
1552 
1553 
1554 
1555 template<typename T>
1557  T * A,
1558  T * V, //householder vector
1559  unsigned int row_start,
1560  unsigned int col_start,
1561  unsigned int size1,
1562  unsigned int size2,
1563  unsigned int stride)
1564 {
1565  T ss = 0;
1566 
1567  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1568  i < size2;
1569  i += gridDim.x * blockDim.x)
1570  {
1571  ss = 0;
1572  for(unsigned int j = row_start; j < size1; j++)
1573  ss = ss +(V[j] * A[j * stride + i]);
1574 
1575  for(unsigned int j = row_start; j < size1; j++)
1576  A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
1577  }
1578 }
1579 
1580 template<typename T>
1582  T * A,
1583  T * V, //householder vector
1584  unsigned int row_start,
1585  unsigned int col_start,
1586  unsigned int size1,
1587  unsigned int size2,
1588  unsigned int stride)
1589 {
1590  T ss = 0;
1591 
1592  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1593  i < size2;
1594  i += gridDim.x * blockDim.x)
1595  {
1596  ss = 0;
1597  for(unsigned int j = row_start; j < size1; j++)
1598  ss = ss +(V[j] * A[j + i * stride]);
1599 
1600  for(unsigned int j = row_start; j < size1; j++)
1601  A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
1602  }
1603 }
1604 
1605 
1606 
1607 template<typename T>
1609  T * A,
1610  T * V, //householder vector
1611  unsigned int row_start,
1612  unsigned int col_start,
1613  unsigned int size1,
1614  unsigned int size2,
1615  unsigned int stride)
1616 {
1617  __shared__ T sums[128];
1618  T ss = 0;
1619 
1620  for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1621  {
1622  ss = 0;
1623  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1624  ss = ss + (V[j] * A[i * stride + j]);
1625  sums[threadIdx.x] = ss;
1626 
1627  __syncthreads();
1628  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1629  __syncthreads();
1630 
1631  T sum_Av = sums[0];
1632 
1633  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1634  A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
1635  }
1636 }
1637 
1638 template<typename T>
1640  T * A,
1641  T * V, //householder vector
1642  unsigned int row_start,
1643  unsigned int col_start,
1644  unsigned int size1,
1645  unsigned int size2,
1646  unsigned int stride)
1647 {
1648  __shared__ T sums[128];
1649  T ss = 0;
1650 
1651  for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1652  {
1653  ss = 0;
1654  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1655  ss = ss + (V[j] * A[i + j * stride]);
1656  sums[threadIdx.x] = ss;
1657 
1658  __syncthreads();
1659  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1660  __syncthreads();
1661 
1662  T sum_Av = sums[0];
1663 
1664  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1665  A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av);
1666  }
1667 }
1668 
1669 
1670 
1671 template<typename T>
1672 __device__ void col_reduce_lcl_array(
1673  T * sums,
1674  unsigned int th_Idx,
1675  unsigned int bl_Dim)
1676 {
1677  unsigned int step = bl_Dim >> 1;
1678 
1679  while(step > 0)
1680  {
1681  if(th_Idx < step)
1682  sums[th_Idx] += sums[th_Idx + step];
1683  step >>= 1;
1684  __syncthreads();
1685  }
1686 }
1687 
1688 
1689 template <typename T>
1691  T * QL,
1692  T * V,
1693  unsigned int size1,
1694  unsigned int strideQ)
1695 {
1696  __shared__ T sums[128];
1697  T ss = 0;
1698  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1699  {
1700  ss = 0;
1701  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1702  ss = ss + (V[j] * QL[i * strideQ + j]);
1703  sums[threadIdx.x] = ss;
1704 
1705  __syncthreads();
1706  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1707  __syncthreads();
1708 
1709  T sum_Qv = sums[0];
1710 
1711  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1712  QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
1713  }
1714 }
1715 
1716 template <typename T>
1718  T * QL,
1719  T * V,
1720  unsigned int size1,
1721  unsigned int strideQ)
1722 {
1723  __shared__ T sums[128];
1724  T ss = 0;
1725  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1726  {
1727  ss = 0;
1728  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1729  ss = ss + (V[j] * QL[i + j * strideQ]);
1730  sums[threadIdx.x] = ss;
1731 
1732  __syncthreads();
1733  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1734  __syncthreads();
1735 
1736  T sum_Qv = sums[0];
1737 
1738  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1739  QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
1740  }
1741 }
1742 
1743 
1744 template <typename T>
1746  T * matr,
1747  T * cs,
1748  T * ss,
1749  unsigned int size,
1750  unsigned int stride,
1751  unsigned int start_i,
1752  unsigned int end_i)
1753 {
1754  unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1755  __shared__ T cs_lcl[256];
1756  __shared__ T ss_lcl[256];
1757 
1758  T x = (j < size) ? matr[(end_i + 1) + j * stride] : 0;
1759 
1760  unsigned int elems_num = end_i - start_i + 1;
1761  unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1762 
1763  for(unsigned int block_id = 0; block_id < block_num; block_id++)
1764  {
1765  unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
1766 
1767  if(threadIdx.x < to)
1768  {
1769  cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1770  ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1771  }
1772  __syncthreads();
1773  if(j < size)
1774  {
1775  for(unsigned int ind = 0; ind < to; ind++)
1776  {
1777  unsigned int i = end_i - (ind + block_id * blockDim.x);
1778  T z = matr[i + j * stride];
1779  T cs_val = cs_lcl[ind];
1780  T ss_val = ss_lcl[ind];
1781  matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
1782  x = -x * ss_val + z * cs_val;
1783  }
1784  }
1785  __syncthreads();
1786  }
1787  if(j < size)
1788  matr[(start_i) + j * stride] = x;
1789 }
1790 
1791 template <typename T>
1793  T * matr,
1794  T * cs,
1795  T * ss,
1796  unsigned int size,
1797  unsigned int stride,
1798  unsigned int start_i,
1799  unsigned int end_i)
1800 {
1801  unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1802  __shared__ T cs_lcl[256];
1803  __shared__ T ss_lcl[256];
1804 
1805  T x = (j < size) ? matr[(end_i + 1) *stride + j] : 0;
1806 
1807  unsigned int elems_num = end_i - start_i + 1;
1808  unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1809 
1810  for(unsigned int block_id = 0; block_id < block_num; block_id++)
1811  {
1812  unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
1813 
1814  if(threadIdx.x < to)
1815  {
1816  cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1817  ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1818  }
1819  __syncthreads();
1820  if(j < size)
1821  {
1822  for(unsigned int ind = 0; ind < to; ind++)
1823  {
1824  unsigned int i = end_i - (ind + block_id * blockDim.x);
1825  T z = matr[i *stride + j];
1826  T cs_val = cs_lcl[ind];
1827  T ss_val = ss_lcl[ind];
1828  matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
1829  x = -x * ss_val + z * cs_val;
1830  }
1831  }
1832  __syncthreads();
1833  }
1834  if(j < size)
1835  matr[(start_i) * stride + j] = x;
1836 }
1837 
1838 
1839 
1840 
1841 
1842 } // namespace cuda
1843 } //namespace linalg
1844 } //namespace viennacl
1845 
1846 
1847 #endif
__global__ void matrix_col_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void house_update_A_right_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void convert_col_kernel(DestNumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const SrcNumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
__global__ void house_update_A_left_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
__global__ void matrix_col_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void bidiag_pack_column_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
endcode *Final step
__global__ void house_update_A_left_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
__global__ void matrix_col_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
__global__ void matrix_col_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
float NumericT
Definition: bisect.cpp:40
__global__ void house_update_QL_row_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void scaled_rank1_update_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void matrix_col_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_m_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
__global__ void matrix_col_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void matrix_col_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
__device__ void col_reduce_lcl_array(T *sums, unsigned int th_Idx, unsigned int bl_Dim)
__global__ void matrix_col_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void element_op_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void house_update_A_right_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void matrix_col_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void element_op_int_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_col_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void trans_vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void house_update_QL_column_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void givens_next_column_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void bidiag_pack_row_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91