ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix_operations_row.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
26 namespace viennacl
27 {
28 namespace linalg
29 {
30 namespace cuda
31 {
32 
33 template<typename DestNumericT, typename SrcNumericT>
34 __global__ void convert_row_kernel(
35  DestNumericT * A,
36  unsigned int A_start1, unsigned int A_start2,
37  unsigned int A_inc1, unsigned int A_inc2,
38  unsigned int A_size1, unsigned int A_size2,
39  unsigned int A_internal_size1, unsigned int A_internal_size2,
40 
41  const SrcNumericT * B,
42  unsigned int B_start1, unsigned int B_start2,
43  unsigned int B_inc1, unsigned int B_inc2,
44  unsigned int B_internal_size1, unsigned int B_internal_size2)
45 {
46  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
47  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
48 
49  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
50  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
51  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2];
52 }
53 
54 //Matrix transpose kernel
55 template<typename NumericT>
56 __global__ void trans_kernel(
57  const NumericT * A,
58  unsigned int A_start1, unsigned int A_start2,
59  unsigned int A_internal_size1, unsigned int A_internal_size2,
60  unsigned int A_size1, unsigned int A_size2,
61  unsigned int A_stride1, unsigned int A_stride2,
62 
63  NumericT * B,
64  unsigned int B_start1, unsigned int B_start2,
65  unsigned int B_internal_size1, unsigned int B_internal_size2,
66  unsigned int B_stride1, unsigned int B_stride2,
67  bool data_major)
68 {
69  for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x)
70  {
71  for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
72  {
73  if(data_major)
74  B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)];
75  else
76  B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1];
77  }
78  }
79 }
80 
81 //
82 // am
83 //
84 
85 // alpha on CPU
86 template<typename NumericT>
87 __global__ void am_row_kernel(
88  NumericT * A,
89  unsigned int A_start1, unsigned int A_start2,
90  unsigned int A_inc1, unsigned int A_inc2,
91  unsigned int A_size1, unsigned int A_size2,
92  unsigned int A_internal_size1, unsigned int A_internal_size2,
93 
94  NumericT fac2,
95  unsigned int options2,
96  const NumericT * B,
97  unsigned int B_start1, unsigned int B_start2,
98  unsigned int B_inc1, unsigned int B_inc2,
99  unsigned int B_internal_size1, unsigned int B_internal_size2)
100 {
101  NumericT alpha = fac2;
102  if (options2 & (1 << 0))
103  alpha = -alpha;
104 
105  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
106  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
107 
108  if (options2 & (1 << 1))
109  {
110  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
111  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
112  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
113  }
114  else
115  {
116  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
117  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
118  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
119  }
120 }
121 
122 // alpha on GPU
123 template<typename NumericT>
124 __global__ void am_row_kernel(
125  NumericT * A,
126  unsigned int A_start1, unsigned int A_start2,
127  unsigned int A_inc1, unsigned int A_inc2,
128  unsigned int A_size1, unsigned int A_size2,
129  unsigned int A_internal_size1, unsigned int A_internal_size2,
130 
131  const NumericT * fac2,
132  unsigned int options2,
133  const NumericT * B,
134  unsigned int B_start1, unsigned int B_start2,
135  unsigned int B_inc1, unsigned int B_inc2,
136  unsigned int B_internal_size1, unsigned int B_internal_size2)
137 {
138  NumericT alpha = *fac2;
139  if (options2 & (1 << 0))
140  alpha = -alpha;
141 
142  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
143  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
144 
145  if (options2 & (1 << 1))
146  {
147  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
148  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
149  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
150  }
151  else
152  {
153  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
154  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
155  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
156  }
157 }
158 
159 
160 //
161 // ambm
162 //
163 
164 // alpha and beta on CPU
165 template<typename NumericT>
166 __global__ void ambm_row_kernel(
167  NumericT * A,
168  unsigned int A_start1, unsigned int A_start2,
169  unsigned int A_inc1, unsigned int A_inc2,
170  unsigned int A_size1, unsigned int A_size2,
171  unsigned int A_internal_size1, unsigned int A_internal_size2,
172 
173  NumericT fac2,
174  unsigned int options2,
175  const NumericT * B,
176  unsigned int B_start1, unsigned int B_start2,
177  unsigned int B_inc1, unsigned int B_inc2,
178  unsigned int B_internal_size1, unsigned int B_internal_size2,
179 
180  NumericT fac3,
181  unsigned int options3,
182  const NumericT * C,
183  unsigned int C_start1, unsigned int C_start2,
184  unsigned int C_inc1, unsigned int C_inc2,
185  unsigned int C_internal_size1, unsigned int C_internal_size2)
186 {
187  NumericT alpha = fac2;
188  if (options2 & (1 << 0))
189  alpha = -alpha;
190 
191  NumericT beta = fac3;
192  if (options3 & (1 << 0))
193  beta = -beta;
194 
195  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
196  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
197 
198  if (options2 & (1 << 1))
199  {
200  if (options3 & (1 << 1))
201  {
202  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
203  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
204  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
205  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
206  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
207  }
208  else
209  {
210  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
211  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
212  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
213  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
214  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
215  }
216  }
217  else
218  {
219  if (options3 & (1 << 1))
220  {
221  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
222  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
223  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
224  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
225  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
226  }
227  else
228  {
229  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
230  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
231  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
232  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
233  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
234  }
235  }
236 }
237 
238 
239 // alpha on CPU, beta on GPU
240 template<typename NumericT>
241 __global__ void ambm_row_kernel(
242  NumericT * A,
243  unsigned int A_start1, unsigned int A_start2,
244  unsigned int A_inc1, unsigned int A_inc2,
245  unsigned int A_size1, unsigned int A_size2,
246  unsigned int A_internal_size1, unsigned int A_internal_size2,
247 
248  NumericT fac2,
249  unsigned int options2,
250  const NumericT * B,
251  unsigned int B_start1, unsigned int B_start2,
252  unsigned int B_inc1, unsigned int B_inc2,
253  unsigned int B_internal_size1, unsigned int B_internal_size2,
254 
255  const NumericT * fac3,
256  unsigned int options3,
257  const NumericT * C,
258  unsigned int C_start1, unsigned int C_start2,
259  unsigned int C_inc1, unsigned int C_inc2,
260  unsigned int C_internal_size1, unsigned int C_internal_size2)
261 {
262  NumericT alpha = fac2;
263  if (options2 & (1 << 0))
264  alpha = -alpha;
265 
266  NumericT beta = *fac3;
267  if (options3 & (1 << 0))
268  beta = -beta;
269 
270  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
271  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
272 
273  if (options2 & (1 << 1))
274  {
275  if (options3 & (1 << 1))
276  {
277  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
278  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
279  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
280  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
281  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
282  }
283  else
284  {
285  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
286  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
287  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
288  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
289  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
290  }
291  }
292  else
293  {
294  if (options3 & (1 << 1))
295  {
296  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
297  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
298  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
299  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
300  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
301  }
302  else
303  {
304  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
305  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
306  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
307  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
308  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
309  }
310  }
311 }
312 
313 // alpha on GPU, beta on CPU
314 template<typename NumericT>
315 __global__ void ambm_row_kernel(
316  NumericT * A,
317  unsigned int A_start1, unsigned int A_start2,
318  unsigned int A_inc1, unsigned int A_inc2,
319  unsigned int A_size1, unsigned int A_size2,
320  unsigned int A_internal_size1, unsigned int A_internal_size2,
321 
322  const NumericT * fac2,
323  unsigned int options2,
324  const NumericT * B,
325  unsigned int B_start1, unsigned int B_start2,
326  unsigned int B_inc1, unsigned int B_inc2,
327  unsigned int B_internal_size1, unsigned int B_internal_size2,
328 
329  NumericT fac3,
330  unsigned int options3,
331  const NumericT * C,
332  unsigned int C_start1, unsigned int C_start2,
333  unsigned int C_inc1, unsigned int C_inc2,
334  unsigned int C_internal_size1, unsigned int C_internal_size2)
335 {
336  NumericT alpha = *fac2;
337  if (options2 & (1 << 0))
338  alpha = -alpha;
339 
340  NumericT beta = fac3;
341  if (options3 & (1 << 0))
342  beta = -beta;
343 
344  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
345  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
346 
347  if (options2 & (1 << 1))
348  {
349  if (options3 & (1 << 1))
350  {
351  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
352  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
353  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
354  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
355  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
356  }
357  else
358  {
359  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
360  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
361  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
362  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
363  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
364  }
365  }
366  else
367  {
368  if (options3 & (1 << 1))
369  {
370  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
371  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
372  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
373  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
374  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
375  }
376  else
377  {
378  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
379  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
380  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
381  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
382  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
383  }
384  }
385 }
386 
387 
388 // alpha and beta on GPU
389 template<typename NumericT>
390 __global__ void ambm_row_kernel(
391  NumericT * A,
392  unsigned int A_start1, unsigned int A_start2,
393  unsigned int A_inc1, unsigned int A_inc2,
394  unsigned int A_size1, unsigned int A_size2,
395  unsigned int A_internal_size1, unsigned int A_internal_size2,
396 
397  const NumericT * fac2,
398  unsigned int options2,
399  const NumericT * B,
400  unsigned int B_start1, unsigned int B_start2,
401  unsigned int B_inc1, unsigned int B_inc2,
402  unsigned int B_internal_size1, unsigned int B_internal_size2,
403 
404  const NumericT * fac3,
405  unsigned int options3,
406  const NumericT * C,
407  unsigned int C_start1, unsigned int C_start2,
408  unsigned int C_inc1, unsigned int C_inc2,
409  unsigned int C_internal_size1, unsigned int C_internal_size2)
410 {
411  NumericT alpha = *fac2;
412  if (options2 & (1 << 0))
413  alpha = -alpha;
414 
415  NumericT beta = *fac3;
416  if (options3 & (1 << 0))
417  beta = -beta;
418 
419  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
420  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
421 
422  if (options2 & (1 << 1))
423  {
424  if (options3 & (1 << 1))
425  {
426  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
427  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
428  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
429  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
430  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
431  }
432  else
433  {
434  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
435  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
436  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
437  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
438  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
439  }
440  }
441  else
442  {
443  if (options3 & (1 << 1))
444  {
445  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
446  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
447  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
448  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
449  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
450  }
451  else
452  {
453  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
454  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
455  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
456  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
457  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
458  }
459  }
460 }
461 
462 
463 //
464 // ambm_m
465 //
466 
467 // alpha and beta on CPU
468 template<typename NumericT>
469 __global__ void ambm_m_row_kernel(
470  NumericT * A,
471  unsigned int A_start1, unsigned int A_start2,
472  unsigned int A_inc1, unsigned int A_inc2,
473  unsigned int A_size1, unsigned int A_size2,
474  unsigned int A_internal_size1, unsigned int A_internal_size2,
475 
476  NumericT fac2,
477  unsigned int options2,
478  const NumericT * B,
479  unsigned int B_start1, unsigned int B_start2,
480  unsigned int B_inc1, unsigned int B_inc2,
481  unsigned int B_internal_size1, unsigned int B_internal_size2,
482 
483  NumericT fac3,
484  unsigned int options3,
485  const NumericT * C,
486  unsigned int C_start1, unsigned int C_start2,
487  unsigned int C_inc1, unsigned int C_inc2,
488  unsigned int C_internal_size1, unsigned int C_internal_size2)
489 {
490  NumericT alpha = fac2;
491  if (options2 & (1 << 0))
492  alpha = -alpha;
493 
494  NumericT beta = fac3;
495  if (options3 & (1 << 0))
496  beta = -beta;
497 
498  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
499  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
500 
501  if (options2 & (1 << 1))
502  {
503  if (options3 & (1 << 1))
504  {
505  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
506  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
507  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
508  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
509  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
510  }
511  else
512  {
513  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
514  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
515  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
516  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
517  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
518  }
519  }
520  else
521  {
522  if (options3 & (1 << 1))
523  {
524  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
525  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
526  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
527  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
528  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
529  }
530  else
531  {
532  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
533  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
534  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
535  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
536  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
537  }
538  }
539 }
540 
541 
542 // alpha on CPU, beta on GPU
543 template<typename NumericT>
544 __global__ void ambm_m_row_kernel(
545  NumericT * A,
546  unsigned int A_start1, unsigned int A_start2,
547  unsigned int A_inc1, unsigned int A_inc2,
548  unsigned int A_size1, unsigned int A_size2,
549  unsigned int A_internal_size1, unsigned int A_internal_size2,
550 
551  NumericT fac2,
552  unsigned int options2,
553  const NumericT * B,
554  unsigned int B_start1, unsigned int B_start2,
555  unsigned int B_inc1, unsigned int B_inc2,
556  unsigned int B_internal_size1, unsigned int B_internal_size2,
557 
558  const NumericT * fac3,
559  unsigned int options3,
560  const NumericT * C,
561  unsigned int C_start1, unsigned int C_start2,
562  unsigned int C_inc1, unsigned int C_inc2,
563  unsigned int C_internal_size1, unsigned int C_internal_size2)
564 {
565  NumericT alpha = fac2;
566  if (options2 & (1 << 0))
567  alpha = -alpha;
568 
569  NumericT beta = *fac3;
570  if (options3 & (1 << 0))
571  beta = -beta;
572 
573  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
574  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
575 
576  if (options2 & (1 << 1))
577  {
578  if (options3 & (1 << 1))
579  {
580  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
581  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
582  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
583  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
584  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
585  }
586  else
587  {
588  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
589  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
590  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
591  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
592  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
593  }
594  }
595  else
596  {
597  if (options3 & (1 << 1))
598  {
599  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
600  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
601  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
602  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
603  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
604  }
605  else
606  {
607  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
608  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
609  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
610  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
611  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
612  }
613  }
614 }
615 
616 // alpha on GPU, beta on CPU
617 template<typename NumericT>
618 __global__ void ambm_m_row_kernel(
619  NumericT * A,
620  unsigned int A_start1, unsigned int A_start2,
621  unsigned int A_inc1, unsigned int A_inc2,
622  unsigned int A_size1, unsigned int A_size2,
623  unsigned int A_internal_size1, unsigned int A_internal_size2,
624 
625  const NumericT * fac2,
626  unsigned int options2,
627  const NumericT * B,
628  unsigned int B_start1, unsigned int B_start2,
629  unsigned int B_inc1, unsigned int B_inc2,
630  unsigned int B_internal_size1, unsigned int B_internal_size2,
631 
632  NumericT fac3,
633  unsigned int options3,
634  const NumericT * C,
635  unsigned int C_start1, unsigned int C_start2,
636  unsigned int C_inc1, unsigned int C_inc2,
637  unsigned int C_internal_size1, unsigned int C_internal_size2)
638 {
639  NumericT alpha = *fac2;
640  if (options2 & (1 << 0))
641  alpha = -alpha;
642 
643  NumericT beta = fac3;
644  if (options3 & (1 << 0))
645  beta = -beta;
646 
647  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
648  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
649 
650  if (options2 & (1 << 1))
651  {
652  if (options3 & (1 << 1))
653  {
654  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
655  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
656  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
657  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
658  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
659  }
660  else
661  {
662  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
663  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
664  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
665  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
666  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
667  }
668  }
669  else
670  {
671  if (options3 & (1 << 1))
672  {
673  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
674  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
675  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
676  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
677  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
678  }
679  else
680  {
681  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
682  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
683  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
684  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
685  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
686  }
687  }
688 }
689 
690 
691 // alpha and beta on GPU
692 template<typename NumericT>
693 __global__ void ambm_m_row_kernel(
694  NumericT * A,
695  unsigned int A_start1, unsigned int A_start2,
696  unsigned int A_inc1, unsigned int A_inc2,
697  unsigned int A_size1, unsigned int A_size2,
698  unsigned int A_internal_size1, unsigned int A_internal_size2,
699 
700  const NumericT * fac2,
701  unsigned int options2,
702  const NumericT * B,
703  unsigned int B_start1, unsigned int B_start2,
704  unsigned int B_inc1, unsigned int B_inc2,
705  unsigned int B_internal_size1, unsigned int B_internal_size2,
706 
707  const NumericT * fac3,
708  unsigned int options3,
709  const NumericT * C,
710  unsigned int C_start1, unsigned int C_start2,
711  unsigned int C_inc1, unsigned int C_inc2,
712  unsigned int C_internal_size1, unsigned int C_internal_size2)
713 {
714  NumericT alpha = *fac2;
715  if (options2 & (1 << 0))
716  alpha = -alpha;
717 
718  NumericT beta = *fac3;
719  if (options3 & (1 << 0))
720  beta = -beta;
721 
722  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
723  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
724 
725  if (options2 & (1 << 1))
726  {
727  if (options3 & (1 << 1))
728  {
729  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
730  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
731  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
732  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
733  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
734  }
735  else
736  {
737  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
738  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
739  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
740  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
741  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
742  }
743  }
744  else
745  {
746  if (options3 & (1 << 1))
747  {
748  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
749  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
750  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
751  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
752  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
753  }
754  else
755  {
756  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
757  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
758  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
759  += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
760  + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
761  }
762  }
763 }
764 
765 //
766 // assignments
767 //
768 
769 template<typename NumericT>
770 __global__ void matrix_row_assign_kernel(
771  NumericT * A,
772  unsigned int A_start1, unsigned int A_start2,
773  unsigned int A_inc1, unsigned int A_inc2,
774  unsigned int A_size1, unsigned int A_size2,
775  unsigned int A_internal_size1, unsigned int A_internal_size2,
776  NumericT alpha)
777 {
778  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
779  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
780 
781  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
782  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
783  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
784 }
785 
786 
787 template<typename NumericT>
789  NumericT * A,
790  unsigned int A_start1, unsigned int A_start2,
791  unsigned int A_inc1, unsigned int A_inc2,
792  unsigned int A_size1, unsigned int A_size2,
793  unsigned int A_internal_size1, unsigned int A_internal_size2,
794  NumericT alpha)
795 {
796  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
797 
798  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
799  A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
800 }
801 
802 //
803 // binary element-wise operations
804 //
805 
806 template<typename NumericT>
807 __global__ void element_op_row_kernel(
808  NumericT * A,
809  unsigned int A_start1, unsigned int A_start2,
810  unsigned int A_inc1, unsigned int A_inc2,
811  unsigned int A_size1, unsigned int A_size2,
812  unsigned int A_internal_size1, unsigned int A_internal_size2,
813 
814  const NumericT * B,
815  unsigned int B_start1, unsigned int B_start2,
816  unsigned int B_inc1, unsigned int B_inc2,
817  unsigned int B_internal_size1, unsigned int B_internal_size2,
818 
819  const NumericT * C,
820  unsigned int C_start1, unsigned int C_start2,
821  unsigned int C_inc1, unsigned int C_inc2,
822  unsigned int C_internal_size1, unsigned int C_internal_size2,
823 
824  unsigned int op_type) //0: product, 1: division, 2: pow
825 {
826  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
827  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
828 
829  if (op_type == 2)
830  {
831  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
832  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
833  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
834  = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
835  C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
836  }
837  else if (op_type == 1)
838  {
839  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
840  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
841  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
842  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
843  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
844  }
845  else if (op_type == 0)
846  {
847  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
848  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
849  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
850  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
851  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
852  }
853 }
854 
855 template<typename NumericT>
857  NumericT * A,
858  unsigned int A_start1, unsigned int A_start2,
859  unsigned int A_inc1, unsigned int A_inc2,
860  unsigned int A_size1, unsigned int A_size2,
861  unsigned int A_internal_size1, unsigned int A_internal_size2,
862 
863  const NumericT * B,
864  unsigned int B_start1, unsigned int B_start2,
865  unsigned int B_inc1, unsigned int B_inc2,
866  unsigned int B_internal_size1, unsigned int B_internal_size2,
867 
868  const NumericT * C,
869  unsigned int C_start1, unsigned int C_start2,
870  unsigned int C_inc1, unsigned int C_inc2,
871  unsigned int C_internal_size1, unsigned int C_internal_size2,
872 
873  unsigned int op_type) //0: product, 1: division, 2: pow
874 {
875  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
876  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
877 
878  if (op_type == 1)
879  {
880  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
881  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
882  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
883  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
884  / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
885  }
886  else if (op_type == 0)
887  {
888  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
889  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
890  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
891  = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
892  * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
893  }
894 }
895 
896 //
897 // unary element-wise operations
898 //
899 
900 // abs
901 template<typename NumericT>
903  NumericT * A,
904  unsigned int A_start1, unsigned int A_start2,
905  unsigned int A_inc1, unsigned int A_inc2,
906  unsigned int A_size1, unsigned int A_size2,
907  unsigned int A_internal_size1, unsigned int A_internal_size2,
908 
909  const NumericT * B,
910  unsigned int B_start1, unsigned int B_start2,
911  unsigned int B_inc1, unsigned int B_inc2,
912  unsigned int B_internal_size1, unsigned int B_internal_size2)
913 {
914  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
915  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
916 
917  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
918  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
919  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
920 }
921 
922 
923 // acos
924 template<typename NumericT>
926  NumericT * A,
927  unsigned int A_start1, unsigned int A_start2,
928  unsigned int A_inc1, unsigned int A_inc2,
929  unsigned int A_size1, unsigned int A_size2,
930  unsigned int A_internal_size1, unsigned int A_internal_size2,
931 
932  const NumericT * B,
933  unsigned int B_start1, unsigned int B_start2,
934  unsigned int B_inc1, unsigned int B_inc2,
935  unsigned int B_internal_size1, unsigned int B_internal_size2)
936 {
937  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
938  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
939 
940  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
941  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
942  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
943 }
944 
945 
946 // asin
947 template<typename NumericT>
949  NumericT * A,
950  unsigned int A_start1, unsigned int A_start2,
951  unsigned int A_inc1, unsigned int A_inc2,
952  unsigned int A_size1, unsigned int A_size2,
953  unsigned int A_internal_size1, unsigned int A_internal_size2,
954 
955  const NumericT * B,
956  unsigned int B_start1, unsigned int B_start2,
957  unsigned int B_inc1, unsigned int B_inc2,
958  unsigned int B_internal_size1, unsigned int B_internal_size2)
959 {
960  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
961  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
962 
963  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
964  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
965  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
966 }
967 
968 
969 // atan
970 template<typename NumericT>
972  NumericT * A,
973  unsigned int A_start1, unsigned int A_start2,
974  unsigned int A_inc1, unsigned int A_inc2,
975  unsigned int A_size1, unsigned int A_size2,
976  unsigned int A_internal_size1, unsigned int A_internal_size2,
977 
978  const NumericT * B,
979  unsigned int B_start1, unsigned int B_start2,
980  unsigned int B_inc1, unsigned int B_inc2,
981  unsigned int B_internal_size1, unsigned int B_internal_size2)
982 {
983  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
984  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
985 
986  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
987  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
988  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
989 }
990 
991 
992 // ceil
993 template<typename NumericT>
995  NumericT * A,
996  unsigned int A_start1, unsigned int A_start2,
997  unsigned int A_inc1, unsigned int A_inc2,
998  unsigned int A_size1, unsigned int A_size2,
999  unsigned int A_internal_size1, unsigned int A_internal_size2,
1000 
1001  const NumericT * B,
1002  unsigned int B_start1, unsigned int B_start2,
1003  unsigned int B_inc1, unsigned int B_inc2,
1004  unsigned int B_internal_size1, unsigned int B_internal_size2)
1005 {
1006  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1007  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1008 
1009  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1010  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1011  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1012 }
1013 
1014 
1015 // cos
1016 template<typename NumericT>
1018  NumericT * A,
1019  unsigned int A_start1, unsigned int A_start2,
1020  unsigned int A_inc1, unsigned int A_inc2,
1021  unsigned int A_size1, unsigned int A_size2,
1022  unsigned int A_internal_size1, unsigned int A_internal_size2,
1023 
1024  const NumericT * B,
1025  unsigned int B_start1, unsigned int B_start2,
1026  unsigned int B_inc1, unsigned int B_inc2,
1027  unsigned int B_internal_size1, unsigned int B_internal_size2)
1028 {
1029  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1030  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1031 
1032  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1033  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1034  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1035 }
1036 
1037 
1038 // cosh
1039 template<typename NumericT>
1041  NumericT * A,
1042  unsigned int A_start1, unsigned int A_start2,
1043  unsigned int A_inc1, unsigned int A_inc2,
1044  unsigned int A_size1, unsigned int A_size2,
1045  unsigned int A_internal_size1, unsigned int A_internal_size2,
1046 
1047  const NumericT * B,
1048  unsigned int B_start1, unsigned int B_start2,
1049  unsigned int B_inc1, unsigned int B_inc2,
1050  unsigned int B_internal_size1, unsigned int B_internal_size2)
1051 {
1052  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1053  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1054 
1055  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1056  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1057  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1058 }
1059 
1060 
1061 // exp
1062 template<typename NumericT>
1064  NumericT * A,
1065  unsigned int A_start1, unsigned int A_start2,
1066  unsigned int A_inc1, unsigned int A_inc2,
1067  unsigned int A_size1, unsigned int A_size2,
1068  unsigned int A_internal_size1, unsigned int A_internal_size2,
1069 
1070  const NumericT * B,
1071  unsigned int B_start1, unsigned int B_start2,
1072  unsigned int B_inc1, unsigned int B_inc2,
1073  unsigned int B_internal_size1, unsigned int B_internal_size2)
1074 {
1075  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1076  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1077 
1078  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1079  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1080  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1081 }
1082 
1083 
1084 // fabs
1085 template<typename NumericT>
1087  NumericT * A,
1088  unsigned int A_start1, unsigned int A_start2,
1089  unsigned int A_inc1, unsigned int A_inc2,
1090  unsigned int A_size1, unsigned int A_size2,
1091  unsigned int A_internal_size1, unsigned int A_internal_size2,
1092 
1093  const NumericT * B,
1094  unsigned int B_start1, unsigned int B_start2,
1095  unsigned int B_inc1, unsigned int B_inc2,
1096  unsigned int B_internal_size1, unsigned int B_internal_size2)
1097 {
1098  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1099  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1100 
1101  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1102  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1103  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1104 }
1105 
1106 
1107 // floor
1108 template<typename NumericT>
1110  NumericT * A,
1111  unsigned int A_start1, unsigned int A_start2,
1112  unsigned int A_inc1, unsigned int A_inc2,
1113  unsigned int A_size1, unsigned int A_size2,
1114  unsigned int A_internal_size1, unsigned int A_internal_size2,
1115 
1116  const NumericT * B,
1117  unsigned int B_start1, unsigned int B_start2,
1118  unsigned int B_inc1, unsigned int B_inc2,
1119  unsigned int B_internal_size1, unsigned int B_internal_size2)
1120 {
1121  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1122  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1123 
1124  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1125  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1126  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1127 }
1128 
1129 
1130 // log
1131 template<typename NumericT>
1133  NumericT * A,
1134  unsigned int A_start1, unsigned int A_start2,
1135  unsigned int A_inc1, unsigned int A_inc2,
1136  unsigned int A_size1, unsigned int A_size2,
1137  unsigned int A_internal_size1, unsigned int A_internal_size2,
1138 
1139  const NumericT * B,
1140  unsigned int B_start1, unsigned int B_start2,
1141  unsigned int B_inc1, unsigned int B_inc2,
1142  unsigned int B_internal_size1, unsigned int B_internal_size2)
1143 {
1144  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1145  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1146 
1147  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1148  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1149  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1150 }
1151 
1152 
1153 // log10
1154 template<typename NumericT>
1156  NumericT * A,
1157  unsigned int A_start1, unsigned int A_start2,
1158  unsigned int A_inc1, unsigned int A_inc2,
1159  unsigned int A_size1, unsigned int A_size2,
1160  unsigned int A_internal_size1, unsigned int A_internal_size2,
1161 
1162  const NumericT * B,
1163  unsigned int B_start1, unsigned int B_start2,
1164  unsigned int B_inc1, unsigned int B_inc2,
1165  unsigned int B_internal_size1, unsigned int B_internal_size2)
1166 {
1167  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1168  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1169 
1170  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1171  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1172  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1173 }
1174 
1175 
1176 // sin
1177 template<typename NumericT>
1179  NumericT * A,
1180  unsigned int A_start1, unsigned int A_start2,
1181  unsigned int A_inc1, unsigned int A_inc2,
1182  unsigned int A_size1, unsigned int A_size2,
1183  unsigned int A_internal_size1, unsigned int A_internal_size2,
1184 
1185  const NumericT * B,
1186  unsigned int B_start1, unsigned int B_start2,
1187  unsigned int B_inc1, unsigned int B_inc2,
1188  unsigned int B_internal_size1, unsigned int B_internal_size2)
1189 {
1190  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1191  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1192 
1193  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1194  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1195  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1196 }
1197 
1198 
1199 // sinh
1200 template<typename NumericT>
1202  NumericT * A,
1203  unsigned int A_start1, unsigned int A_start2,
1204  unsigned int A_inc1, unsigned int A_inc2,
1205  unsigned int A_size1, unsigned int A_size2,
1206  unsigned int A_internal_size1, unsigned int A_internal_size2,
1207 
1208  const NumericT * B,
1209  unsigned int B_start1, unsigned int B_start2,
1210  unsigned int B_inc1, unsigned int B_inc2,
1211  unsigned int B_internal_size1, unsigned int B_internal_size2)
1212 {
1213  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1214  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1215 
1216  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1217  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1218  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1219 }
1220 
1221 
1222 // sqrt
1223 template<typename NumericT>
1225  NumericT * A,
1226  unsigned int A_start1, unsigned int A_start2,
1227  unsigned int A_inc1, unsigned int A_inc2,
1228  unsigned int A_size1, unsigned int A_size2,
1229  unsigned int A_internal_size1, unsigned int A_internal_size2,
1230 
1231  const NumericT * B,
1232  unsigned int B_start1, unsigned int B_start2,
1233  unsigned int B_inc1, unsigned int B_inc2,
1234  unsigned int B_internal_size1, unsigned int B_internal_size2)
1235 {
1236  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1237  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1238 
1239  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1240  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1241  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1242 }
1243 
1244 
1245 // tan
1246 template<typename NumericT>
1248  NumericT * A,
1249  unsigned int A_start1, unsigned int A_start2,
1250  unsigned int A_inc1, unsigned int A_inc2,
1251  unsigned int A_size1, unsigned int A_size2,
1252  unsigned int A_internal_size1, unsigned int A_internal_size2,
1253 
1254  const NumericT * B,
1255  unsigned int B_start1, unsigned int B_start2,
1256  unsigned int B_inc1, unsigned int B_inc2,
1257  unsigned int B_internal_size1, unsigned int B_internal_size2)
1258 {
1259  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1260  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1261 
1262  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1263  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1264  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1265 }
1266 
1267 
1268 // tanh
1269 template<typename NumericT>
1271  NumericT * A,
1272  unsigned int A_start1, unsigned int A_start2,
1273  unsigned int A_inc1, unsigned int A_inc2,
1274  unsigned int A_size1, unsigned int A_size2,
1275  unsigned int A_internal_size1, unsigned int A_internal_size2,
1276 
1277  const NumericT * B,
1278  unsigned int B_start1, unsigned int B_start2,
1279  unsigned int B_inc1, unsigned int B_inc2,
1280  unsigned int B_internal_size1, unsigned int B_internal_size2)
1281 {
1282  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1283  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1284 
1285  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1286  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1287  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1288 }
1289 
1290 
1291 
1292 //
1293 // matrix-vector product
1294 //
1295 
1296 template<typename NumericT>
1297 __global__ void vec_mul_row_kernel(
1298  const NumericT * A,
1299  unsigned int A_row_start,
1300  unsigned int A_col_start,
1301  unsigned int A_row_inc,
1302  unsigned int A_col_inc,
1303  unsigned int A_row_size,
1304  unsigned int A_col_size,
1305  unsigned int A_internal_rows,
1306  unsigned int A_internal_cols,
1307  const NumericT * v,
1308  unsigned int v_start,
1309  unsigned int v_inc,
1310  unsigned int v_size,
1311  NumericT * result,
1312  unsigned int result_start,
1313  unsigned int result_inc,
1314  unsigned int result_size)
1315 {
1316  __shared__ NumericT work[128];
1317 
1318  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1319  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1320  unsigned int lid = threadIdx.x;
1321 
1322  for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
1323  {
1324  NumericT dot_prod = 0;
1325  for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1326  dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1327  work[lid] = dot_prod;
1328 
1329  for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
1330  __syncthreads();
1331  if (lid < stride)
1332  work[lid] += work[lid+stride];
1333  }
1334 
1335  if (lid == 0)
1336  result[row * result_inc + result_start] = work[0];
1337  }
1338 }
1339 
1340 
1341 template<typename NumericT>
1343  const NumericT * A,
1344  unsigned int A_row_start,
1345  unsigned int A_col_start,
1346  unsigned int A_row_inc,
1347  unsigned int A_col_inc,
1348  unsigned int A_row_size,
1349  unsigned int A_col_size,
1350  unsigned int A_internal_rows,
1351  unsigned int A_internal_cols,
1352  const NumericT * v,
1353  unsigned int v_start,
1354  unsigned int v_inc,
1355  unsigned int v_size,
1356  NumericT * result,
1357  unsigned int result_start,
1358  unsigned int result_inc,
1359  unsigned int result_size)
1360 {
1361  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
1362  {
1363  NumericT dot_prod = 0;
1364  for (unsigned int col = 0; col < A_row_size; ++col)
1365  dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1366  result[row * result_inc + result_start] = dot_prod;
1367  }
1368 }
1369 
1370 
1371 //
1372 // matrix-matrix products
1373 //
1374 
1375 
1376 
1377 
1378 //
1379 // scaled rank-1-update
1380 //
1381 
1382 // alpha on CPU
1383 template<typename NumericT>
1385  NumericT * A,
1386  unsigned int A_start1, unsigned int A_start2,
1387  unsigned int A_inc1, unsigned int A_inc2,
1388  unsigned int A_size1, unsigned int A_size2,
1389  unsigned int A_internal_size1, unsigned int A_internal_size2,
1390 
1391  NumericT val,
1392  unsigned int options2,
1393 
1394  const NumericT * vec1,
1395  unsigned int start1,
1396  unsigned int inc1,
1397  unsigned int size1,
1398 
1399  const NumericT * vec2,
1400  unsigned int start2,
1401  unsigned int inc2,
1402  unsigned int size2)
1403 {
1404  NumericT alpha = val;
1405  if (options2 & (1 << 0))
1406  alpha = -alpha;
1407  if (options2 & (1 << 1))
1408  alpha = NumericT(1) / alpha;
1409 
1410  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1411  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1412 
1413  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1414  {
1415  NumericT tmp = alpha * vec1[row * inc1 + start1];
1416  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1417  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1418  }
1419 }
1420 
1421 
1422 // alpha on GPU
1423 template<typename NumericT>
1425  NumericT * A,
1426  unsigned int A_start1, unsigned int A_start2,
1427  unsigned int A_inc1, unsigned int A_inc2,
1428  unsigned int A_size1, unsigned int A_size2,
1429  unsigned int A_internal_size1, unsigned int A_internal_size2,
1430 
1431  const NumericT * val,
1432  unsigned int options2,
1433 
1434  const NumericT * vec1,
1435  unsigned int start1,
1436  unsigned int inc1,
1437  unsigned int size1,
1438 
1439  const NumericT * vec2,
1440  unsigned int start2,
1441  unsigned int inc2,
1442  unsigned int size2)
1443 {
1444  NumericT alpha = *val;
1445  if (options2 & (1 << 0))
1446  alpha = -alpha;
1447  if (options2 & (1 << 1))
1448  alpha = NumericT(1) / alpha;
1449 
1450  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1451  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1452 
1453  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1454  {
1455  NumericT tmp = alpha * vec1[row * inc1 + start1];
1456  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1457  A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
1458  }
1459 }
1460 
1461 
1462 
1463 } // namespace cuda
1464 } //namespace linalg
1465 } //namespace viennacl
1466 
1467 
1468 #endif
__global__ void element_op_int_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:163
__global__ void matrix_row_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
__global__ void ambm_m_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
__global__ void matrix_row_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:201
float NumericT
Definition: bisect.cpp:40
__global__ void trans_vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
__global__ void element_op_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_row_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void convert_row_kernel(DestNumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const SrcNumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_row_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
__global__ void scaled_rank1_update_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void trans_kernel(const NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_internal_size1, unsigned int A_internal_size2, unsigned int A_size1, unsigned int A_size2, unsigned int A_stride1, unsigned int A_stride2, NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_internal_size1, unsigned int B_internal_size2, unsigned int B_stride1, unsigned int B_stride2, bool data_major)
__global__ void matrix_row_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_row_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void matrix_row_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void matrix_row_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_row_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)