ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2016, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 
27 #include "viennacl/forwards.h"
29 #include "viennacl/ocl/device.hpp"
30 #include "viennacl/ocl/handle.hpp"
31 #include "viennacl/ocl/kernel.hpp"
32 #include "viennacl/scalar.hpp"
33 #include "viennacl/tools/tools.hpp"
38 #include "viennacl/traits/size.hpp"
42 
43 namespace viennacl
44 {
45 namespace linalg
46 {
47 namespace opencl
48 {
49 
50 template<typename NumericT>
52  NumericT alpha,
55  vector_base<NumericT> const & Ap,
56  NumericT beta,
57  vector_base<NumericT> & inner_prod_buffer)
58 {
59  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
61 
63  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
64 
65  k.local_work_size(0, 128);
66  k.global_work_size(0, 128*128);
67 
69  {
70  k.local_work_size(0, 256);
71  k.global_work_size(0, 256*256);
72  }
73 
74  viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
75 }
76 
77 template<typename NumericT>
79  vector_base<NumericT> const & p,
81  vector_base<NumericT> & inner_prod_buffer)
82 {
83  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
85 
86  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
87 
88  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "cg_csr_blocked_prod" : "cg_csr_prod");
89 
90  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
91  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
92 
93  k.local_work_size(0, 128);
94  k.global_work_size(0, 128*128);
95 
97  {
98  k.local_work_size(0, 256);
99  k.global_work_size(0, 256*256);
100  }
101 
102  if (use_nvidia_blocked)
103  {
104  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
105  p,
106  Ap,
107  vec_size,
108  inner_prod_buffer,
109  buffer_size_per_vector,
112  ));
113  }
114  else
115  {
116  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
117  p,
118  Ap,
119  vec_size,
120  inner_prod_buffer,
121  buffer_size_per_vector,
124  viennacl::ocl::local_mem(1024 * sizeof(NumericT))
125  ));
126  }
127 
128 }
129 
130 template<typename NumericT>
132  vector_base<NumericT> const & p,
134  vector_base<NumericT> & inner_prod_buffer)
135 {
136  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
138 
139  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
140  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
141 
142  Ap.clear();
143 
145  unsigned int thread_num = 256; //k.local_work_size(0);
146 
147  k.local_work_size(0, thread_num);
148 
149  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
150 
151  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
152  p,
153  Ap,
154  vec_size,
155  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
156  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
157  inner_prod_buffer,
158  buffer_size_per_vector,
161  ));
162 }
163 
164 template<typename NumericT>
166  vector_base<NumericT> const & p,
168  vector_base<NumericT> & inner_prod_buffer)
169 {
170  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
172 
173  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
174  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
175 
177 
178  unsigned int thread_num = 128;
179  unsigned int group_num = 256;
180 
181  k.local_work_size(0, thread_num);
182  k.global_work_size(0, thread_num * group_num);
183 
185  {
186  k.local_work_size(0, 256);
187  k.global_work_size(0, 256*256);
188  }
189 
190  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
191  A.handle().opencl_handle(),
192  cl_uint(A.internal_size1()),
193  cl_uint(A.maxnnz()),
194  cl_uint(A.internal_maxnnz()),
195  viennacl::traits::opencl_handle(p),
196  viennacl::traits::opencl_handle(Ap),
197  vec_size,
198  inner_prod_buffer,
199  buffer_size_per_vector,
202  )
203  );
204 }
205 
206 template<typename NumericT>
208  vector_base<NumericT> const & p,
210  vector_base<NumericT> & inner_prod_buffer)
211 {
212  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
214 
215  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
216  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
217 
219 
220  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
221  unsigned int group_num = 256;
222 
224  thread_num = 256;
225 
226  k.local_work_size(0, thread_num);
227  k.global_work_size(0, thread_num * group_num);
228 
229  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
230  A.handle2().opencl_handle(),
231  A.handle3().opencl_handle(),
232  A.handle().opencl_handle(),
233  viennacl::traits::opencl_handle(p),
234  viennacl::traits::opencl_handle(Ap),
235  vec_size,
236  cl_uint(A.rows_per_block()),
237  inner_prod_buffer,
238  buffer_size_per_vector,
241  )
242  );
243 }
244 
245 
246 template<typename NumericT>
248  vector_base<NumericT> const & p,
250  vector_base<NumericT> & inner_prod_buffer)
251 {
252  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
254 
255  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
256  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
257 
259 
260  unsigned int thread_num = 128;
261  unsigned int group_num = 128;
262 
263  k.local_work_size(0, thread_num);
264  k.global_work_size(0, thread_num * group_num);
265 
267  {
268  k.local_work_size(0, 256);
269  k.global_work_size(0, 256*256);
270  }
271 
272  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
273  A.handle().opencl_handle(),
274  A.handle3().opencl_handle(),
275  A.handle4().opencl_handle(),
276  A.handle5().opencl_handle(),
277  cl_uint(A.internal_size1()),
278  cl_uint(A.ell_nnz()),
279  cl_uint(A.internal_ellnnz()),
280  viennacl::traits::opencl_handle(p),
281  viennacl::traits::opencl_handle(Ap),
282  vec_size,
283  inner_prod_buffer,
284  buffer_size_per_vector,
287  )
288  );
289 }
290 
291 
293 
294 template<typename NumericT>
297  vector_base<NumericT> const & Ap,
298  vector_base<NumericT> & inner_prod_buffer,
299  vcl_size_t buffer_chunk_size,
300  vcl_size_t buffer_chunk_offset)
301 {
302  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
304 
306  cl_uint vec_size = cl_uint(viennacl::traits::size(s));
307 
308  k.local_work_size(0, 128);
309  k.global_work_size(0, 128*128);
310 
312  {
313  k.local_work_size(0, 256);
314  k.global_work_size(0, 256*256);
315  }
316 
317  cl_uint chunk_size = cl_uint(buffer_chunk_size);
318  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
319  viennacl::ocl::enqueue(k(s, r, Ap,
320  inner_prod_buffer, chunk_size, chunk_offset, vec_size,
323 }
324 
325 template<typename NumericT>
327  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
328  NumericT beta, vector_base<NumericT> const & Ap,
329  vector_base<NumericT> const & r0star,
330  vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
331 {
332  (void)buffer_chunk_size;
333 
334  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
336 
338  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
339 
340  k.local_work_size(0, 128);
341  k.global_work_size(0, 128*128);
342 
344  {
345  k.local_work_size(0, 256);
346  k.global_work_size(0, 256*256);
347  }
348 
349  viennacl::ocl::enqueue(k(result, alpha, p, omega, s,
350  residual, As,
351  beta, Ap,
352  r0star,
353  inner_prod_buffer,
354  vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
355  )
356  );
357 }
358 
359 template<typename NumericT>
361  vector_base<NumericT> const & p,
363  vector_base<NumericT> const & r0star,
364  vector_base<NumericT> & inner_prod_buffer,
365  vcl_size_t buffer_chunk_size,
366  vcl_size_t buffer_chunk_offset)
367 {
368  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
370 
371  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
372 
373  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "bicgstab_csr_blocked_prod" : "bicgstab_csr_prod");
374 
375  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
376  cl_uint chunk_size = cl_uint(buffer_chunk_size);
377  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
378 
379  k.local_work_size(0, 128);
380  k.global_work_size(0, 128*128);
381 
383  {
384  k.local_work_size(0, 256);
385  k.global_work_size(0, 256*256);
386  }
387 
388  if (use_nvidia_blocked)
389  {
390  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
391  p,
392  Ap,
393  r0star,
394  vec_size,
395  inner_prod_buffer, chunk_size, chunk_offset,
399  ));
400  }
401  else
402  {
403  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
404  p,
405  Ap,
406  r0star,
407  vec_size,
408  inner_prod_buffer, chunk_size, chunk_offset,
412  ));
413  }
414 
415 }
416 
417 
418 template<typename NumericT>
420  vector_base<NumericT> const & p,
422  vector_base<NumericT> const & r0star,
423  vector_base<NumericT> & inner_prod_buffer,
424  vcl_size_t buffer_chunk_size,
425  vcl_size_t buffer_chunk_offset)
426 {
427  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
429 
430  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
431  cl_uint chunk_size = cl_uint(buffer_chunk_size);
432  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
433 
434  Ap.clear();
435 
437  unsigned int thread_num = 256; //k.local_work_size(0);
438 
439  k.local_work_size(0, thread_num);
440 
441  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
442 
443  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
444  p,
445  Ap,
446  r0star,
447  vec_size,
448  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
449  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
450  inner_prod_buffer, chunk_size, chunk_offset,
454  ));
455 }
456 
457 template<typename NumericT>
459  vector_base<NumericT> const & p,
461  vector_base<NumericT> const & r0star,
462  vector_base<NumericT> & inner_prod_buffer,
463  vcl_size_t buffer_chunk_size,
464  vcl_size_t buffer_chunk_offset)
465 {
466  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
468 
469  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
470  cl_uint chunk_size = cl_uint(buffer_chunk_size);
471  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
472 
474 
475  unsigned int thread_num = 128;
476  unsigned int group_num = 128;
477 
478  k.local_work_size(0, thread_num);
479  k.global_work_size(0, thread_num * group_num);
480 
482  {
483  k.local_work_size(0, 256);
484  k.global_work_size(0, 256*256);
485  }
486 
487  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
488  A.handle().opencl_handle(),
489  cl_uint(A.internal_size1()),
490  cl_uint(A.maxnnz()),
491  cl_uint(A.internal_maxnnz()),
492  viennacl::traits::opencl_handle(p),
493  viennacl::traits::opencl_handle(Ap),
494  r0star,
495  vec_size,
496  inner_prod_buffer, chunk_size, chunk_offset,
500  )
501  );
502 }
503 
504 template<typename NumericT>
506  vector_base<NumericT> const & p,
508  vector_base<NumericT> const & r0star,
509  vector_base<NumericT> & inner_prod_buffer,
510  vcl_size_t buffer_chunk_size,
511  vcl_size_t buffer_chunk_offset)
512 {
513  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
515 
516  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
517  cl_uint chunk_size = cl_uint(buffer_chunk_size);
518  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
519 
521 
522  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
523  unsigned int group_num = 256;
524 
526  thread_num = 256;
527 
528  k.local_work_size(0, thread_num);
529  k.global_work_size(0, thread_num * group_num);
530 
531  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
532  A.handle2().opencl_handle(),
533  A.handle3().opencl_handle(),
534  A.handle().opencl_handle(),
535  viennacl::traits::opencl_handle(p),
536  viennacl::traits::opencl_handle(Ap),
537  r0star,
538  vec_size,
539  cl_uint(A.rows_per_block()),
540  inner_prod_buffer, chunk_size, chunk_offset,
544  )
545  );
546 }
547 
548 
549 template<typename NumericT>
551  vector_base<NumericT> const & p,
553  vector_base<NumericT> const & r0star,
554  vector_base<NumericT> & inner_prod_buffer,
555  vcl_size_t buffer_chunk_size,
556  vcl_size_t buffer_chunk_offset)
557 {
558  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
560 
561  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
562  cl_uint chunk_size = cl_uint(buffer_chunk_size);
563  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
564 
566 
567  unsigned int thread_num = 256;
568  unsigned int group_num = 128;
569 
570  k.local_work_size(0, thread_num);
571  k.global_work_size(0, thread_num * group_num);
572 
574  {
575  k.local_work_size(0, 256);
576  k.global_work_size(0, 256*256);
577  }
578 
579  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
580  A.handle().opencl_handle(),
581  A.handle3().opencl_handle(),
582  A.handle4().opencl_handle(),
583  A.handle5().opencl_handle(),
584  cl_uint(A.internal_size1()),
585  cl_uint(A.ell_nnz()),
586  cl_uint(A.internal_ellnnz()),
587  viennacl::traits::opencl_handle(p),
588  viennacl::traits::opencl_handle(Ap),
589  r0star,
590  vec_size,
591  inner_prod_buffer, chunk_size, chunk_offset,
595  )
596  );
597 }
598 
600 
608 template <typename T>
610  vector_base<T> const & residual,
611  vector_base<T> & R_buffer,
612  vcl_size_t offset_in_R,
613  vector_base<T> const & inner_prod_buffer,
614  vector_base<T> & r_dot_vk_buffer,
615  vcl_size_t buffer_chunk_size,
616  vcl_size_t buffer_chunk_offset)
617 {
618  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context());
620 
622 
623  k.local_work_size(0, 128);
624  k.global_work_size(0, 128*128);
625 
626  cl_uint size_vk = cl_uint(v_k.size());
627  cl_uint vk_offset = cl_uint(viennacl::traits::start(v_k));
628  cl_uint R_offset = cl_uint(offset_in_R);
629  cl_uint chunk_size = cl_uint(buffer_chunk_size);
630  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
631  viennacl::ocl::enqueue(k(v_k, vk_offset,
632  residual,
633  R_buffer, R_offset,
634  inner_prod_buffer, chunk_size,
635  r_dot_vk_buffer, chunk_offset,
636  size_vk,
638  ));
639 }
640 
641 template <typename T>
642 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
643  vcl_size_t v_k_size,
644  vcl_size_t v_k_internal_size,
645  vcl_size_t param_k,
646  vector_base<T> & vi_in_vk_buffer,
647  vcl_size_t buffer_chunk_size)
648 {
649  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
651 
653 
654  k.local_work_size(0, 128);
655  k.global_work_size(0, 128*128);
656 
657  cl_uint size_vk = cl_uint(v_k_size);
658  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
659  cl_uint ocl_k = cl_uint(param_k);
660  cl_uint chunk_size = cl_uint(buffer_chunk_size);
661  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
662  vi_in_vk_buffer, chunk_size
663  ));
664 }
665 
666 template <typename T>
668  vcl_size_t v_k_size,
669  vcl_size_t v_k_internal_size,
670  vcl_size_t param_k,
671  vector_base<T> const & vi_in_vk_buffer,
672  vector_base<T> & R_buffer,
673  vcl_size_t krylov_dim,
674  vector_base<T> & inner_prod_buffer,
675  vcl_size_t buffer_chunk_size)
676 {
677  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
679 
681 
682  k.local_work_size(0, 128);
683  k.global_work_size(0, 128*128);
684 
685  cl_uint size_vk = cl_uint(v_k_size);
686  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
687  cl_uint ocl_k = cl_uint(param_k);
688  cl_uint chunk_size = cl_uint(buffer_chunk_size);
689  cl_uint ocl_krylov_dim = cl_uint(krylov_dim);
690  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
691  vi_in_vk_buffer, chunk_size,
692  R_buffer, ocl_krylov_dim,
693  inner_prod_buffer,
694  viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T))
695  ));
696 }
697 
698 template <typename T>
700  vector_base<T> const & residual,
701  vector_base<T> const & krylov_basis,
702  vcl_size_t v_k_size,
703  vcl_size_t v_k_internal_size,
704  vector_base<T> const & coefficients,
705  vcl_size_t param_k)
706 {
707  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
709 
711 
712  k.local_work_size(0, 128);
713  k.global_work_size(0, 128*128);
714 
715  cl_uint size_vk = cl_uint(v_k_size);
716  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
717  cl_uint ocl_k = cl_uint(param_k);
718  viennacl::ocl::enqueue(k(result,
719  residual,
720  krylov_basis, size_vk, internal_size_vk,
721  coefficients, ocl_k
722  ));
723 }
724 
725 
726 template <typename T>
728  vector_base<T> const & p,
729  vector_base<T> & Ap,
730  vector_base<T> & inner_prod_buffer)
731 {
732  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
734 
735  bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
736 
737  viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), use_nvidia_blocked ? "gmres_csr_blocked_prod" : "gmres_csr_prod");
738 
739  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
740  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
741  cl_uint start_p = cl_uint(viennacl::traits::start(p));
742  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
743 
744  k.local_work_size(0, 128);
745  k.global_work_size(0, 128*128);
746 
748  {
749  k.local_work_size(0, 256);
750  k.global_work_size(0, 256*128);
751  }
752 
753  if (use_nvidia_blocked)
754  {
755  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
756  p, start_p,
757  Ap, start_Ap,
758  vec_size,
759  inner_prod_buffer,
760  buffer_size_per_vector,
761  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
763  ));
764  }
765  else
766  {
767  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
768  p, start_p,
769  Ap, start_Ap,
770  vec_size,
771  inner_prod_buffer,
772  buffer_size_per_vector,
773  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
774  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
775  viennacl::ocl::local_mem(1024 * sizeof(T))
776  ));
777  }
778 }
779 
780 template <typename T>
782  vector_base<T> const & p,
783  vector_base<T> & Ap,
784  vector_base<T> & inner_prod_buffer)
785 {
786  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
788 
789  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
790  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
791  cl_uint start_p = cl_uint(viennacl::traits::start(p));
792  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
793 
794  Ap.clear();
795  inner_prod_buffer.clear();
796 
798  unsigned int thread_num = 128; //k.local_work_size(0);
799 
800  k.local_work_size(0, thread_num);
801 
802  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
803 
804  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
805  p, start_p,
806  Ap, start_Ap,
807  vec_size,
808  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
809  viennacl::ocl::local_mem(sizeof(T)*thread_num),
810  inner_prod_buffer,
811  buffer_size_per_vector,
812  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
814  ));
815 }
816 
817 template <typename T>
819  vector_base<T> const & p,
820  vector_base<T> & Ap,
821  vector_base<T> & inner_prod_buffer)
822 {
823  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
825 
826  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
827  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
828  cl_uint start_p = cl_uint(viennacl::traits::start(p));
829  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
830 
832 
833  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
834  unsigned int group_num = 128;
835 
836  k.local_work_size(0, thread_num);
837  k.global_work_size(0, thread_num * group_num);
838 
839  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
840  A.handle().opencl_handle(),
841  cl_uint(A.internal_size1()),
842  cl_uint(A.maxnnz()),
843  cl_uint(A.internal_maxnnz()),
844  viennacl::traits::opencl_handle(p), start_p,
845  viennacl::traits::opencl_handle(Ap), start_Ap,
846  vec_size,
847  inner_prod_buffer,
848  buffer_size_per_vector,
849  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
851  )
852  );
853 }
854 
855 template <typename T>
857  vector_base<T> const & p,
858  vector_base<T> & Ap,
859  vector_base<T> & inner_prod_buffer)
860 {
861  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
863 
864  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
865  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
866  cl_uint start_p = cl_uint(viennacl::traits::start(p));
867  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
868 
870 
871  vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
872  unsigned int group_num = 128;
873 
875  thread_num = 256;
876 
877  k.local_work_size(0, thread_num);
878  k.global_work_size(0, thread_num * group_num);
879 
880  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
881  A.handle2().opencl_handle(),
882  A.handle3().opencl_handle(),
883  A.handle().opencl_handle(),
884  viennacl::traits::opencl_handle(p), start_p,
885  viennacl::traits::opencl_handle(Ap), start_Ap,
886  vec_size,
887  cl_uint(A.rows_per_block()),
888  inner_prod_buffer,
889  buffer_size_per_vector,
890  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
892  )
893  );
894 }
895 
896 
897 template <typename T>
899  vector_base<T> const & p,
900  vector_base<T> & Ap,
901  vector_base<T> & inner_prod_buffer)
902 {
903  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
905 
906  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
907  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
908  cl_uint start_p = cl_uint(viennacl::traits::start(p));
909  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
910 
912 
913  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
914  unsigned int group_num = 128;
915 
916  k.local_work_size(0, thread_num);
917  k.global_work_size(0, thread_num * group_num);
918 
919 
920  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
921  A.handle().opencl_handle(),
922  A.handle3().opencl_handle(),
923  A.handle4().opencl_handle(),
924  A.handle5().opencl_handle(),
925  cl_uint(A.internal_size1()),
926  cl_uint(A.ell_nnz()),
927  cl_uint(A.internal_ellnnz()),
928  viennacl::traits::opencl_handle(p), start_p,
929  viennacl::traits::opencl_handle(Ap), start_Ap,
930  vec_size,
931  inner_prod_buffer,
932  buffer_size_per_vector,
933  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
935  )
936  );
937 }
938 
939 
940 } //namespace opencl
941 } //namespace linalg
942 } //namespace viennacl
943 
944 
945 #endif
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:406
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:112
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
Definition: iterative.hpp:1553
handle_type & handle2()
Definition: ell_matrix.hpp:103
Represents an OpenCL device within ViennaCL.
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Generic size and resize functionality for different vector and matrix types.
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
const vcl_size_t & size1() const
Returns the number of rows.
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Definition: iterative.hpp:1560
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:55
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
This file provides the forward declarations for the main types used within ViennaCL.
Determines row and column increments for matrices and matrix proxies.
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
cl_uint vendor_id() const
A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID...
Definition: device.hpp:917
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
vcl_size_t rows_per_block() const
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
Common implementations shared by OpenCL-based operations.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
float NumericT
Definition: bisect.cpp:40
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:239
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
OpenCL kernel file for specialized iterative solver kernels.
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:605
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:403
Implementation of a smart-pointer-like class for handling OpenCL handles.
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:75
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
handle_type & handle()
Definition: ell_matrix.hpp:100
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:875
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Representation of an OpenCL kernel in ViennaCL.
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Forward declarations of the implicit_vector_base, vector_base class.
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
Simple enable-if variant that uses the SFINAE pattern.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...