ViennaCL - The Vienna Computing Library  1.7.1
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
blas3range.cpp
Go to the documentation of this file.
1 /* =========================================================================
2  Copyright (c) 2010-2016, Institute for Microelectronics,
3  Institute for Analysis and Scientific Computing,
4  TU Wien.
5  Portions of this software are copyright by UChicago Argonne, LLC.
6 
7  -----------------
8  ViennaCL - The Vienna Computing Library
9  -----------------
10 
11  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12 
13  (A list of authors and contributors can be found in the PDF manual)
14 
15  License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 /*
19 *
20 * Tutorial: BLAS level 3 functionality on sub-matrices (blas3range.cpp and blas3range.cu are identical, the latter being required for compilation using CUDA nvcc)
21 *
22 */
23 
24 //disable debug mechanisms to have a fair comparison with ublas:
25 #ifndef NDEBUG
26  #define NDEBUG
27 #endif
28 
29 
30 //
31 // include necessary system headers
32 //
33 #include <iostream>
34 
35 //
36 // ublas includes
37 //
38 #include <boost/numeric/ublas/io.hpp>
39 #include <boost/numeric/ublas/triangular.hpp>
40 #include <boost/numeric/ublas/matrix_sparse.hpp>
41 #include <boost/numeric/ublas/matrix.hpp>
42 #include <boost/numeric/ublas/matrix_proxy.hpp>
43 #include <boost/numeric/ublas/lu.hpp>
44 #include <boost/numeric/ublas/io.hpp>
45 
46 
47 // Must be set if you want to use ViennaCL algorithms on ublas objects
48 #define VIENNACL_WITH_UBLAS 1
49 
50 //
51 // ViennaCL includes
52 //
53 #include "viennacl/scalar.hpp"
54 #include "viennacl/vector.hpp"
55 #include "viennacl/matrix.hpp"
56 #include "viennacl/linalg/prod.hpp"
59 #include "viennacl/tools/timer.hpp"
60 
61 #define BLAS3_MATRIX_SIZE 1500
62 
63 using namespace boost::numeric;
64 
65 int main()
66 {
67  typedef float ScalarType;
68 
70  double exec_time;
71 
73 
74  //
75  // Set up some ublas objects
76  //
77  ublas::matrix<ScalarType> ublas_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
78  ublas::matrix<ScalarType, ublas::column_major> ublas_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
79  ublas::matrix<ScalarType> ublas_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
80  ublas::matrix<ScalarType> ublas_C1(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
81  ublas::matrix<ScalarType> ublas_C2(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
82 
83  //
84  // One alternative: Put the matrices into a contiguous block of memory (allows to use viennacl::fast_copy(), avoiding temporary memory)
85  //
86  std::vector<ScalarType> stl_A(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
87  std::vector<ScalarType> stl_B(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
88  std::vector<ScalarType> stl_C(BLAS3_MATRIX_SIZE * BLAS3_MATRIX_SIZE);
89 
90  //
91  // Fill the matrix
92  //
93  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
94  for (unsigned int j = 0; j < ublas_A.size2(); ++j)
95  {
96  ublas_A(i,j) = randomNumber();
97  stl_A[i*ublas_A.size2() + j] = ublas_A(i,j);
98  }
99 
100  for (unsigned int i = 0; i < ublas_B.size1(); ++i)
101  for (unsigned int j = 0; j < ublas_B.size2(); ++j)
102  {
103  ublas_B(i,j) = randomNumber();
104  stl_B[i + j*ublas_B.size1()] = ublas_B(i,j);
105  }
106 
107  ublas::range ublas_r1(1, BLAS3_MATRIX_SIZE-1);
108  ublas::range ublas_r2(2, BLAS3_MATRIX_SIZE-2);
109  ublas::matrix_range< ublas::matrix<ScalarType> > ublas_A_sub(ublas_A, ublas_r1, ublas_r2);
110  ublas::matrix_range< ublas::matrix<ScalarType, ublas::column_major> > ublas_B_sub(ublas_B, ublas_r2, ublas_r1);
111  ublas::matrix_range< ublas::matrix<ScalarType> > ublas_C_sub(ublas_C, ublas_r1, ublas_r1);
112 
113  //
114  // Set up some ViennaCL objects
115  //
116  //viennacl::ocl::set_context_device_type(0, viennacl::ocl::gpu_tag()); //uncomment this is you wish to use GPUs only
117  viennacl::matrix<ScalarType> vcl_A(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
118  viennacl::matrix<ScalarType, viennacl::column_major> vcl_B(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
119  viennacl::matrix<ScalarType> vcl_C(BLAS3_MATRIX_SIZE, BLAS3_MATRIX_SIZE);
120 
121  viennacl::range vcl_r1(1, BLAS3_MATRIX_SIZE-1);
122  viennacl::range vcl_r2(2, BLAS3_MATRIX_SIZE-2);
123  viennacl::matrix_range< viennacl::matrix<ScalarType> > vcl_A_sub(vcl_A, vcl_r1, vcl_r2);
125  viennacl::matrix_range< viennacl::matrix<ScalarType> > vcl_C_sub(vcl_C, vcl_r1, vcl_r1);
126 
127  ublas_C.clear();
128  viennacl::copy(ublas_C, vcl_C);
129 
133 
134  //
135  // Compute reference product using ublas:
136  //
137  std::cout << "--- Computing matrix-matrix product using ublas ---" << std::endl;
138  timer.start();
139  ublas_C_sub = ublas::prod(ublas_A_sub, ublas_B_sub);
140  exec_time = timer.get();
141  std::cout << " - Execution time: " << exec_time << std::endl;
142 
143  //std::cout << ublas_C << std::endl;
144 
145  //
146  // Now iterate over all OpenCL devices in the context and compute the matrix-matrix product
147  //
148  std::cout << std::endl << "--- Computing matrix-matrix product on each available compute device using ViennaCL ---" << std::endl;
149  std::vector<viennacl::ocl::device> devices = viennacl::ocl::current_context().devices();
150  for (std::size_t i=0; i<devices.size(); ++i)
151  {
153  std::cout << " - Device Name: " << viennacl::ocl::current_device().name() << std::endl;
154 
155  //viennacl::copy(ublas_A, vcl_A);
156  //viennacl::copy(ublas_B, vcl_B);
157  viennacl::fast_copy(&(stl_A[0]),
158  &(stl_A[0]) + stl_A.size(),
159  vcl_A);
160  viennacl::fast_copy(&(stl_B[0]),
161  &(stl_B[0]) + stl_B.size(),
162  vcl_B);
163  vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
165  timer.start();
166  vcl_C_sub = viennacl::linalg::prod(vcl_A_sub, vcl_B_sub);
168  exec_time = timer.get();
169  std::cout << " - Execution time on device (no setup time included): " << exec_time << std::endl;
170  std::cout << " - GFLOPs: " << (vcl_A.size1() / 1000.0) * (vcl_A.size2() / 1000.0) * (vcl_B.size2() / 1000.0) / exec_time << std::endl;
171 
172  //std::cout << vcl_C << std::endl;
173 
174  //
175  // Verify the result
176  //
177  //viennacl::copy(vcl_C, ublas_C1);
178  viennacl::fast_copy(vcl_C, &(stl_C[0]));
179  for (unsigned int i = 0; i < ublas_C1.size1(); ++i)
180  for (unsigned int j = 0; j < ublas_C1.size2(); ++j)
181  ublas_C1(i,j) = stl_C[i * ublas_C1.size2() + j];
182 
183  std::cout << " - Checking result... ";
184  bool check_ok = true;
185  for (unsigned int i = 0; i < ublas_A.size1(); ++i)
186  {
187  for (unsigned int j = 0; j < ublas_A.size2(); ++j)
188  {
189  if ( fabs(ublas_C1(i,j) - ublas_C(i,j)) / ublas_C(i,j) > 1e-4 )
190  {
191  check_ok = false;
192  break;
193  }
194  }
195  if (!check_ok)
196  break;
197  }
198  if (check_ok)
199  std::cout << "[OK]" << std::endl << std::endl;
200  else
201  std::cout << "[FAILED]" << std::endl << std::endl;
202 
203  }
204 
205  //
206  // That's it.
207  //
208  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
209  return EXIT_SUCCESS;
210 }
211 
Simple timer class based on gettimeofday (POSIX) or QueryPerformanceCounter (Windows).
Definition: timer.hpp:90
void finish() const
Waits until all kernels in the queue have finished their execution.
void switch_device(vcl_size_t i)
Switches the current device to the i-th device in this context.
Definition: context.hpp:119
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...
Implementation of the dense matrix class.
viennacl::ocl::context & current_context()
Convenience function for returning the current context.
Definition: backend.hpp:213
A dense matrix class.
Definition: forwards.h:375
viennacl::ocl::device const & current_device()
Convenience function for returning the active device in the current context.
Definition: backend.hpp:351
basic_range range
Definition: forwards.h:424
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:102
Random number generator for returning uniformly distributed values in the closed interval [0...
Definition: random.hpp:44
viennacl::ocl::command_queue & get_queue()
Convenience function for getting the default queue for the currently active device in the active cont...
Definition: backend.hpp:320
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:226
std::string name() const
Device name string.
Definition: device.hpp:566
size_type size1() const
Returns the number of rows.
Definition: matrix_def.hpp:224
#define BLAS3_MATRIX_SIZE
Definition: blas3range.cpp:61
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling.
Proxy classes for matrices.
void prod(std::vector< std::map< IndexT, NumericT > > const &stl_A, std::vector< std::map< IndexT, NumericT > > const &stl_B, std::vector< std::map< IndexT, NumericT > > &stl_C)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void copy(std::vector< NumericT > &cpu_vec, circulant_matrix< NumericT, AlignmentV > &gpu_mat)
Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU) ...
A small collection of sequential random number generators.
A range class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
Definition: forwards.h:424
float ScalarType
Definition: fft_1d.cpp:42
int main()
Definition: blas3range.cpp:65
double get() const
Definition: timer.hpp:104
Class for representing non-strided submatrices of a bigger matrix A.
Definition: forwards.h:440
std::vector< viennacl::ocl::device > const & devices() const
Returns a vector with all devices in this context.
Definition: context.hpp:106
Implementation of the ViennaCL scalar class.
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)