doc/dense__blas_8cpp_source.html

 /* =========================================================================

    Copyright (c) 2010-2016, Institute for Microelectronics,

                             Institute for Analysis and Scientific Computing,

                             TU Wien.

    Portions of this software are copyright by UChicago Argonne, LLC.


                             -----------------

                   ViennaCL - The Vienna Computing Library

                             -----------------


    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at


    (A list of authors and contributors can be found in the PDF manual)


    License:         MIT (X11), see file LICENSE in the base directory

 ============================================================================= */


 #include "viennacl/matrix.hpp"

 #include "viennacl/matrix_proxy.hpp"

 #include "viennacl/vector.hpp"

 #include "viennacl/vector_proxy.hpp"


 #include "viennacl/linalg/inner_prod.hpp"

 #include "viennacl/linalg/prod.hpp"

 #include "viennacl/linalg/lu.hpp"

 #include "viennacl/tools/timer.hpp"


 #include <iomanip>

 #include <stdlib.h>


 template<class T, class F>

 void init_random(viennacl::matrix<T, F> & M)

 {

   std::vector<T> cM(M.internal_size());

   for (std::size_t i = 0; i < M.size1(); ++i)

     for (std::size_t j = 0; j < M.size2(); ++j)

       cM[F::mem_index(i, j, M.internal_size1(), M.internal_size2())] = T(rand())/T(RAND_MAX);

   viennacl::fast_copy(&cM[0],&cM[0] + cM.size(),M);

 }


 template<class T>

 void init_random(viennacl::vector<T> & x)

 {

   std::vector<T> cx(x.internal_size());

   for (std::size_t i = 0; i < cx.size(); ++i)

     cx[i] = T(rand())/T(RAND_MAX);

   viennacl::fast_copy(&cx[0], &cx[0] + cx.size(), x.begin());

 }


 template<class T>

 void bench(size_t BLAS1_N, size_t BLAS2_M, size_t BLAS2_N, size_t BLAS3_M, size_t BLAS3_N, size_t BLAS3_K, std::string const & prefix)

 {

   using viennacl::linalg::inner_prod;

   using viennacl::linalg::prod;

   using viennacl::linalg::lu_factorize;

   using viennacl::trans;


   viennacl::tools::timer timer;

   double time_previous, time_spent;

   size_t Nruns;

   double time_per_benchmark = 1;


 #define BENCHMARK_OP(OPERATION, NAME, PERF, INDEX) \

   OPERATION; \

   viennacl::backend::finish();\

   timer.start(); \

   Nruns = 0; \

   time_spent = 0; \

   while (time_spent < time_per_benchmark) \

   { \

     time_previous = timer.get(); \

     OPERATION; \

     viennacl::backend::finish(); \

     time_spent += timer.get() - time_previous; \

     Nruns+=1; \

   } \

   time_spent/=(double)Nruns; \

   std::cout << prefix << NAME " : " << PERF << " " INDEX << std::endl; \


   //BLAS1

   {

     viennacl::scalar<T> s(0);

     T alpha = (T)2.4;

     viennacl::vector<T> x(BLAS1_N);

     viennacl::vector<T> y(BLAS1_N);

     viennacl::vector<T> z(BLAS1_N);


     init_random(x);

     init_random(y);

     init_random(z);


     BENCHMARK_OP(x = y,                "COPY", std::setprecision(3) << double(2*BLAS1_N*sizeof(T))/time_spent * 1e-9, "GB/s")

     BENCHMARK_OP(x = y + alpha*x,      "AXPY", std::setprecision(3) << double(3*BLAS1_N*sizeof(T))/time_spent * 1e-9, "GB/s")

     BENCHMARK_OP(s = inner_prod(x, y), "DOT",  std::setprecision(3) << double(2*BLAS1_N*sizeof(T))/time_spent * 1e-9, "GB/s")

   }


   //BLAS2

   {

     viennacl::matrix<T,viennacl::column_major> A(BLAS2_M, BLAS2_N);

     viennacl::vector<T> x(BLAS2_N);

     viennacl::vector<T> y(BLAS2_M);

     init_random(A);

     init_random(x);

     init_random(y);


     BENCHMARK_OP(y = prod(A, x),        "GEMV-N", std::setprecision(3) << double((BLAS2_M + BLAS2_N + BLAS2_M*BLAS2_N)*sizeof(T))/time_spent * 1e-9, "GB/s")

     BENCHMARK_OP(x = prod(trans(A), y), "GEMV-T", std::setprecision(3) << double((BLAS2_M + BLAS2_N + BLAS2_M*BLAS2_N)*sizeof(T))/time_spent * 1e-9, "GB/s")

   }


   //BLAS3

   {

     viennacl::matrix<T,viennacl::column_major> C(BLAS3_M, BLAS3_N);

     viennacl::matrix<T,viennacl::column_major> A(BLAS3_M, BLAS3_K);

     viennacl::matrix<T,viennacl::column_major> B(BLAS3_K, BLAS3_N);

     viennacl::matrix<T,viennacl::column_major> AT = trans(A);

     viennacl::matrix<T,viennacl::column_major> BT = trans(B);

     init_random(A);

     init_random(B);


     BENCHMARK_OP(C = prod(A, B),                 "GEMM-NN",      double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");

     BENCHMARK_OP(C = prod(A, trans(BT)),         "GEMM-NT",      double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");

     BENCHMARK_OP(C = prod(trans(AT), B),         "GEMM-TN",      double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");

     BENCHMARK_OP(C = prod(trans(AT), trans(BT)), "GEMM-TT",      double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");

     //BENCHMARK_OP(lu_factorize(A),                "LU-FACTORIZE", double(2*BLAS3_M*BLAS3_K*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");

   }


 }


 int main()

 {

 #ifdef VIENNACL_WITH_OPENCL

   std::cout << std::endl;

   std::cout << "----------------------------------------------" << std::endl;

   std::cout << "               Device Info" << std::endl;

   std::cout << "----------------------------------------------" << std::endl;

   std::cout << std::endl;

   std::cout << viennacl::ocl::current_device().info() << std::endl;

   std::cout << std::endl;

 #endif


   std::size_t BLAS1_N = 10000000;


   std::size_t BLAS2_M = 3840;

   std::size_t BLAS2_N = 3840;


   std::size_t BLAS3_M = 1976;

   std::size_t BLAS3_N = 1976;

   std::size_t BLAS3_K = 1976;


   std::cout << "Benchmark : BLAS" << std::endl;

   std::cout << "----------------" << std::endl;

   bench<float>(BLAS1_N, BLAS2_M, BLAS2_N, BLAS3_M, BLAS3_N, BLAS3_K, "s");

   std::cout << "----" << std::endl;

 #ifdef VIENNACL_WITH_OPENCL

   if ( viennacl::ocl::current_device().double_support() )

 #endif

   bench<double>(BLAS1_N, BLAS2_M, BLAS2_N, BLAS3_M, BLAS3_N, BLAS3_K, "d");

 }

init_random
void init_random(viennacl::matrix< T, F > &M)
Definition: dense_blas.cpp:32

viennacl::tools::timer
Simple timer class based on gettimeofday (POSIX) or QueryPerformanceCounter (Windows).
Definition: timer.hpp:90

viennacl::scalar
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:227

viennacl::trans
viennacl::enable_if< viennacl::is_any_sparse_matrix< M1 >::value, matrix_expression< const M1, const M1, op_trans > >::type trans(const M1 &mat)
Returns an expression template class representing a transposed matrix.
Definition: sparse_matrix_operations.hpp:376

bench
void bench(size_t BLAS1_N, size_t BLAS2_M, size_t BLAS2_N, size_t BLAS3_M, size_t BLAS3_N, size_t BLAS3_K, std::string const &prefix)
Definition: dense_blas.cpp:51

viennacl::matrix_base< NumericT >::internal_size
size_type internal_size() const
Returns the total amount of allocated memory in multiples of sizeof(NumericT)
Definition: matrix_def.hpp:242

trans
std::vector< std::vector< NumericT > > trans(std::vector< std::vector< NumericT > > const &A)
Definition: blas3_solve.cpp:195

prod.hpp
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...

matrix.hpp
Implementation of the dense matrix class.

viennacl::matrix
A dense matrix class.
Definition: forwards.h:375

viennacl::linalg::inner_prod
viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value, typename VectorT1::value_type >::type inner_prod(VectorT1 const &v1, VectorT2 const &v2)
Definition: inner_prod.hpp:100

viennacl::ocl::current_device
viennacl::ocl::device const & current_device()
Convenience function for returning the active device in the current context.
Definition: backend.hpp:351

inner_prod.hpp
Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.

viennacl::ocl::device::info
std::string info(vcl_size_t indent=0, char indent_char= ' ') const
Returns an info string with a few properties of the device. Use full_info() to get all details...
Definition: device.hpp:995

BENCHMARK_OP
#define BENCHMARK_OP(OPERATION, NAME, PERF, INDEX)

viennacl::linalg::prod
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:102

viennacl::vector_base< NumericT >::begin
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)

viennacl::ocl::device::double_support
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Definition: device.hpp:956

viennacl::matrix_base< NumericT >::size2
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:226

viennacl::vector
Definition: forwards.h:266

lu.hpp
Implementations of LU factorization for row-major and column-major dense matrices.

viennacl::matrix_base< NumericT >::size1
size_type size1() const
Returns the number of rows.
Definition: matrix_def.hpp:224

vector_proxy.hpp
Proxy classes for vectors.

timer.hpp
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling.

matrix_proxy.hpp
Proxy classes for matrices.

prod
void prod(std::vector< std::map< IndexT, NumericT > > const &stl_A, std::vector< std::map< IndexT, NumericT > > const &stl_B, std::vector< std::map< IndexT, NumericT > > &stl_C)
Definition: sparse_prod.cpp:114

vector.hpp
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...

viennacl::matrix_base< NumericT >::internal_size2
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:240

main
int main()
Definition: dense_blas.cpp:131

viennacl::matrix_base< NumericT >::internal_size1
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:238

viennacl::vector_base< NumericT >::internal_size
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector_def.hpp:120

viennacl::linalg::lu_factorize
void lu_factorize(matrix< NumericT, viennacl::row_major > &A)
LU factorization of a row-major dense matrix.
Definition: lu.hpp:42

viennacl::fast_copy
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)