1 #ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
2 #define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
44 namespace device_specific
89 return result_type(
new T(utils::numeric_type_to_string(numeric_type(statement,root_idx)), binder_.get(NULL), mapped_object::node_info(mapping, statement, root_idx)));
92 template<
class NumericT>
93 result_type operator()(
NumericT const & )
const
95 return result_type(
new mapped_host_scalar(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
99 template<
class NumericT>
100 result_type operator()(scalar<NumericT>
const & scal)
const
102 return result_type(
new mapped_scalar(utils::type_to_string<NumericT>::value(), binder_.get(&
viennacl::traits::handle(scal))));
106 template<
class NumericT>
107 result_type operator()(vector_base<NumericT>
const & vec)
const
109 return result_type(
new mapped_vector(utils::type_to_string<NumericT>::value(), binder_.get(&
viennacl::traits::handle(vec))));
113 template<
class NumericT>
114 result_type operator()(implicit_vector_base<NumericT>
const & )
const
116 return result_type(
new mapped_implicit_vector(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
120 template<
class NumericT>
121 result_type operator()(matrix_base<NumericT>
const & mat)
const
123 return result_type(
new mapped_matrix(utils::type_to_string<NumericT>::value(), binder_.get(&
viennacl::traits::handle(mat)),
128 template<
class NumericT>
129 result_type operator()(implicit_matrix_base<NumericT>
const & )
const
131 return result_type(
new mapped_implicit_matrix(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
136 mapping_type::key_type key(root_idx, leaf_t);
137 scheduler::statement_node
const & root_node = statement.array()[root_idx];
140 mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.lhs, *
this)));
142 mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.rhs, *
this)));
146 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_vector_diag>(&statement, root_idx, &mapping_)));
148 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_diag>(&statement, root_idx, &mapping_)));
150 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_row>(&statement, root_idx, &mapping_)));
152 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_column>(&statement, root_idx, &mapping_)));
154 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_reduction>(&statement, root_idx, &mapping_)));
156 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_row_wise_reduction>(&statement, root_idx, &mapping_)));
158 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(&statement, root_idx, &mapping_)));
160 mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_trans>(&statement, root_idx, &mapping_)));
165 symbolic_binder & binder_;
170 class prototype_generation_traversal :
public tree_parsing::traversal_functor
173 std::set<std::string> & already_generated_;
176 std::map<std::string, unsigned int>
const & widths_;
178 prototype_generation_traversal(std::set<std::string> & already_generated, std::string & str,
mapping_type const & mapping, std::map<std::string, unsigned int>
const & widths) :
179 already_generated_(already_generated), str_(str), mapping_(mapping), widths_(widths){ }
181 void operator()(scheduler::statement
const & statement,
vcl_size_t root_idx, leaf_t leaf)
const
183 scheduler::statement_node
const & root_node = statement.array()[root_idx];
187 mapped_object * obj =
at(mapping_, std::make_pair(root_idx,leaf)).get();
188 if(widths_.find(obj->name())!=widths_.end())
189 obj->append_kernel_arguments(already_generated_, str_,
at(widths_, obj->name()));
191 obj->append_kernel_arguments(already_generated_, str_, 1);
199 class set_arguments_functor :
public tree_parsing::traversal_functor
202 typedef void result_type;
204 set_arguments_functor(symbolic_binder & binder,
unsigned int & current_arg,
viennacl::ocl::kernel & kernel) : binder_(binder), current_arg_(current_arg), kernel_(kernel){ }
206 template<
class NumericT>
207 result_type operator()(
NumericT const & scal)
const {
209 kernel_.arg(current_arg_++, cl_scalartype(scal));
213 template<
class NumericT>
214 result_type operator()(scalar<NumericT>
const & scal)
const {
216 kernel_.arg(current_arg_++, scal.handle().opencl_handle());
220 template<
class NumericT>
221 result_type operator()(vector_base<NumericT>
const & vec)
const {
224 kernel_.arg(current_arg_++, vec.handle().opencl_handle());
231 template<
class NumericT>
232 result_type operator()(implicit_vector_base<NumericT>
const & vec)
const
235 kernel_.arg(current_arg_++, cl_scalartype(vec.value()));
237 kernel_.arg(current_arg_++, cl_uint(vec.index()));
241 template<
class NumericT>
242 result_type operator()(matrix_base<NumericT>
const & mat)
const
246 kernel_.arg(current_arg_++, mat.handle().opencl_handle());
266 template<
class NumericT>
267 result_type operator()(implicit_matrix_base<NumericT>
const & mat)
const
273 void operator()(scheduler::statement
const & statement,
vcl_size_t root_idx, leaf_t leaf_t)
const
275 scheduler::statement_node
const & root_node = statement.array()[root_idx];
277 utils::call_on_element(root_node.lhs, *
this);
279 utils::call_on_element(root_node.rhs, *
this);
283 symbolic_binder & binder_;
284 unsigned int & current_arg_;
291 std::map<std::string, unsigned int>
const & widths)
293 statements_container::data_type::const_iterator sit;
294 std::vector<mapping_type>::const_iterator mit;
295 std::set<std::string> already_generated;
297 std::string arguments = first_arguments;
298 for (mit = mappings.begin(), sit = statements.
data().begin(); sit != statements.
data().end(); ++sit, ++mit)
299 tree_parsing::traverse(*sit, sit->root(), prototype_generation_traversal(already_generated, arguments, *mit, widths),
true);
300 arguments.erase(arguments.size()-1);
301 stream <<
"__kernel " <<
"void " << name <<
"(" << arguments <<
")" << std::endl;
306 generate_prototype(stream, name, first_arguments, mappings, statements, std::map<std::string, unsigned int>());
312 for (statements_container::data_type::const_iterator itt = statements.
data().begin(); itt != statements.
data().end(); ++itt)
321 message_(
"ViennaCL: Internal error: The generator cannot apply the given template to the given statement: " + message +
"\n"
322 "If you are using a builtin template, please report on viennacl-support@lists.sourceforge.net! We will provide a fix as soon as possible\n"
323 "If you are using your own template, please try using other parameters") {}
324 virtual const char*
what()
const throw() {
return message_.c_str(); }
327 std::string message_;
340 std::string chunk_size =
"chunk_size";
341 std::string chunk_start =
"chunk_start";
342 std::string chunk_end =
"chunk_end";
344 stream <<
"unsigned int " << chunk_size <<
" = (" << bound <<
"+" << domain_size <<
"-1)/" << domain_size <<
";" << std::endl;
345 stream <<
"unsigned int " << chunk_start <<
" =" << domain_id <<
"*" << chunk_size <<
";" << std::endl;
346 stream <<
"unsigned int " << chunk_end <<
" = min(" << chunk_start <<
"+" << chunk_size <<
", " << bound <<
");" << std::endl;
348 upper_bound = chunk_end;
366 node = &array[(node->*ptr).node_index];
376 static char suffixes[] = {
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
377 'a',
'b',
'c',
'd',
'e',
'f'};
390 for (statements_container::data_type::const_iterator it = statements.
data().begin(); it != statements.
data().end(); ++it)
393 std::vector<scheduler::lhs_rhs_element> vectors;
395 for (std::vector<scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt)
400 std::vector<scheduler::lhs_rhs_element> matrices;
402 for (std::vector<scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt)
406 std::vector<vcl_size_t> striding_operators;
408 if(striding_operators.size() > 0)
416 using namespace scheduler;
417 using namespace utils;
420 vcl_size_t size1 = up_to_internal_size?call_on_matrix(node.
lhs, internal_size1_fun()):call_on_matrix(node.
lhs, size1_fun());
421 vcl_size_t size2 = up_to_internal_size?call_on_matrix(node.
lhs, internal_size2_fun()):call_on_matrix(node.
lhs, size2_fun());
425 return up_to_internal_size?call_on_matrix(node.
lhs, internal_size2_fun()):call_on_matrix(node.
lhs, size2_fun());
427 return up_to_internal_size?call_on_matrix(node.
lhs, internal_size1_fun()):call_on_matrix(node.
lhs, size1_fun());
429 return up_to_internal_size?call_on_vector(node.
lhs, internal_size_fun()):call_on_vector(node.
lhs, size_fun());
440 fetching_policy_type fetch,
unsigned int simd_width, std::string
const & i, std::string
const & bound, std::string
const & domain_id, std::string
const & domain_size)
443 std::string boundround = bound +
"/" + strwidth;
445 std::string init, upper_bound, inc;
446 fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size);
447 stream <<
"for(unsigned int " << i <<
" = " << init <<
"; " << i <<
" < " << upper_bound <<
"; " << i <<
" += " << inc <<
")" << std::endl;
448 stream <<
"{" << std::endl;
450 loop_body(stream, simd_width);
452 stream <<
"}" << std::endl;
456 stream <<
"for(unsigned int " << i <<
" = " << boundround <<
"*" << strwidth <<
" + " << domain_id <<
"; " << i <<
" < " << bound <<
"; " << i <<
" += " + domain_size +
")" << std::endl;
457 stream <<
"{" << std::endl;
459 loop_body(stream, 1);
461 stream <<
"}" << std::endl;
465 static std::string
vstore(
unsigned int simd_width, std::string
const & value, std::string
const & offset, std::string
const & ptr)
468 return "(" + ptr +
")[" + offset +
"] = " + value;
470 return utils::append_width(
"vstore", simd_width) +
"(" + value +
", " + offset +
", " + ptr +
")";
473 static std::string
vload(
unsigned int simd_width, std::string
const & offset, std::string
const & ptr)
476 return "(" + ptr +
")[" + offset +
"]";
483 virtual std::vector<std::string> generate_impl(std::string
const & kernel_prefix,
statements_container const & statements, std::vector<mapping_type>
const & mapping)
const = 0;
492 statements_container::data_type::const_iterator sit;
493 std::vector<mapping_type>::iterator mit;
499 std::vector<mapping_type> mappings(statements.
data().size());
501 for (mit = mappings.begin(), sit = statements.
data().begin(); sit != statements.
data().end(); ++sit, ++mit)
504 return generate_impl(kernel_prefix, statements, mappings);
510 virtual void enqueue(std::string
const & kernel_prefix, std::vector<lazy_program_compiler> & programs,
statements_container const & statements) = 0;
518 template<
class TemplateType,
class ParametersType>
524 virtual unsigned int n_lmem_elements()
const {
return 0; }
545 using namespace viennacl::tools;
548 unsigned int scalartype_size =
utils::size_of(
lhs_most(statement.array(), statement.root()).lhs.numeric_type);
552 vcl_size_t lmem_usage = scalartype_size*n_lmem_elements();
553 if (lmem_usage>lmem_available)
554 return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
559 if (p_.local_size_0*p_.local_size_1 > max_workgroup_size)
560 return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW;
561 if (p_.local_size_0 > max_work_item_sizes[0])
562 return TEMPLATE_LOCAL_SIZE_0_OVERFLOW;
564 if (p_.local_size_1 > max_work_item_sizes[1])
565 return TEMPLATE_LOCAL_SIZE_1_OVERFLOW;
568 unsigned int warp_size = 8;
569 if (device.
type()==CL_DEVICE_TYPE_GPU)
577 if (((p_.local_size_0*p_.local_size_1)%warp_size)>0)
578 return TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE;
581 if (p_.simd_width!=1 && p_.simd_width!=2 &&
582 p_.simd_width!=4 && p_.simd_width!=8 &&
584 return TEMPLATE_INVALID_SIMD_WIDTH;
586 return check_invalid_impl(device);
unsigned int local_size_0
tools::shared_ptr< template_base > clone() const
virtual int check_invalid(statements_container const &statements, viennacl::ocl::device const &device) const =0
returns whether or not the profile has undefined behavior on particular device
virtual ~loop_body_base()
Represents an OpenCL device within ViennaCL.
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Exception for the case the generator is unable to deal with the operation.
void set_arguments(statements_container const &statements, viennacl::ocl::kernel &kernel, unsigned int ¤t_arg)
Represents an OpenCL kernel within ViennaCL.
invalid_template_exception()
static std::string vstore(unsigned int simd_width, std::string const &value, std::string const &offset, std::string const &ptr)
int check_invalid(statements_container const &statements, viennacl::ocl::device const &device) const
returns whether or not the profile has undefined behavior on particular device
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Some helper routines for reading/writing/printing scheduler expressions.
void traverse(scheduler::statement const &statement, vcl_size_t root_idx, Fun const &fun, bool inspect)
Recursively execute a functor on a statement.
static bool is_node_trans(scheduler::statement::container_type const &array, vcl_size_t root_idx, leaf_t leaf_type)
parameters_type const & parameters() const
static bool is_striding_operator(scheduler::statement_node const &node)
A class representing a compute device (e.g. a GPU)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
cl_ulong local_mem_size() const
Size of local memory arena in bytes. The minimum value is 32 KB.
cl_device_type type() const
The OpenCL device type.
A class representing the 'data' for the LHS or RHS operand of the respective node.
container_type const & array() const
cl_uint vendor_id() const
A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID...
statement_node_numeric_type
Encodes the type of a node in the statement tree.
std::list< scheduler::statement > const & data() const
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
static std::string append_simd_suffix(std::string const &str, unsigned int i)
virtual tools::shared_ptr< template_base > clone() const =0
scheduler::statement_node const & lhs_most(scheduler::statement::container_type const &array, vcl_size_t root)
bool is_scalar_reduction(scheduler::statement_node const &node)
std::vector< value_type > container_type
result_of::size_type< T >::type start2(T const &obj)
vcl_size_t ld(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
unsigned int local_size_1
Map ViennaCL objects to generator wrappers.
static bool has_strided_access(statements_container const &statements)
static void fetching_loop_info(fetching_policy_type policy, std::string const &bound, utils::kernel_generation_stream &stream, std::string &init, std::string &upper_bound, std::string &inc, std::string const &domain_id, std::string const &domain_size)
Helper for compiling a program lazily.
statement_node_numeric_type numeric_type
result_of::size_type< T >::type start(T const &obj)
Various utility implementations for dispatching with respect to the different devices available on th...
invalid_template_exception(std::string message)
virtual ~invalid_template_exception()
base functor class for traversing a statement
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
static vcl_size_t vector_size(scheduler::statement_node const &node, bool up_to_internal_size)
static void generate_prototype(utils::kernel_generation_stream &stream, std::string const &name, std::string const &first_arguments, std::vector< mapping_type > const &mappings, statements_container const &statements, std::map< std::string, unsigned int > const &widths)
virtual const char * what() const
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
static void generate_prototype(utils::kernel_generation_stream &stream, std::string const &name, std::string const &first_arguments, std::vector< mapping_type > const &mappings, statements_container const &statements)
std::map< mapping_key, tools::shared_ptr< mapped_object > > mapping_type
virtual void enqueue(std::string const &kernel_prefix, std::vector< lazy_program_compiler > &programs, statements_container const &statements)=0
Code for parsing the expression trees.
std::vector< std::string > generate(std::string const &kernel_prefix, statements_container const &statements, viennacl::ocl::device const &device)
Representation of an OpenCL kernel in ViennaCL.
template_base_impl(parameters_type const ¶meters, binding_policy_t binding_policy)
The constructor.
bool row_major(T const &)
static void element_wise_loop_1D(utils::kernel_generation_stream &stream, loop_body_base const &loop_body, fetching_policy_type fetch, unsigned int simd_width, std::string const &i, std::string const &bound, std::string const &domain_id, std::string const &domain_size)
static std::string vload(unsigned int simd_width, std::string const &offset, std::string const &ptr)
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
viennacl::backend::mem_handle & handle(T &obj)
Returns the generic memory handle of an object. Non-const version.
bool is_vector_reduction(scheduler::statement_node const &node)
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
template_base(binding_policy_t binding_policy)
Main datastructure for an node in the statement tree.
size_t max_work_group_size() const
Maximum number of work-items in a work-group executing a kernel using the data parallel execution mod...
unsigned int size_of(scheduler::statement_node_numeric_type type)
tools::shared_ptr< symbolic_binder > make_binder(binding_policy_t policy)
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels)
std::vector< size_t > max_work_item_sizes() const
Maximum number of work-items that can be specified in each dimension of the work-group.
std::string append_width(std::string const &str, unsigned int width)
ParametersType parameters_type