Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. //
  2. // Copyright 2012 Francisco Jerez
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a
  5. // copy of this software and associated documentation files (the "Software"),
  6. // to deal in the Software without restriction, including without limitation
  7. // the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. // and/or sell copies of the Software, and to permit persons to whom the
  9. // Software is furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17. // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18. // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19. // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20. // OTHER DEALINGS IN THE SOFTWARE.
  21. //
  22.  
  23. #include "core/kernel.hpp"
  24. #include "core/resource.hpp"
  25. #include "util/factor.hpp"
  26. #include "util/u_math.h"
  27. #include "pipe/p_context.h"
  28.  
  29. using namespace clover;
  30.  
  31. kernel::kernel(clover::program &prog, const std::string &name,
  32.                const std::vector<module::argument> &margs) :
  33.    program(prog), _name(name), exec(*this),
  34.    program_ref(prog._kernel_ref_counter) {
  35.    for (auto &marg : margs) {
  36.       if (marg.semantic == module::argument::general)
  37.          _args.emplace_back(argument::create(marg));
  38.    }
  39. }
  40.  
  41. template<typename V>
  42. static inline std::vector<uint>
  43. pad_vector(command_queue &q, const V &v, uint x) {
  44.    std::vector<uint> w { v.begin(), v.end() };
  45.    w.resize(q.device().max_block_size().size(), x);
  46.    return w;
  47. }
  48.  
  49. void
  50. kernel::launch(command_queue &q,
  51.                const std::vector<size_t> &grid_offset,
  52.                const std::vector<size_t> &grid_size,
  53.                const std::vector<size_t> &block_size) {
  54.    const auto m = program().binary(q.device());
  55.    const auto reduced_grid_size =
  56.       map(divides(), grid_size, block_size);
  57.    void *st = exec.bind(&q, grid_offset);
  58.  
  59.    // The handles are created during exec_context::bind(), so we need make
  60.    // sure to call exec_context::bind() before retrieving them.
  61.    std::vector<uint32_t *> g_handles = map([&](size_t h) {
  62.          return (uint32_t *)&exec.input[h];
  63.       }, exec.g_handles);
  64.  
  65.    q.pipe->bind_compute_state(q.pipe, st);
  66.    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
  67.                                0, exec.samplers.size(),
  68.                                exec.samplers.data());
  69.  
  70.    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
  71.                              exec.sviews.size(), exec.sviews.data());
  72.    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
  73.                                  exec.resources.data());
  74.    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
  75.                               exec.g_buffers.data(), g_handles.data());
  76.  
  77.    q.pipe->launch_grid(q.pipe,
  78.                        pad_vector(q, block_size, 1).data(),
  79.                        pad_vector(q, reduced_grid_size, 1).data(),
  80.                        find(name_equals(_name), m.syms).offset,
  81.                        exec.input.data());
  82.  
  83.    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
  84.    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
  85.    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
  86.                              exec.sviews.size(), NULL);
  87.    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
  88.                                exec.samplers.size(), NULL);
  89.    exec.unbind();
  90. }
  91.  
  92. size_t
  93. kernel::mem_local() const {
  94.    size_t sz = 0;
  95.  
  96.    for (auto &arg : args()) {
  97.       if (dynamic_cast<local_argument *>(&arg))
  98.          sz += arg.storage();
  99.    }
  100.  
  101.    return sz;
  102. }
  103.  
  104. size_t
  105. kernel::mem_private() const {
  106.    return 0;
  107. }
  108.  
  109. const std::string &
  110. kernel::name() const {
  111.    return _name;
  112. }
  113.  
  114. std::vector<size_t>
  115. kernel::optimal_block_size(const command_queue &q,
  116.                            const std::vector<size_t> &grid_size) const {
  117.    return factor::find_grid_optimal_factor<size_t>(
  118.       q.device().max_threads_per_block(), q.device().max_block_size(),
  119.       grid_size);
  120. }
  121.  
  122. std::vector<size_t>
  123. kernel::required_block_size() const {
  124.    return { 0, 0, 0 };
  125. }
  126.  
  127. kernel::argument_range
  128. kernel::args() {
  129.    return map(derefs(), _args);
  130. }
  131.  
  132. kernel::const_argument_range
  133. kernel::args() const {
  134.    return map(derefs(), _args);
  135. }
  136.  
  137. const module &
  138. kernel::module(const command_queue &q) const {
  139.    return program().binary(q.device());
  140. }
  141.  
  142. kernel::exec_context::exec_context(kernel &kern) :
  143.    kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
  144. }
  145.  
  146. kernel::exec_context::~exec_context() {
  147.    if (st)
  148.       q->pipe->delete_compute_state(q->pipe, st);
  149. }
  150.  
  151. void *
  152. kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
  153.                            const std::vector<size_t> &grid_offset) {
  154.    std::swap(q, _q);
  155.  
  156.    // Bind kernel arguments.
  157.    auto &m = kern.program().binary(q->device());
  158.    auto margs = find(name_equals(kern.name()), m.syms).args;
  159.    auto msec = find(type_equals(module::section::text), m.secs);
  160.    auto explicit_arg = kern._args.begin();
  161.  
  162.    for (auto &marg : margs) {
  163.       switch (marg.semantic) {
  164.       case module::argument::general:
  165.          (*(explicit_arg++))->bind(*this, marg);
  166.          break;
  167.  
  168.       case module::argument::grid_dimension: {
  169.          const cl_uint dimension = grid_offset.size();
  170.          auto arg = argument::create(marg);
  171.  
  172.          arg->set(sizeof(dimension), &dimension);
  173.          arg->bind(*this, marg);
  174.          break;
  175.       }
  176.       case module::argument::grid_offset: {
  177.          for (cl_uint x : pad_vector(*q, grid_offset, 1)) {
  178.             auto arg = argument::create(marg);
  179.  
  180.             arg->set(sizeof(x), &x);
  181.             arg->bind(*this, marg);
  182.          }
  183.          break;
  184.       }
  185.       }
  186.    }
  187.  
  188.    // Create a new compute state if anything changed.
  189.    if (!st || q != _q ||
  190.        cs.req_local_mem != mem_local ||
  191.        cs.req_input_mem != input.size()) {
  192.       if (st)
  193.          _q->pipe->delete_compute_state(_q->pipe, st);
  194.  
  195.       cs.prog = &(msec.data[0]);
  196.       cs.req_local_mem = mem_local;
  197.       cs.req_input_mem = input.size();
  198.       st = q->pipe->create_compute_state(q->pipe, &cs);
  199.    }
  200.  
  201.    return st;
  202. }
  203.  
  204. void
  205. kernel::exec_context::unbind() {
  206.    for (auto &arg : kern.args())
  207.       arg.unbind(*this);
  208.  
  209.    input.clear();
  210.    samplers.clear();
  211.    sviews.clear();
  212.    resources.clear();
  213.    g_buffers.clear();
  214.    g_handles.clear();
  215.    mem_local = 0;
  216. }
  217.  
  218. namespace {
  219.    template<typename T>
  220.    std::vector<uint8_t>
  221.    bytes(const T& x) {
  222.       return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
  223.    }
  224.  
  225.    ///
  226.    /// Transform buffer \a v from the native byte order into the byte
  227.    /// order specified by \a e.
  228.    ///
  229.    template<typename T>
  230.    void
  231.    byteswap(T &v, pipe_endian e) {
  232.       if (PIPE_ENDIAN_NATIVE != e)
  233.          std::reverse(v.begin(), v.end());
  234.    }
  235.  
  236.    ///
  237.    /// Pad buffer \a v to the next multiple of \a n.
  238.    ///
  239.    template<typename T>
  240.    void
  241.    align(T &v, size_t n) {
  242.       v.resize(util_align_npot(v.size(), n));
  243.    }
  244.  
  245.    bool
  246.    msb(const std::vector<uint8_t> &s) {
  247.       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
  248.          return s.back() & 0x80;
  249.       else
  250.          return s.front() & 0x80;
  251.    }
  252.  
  253.    ///
  254.    /// Resize buffer \a v to size \a n using sign or zero extension
  255.    /// according to \a ext.
  256.    ///
  257.    template<typename T>
  258.    void
  259.    extend(T &v, enum module::argument::ext_type ext, size_t n) {
  260.       const size_t m = std::min(v.size(), n);
  261.       const bool sign_ext = (ext == module::argument::sign_ext);
  262.       const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
  263.       T w(n, fill);
  264.  
  265.       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
  266.          std::copy_n(v.begin(), m, w.begin());
  267.       else
  268.          std::copy_n(v.end() - m, m, w.end() - m);
  269.  
  270.       std::swap(v, w);
  271.    }
  272.  
  273.    ///
  274.    /// Append buffer \a w to \a v.
  275.    ///
  276.    template<typename T>
  277.    void
  278.    insert(T &v, const T &w) {
  279.       v.insert(v.end(), w.begin(), w.end());
  280.    }
  281.  
  282.    ///
  283.    /// Append \a n elements to the end of buffer \a v.
  284.    ///
  285.    template<typename T>
  286.    size_t
  287.    allocate(T &v, size_t n) {
  288.       size_t pos = v.size();
  289.       v.resize(pos + n);
  290.       return pos;
  291.    }
  292. }
  293.  
  294. std::unique_ptr<kernel::argument>
  295. kernel::argument::create(const module::argument &marg) {
  296.    switch (marg.type) {
  297.    case module::argument::scalar:
  298.       return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
  299.  
  300.    case module::argument::global:
  301.       return std::unique_ptr<kernel::argument>(new global_argument);
  302.  
  303.    case module::argument::local:
  304.       return std::unique_ptr<kernel::argument>(new local_argument);
  305.  
  306.    case module::argument::constant:
  307.       return std::unique_ptr<kernel::argument>(new constant_argument);
  308.  
  309.    case module::argument::image2d_rd:
  310.    case module::argument::image3d_rd:
  311.       return std::unique_ptr<kernel::argument>(new image_rd_argument);
  312.  
  313.    case module::argument::image2d_wr:
  314.    case module::argument::image3d_wr:
  315.       return std::unique_ptr<kernel::argument>(new image_wr_argument);
  316.  
  317.    case module::argument::sampler:
  318.       return std::unique_ptr<kernel::argument>(new sampler_argument);
  319.  
  320.    }
  321.    throw error(CL_INVALID_KERNEL_DEFINITION);
  322. }
  323.  
  324. kernel::argument::argument() : _set(false) {
  325. }
  326.  
  327. bool
  328. kernel::argument::set() const {
  329.    return _set;
  330. }
  331.  
  332. size_t
  333. kernel::argument::storage() const {
  334.    return 0;
  335. }
  336.  
  337. kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
  338. }
  339.  
  340. void
  341. kernel::scalar_argument::set(size_t size, const void *value) {
  342.    if (size != this->size)
  343.       throw error(CL_INVALID_ARG_SIZE);
  344.  
  345.    v = { (uint8_t *)value, (uint8_t *)value + size };
  346.    _set = true;
  347. }
  348.  
  349. void
  350. kernel::scalar_argument::bind(exec_context &ctx,
  351.                               const module::argument &marg) {
  352.    auto w = v;
  353.  
  354.    extend(w, marg.ext_type, marg.target_size);
  355.    byteswap(w, ctx.q->device().endianness());
  356.    align(ctx.input, marg.target_align);
  357.    insert(ctx.input, w);
  358. }
  359.  
  360. void
  361. kernel::scalar_argument::unbind(exec_context &ctx) {
  362. }
  363.  
  364. void
  365. kernel::global_argument::set(size_t size, const void *value) {
  366.    if (size != sizeof(cl_mem))
  367.       throw error(CL_INVALID_ARG_SIZE);
  368.  
  369.    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
  370.    _set = true;
  371. }
  372.  
  373. void
  374. kernel::global_argument::bind(exec_context &ctx,
  375.                               const module::argument &marg) {
  376.    align(ctx.input, marg.target_align);
  377.  
  378.    if (buf) {
  379.       const resource &r = buf->resource(*ctx.q);
  380.       ctx.g_handles.push_back(ctx.input.size());
  381.       ctx.g_buffers.push_back(r.pipe);
  382.  
  383.       // How to handle multi-demensional offsets?
  384.       // We don't need to.  Buffer offsets are always
  385.       // one-dimensional.
  386.       auto v = bytes(r.offset[0]);
  387.       extend(v, marg.ext_type, marg.target_size);
  388.       byteswap(v, ctx.q->device().endianness());
  389.       insert(ctx.input, v);
  390.    } else {
  391.       // Null pointer.
  392.       allocate(ctx.input, marg.target_size);
  393.    }
  394. }
  395.  
  396. void
  397. kernel::global_argument::unbind(exec_context &ctx) {
  398. }
  399.  
  400. size_t
  401. kernel::local_argument::storage() const {
  402.    return _storage;
  403. }
  404.  
  405. void
  406. kernel::local_argument::set(size_t size, const void *value) {
  407.    if (value)
  408.       throw error(CL_INVALID_ARG_VALUE);
  409.  
  410.    _storage = size;
  411.    _set = true;
  412. }
  413.  
  414. void
  415. kernel::local_argument::bind(exec_context &ctx,
  416.                              const module::argument &marg) {
  417.    auto v = bytes(ctx.mem_local);
  418.  
  419.    extend(v, module::argument::zero_ext, marg.target_size);
  420.    byteswap(v, ctx.q->device().endianness());
  421.    align(ctx.input, marg.target_align);
  422.    insert(ctx.input, v);
  423.  
  424.    ctx.mem_local += _storage;
  425. }
  426.  
  427. void
  428. kernel::local_argument::unbind(exec_context &ctx) {
  429. }
  430.  
  431. void
  432. kernel::constant_argument::set(size_t size, const void *value) {
  433.    if (size != sizeof(cl_mem))
  434.       throw error(CL_INVALID_ARG_SIZE);
  435.  
  436.    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
  437.    _set = true;
  438. }
  439.  
  440. void
  441. kernel::constant_argument::bind(exec_context &ctx,
  442.                                 const module::argument &marg) {
  443.    align(ctx.input, marg.target_align);
  444.  
  445.    if (buf) {
  446.       resource &r = buf->resource(*ctx.q);
  447.       auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
  448.  
  449.       extend(v, module::argument::zero_ext, marg.target_size);
  450.       byteswap(v, ctx.q->device().endianness());
  451.       insert(ctx.input, v);
  452.  
  453.       st = r.bind_surface(*ctx.q, false);
  454.       ctx.resources.push_back(st);
  455.    } else {
  456.       // Null pointer.
  457.       allocate(ctx.input, marg.target_size);
  458.    }
  459. }
  460.  
  461. void
  462. kernel::constant_argument::unbind(exec_context &ctx) {
  463.    if (buf)
  464.       buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
  465. }
  466.  
  467. void
  468. kernel::image_rd_argument::set(size_t size, const void *value) {
  469.    if (size != sizeof(cl_mem))
  470.       throw error(CL_INVALID_ARG_SIZE);
  471.  
  472.    img = &obj<image>(*(cl_mem *)value);
  473.    _set = true;
  474. }
  475.  
  476. void
  477. kernel::image_rd_argument::bind(exec_context &ctx,
  478.                                 const module::argument &marg) {
  479.    auto v = bytes(ctx.sviews.size());
  480.  
  481.    extend(v, module::argument::zero_ext, marg.target_size);
  482.    byteswap(v, ctx.q->device().endianness());
  483.    align(ctx.input, marg.target_align);
  484.    insert(ctx.input, v);
  485.  
  486.    st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
  487.    ctx.sviews.push_back(st);
  488. }
  489.  
  490. void
  491. kernel::image_rd_argument::unbind(exec_context &ctx) {
  492.    img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
  493. }
  494.  
  495. void
  496. kernel::image_wr_argument::set(size_t size, const void *value) {
  497.    if (size != sizeof(cl_mem))
  498.       throw error(CL_INVALID_ARG_SIZE);
  499.  
  500.    img = &obj<image>(*(cl_mem *)value);
  501.    _set = true;
  502. }
  503.  
  504. void
  505. kernel::image_wr_argument::bind(exec_context &ctx,
  506.                                 const module::argument &marg) {
  507.    auto v = bytes(ctx.resources.size());
  508.  
  509.    extend(v, module::argument::zero_ext, marg.target_size);
  510.    byteswap(v, ctx.q->device().endianness());
  511.    align(ctx.input, marg.target_align);
  512.    insert(ctx.input, v);
  513.  
  514.    st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
  515.    ctx.resources.push_back(st);
  516. }
  517.  
  518. void
  519. kernel::image_wr_argument::unbind(exec_context &ctx) {
  520.    img->resource(*ctx.q).unbind_surface(*ctx.q, st);
  521. }
  522.  
  523. void
  524. kernel::sampler_argument::set(size_t size, const void *value) {
  525.    if (size != sizeof(cl_sampler))
  526.       throw error(CL_INVALID_ARG_SIZE);
  527.  
  528.    s = &obj(*(cl_sampler *)value);
  529.    _set = true;
  530. }
  531.  
  532. void
  533. kernel::sampler_argument::bind(exec_context &ctx,
  534.                                const module::argument &marg) {
  535.    st = s->bind(*ctx.q);
  536.    ctx.samplers.push_back(st);
  537. }
  538.  
  539. void
  540. kernel::sampler_argument::unbind(exec_context &ctx) {
  541.    s->unbind(*ctx.q, st);
  542. }
  543.