Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
  2.  
  3. /*
  4.  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the "Software"),
  8.  * to deal in the Software without restriction, including without limitation
  9.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10.  * and/or sell copies of the Software, and to permit persons to whom the
  11.  * Software is furnished to do so, subject to the following conditions:
  12.  *
  13.  * The above copyright notice and this permission notice (including the next
  14.  * paragraph) shall be included in all copies or substantial portions of the
  15.  * Software.
  16.  *
  17.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22.  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23.  * SOFTWARE.
  24.  *
  25.  * Authors:
  26.  *    Rob Clark <robclark@freedesktop.org>
  27.  */
  28.  
  29.  
  30. #include "util/u_math.h"
  31.  
  32. #include "ir3.h"
  33.  
  34. enum {
  35.         SCHEDULED = -1,
  36.         DELAYED = -2,
  37. };
  38.  
  39. /*
  40.  * Instruction Scheduling:
  41.  *
  42.  * Using the depth sorted list from depth pass, attempt to recursively
  43.  * schedule deepest unscheduled path.  The first instruction that cannot
  44.  * be scheduled, returns the required delay slots it needs, at which
  45.  * point we return back up to the top and attempt to schedule by next
  46.  * highest depth.  After a sufficient number of instructions have been
  47.  * scheduled, return back to beginning of list and start again.  If you
  48.  * reach the end of depth sorted list without being able to insert any
  49.  * instruction, insert nop's.  Repeat until no more unscheduled
  50.  * instructions.
  51.  *
  52.  * There are a few special cases that need to be handled, since sched
  53.  * is currently independent of register allocation.  Usages of address
  54.  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  55.  * if you have two pairs of instructions that write the same special
  56.  * register and then read it, then those pairs cannot be interleaved.
  57.  * To solve this, when we are in such a scheduling "critical section",
  58.  * and we encounter a conflicting write to a special register, we try
  59.  * to schedule any remaining instructions that use that value first.
  60.  */
  61.  
  62. struct ir3_sched_ctx {
  63.         struct ir3_instruction *scheduled; /* last scheduled instr */
  64.         struct ir3_instruction *addr;      /* current a0.x user, if any */
  65.         struct ir3_instruction *pred;      /* current p0.x user, if any */
  66.         unsigned cnt;
  67.         bool error;
  68. };
  69.  
  70. static struct ir3_instruction *
  71. deepest(struct ir3_instruction **srcs, unsigned nsrcs)
  72. {
  73.         struct ir3_instruction *d = NULL;
  74.         unsigned i = 0, id = 0;
  75.  
  76.         while ((i < nsrcs) && !(d = srcs[id = i]))
  77.                 i++;
  78.  
  79.         if (!d)
  80.                 return NULL;
  81.  
  82.         for (; i < nsrcs; i++)
  83.                 if (srcs[i] && (srcs[i]->depth > d->depth))
  84.                         d = srcs[id = i];
  85.  
  86.         srcs[id] = NULL;
  87.  
  88.         return d;
  89. }
  90.  
  91. static unsigned distance(struct ir3_sched_ctx *ctx,
  92.                 struct ir3_instruction *instr, unsigned maxd)
  93. {
  94.         struct ir3_instruction *n = ctx->scheduled;
  95.         unsigned d = 0;
  96.         while (n && (n != instr) && (d < maxd)) {
  97.                 if (is_alu(n) || is_flow(n))
  98.                         d++;
  99.                 n = n->next;
  100.         }
  101.         return d;
  102. }
  103.  
  104. /* TODO maybe we want double linked list? */
  105. static struct ir3_instruction * prev(struct ir3_instruction *instr)
  106. {
  107.         struct ir3_instruction *p = instr->block->head;
  108.         while (p && (p->next != instr))
  109.                 p = p->next;
  110.         return p;
  111. }
  112.  
  113. static bool is_sfu_or_mem(struct ir3_instruction *instr)
  114. {
  115.         return is_sfu(instr) || is_mem(instr);
  116. }
  117.  
  118. static void schedule(struct ir3_sched_ctx *ctx,
  119.                 struct ir3_instruction *instr, bool remove)
  120. {
  121.         struct ir3_block *block = instr->block;
  122.  
  123.         /* maybe there is a better way to handle this than just stuffing
  124.          * a nop.. ideally we'd know about this constraint in the
  125.          * scheduling and depth calculation..
  126.          */
  127.         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
  128.                 schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
  129.  
  130.         /* remove from depth list:
  131.          */
  132.         if (remove) {
  133.                 struct ir3_instruction *p = prev(instr);
  134.  
  135.                 /* NOTE: this can happen for inputs which are not
  136.                  * read.. in that case there is no need to schedule
  137.                  * the input, so just bail:
  138.                  */
  139.                 if (instr != (p ? p->next : block->head))
  140.                         return;
  141.  
  142.                 if (p)
  143.                         p->next = instr->next;
  144.                 else
  145.                         block->head = instr->next;
  146.         }
  147.  
  148.         if (writes_addr(instr)) {
  149.                 assert(ctx->addr == NULL);
  150.                 ctx->addr = instr;
  151.         }
  152.  
  153.         if (writes_pred(instr)) {
  154.                 assert(ctx->pred == NULL);
  155.                 ctx->pred = instr;
  156.         }
  157.  
  158.         instr->flags |= IR3_INSTR_MARK;
  159.  
  160.         instr->next = ctx->scheduled;
  161.         ctx->scheduled = instr;
  162.  
  163.         ctx->cnt++;
  164. }
  165.  
  166. /*
  167.  * Delay-slot calculation.  Follows fanin/fanout.
  168.  */
  169.  
  170. /* calculate delay for specified src: */
  171. static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
  172.                 struct ir3_instruction *assigner,
  173.                 struct ir3_instruction *consumer, unsigned srcn)
  174. {
  175.         unsigned delay = 0;
  176.  
  177.         if (is_meta(assigner)) {
  178.                 struct ir3_instruction *src;
  179.                 foreach_ssa_src(src, assigner) {
  180.                         unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
  181.                         delay = MAX2(delay, d);
  182.                 }
  183.         } else {
  184.                 delay = ir3_delayslots(assigner, consumer, srcn);
  185.                 delay -= distance(ctx, assigner, delay);
  186.         }
  187.  
  188.         return delay;
  189. }
  190.  
  191. /* calculate delay for instruction (maximum of delay for all srcs): */
  192. static unsigned delay_calc(struct ir3_sched_ctx *ctx,
  193.                 struct ir3_instruction *instr)
  194. {
  195.         unsigned delay = 0;
  196.         struct ir3_instruction *src;
  197.  
  198.         foreach_ssa_src_n(src, i, instr) {
  199.                 unsigned d = delay_calc_srcn(ctx, src, instr, i);
  200.                 delay = MAX2(delay, d);
  201.         }
  202.  
  203.         return delay;
  204. }
  205.  
  206. /* A negative return value signals that an instruction has been newly
  207.  * SCHEDULED (or DELAYED due to address or predicate register already
  208.  * in use), return back up to the top of the stack (to block_sched())
  209.  */
  210. static int trysched(struct ir3_sched_ctx *ctx,
  211.                 struct ir3_instruction *instr)
  212. {
  213.         struct ir3_instruction *srcs[64];
  214.         struct ir3_instruction *src;
  215.         unsigned delay, nsrcs = 0;
  216.  
  217.         /* if already scheduled: */
  218.         if (instr->flags & IR3_INSTR_MARK)
  219.                 return 0;
  220.  
  221.         /* figure out our src's, copy 'em out into an array for sorting: */
  222.         foreach_ssa_src(src, instr) {
  223.                 debug_assert(nsrcs < ARRAY_SIZE(srcs));
  224.                 srcs[nsrcs++] = src;
  225.         }
  226.  
  227.         /* for each src register in sorted order:
  228.          */
  229.         delay = 0;
  230.         while ((src = deepest(srcs, nsrcs))) {
  231.                 delay = trysched(ctx, src);
  232.                 if (delay)
  233.                         return delay;
  234.         }
  235.  
  236.         /* all our dependents are scheduled, figure out if
  237.          * we have enough delay slots to schedule ourself:
  238.          */
  239.         delay = delay_calc(ctx, instr);
  240.         if (delay)
  241.                 return delay;
  242.  
  243.         /* if the instruction is a kill, we need to ensure *every*
  244.          * bary.f is scheduled.  The hw seems unhappy if the thread
  245.          * gets killed before the end-input (ei) flag is hit.
  246.          *
  247.          * We could do this by adding each bary.f instruction as
  248.          * virtual ssa src for the kill instruction.  But we have
  249.          * fixed length instr->regs[].
  250.          *
  251.          * TODO this wouldn't be quite right if we had multiple
  252.          * basic blocks, if any block was conditional.  We'd need
  253.          * to schedule the bary.f's outside of any block which
  254.          * was conditional that contained a kill.. I think..
  255.          */
  256.         if (is_kill(instr)) {
  257.                 struct ir3 *ir = instr->block->shader;
  258.                 unsigned i;
  259.  
  260.                 for (i = 0; i < ir->baryfs_count; i++) {
  261.                         struct ir3_instruction *baryf = ir->baryfs[i];
  262.                         if (baryf->depth == DEPTH_UNUSED)
  263.                                 continue;
  264.                         delay = trysched(ctx, baryf);
  265.                         if (delay)
  266.                                 return delay;
  267.                 }
  268.         }
  269.  
  270.         /* if this is a write to address/predicate register, and that
  271.          * register is currently in use, we need to defer until it is
  272.          * free:
  273.          */
  274.         if (writes_addr(instr) && ctx->addr) {
  275.                 assert(ctx->addr != instr);
  276.                 return DELAYED;
  277.         }
  278.         if (writes_pred(instr) && ctx->pred) {
  279.                 assert(ctx->pred != instr);
  280.                 return DELAYED;
  281.         }
  282.  
  283.         schedule(ctx, instr, true);
  284.         return SCHEDULED;
  285. }
  286.  
  287. static struct ir3_instruction * reverse(struct ir3_instruction *instr)
  288. {
  289.         struct ir3_instruction *reversed = NULL;
  290.         while (instr) {
  291.                 struct ir3_instruction *next = instr->next;
  292.                 instr->next = reversed;
  293.                 reversed = instr;
  294.                 instr = next;
  295.         }
  296.         return reversed;
  297. }
  298.  
  299. static bool uses_current_addr(struct ir3_sched_ctx *ctx,
  300.                 struct ir3_instruction *instr)
  301. {
  302.         return instr->address && (ctx->addr == instr->address);
  303. }
  304.  
  305. static bool uses_current_pred(struct ir3_sched_ctx *ctx,
  306.                 struct ir3_instruction *instr)
  307. {
  308.         struct ir3_instruction *src;
  309.         foreach_ssa_src(src, instr)
  310.                 if (ctx->pred == src)
  311.                         return true;
  312.         return false;
  313. }
  314.  
  315. /* when we encounter an instruction that writes to the address register
  316.  * when it is in use, we delay that instruction and try to schedule all
  317.  * other instructions using the current address register:
  318.  */
  319. static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
  320.                 struct ir3_block *block)
  321. {
  322.         struct ir3_instruction *instr = block->head;
  323.         bool addr_in_use = false;
  324.         bool pred_in_use = false;
  325.         bool all_delayed = true;
  326.         unsigned cnt = ~0, attempted = 0;
  327.  
  328.         while (instr) {
  329.                 struct ir3_instruction *next = instr->next;
  330.                 bool addr = uses_current_addr(ctx, instr);
  331.                 bool pred = uses_current_pred(ctx, instr);
  332.  
  333.                 if (addr || pred) {
  334.                         int ret = trysched(ctx, instr);
  335.  
  336.                         if (ret != DELAYED)
  337.                                 all_delayed = false;
  338.  
  339.                         if (ret == SCHEDULED)
  340.                                 cnt = 0;
  341.                         else if (ret > 0)
  342.                                 cnt = MIN2(cnt, ret);
  343.                         if (addr)
  344.                                 addr_in_use = true;
  345.                         if (pred)
  346.                                 pred_in_use = true;
  347.  
  348.                         attempted++;
  349.                 }
  350.  
  351.                 instr = next;
  352.         }
  353.  
  354.         if (!addr_in_use)
  355.                 ctx->addr = NULL;
  356.  
  357.         if (!pred_in_use)
  358.                 ctx->pred = NULL;
  359.  
  360.         /* detect if we've gotten ourselves into an impossible situation
  361.          * and bail if needed
  362.          */
  363.         if (all_delayed && (attempted > 0)) {
  364.                 if (pred_in_use) {
  365.                         /* TODO we probably need to keep a list of instructions
  366.                          * that reference predicate, similar to indirects
  367.                          */
  368.                         ctx->error = true;
  369.                         return DELAYED;
  370.                 }
  371.                 if (addr_in_use) {
  372.                         struct ir3 *ir = ctx->addr->block->shader;
  373.                         struct ir3_instruction *new_addr =
  374.                                         ir3_instr_clone(ctx->addr);
  375.                         unsigned i;
  376.  
  377.                         /* original addr is scheduled, but new one isn't: */
  378.                         new_addr->flags &= ~IR3_INSTR_MARK;
  379.  
  380.                         for (i = 0; i < ir->indirects_count; i++) {
  381.                                 struct ir3_instruction *indirect = ir->indirects[i];
  382.  
  383.                                 /* skip instructions already scheduled: */
  384.                                 if (indirect->flags & IR3_INSTR_MARK)
  385.                                         continue;
  386.  
  387.                                 /* remap remaining instructions using current addr
  388.                                  * to new addr:
  389.                                  */
  390.                                 if (indirect->address == ctx->addr)
  391.                                         indirect->address = new_addr;
  392.                         }
  393.  
  394.                         /* all remaining indirects remapped to new addr: */
  395.                         ctx->addr = NULL;
  396.  
  397.                         /* not really, but this will trigger us to go back to
  398.                          * main trysched() loop now that we've resolved the
  399.                          * conflict by duplicating the instr that writes to
  400.                          * the address register.
  401.                          */
  402.                         return SCHEDULED;
  403.                 }
  404.         }
  405.  
  406.         return cnt;
  407. }
  408.  
  409. static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
  410. {
  411.         struct ir3_instruction *instr;
  412.  
  413.         /* schedule all the shader input's (meta-instr) first so that
  414.          * the RA step sees that the input registers contain a value
  415.          * from the start of the shader:
  416.          */
  417.         if (!block->parent) {
  418.                 unsigned i;
  419.                 for (i = 0; i < block->ninputs; i++) {
  420.                         struct ir3_instruction *in = block->inputs[i];
  421.                         if (in)
  422.                                 schedule(ctx, in, true);
  423.                 }
  424.         }
  425.  
  426.         while ((instr = block->head) && !ctx->error) {
  427.                 /* NOTE: always grab next *before* trysched(), in case the
  428.                  * instruction is actually scheduled (and therefore moved
  429.                  * from depth list into scheduled list)
  430.                  */
  431.                 struct ir3_instruction *next = instr->next;
  432.                 int cnt = trysched(ctx, instr);
  433.  
  434.                 if (cnt == DELAYED)
  435.                         cnt = block_sched_undelayed(ctx, block);
  436.  
  437.                 /* -1 is signal to return up stack, but to us means same as 0: */
  438.                 cnt = MAX2(0, cnt);
  439.                 cnt += ctx->cnt;
  440.                 instr = next;
  441.  
  442.                 /* if deepest remaining instruction cannot be scheduled, try
  443.                  * the increasingly more shallow instructions until needed
  444.                  * number of delay slots is filled:
  445.                  */
  446.                 while (instr && (cnt > ctx->cnt)) {
  447.                         next = instr->next;
  448.                         trysched(ctx, instr);
  449.                         instr = next;
  450.                 }
  451.  
  452.                 /* and if we run out of instructions that can be scheduled,
  453.                  * then it is time for nop's:
  454.                  */
  455.                 while (cnt > ctx->cnt)
  456.                         schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
  457.         }
  458.  
  459.         /* at this point, scheduled list is in reverse order, so fix that: */
  460.         block->head = reverse(ctx->scheduled);
  461. }
  462.  
  463. int ir3_block_sched(struct ir3_block *block)
  464. {
  465.         struct ir3_sched_ctx ctx = {0};
  466.         ir3_clear_mark(block->shader);
  467.         block_sched(&ctx, block);
  468.         if (ctx.error)
  469.                 return -1;
  470.         return 0;
  471. }
  472.