Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2012 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. /** @file brw_eu_compact.c
  25.  *
  26.  * Instruction compaction is a feature of gm45 and newer hardware that allows
  27.  * for a smaller instruction encoding.
  28.  *
  29.  * The instruction cache is on the order of 32KB, and many programs generate
  30.  * far more instructions than that.  The instruction cache is built to barely
  31.  * keep up with instruction dispatch abaility in cache hit cases -- L1
  32.  * instruction cache misses that still hit in the next level could limit
  33.  * throughput by around 50%.
  34.  *
  35.  * The idea of instruction compaction is that most instructions use a tiny
  36.  * subset of the GPU functionality, so we can encode what would be a 16 byte
  37.  * instruction in 8 bytes using some lookup tables for various fields.
  38.  */
  39.  
  40. #include "brw_context.h"
  41. #include "brw_eu.h"
  42.  
  43. static const uint32_t gen6_control_index_table[32] = {
  44.    0b00000000000000000,
  45.    0b01000000000000000,
  46.    0b00110000000000000,
  47.    0b00000000100000000,
  48.    0b00010000000000000,
  49.    0b00001000100000000,
  50.    0b00000000100000010,
  51.    0b00000000000000010,
  52.    0b01000000100000000,
  53.    0b01010000000000000,
  54.    0b10110000000000000,
  55.    0b00100000000000000,
  56.    0b11010000000000000,
  57.    0b11000000000000000,
  58.    0b01001000100000000,
  59.    0b01000000000001000,
  60.    0b01000000000000100,
  61.    0b00000000000001000,
  62.    0b00000000000000100,
  63.    0b00111000100000000,
  64.    0b00001000100000010,
  65.    0b00110000100000000,
  66.    0b00110000000000001,
  67.    0b00100000000000001,
  68.    0b00110000000000010,
  69.    0b00110000000000101,
  70.    0b00110000000001001,
  71.    0b00110000000010000,
  72.    0b00110000000000011,
  73.    0b00110000000000100,
  74.    0b00110000100001000,
  75.    0b00100000000001001
  76. };
  77.  
  78. static const uint32_t gen6_datatype_table[32] = {
  79.    0b001001110000000000,
  80.    0b001000110000100000,
  81.    0b001001110000000001,
  82.    0b001000000001100000,
  83.    0b001010110100101001,
  84.    0b001000000110101101,
  85.    0b001100011000101100,
  86.    0b001011110110101101,
  87.    0b001000000111101100,
  88.    0b001000000001100001,
  89.    0b001000110010100101,
  90.    0b001000000001000001,
  91.    0b001000001000110001,
  92.    0b001000001000101001,
  93.    0b001000000000100000,
  94.    0b001000001000110010,
  95.    0b001010010100101001,
  96.    0b001011010010100101,
  97.    0b001000000110100101,
  98.    0b001100011000101001,
  99.    0b001011011000101100,
  100.    0b001011010110100101,
  101.    0b001011110110100101,
  102.    0b001111011110111101,
  103.    0b001111011110111100,
  104.    0b001111011110111101,
  105.    0b001111011110011101,
  106.    0b001111011110111110,
  107.    0b001000000000100001,
  108.    0b001000000000100010,
  109.    0b001001111111011101,
  110.    0b001000001110111110,
  111. };
  112.  
  113. static const uint32_t gen6_subreg_table[32] = {
  114.    0b000000000000000,
  115.    0b000000000000100,
  116.    0b000000110000000,
  117.    0b111000000000000,
  118.    0b011110000001000,
  119.    0b000010000000000,
  120.    0b000000000010000,
  121.    0b000110000001100,
  122.    0b001000000000000,
  123.    0b000001000000000,
  124.    0b000001010010100,
  125.    0b000000001010110,
  126.    0b010000000000000,
  127.    0b110000000000000,
  128.    0b000100000000000,
  129.    0b000000010000000,
  130.    0b000000000001000,
  131.    0b100000000000000,
  132.    0b000001010000000,
  133.    0b001010000000000,
  134.    0b001100000000000,
  135.    0b000000001010100,
  136.    0b101101010010100,
  137.    0b010100000000000,
  138.    0b000000010001111,
  139.    0b011000000000000,
  140.    0b111110000000000,
  141.    0b101000000000000,
  142.    0b000000000001111,
  143.    0b000100010001111,
  144.    0b001000010001111,
  145.    0b000110000000000,
  146. };
  147.  
  148. static const uint32_t gen6_src_index_table[32] = {
  149.    0b000000000000,
  150.    0b010110001000,
  151.    0b010001101000,
  152.    0b001000101000,
  153.    0b011010010000,
  154.    0b000100100000,
  155.    0b010001101100,
  156.    0b010101110000,
  157.    0b011001111000,
  158.    0b001100101000,
  159.    0b010110001100,
  160.    0b001000100000,
  161.    0b010110001010,
  162.    0b000000000010,
  163.    0b010101010000,
  164.    0b010101101000,
  165.    0b111101001100,
  166.    0b111100101100,
  167.    0b011001110000,
  168.    0b010110001001,
  169.    0b010101011000,
  170.    0b001101001000,
  171.    0b010000101100,
  172.    0b010000000000,
  173.    0b001101110000,
  174.    0b001100010000,
  175.    0b001100000000,
  176.    0b010001101010,
  177.    0b001101111000,
  178.    0b000001110000,
  179.    0b001100100000,
  180.    0b001101010000,
  181. };
  182.  
  183. static const uint32_t gen7_control_index_table[32] = {
  184.    0b0000000000000000010,
  185.    0b0000100000000000000,
  186.    0b0000100000000000001,
  187.    0b0000100000000000010,
  188.    0b0000100000000000011,
  189.    0b0000100000000000100,
  190.    0b0000100000000000101,
  191.    0b0000100000000000111,
  192.    0b0000100000000001000,
  193.    0b0000100000000001001,
  194.    0b0000100000000001101,
  195.    0b0000110000000000000,
  196.    0b0000110000000000001,
  197.    0b0000110000000000010,
  198.    0b0000110000000000011,
  199.    0b0000110000000000100,
  200.    0b0000110000000000101,
  201.    0b0000110000000000111,
  202.    0b0000110000000001001,
  203.    0b0000110000000001101,
  204.    0b0000110000000010000,
  205.    0b0000110000100000000,
  206.    0b0001000000000000000,
  207.    0b0001000000000000010,
  208.    0b0001000000000000100,
  209.    0b0001000000100000000,
  210.    0b0010110000000000000,
  211.    0b0010110000000010000,
  212.    0b0011000000000000000,
  213.    0b0011000000100000000,
  214.    0b0101000000000000000,
  215.    0b0101000000100000000
  216. };
  217.  
  218. static const uint32_t gen7_datatype_table[32] = {
  219.    0b001000000000000001,
  220.    0b001000000000100000,
  221.    0b001000000000100001,
  222.    0b001000000001100001,
  223.    0b001000000010111101,
  224.    0b001000001011111101,
  225.    0b001000001110100001,
  226.    0b001000001110100101,
  227.    0b001000001110111101,
  228.    0b001000010000100001,
  229.    0b001000110000100000,
  230.    0b001000110000100001,
  231.    0b001001010010100101,
  232.    0b001001110010100100,
  233.    0b001001110010100101,
  234.    0b001111001110111101,
  235.    0b001111011110011101,
  236.    0b001111011110111100,
  237.    0b001111011110111101,
  238.    0b001111111110111100,
  239.    0b000000001000001100,
  240.    0b001000000000111101,
  241.    0b001000000010100101,
  242.    0b001000010000100000,
  243.    0b001001010010100100,
  244.    0b001001110010000100,
  245.    0b001010010100001001,
  246.    0b001101111110111101,
  247.    0b001111111110111101,
  248.    0b001011110110101100,
  249.    0b001010010100101000,
  250.    0b001010110100101000
  251. };
  252.  
  253. static const uint32_t gen7_subreg_table[32] = {
  254.    0b000000000000000,
  255.    0b000000000000001,
  256.    0b000000000001000,
  257.    0b000000000001111,
  258.    0b000000000010000,
  259.    0b000000010000000,
  260.    0b000000100000000,
  261.    0b000000110000000,
  262.    0b000001000000000,
  263.    0b000001000010000,
  264.    0b000010100000000,
  265.    0b001000000000000,
  266.    0b001000000000001,
  267.    0b001000010000001,
  268.    0b001000010000010,
  269.    0b001000010000011,
  270.    0b001000010000100,
  271.    0b001000010000111,
  272.    0b001000010001000,
  273.    0b001000010001110,
  274.    0b001000010001111,
  275.    0b001000110000000,
  276.    0b001000111101000,
  277.    0b010000000000000,
  278.    0b010000110000000,
  279.    0b011000000000000,
  280.    0b011110010000111,
  281.    0b100000000000000,
  282.    0b101000000000000,
  283.    0b110000000000000,
  284.    0b111000000000000,
  285.    0b111000000011100
  286. };
  287.  
  288. static const uint32_t gen7_src_index_table[32] = {
  289.    0b000000000000,
  290.    0b000000000010,
  291.    0b000000010000,
  292.    0b000000010010,
  293.    0b000000011000,
  294.    0b000000100000,
  295.    0b000000101000,
  296.    0b000001001000,
  297.    0b000001010000,
  298.    0b000001110000,
  299.    0b000001111000,
  300.    0b001100000000,
  301.    0b001100000010,
  302.    0b001100001000,
  303.    0b001100010000,
  304.    0b001100010010,
  305.    0b001100100000,
  306.    0b001100101000,
  307.    0b001100111000,
  308.    0b001101000000,
  309.    0b001101000010,
  310.    0b001101001000,
  311.    0b001101010000,
  312.    0b001101100000,
  313.    0b001101101000,
  314.    0b001101110000,
  315.    0b001101110001,
  316.    0b001101111000,
  317.    0b010001101000,
  318.    0b010001101001,
  319.    0b010001101010,
  320.    0b010110001000
  321. };
  322.  
  323. static const uint32_t *control_index_table;
  324. static const uint32_t *datatype_table;
  325. static const uint32_t *subreg_table;
  326. static const uint32_t *src_index_table;
  327.  
  328. static bool
  329. set_control_index(struct brw_context *brw,
  330.                   struct brw_compact_instruction *dst,
  331.                   struct brw_instruction *src)
  332. {
  333.    uint32_t *src_u32 = (uint32_t *)src;
  334.    uint32_t uncompacted = 0;
  335.  
  336.    uncompacted |= ((src_u32[0] >> 8) & 0xffff) << 0;
  337.    uncompacted |= ((src_u32[0] >> 31) & 0x1) << 16;
  338.    /* On gen7, the flag register number gets integrated into the control
  339.     * index.
  340.     */
  341.    if (brw->gen >= 7)
  342.       uncompacted |= ((src_u32[2] >> 25) & 0x3) << 17;
  343.  
  344.    for (int i = 0; i < 32; i++) {
  345.       if (control_index_table[i] == uncompacted) {
  346.          dst->dw0.control_index = i;
  347.          return true;
  348.       }
  349.    }
  350.  
  351.    return false;
  352. }
  353.  
  354. static bool
  355. set_datatype_index(struct brw_compact_instruction *dst,
  356.                    struct brw_instruction *src)
  357. {
  358.    uint32_t uncompacted = 0;
  359.  
  360.    uncompacted |= src->bits1.ud & 0x7fff;
  361.    uncompacted |= (src->bits1.ud >> 29) << 15;
  362.  
  363.    for (int i = 0; i < 32; i++) {
  364.       if (datatype_table[i] == uncompacted) {
  365.          dst->dw0.data_type_index = i;
  366.          return true;
  367.       }
  368.    }
  369.  
  370.    return false;
  371. }
  372.  
  373. static bool
  374. set_subreg_index(struct brw_compact_instruction *dst,
  375.                  struct brw_instruction *src)
  376. {
  377.    uint32_t uncompacted = 0;
  378.  
  379.    uncompacted |= src->bits1.da1.dest_subreg_nr << 0;
  380.    uncompacted |= src->bits2.da1.src0_subreg_nr << 5;
  381.    uncompacted |= src->bits3.da1.src1_subreg_nr << 10;
  382.  
  383.    for (int i = 0; i < 32; i++) {
  384.       if (subreg_table[i] == uncompacted) {
  385.          dst->dw0.sub_reg_index = i;
  386.          return true;
  387.       }
  388.    }
  389.  
  390.    return false;
  391. }
  392.  
  393. static bool
  394. get_src_index(uint32_t uncompacted,
  395.               uint32_t *compacted)
  396. {
  397.    for (int i = 0; i < 32; i++) {
  398.       if (src_index_table[i] == uncompacted) {
  399.          *compacted = i;
  400.          return true;
  401.       }
  402.    }
  403.  
  404.    return false;
  405. }
  406.  
  407. static bool
  408. set_src0_index(struct brw_compact_instruction *dst,
  409.                struct brw_instruction *src)
  410. {
  411.    uint32_t compacted, uncompacted = 0;
  412.  
  413.    uncompacted |= (src->bits2.ud >> 13) & 0xfff;
  414.  
  415.    if (!get_src_index(uncompacted, &compacted))
  416.       return false;
  417.  
  418.    dst->dw0.src0_index = compacted & 0x3;
  419.    dst->dw1.src0_index = compacted >> 2;
  420.  
  421.    return true;
  422. }
  423.  
  424. static bool
  425. set_src1_index(struct brw_compact_instruction *dst,
  426.                struct brw_instruction *src)
  427. {
  428.    uint32_t compacted, uncompacted = 0;
  429.  
  430.    uncompacted |= (src->bits3.ud >> 13) & 0xfff;
  431.  
  432.    if (!get_src_index(uncompacted, &compacted))
  433.       return false;
  434.  
  435.    dst->dw1.src1_index = compacted;
  436.  
  437.    return true;
  438. }
  439.  
  440. /**
  441.  * Tries to compact instruction src into dst.
  442.  *
  443.  * It doesn't modify dst unless src is compactable, which is relied on by
  444.  * brw_compact_instructions().
  445.  */
  446. bool
  447. brw_try_compact_instruction(struct brw_compile *p,
  448.                             struct brw_compact_instruction *dst,
  449.                             struct brw_instruction *src)
  450. {
  451.    struct brw_context *brw = p->brw;
  452.    struct brw_compact_instruction temp;
  453.  
  454.    if (src->header.opcode == BRW_OPCODE_IF ||
  455.        src->header.opcode == BRW_OPCODE_ELSE ||
  456.        src->header.opcode == BRW_OPCODE_ENDIF ||
  457.        src->header.opcode == BRW_OPCODE_HALT ||
  458.        src->header.opcode == BRW_OPCODE_DO ||
  459.        src->header.opcode == BRW_OPCODE_WHILE) {
  460.       /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
  461.        * to be able to handle compacted flow control instructions..
  462.        */
  463.       return false;
  464.    }
  465.  
  466.    /* FINISHME: immediates */
  467.    if (src->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE ||
  468.        src->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
  469.       return false;
  470.  
  471.    memset(&temp, 0, sizeof(temp));
  472.  
  473.    temp.dw0.opcode = src->header.opcode;
  474.    temp.dw0.debug_control = src->header.debug_control;
  475.    if (!set_control_index(brw, &temp, src))
  476.       return false;
  477.    if (!set_datatype_index(&temp, src))
  478.       return false;
  479.    if (!set_subreg_index(&temp, src))
  480.       return false;
  481.    temp.dw0.acc_wr_control = src->header.acc_wr_control;
  482.    temp.dw0.conditionalmod = src->header.destreg__conditionalmod;
  483.    if (brw->gen <= 6)
  484.       temp.dw0.flag_subreg_nr = src->bits2.da1.flag_subreg_nr;
  485.    temp.dw0.cmpt_ctrl = 1;
  486.    if (!set_src0_index(&temp, src))
  487.       return false;
  488.    if (!set_src1_index(&temp, src))
  489.       return false;
  490.    temp.dw1.dst_reg_nr = src->bits1.da1.dest_reg_nr;
  491.    temp.dw1.src0_reg_nr = src->bits2.da1.src0_reg_nr;
  492.    temp.dw1.src1_reg_nr = src->bits3.da1.src1_reg_nr;
  493.  
  494.    *dst = temp;
  495.  
  496.    return true;
  497. }
  498.  
  499. static void
  500. set_uncompacted_control(struct brw_context *brw,
  501.                         struct brw_instruction *dst,
  502.                         struct brw_compact_instruction *src)
  503. {
  504.    uint32_t *dst_u32 = (uint32_t *)dst;
  505.    uint32_t uncompacted = control_index_table[src->dw0.control_index];
  506.  
  507.    dst_u32[0] |= ((uncompacted >> 0) & 0xffff) << 8;
  508.    dst_u32[0] |= ((uncompacted >> 16) & 0x1) << 31;
  509.  
  510.    if (brw->gen >= 7)
  511.       dst_u32[2] |= ((uncompacted >> 17) & 0x3) << 25;
  512. }
  513.  
  514. static void
  515. set_uncompacted_datatype(struct brw_instruction *dst,
  516.                          struct brw_compact_instruction *src)
  517. {
  518.    uint32_t uncompacted = datatype_table[src->dw0.data_type_index];
  519.  
  520.    dst->bits1.ud &= ~(0x7 << 29);
  521.    dst->bits1.ud |= ((uncompacted >> 15) & 0x7) << 29;
  522.    dst->bits1.ud &= ~0x7fff;
  523.    dst->bits1.ud |= uncompacted & 0x7fff;
  524. }
  525.  
  526. static void
  527. set_uncompacted_subreg(struct brw_instruction *dst,
  528.                        struct brw_compact_instruction *src)
  529. {
  530.    uint32_t uncompacted = subreg_table[src->dw0.sub_reg_index];
  531.  
  532.    dst->bits1.da1.dest_subreg_nr = (uncompacted >> 0)  & 0x1f;
  533.    dst->bits2.da1.src0_subreg_nr = (uncompacted >> 5)  & 0x1f;
  534.    dst->bits3.da1.src1_subreg_nr = (uncompacted >> 10) & 0x1f;
  535. }
  536.  
  537. static void
  538. set_uncompacted_src0(struct brw_instruction *dst,
  539.                      struct brw_compact_instruction *src)
  540. {
  541.    uint32_t compacted = src->dw0.src0_index | src->dw1.src0_index << 2;
  542.    uint32_t uncompacted = src_index_table[compacted];
  543.  
  544.    dst->bits2.ud |= uncompacted << 13;
  545. }
  546.  
  547. static void
  548. set_uncompacted_src1(struct brw_instruction *dst,
  549.                      struct brw_compact_instruction *src)
  550. {
  551.    uint32_t uncompacted = src_index_table[src->dw1.src1_index];
  552.  
  553.    dst->bits3.ud |= uncompacted << 13;
  554. }
  555.  
  556. void
  557. brw_uncompact_instruction(struct brw_context *brw,
  558.                           struct brw_instruction *dst,
  559.                           struct brw_compact_instruction *src)
  560. {
  561.    memset(dst, 0, sizeof(*dst));
  562.  
  563.    dst->header.opcode = src->dw0.opcode;
  564.    dst->header.debug_control = src->dw0.debug_control;
  565.  
  566.    set_uncompacted_control(brw, dst, src);
  567.    set_uncompacted_datatype(dst, src);
  568.    set_uncompacted_subreg(dst, src);
  569.    dst->header.acc_wr_control = src->dw0.acc_wr_control;
  570.    dst->header.destreg__conditionalmod = src->dw0.conditionalmod;
  571.    if (brw->gen <= 6)
  572.       dst->bits2.da1.flag_subreg_nr = src->dw0.flag_subreg_nr;
  573.    set_uncompacted_src0(dst, src);
  574.    set_uncompacted_src1(dst, src);
  575.    dst->bits1.da1.dest_reg_nr = src->dw1.dst_reg_nr;
  576.    dst->bits2.da1.src0_reg_nr = src->dw1.src0_reg_nr;
  577.    dst->bits3.da1.src1_reg_nr = src->dw1.src1_reg_nr;
  578. }
  579.  
  580. void brw_debug_compact_uncompact(struct brw_context *brw,
  581.                                  struct brw_instruction *orig,
  582.                                  struct brw_instruction *uncompacted)
  583. {
  584.    fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
  585.            brw->gen);
  586.  
  587.    fprintf(stderr, "  before: ");
  588.    brw_disasm(stderr, orig, brw->gen);
  589.  
  590.    fprintf(stderr, "  after:  ");
  591.    brw_disasm(stderr, uncompacted, brw->gen);
  592.  
  593.    uint32_t *before_bits = (uint32_t *)orig;
  594.    uint32_t *after_bits = (uint32_t *)uncompacted;
  595.    printf("  changed bits:\n");
  596.    for (int i = 0; i < 128; i++) {
  597.       uint32_t before = before_bits[i / 32] & (1 << (i & 31));
  598.       uint32_t after = after_bits[i / 32] & (1 << (i & 31));
  599.  
  600.       if (before != after) {
  601.          printf("  bit %d, %s to %s\n", i,
  602.                 before ? "set" : "unset",
  603.                 after ? "set" : "unset");
  604.       }
  605.    }
  606. }
  607.  
  608. static int
  609. compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
  610. {
  611.    int this_compacted_count = compacted_counts[old_ip];
  612.    int target_compacted_count = compacted_counts[old_target_ip];
  613.    return target_compacted_count - this_compacted_count;
  614. }
  615.  
  616. static void
  617. update_uip_jip(struct brw_instruction *insn, int this_old_ip,
  618.                int *compacted_counts)
  619. {
  620.    int target_old_ip;
  621.  
  622.    target_old_ip = this_old_ip + insn->bits3.break_cont.jip;
  623.    insn->bits3.break_cont.jip -= compacted_between(this_old_ip,
  624.                                                    target_old_ip,
  625.                                                    compacted_counts);
  626.  
  627.    target_old_ip = this_old_ip + insn->bits3.break_cont.uip;
  628.    insn->bits3.break_cont.uip -= compacted_between(this_old_ip,
  629.                                                    target_old_ip,
  630.                                                    compacted_counts);
  631. }
  632.  
  633. void
  634. brw_init_compaction_tables(struct brw_context *brw)
  635. {
  636.    assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
  637.    assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
  638.    assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
  639.    assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
  640.    assert(gen7_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
  641.    assert(gen7_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
  642.    assert(gen7_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
  643.    assert(gen7_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
  644.  
  645.    switch (brw->gen) {
  646.    case 7:
  647.       control_index_table = gen7_control_index_table;
  648.       datatype_table = gen7_datatype_table;
  649.       subreg_table = gen7_subreg_table;
  650.       src_index_table = gen7_src_index_table;
  651.       break;
  652.    case 6:
  653.       control_index_table = gen6_control_index_table;
  654.       datatype_table = gen6_datatype_table;
  655.       subreg_table = gen6_subreg_table;
  656.       src_index_table = gen6_src_index_table;
  657.       break;
  658.    default:
  659.       return;
  660.    }
  661. }
  662.  
  663. void
  664. brw_compact_instructions(struct brw_compile *p)
  665. {
  666.    struct brw_context *brw = p->brw;
  667.    void *store = p->store;
  668.    /* For an instruction at byte offset 8*i before compaction, this is the number
  669.     * of compacted instructions that preceded it.
  670.     */
  671.    int compacted_counts[p->next_insn_offset / 8];
  672.    /* For an instruction at byte offset 8*i after compaction, this is the
  673.     * 8-byte offset it was at before compaction.
  674.     */
  675.    int old_ip[p->next_insn_offset / 8];
  676.  
  677.    if (brw->gen < 6)
  678.       return;
  679.  
  680.    int src_offset;
  681.    int offset = 0;
  682.    int compacted_count = 0;
  683.    for (src_offset = 0; src_offset < p->nr_insn * 16;) {
  684.       struct brw_instruction *src = store + src_offset;
  685.       void *dst = store + offset;
  686.  
  687.       old_ip[offset / 8] = src_offset / 8;
  688.       compacted_counts[src_offset / 8] = compacted_count;
  689.  
  690.       struct brw_instruction saved = *src;
  691.  
  692.       if (!src->header.cmpt_control &&
  693.           brw_try_compact_instruction(p, dst, src)) {
  694.          compacted_count++;
  695.  
  696.          if (INTEL_DEBUG) {
  697.             struct brw_instruction uncompacted;
  698.             brw_uncompact_instruction(brw, &uncompacted, dst);
  699.             if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
  700.                brw_debug_compact_uncompact(brw, &saved, &uncompacted);
  701.             }
  702.          }
  703.  
  704.          offset += 8;
  705.          src_offset += 16;
  706.       } else {
  707.          int size = src->header.cmpt_control ? 8 : 16;
  708.  
  709.          /* It appears that the end of thread SEND instruction needs to be
  710.           * aligned, or the GPU hangs.
  711.           */
  712.          if ((src->header.opcode == BRW_OPCODE_SEND ||
  713.               src->header.opcode == BRW_OPCODE_SENDC) &&
  714.              src->bits3.generic.end_of_thread &&
  715.              (offset & 8) != 0) {
  716.             struct brw_compact_instruction *align = store + offset;
  717.             memset(align, 0, sizeof(*align));
  718.             align->dw0.opcode = BRW_OPCODE_NOP;
  719.             align->dw0.cmpt_ctrl = 1;
  720.             offset += 8;
  721.             old_ip[offset / 8] = src_offset / 8;
  722.             dst = store + offset;
  723.          }
  724.  
  725.          /* If we didn't compact this intruction, we need to move it down into
  726.           * place.
  727.           */
  728.          if (offset != src_offset) {
  729.             memmove(dst, src, size);
  730.          }
  731.          offset += size;
  732.          src_offset += size;
  733.       }
  734.    }
  735.  
  736.    /* Fix up control flow offsets. */
  737.    p->next_insn_offset = offset;
  738.    for (offset = 0; offset < p->next_insn_offset;) {
  739.       struct brw_instruction *insn = store + offset;
  740.       int this_old_ip = old_ip[offset / 8];
  741.       int this_compacted_count = compacted_counts[this_old_ip];
  742.       int target_old_ip, target_compacted_count;
  743.  
  744.       switch (insn->header.opcode) {
  745.       case BRW_OPCODE_BREAK:
  746.       case BRW_OPCODE_CONTINUE:
  747.       case BRW_OPCODE_HALT:
  748.          update_uip_jip(insn, this_old_ip, compacted_counts);
  749.          break;
  750.  
  751.       case BRW_OPCODE_IF:
  752.       case BRW_OPCODE_ELSE:
  753.       case BRW_OPCODE_ENDIF:
  754.       case BRW_OPCODE_WHILE:
  755.          if (brw->gen == 6) {
  756.             target_old_ip = this_old_ip + insn->bits1.branch_gen6.jump_count;
  757.             target_compacted_count = compacted_counts[target_old_ip];
  758.             insn->bits1.branch_gen6.jump_count -= (target_compacted_count -
  759.                                                    this_compacted_count);
  760.          } else {
  761.             update_uip_jip(insn, this_old_ip, compacted_counts);
  762.          }
  763.          break;
  764.       }
  765.  
  766.       if (insn->header.cmpt_control) {
  767.          offset += 8;
  768.       } else {
  769.          offset += 16;
  770.       }
  771.    }
  772.  
  773.    /* p->nr_insn is counting the number of uncompacted instructions still, so
  774.     * divide.  We do want to be sure there's a valid instruction in any
  775.     * alignment padding, so that the next compression pass (for the FS 8/16
  776.     * compile passes) parses correctly.
  777.     */
  778.    if (p->next_insn_offset & 8) {
  779.       struct brw_compact_instruction *align = store + offset;
  780.       memset(align, 0, sizeof(*align));
  781.       align->dw0.opcode = BRW_OPCODE_NOP;
  782.       align->dw0.cmpt_ctrl = 1;
  783.       p->next_insn_offset += 8;
  784.    }
  785.    p->nr_insn = p->next_insn_offset / 16;
  786.  
  787.    if (0) {
  788.       fprintf(stdout, "dumping compacted program\n");
  789.       brw_dump_compile(p, stdout, 0, p->next_insn_offset);
  790.  
  791.       int cmp = 0;
  792.       for (offset = 0; offset < p->next_insn_offset;) {
  793.          struct brw_instruction *insn = store + offset;
  794.  
  795.          if (insn->header.cmpt_control) {
  796.             offset += 8;
  797.             cmp++;
  798.          } else {
  799.             offset += 16;
  800.          }
  801.       }
  802.       fprintf(stderr, "%db/%db saved (%d%%)\n", cmp * 8, offset + cmp * 8,
  803.               cmp * 8 * 100 / (offset + cmp * 8));
  804.    }
  805. }
  806.