Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2013 Ilia Mirkin
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice shall be included in
  12.  * all copies or substantial portions of the Software.
  13.  *
  14.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18.  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19.  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20.  * OTHER DEALINGS IN THE SOFTWARE.
  21.  */
  22.  
  23. #include "nv84_video.h"
  24.  
  25. #include "util/u_sse.h"
  26.  
  27. struct h264_iparm1 {
  28.    uint8_t scaling_lists_4x4[6][16]; // 00
  29.    uint8_t scaling_lists_8x8[2][64]; // 60
  30.    uint32_t width; // e0
  31.    uint32_t height; // e4
  32.    uint64_t ref1_addrs[16]; // e8
  33.    uint64_t ref2_addrs[16]; // 168
  34.    uint32_t unk1e8;
  35.    uint32_t unk1ec;
  36.    uint32_t w1; // 1f0
  37.    uint32_t w2; // 1f4
  38.    uint32_t w3; // 1f8
  39.    uint32_t h1; // 1fc
  40.    uint32_t h2; // 200
  41.    uint32_t h3; // 204
  42.    uint32_t mb_adaptive_frame_field_flag; // 208
  43.    uint32_t field_pic_flag; // 20c
  44.    uint32_t format; // 210
  45.    uint32_t unk214; // 214
  46. };
  47.  
  48. struct h264_iparm2 {
  49.    uint32_t width; // 00
  50.    uint32_t height; // 04
  51.    uint32_t mbs; // 08
  52.    uint32_t w1; // 0c
  53.    uint32_t w2; // 10
  54.    uint32_t w3; // 14
  55.    uint32_t h1; // 18
  56.    uint32_t h2; // 1c
  57.    uint32_t h3; // 20
  58.    uint32_t unk24;
  59.    uint32_t mb_adaptive_frame_field_flag; // 28
  60.    uint32_t top; // 2c
  61.    uint32_t bottom; // 30
  62.    uint32_t is_reference; // 34
  63. };
  64.  
  65. void
  66. nv84_decoder_vp_h264(struct nv84_decoder *dec,
  67.                      struct pipe_h264_picture_desc *desc,
  68.                      struct nv84_video_buffer *dest)
  69. {
  70.    struct h264_iparm1 param1;
  71.    struct h264_iparm2 param2;
  72.    int i, width = align(dest->base.width, 16),
  73.       height = align(dest->base.height, 16);
  74.  
  75.    struct nouveau_pushbuf *push = dec->vp_pushbuf;
  76.    struct nouveau_pushbuf_refn bo_refs[] = {
  77.       { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  78.       { dest->full, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  79.       { dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  80.       { dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  81.       { dec->vp_params, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
  82.       { dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  83.    };
  84.    int num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
  85.    bool is_ref = desc->is_reference;
  86.  
  87.    STATIC_ASSERT(sizeof(struct h264_iparm1) == 0x218);
  88.    STATIC_ASSERT(sizeof(struct h264_iparm2) == 0x38);
  89.  
  90.    memset(&param1, 0, sizeof(param1));
  91.    memset(&param2, 0, sizeof(param2));
  92.  
  93.    memcpy(&param1.scaling_lists_4x4, desc->scaling_lists_4x4,
  94.           sizeof(param1.scaling_lists_4x4));
  95.    memcpy(&param1.scaling_lists_8x8, desc->scaling_lists_8x8,
  96.           sizeof(param1.scaling_lists_8x8));
  97.  
  98.    param1.width = width;
  99.    param1.w1 = param1.w2 = param1.w3 = align(width, 64);
  100.    param1.height = param1.h2 = height;
  101.    param1.h1 = param1.h3 = align(height, 32);
  102.    param1.format = 0x3231564e; /* 'NV12' */
  103.    param1.mb_adaptive_frame_field_flag = desc->mb_adaptive_frame_field_flag;
  104.    param1.field_pic_flag = desc->field_pic_flag;
  105.  
  106.    param2.width = width;
  107.    param2.w1 = param2.w2 = param2.w3 = param1.w1;
  108.    if (desc->field_pic_flag)
  109.       param2.height = align(height, 32) / 2;
  110.    else
  111.       param2.height = height;
  112.    param2.h1 = param2.h2 = align(height, 32);
  113.    param2.h3 = height;
  114.    param2.mbs = width * height >> 8;
  115.    if (desc->field_pic_flag) {
  116.       param2.top = desc->bottom_field_flag ? 2 : 1;
  117.       param2.bottom = desc->bottom_field_flag;
  118.    }
  119.    param2.mb_adaptive_frame_field_flag = desc->mb_adaptive_frame_field_flag;
  120.    param2.is_reference = desc->is_reference;
  121.  
  122.    PUSH_SPACE(push, 5 + 16 + 3 + 2 + 6 + (is_ref ? 2 : 0) + 3 + 2 + 4 + 2);
  123.  
  124.    struct nouveau_bo *ref2_default = dest->full;
  125.  
  126.    for (i = 0; i < 16; i++) {
  127.       struct nv84_video_buffer *buf = (struct nv84_video_buffer *)desc->ref[i];
  128.       struct nouveau_bo *bo1, *bo2;
  129.       if (buf) {
  130.          bo1 = buf->interlaced;
  131.          bo2 = buf->full;
  132.          if (i == 0)
  133.             ref2_default = buf->full;
  134.       } else {
  135.          bo1 = dest->interlaced;
  136.          bo2 = ref2_default;
  137.       }
  138.       param1.ref1_addrs[i] = bo1->offset;
  139.       param1.ref2_addrs[i] = bo2->offset;
  140.       struct nouveau_pushbuf_refn bo_refs[] = {
  141.          { bo1, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  142.          { bo2, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  143.       };
  144.       nouveau_pushbuf_refn(push, bo_refs, sizeof(bo_refs)/sizeof(bo_refs[0]));
  145.    }
  146.  
  147.    memcpy(dec->vp_params->map, &param1, sizeof(param1));
  148.    memcpy(dec->vp_params->map + 0x400, &param2, sizeof(param2));
  149.  
  150.    nouveau_pushbuf_refn(push, bo_refs, num_refs);
  151.  
  152.    /* Wait for BSP to have completed */
  153.    BEGIN_NV04(push, SUBC_VP(0x10), 4);
  154.    PUSH_DATAh(push, dec->fence->offset);
  155.    PUSH_DATA (push, dec->fence->offset);
  156.    PUSH_DATA (push, 2);
  157.    PUSH_DATA (push, 1); /* wait for sem == 2 */
  158.  
  159.    /* VP step 1 */
  160.    BEGIN_NV04(push, SUBC_VP(0x400), 15);
  161.    PUSH_DATA (push, 1);
  162.    PUSH_DATA (push, param2.mbs);
  163.    PUSH_DATA (push, 0x3987654); /* each nibble probably a dma index */
  164.    PUSH_DATA (push, 0x55001); /* constant */
  165.    PUSH_DATA (push, dec->vp_params->offset >> 8);
  166.    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_residual) >> 8);
  167.    PUSH_DATA (push, dec->vpring_ctrl);
  168.    PUSH_DATA (push, dec->vpring->offset >> 8);
  169.    PUSH_DATA (push, dec->bitstream->size / 2 - 0x700);
  170.    PUSH_DATA (push, (dec->mbring->offset + dec->mbring->size - 0x2000) >> 8);
  171.    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
  172.                      dec->vpring_residual + dec->vpring_deblock) >> 8);
  173.    PUSH_DATA (push, 0);
  174.    PUSH_DATA (push, 0x100008);
  175.    PUSH_DATA (push, dest->interlaced->offset >> 8);
  176.    PUSH_DATA (push, 0);
  177.  
  178.    BEGIN_NV04(push, SUBC_VP(0x620), 2);
  179.    PUSH_DATA (push, 0);
  180.    PUSH_DATA (push, 0);
  181.  
  182.    BEGIN_NV04(push, SUBC_VP(0x300), 1);
  183.    PUSH_DATA (push, 0);
  184.  
  185.    /* VP step 2 */
  186.    BEGIN_NV04(push, SUBC_VP(0x400), 5);
  187.    PUSH_DATA (push, 0x54530201);
  188.    PUSH_DATA (push, (dec->vp_params->offset >> 8) + 0x4);
  189.    PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
  190.                      dec->vpring_residual) >> 8);
  191.    PUSH_DATA (push, dest->interlaced->offset >> 8);
  192.    PUSH_DATA (push, dest->interlaced->offset >> 8);
  193.  
  194.    if (is_ref) {
  195.       BEGIN_NV04(push, SUBC_VP(0x414), 1);
  196.       PUSH_DATA (push, dest->full->offset >> 8);
  197.    }
  198.  
  199.    BEGIN_NV04(push, SUBC_VP(0x620), 2);
  200.    PUSH_DATAh(push, dec->vp_fw2_offset);
  201.    PUSH_DATA (push, dec->vp_fw2_offset);
  202.  
  203.    BEGIN_NV04(push, SUBC_VP(0x300), 1);
  204.    PUSH_DATA (push, 0);
  205.  
  206.    /* Set the semaphore back to 1 */
  207.    BEGIN_NV04(push, SUBC_VP(0x610), 3);
  208.    PUSH_DATAh(push, dec->fence->offset);
  209.    PUSH_DATA (push, dec->fence->offset);
  210.    PUSH_DATA (push, 1);
  211.  
  212.    /* Write to the semaphore location, intr */
  213.    BEGIN_NV04(push, SUBC_VP(0x304), 1);
  214.    PUSH_DATA (push, 0x101);
  215.  
  216.    for (i = 0; i < 2; i++) {
  217.       struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
  218.       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
  219.    }
  220.  
  221.    PUSH_KICK (push);
  222. }
  223.  
  224. static INLINE int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
  225.    int16_t ret = val * quant / 16;
  226.    if (mpeg1 && ret) {
  227.       if (ret > 0)
  228.          ret = (ret - 1) | 1;
  229.       else
  230.          ret = (ret + 1) | 1;
  231.    }
  232.    if (ret < -2048)
  233.       ret = -2048;
  234.    else if (ret > 2047)
  235.       ret = 2047;
  236.    return ret;
  237. }
  238.  
  239. struct mpeg12_mb_info {
  240.    uint32_t index;
  241.    uint8_t unk4;
  242.    uint8_t unk5;
  243.    uint16_t coded_block_pattern;
  244.    uint8_t block_counts[6];
  245.    uint16_t PMV[8];
  246.    uint16_t skipped;
  247. };
  248.  
  249. void
  250. nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec,
  251.                           struct pipe_mpeg12_picture_desc *desc,
  252.                           const struct pipe_mpeg12_macroblock *macrob)
  253. {
  254.    STATIC_ASSERT(sizeof(struct mpeg12_mb_info) == 32);
  255.  
  256.    struct mpeg12_mb_info info = {0};
  257.    int i, sum = 0, mask, block_index, count;
  258.    const int16_t *blocks;
  259.    int intra = macrob->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA;
  260.    int motion = macrob->macroblock_type &
  261.       (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD | PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD);
  262.    const uint8_t *quant_matrix = intra ? dec->mpeg12_intra_matrix :
  263.       dec->mpeg12_non_intra_matrix;
  264.    int mpeg1 = dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1;
  265.  
  266.    info.index = macrob->y * mb(dec->base.width) + macrob->x;
  267.    info.unk4 = motion;
  268.    if (intra)
  269.       info.unk4 |= 1;
  270.    if (macrob->macroblock_modes.bits.dct_type)
  271.       info.unk4 |= 0x20;
  272.    info.unk5 = (macrob->motion_vertical_field_select << 4) |
  273.       (macrob->macroblock_modes.value & 0xf);
  274.    info.coded_block_pattern = macrob->coded_block_pattern;
  275.    if (motion) {
  276.       memcpy(info.PMV, macrob->PMV, sizeof(info.PMV));
  277.    }
  278.    blocks = macrob->blocks;
  279.    for (mask = 0x20, block_index = 0; mask > 0; mask >>= 1, block_index++) {
  280.       if ((macrob->coded_block_pattern & mask) == 0)
  281.          continue;
  282.  
  283.       count = 0;
  284.  
  285.       /*
  286.        * The observation here is that there are a lot of 0's, and things go
  287.        * a lot faster if one skips over them.
  288.        */
  289.  
  290. #if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64)
  291. /* Note that the SSE implementation is much more tuned to X86_64. As it's not
  292.  * benchmarked on X86_32, disable it there. I suspect that the code needs to
  293.  * be reorganized in terms of 32-bit wide data in order to be more
  294.  * efficient. NV84+ were released well into the 64-bit CPU era, so it should
  295.  * be a minority case.
  296.  */
  297.  
  298. /* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending
  299.  * on whether the corresponding (16-bit) word in blocks is zero or non-zero. */
  300. #define wordmask(blocks, zero) \
  301.       (uint64_t)(_mm_movemask_epi8( \
  302.                        _mm_cmpeq_epi16( \
  303.                              zero, _mm_load_si128((__m128i *)(blocks)))))
  304.  
  305.       __m128i zero = _mm_setzero_si128();
  306.  
  307.       /* TODO: Look into doing the inverse quantization in terms of SSE
  308.        * operations unconditionally, when necessary. */
  309.       uint64_t bmask0 = wordmask(blocks, zero);
  310.       bmask0 |= wordmask(blocks + 8, zero) << 16;
  311.       bmask0 |= wordmask(blocks + 16, zero) << 32;
  312.       bmask0 |= wordmask(blocks + 24, zero) << 48;
  313.       uint64_t bmask1 = wordmask(blocks + 32, zero);
  314.       bmask1 |= wordmask(blocks + 40, zero) << 16;
  315.       bmask1 |= wordmask(blocks + 48, zero) << 32;
  316.       bmask1 |= wordmask(blocks + 56, zero) << 48;
  317.  
  318.       /* The wordmask macro returns the inverse of what we want, since it
  319.        * returns a 1 for equal-to-zero. Invert. */
  320.       bmask0 = ~bmask0;
  321.       bmask1 = ~bmask1;
  322.  
  323.       /* Note that the bitmask is actually sequences of 2 bits for each block
  324.        * index. This is because there is no movemask_epi16. That means that
  325.        * (a) ffs will never return 64, since the prev bit will always be set
  326.        * in that case, and (b) we need to do an extra bit shift. Or'ing the
  327.        * bitmasks together is faster than having a loop that computes them one
  328.        * at a time and processes them, on a Core i7-920. Trying to put bmask
  329.        * into an array and then looping also slows things down.
  330.        */
  331.  
  332.       /* shift needs to be the same width as i, and unsigned so that / 2
  333.        * becomes a rshift operation */
  334.       uint32_t shift;
  335.       i = 0;
  336.  
  337.       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
  338.          int16_t tmp;
  339.          while ((shift = __builtin_ffsll(bmask0))) {
  340.             i += (shift - 1) / 2;
  341.             bmask0 >>= shift - 1;
  342.             *dec->mpeg12_data++ = dec->zscan[i] * 2;
  343.             tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
  344.             *dec->mpeg12_data++ = tmp;
  345.             sum += tmp;
  346.             count++;
  347.             i++;
  348.             bmask0 >>= 2;
  349.          }
  350.          i = 32;
  351.          while ((shift = __builtin_ffsll(bmask1))) {
  352.             i += (shift - 1) / 2;
  353.             bmask1 >>= shift - 1;
  354.             *dec->mpeg12_data++ = dec->zscan[i] * 2;
  355.             tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
  356.             *dec->mpeg12_data++ = tmp;
  357.             sum += tmp;
  358.             count++;
  359.             i++;
  360.             bmask1 >>= 2;
  361.          }
  362.       } else {
  363.          while ((shift = __builtin_ffsll(bmask0))) {
  364.             i += (shift - 1) / 2;
  365.             bmask0 >>= shift - 1;
  366.             *dec->mpeg12_data++ = i * 2;
  367.             *dec->mpeg12_data++ = blocks[i];
  368.             count++;
  369.             i++;
  370.             bmask0 >>= 2;
  371.          }
  372.          i = 32;
  373.          while ((shift = __builtin_ffsll(bmask1))) {
  374.             i += (shift - 1) / 2;
  375.             bmask1 >>= shift - 1;
  376.             *dec->mpeg12_data++ = i * 2;
  377.             *dec->mpeg12_data++ = blocks[i];
  378.             count++;
  379.             i++;
  380.             bmask1 >>= 2;
  381.          }
  382.       }
  383. #undef wordmask
  384. #else
  385.  
  386.       /*
  387.        * This loop looks ridiculously written... and it is. I tried a lot of
  388.        * different ways of achieving this scan, and this was the fastest, at
  389.        * least on a Core i7-920. Note that it's not necessary to skip the 0's,
  390.        * the firmware will deal with those just fine. But it's faster to skip
  391.        * them. Note to people trying benchmarks: make sure to use realistic
  392.        * mpeg data, which can often be a single data point first followed by
  393.        * 63 0's, or <data> 7x <0> <data> 7x <0> etc.
  394.        */
  395.       i = 0;
  396.       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
  397.          while (true) {
  398.             int16_t tmp;
  399.             while (likely(i < 64 && !(tmp = blocks[i]))) i++;
  400.             if (i >= 64) break;
  401.             *dec->mpeg12_data++ = dec->zscan[i] * 2;
  402.             tmp = inverse_quantize(tmp, quant_matrix[i], mpeg1);
  403.             *dec->mpeg12_data++ = tmp;
  404.             sum += tmp;
  405.             count++;
  406.             i++;
  407.          }
  408.       } else {
  409.          while (true) {
  410.             int16_t tmp;
  411.             while (likely(i < 64 && !(tmp = blocks[i]))) i++;
  412.             if (i >= 64) break;
  413.             *dec->mpeg12_data++ = i * 2;
  414.             *dec->mpeg12_data++ = tmp;
  415.             count++;
  416.             i++;
  417.          }
  418.       }
  419.  
  420. #endif
  421.  
  422.       if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
  423.          if (!mpeg1 && (sum & 1) == 0) {
  424.             if (count && *(dec->mpeg12_data - 2) == 63 * 2) {
  425.                uint16_t *val = dec->mpeg12_data - 1;
  426.                if (*val & 1) *val -= 1;
  427.                else *val += 1;
  428.             } else {
  429.                *dec->mpeg12_data++ = 63 * 2;
  430.                *dec->mpeg12_data++ = 1;
  431.                count++;
  432.             }
  433.          }
  434.       }
  435.  
  436.       if (count) {
  437.          *(dec->mpeg12_data - 2) |= 1;
  438.       } else {
  439.          *dec->mpeg12_data++ = 1;
  440.          *dec->mpeg12_data++ = 0;
  441.          count = 1;
  442.       }
  443.       info.block_counts[block_index] = count;
  444.       blocks += 64;
  445.    }
  446.  
  447.    memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
  448.    dec->mpeg12_mb_info += sizeof(info);
  449.  
  450.    if (macrob->num_skipped_macroblocks) {
  451.       info.index++;
  452.       info.coded_block_pattern = 0;
  453.       info.skipped = macrob->num_skipped_macroblocks - 1;
  454.       memset(info.block_counts, 0, sizeof(info.block_counts));
  455.       memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
  456.       dec->mpeg12_mb_info += sizeof(info);
  457.    }
  458. }
  459.  
  460. struct mpeg12_header {
  461.    uint32_t luma_top_size; // 00
  462.    uint32_t luma_bottom_size; // 04
  463.    uint32_t chroma_top_size; // 08
  464.    uint32_t mbs; // 0c
  465.    uint32_t mb_info_size; // 10
  466.    uint32_t mb_width_minus1; // 14
  467.    uint32_t mb_height_minus1; // 18
  468.    uint32_t width; // 1c
  469.    uint32_t height; // 20
  470.    uint8_t progressive; // 24
  471.    uint8_t mocomp_only; // 25
  472.    uint8_t frames; // 26
  473.    uint8_t picture_structure; // 27
  474.    uint32_t unk28; // 28 -- 0x50100
  475.    uint32_t unk2c; // 2c
  476.    uint32_t pad[4 * 13];
  477. };
  478.  
  479. void
  480. nv84_decoder_vp_mpeg12(struct nv84_decoder *dec,
  481.                        struct pipe_mpeg12_picture_desc *desc,
  482.                        struct nv84_video_buffer *dest)
  483. {
  484.    struct nouveau_pushbuf *push = dec->vp_pushbuf;
  485.    struct nv84_video_buffer *ref1 = (struct nv84_video_buffer *)desc->ref[0];
  486.    struct nv84_video_buffer *ref2 = (struct nv84_video_buffer *)desc->ref[1];
  487.    struct nouveau_pushbuf_refn bo_refs[] = {
  488.       { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  489.       { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  490.       { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
  491.       { dec->mpeg12_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
  492.    };
  493.    int i, num_refs = sizeof(bo_refs) / sizeof(*bo_refs);
  494.    struct mpeg12_header header = {0};
  495.    struct nv50_miptree *y = nv50_miptree(dest->resources[0]);
  496.    struct nv50_miptree *uv = nv50_miptree(dest->resources[1]);
  497.  
  498.    STATIC_ASSERT(sizeof(struct mpeg12_header) == 0x100);
  499.  
  500.    if (ref1 == NULL)
  501.       ref1 = dest;
  502.    if (ref2 == NULL)
  503.       ref2 = dest;
  504.    bo_refs[1].bo = ref1->interlaced;
  505.    bo_refs[2].bo = ref2->interlaced;
  506.  
  507.    header.luma_top_size = y->layer_stride;
  508.    header.luma_bottom_size = y->layer_stride;
  509.    header.chroma_top_size = uv->layer_stride;
  510.    header.mbs = mb(dec->base.width) * mb(dec->base.height);
  511.    header.mb_info_size = dec->mpeg12_mb_info - dec->mpeg12_bo->map - 0x100;
  512.    header.mb_width_minus1 = mb(dec->base.width) - 1;
  513.    header.mb_height_minus1 = mb(dec->base.height) - 1;
  514.    header.width = align(dec->base.width, 16);
  515.    header.height = align(dec->base.height, 16);
  516.    header.progressive = desc->frame_pred_frame_dct;
  517.    header.frames = 1 + (desc->ref[0] != NULL) + (desc->ref[1] != NULL);
  518.    header.picture_structure = desc->picture_structure;
  519.    header.unk28 = 0x50100;
  520.  
  521.    memcpy(dec->mpeg12_bo->map, &header, sizeof(header));
  522.  
  523.    PUSH_SPACE(push, 10 + 3 + 2);
  524.  
  525.    nouveau_pushbuf_refn(push, bo_refs, num_refs);
  526.  
  527.    BEGIN_NV04(push, SUBC_VP(0x400), 9);
  528.    PUSH_DATA (push, 0x543210); /* each nibble possibly a dma index */
  529.    PUSH_DATA (push, 0x555001); /* constant */
  530.    PUSH_DATA (push, dec->mpeg12_bo->offset >> 8);
  531.    PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100) >> 8);
  532.    PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100 +
  533.                      align(0x20 * mb(dec->base.width) *
  534.                            mb(dec->base.height), 0x100)) >> 8);
  535.    PUSH_DATA (push, dest->interlaced->offset >> 8);
  536.    PUSH_DATA (push, ref1->interlaced->offset >> 8);
  537.    PUSH_DATA (push, ref2->interlaced->offset >> 8);
  538.    PUSH_DATA (push, 6 * 64 * 8 * header.mbs);
  539.  
  540.    BEGIN_NV04(push, SUBC_VP(0x620), 2);
  541.    PUSH_DATA (push, 0);
  542.    PUSH_DATA (push, 0);
  543.  
  544.    BEGIN_NV04(push, SUBC_VP(0x300), 1);
  545.    PUSH_DATA (push, 0);
  546.  
  547.    for (i = 0; i < 2; i++) {
  548.       struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
  549.       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
  550.    }
  551.    PUSH_KICK (push);
  552. }
  553.