Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
  3.  *
  4.  * This file is part of FFmpeg.
  5.  *
  6.  * FFmpeg is free software; you can redistribute it and/or
  7.  * modify it under the terms of the GNU Lesser General Public
  8.  * License as published by the Free Software Foundation; either
  9.  * version 2.1 of the License, or (at your option) any later version.
  10.  *
  11.  * FFmpeg is distributed in the hope that it will be useful,
  12.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14.  * Lesser General Public License for more details.
  15.  *
  16.  * You should have received a copy of the GNU Lesser General Public
  17.  * License along with FFmpeg; if not, write to the Free Software
  18.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19.  */
  20.  
  21. #include "libavutil/mips/generic_macros_msa.h"
  22. #include "idctdsp_mips.h"
  23.  
  24. static void simple_idct_msa(int16_t *block)
  25. {
  26.     int32_t const_val;
  27.     v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
  28.     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  29.     v8i16 w1, w3, w5, w7;
  30.     v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
  31.     v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
  32.     v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
  33.     v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
  34.     v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
  35.     v4i32 w2, w4, w6;
  36.     v8i16 select_vec, temp;
  37.     v8i16 zero = { 0 };
  38.     v4i32 const_val0 = __msa_ldi_w(1);
  39.     v4i32 const_val1 = __msa_ldi_w(1);
  40.  
  41.     LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  42.     const_val0 <<= 10;
  43.     const_val = 16383 * ((1 << 19) / 16383);
  44.     const_val1 = __msa_insert_w(const_val0, 0, const_val);
  45.     const_val1 = __msa_splati_w(const_val1, 0);
  46.     TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  47.                        in0, in1, in2, in3, in4, in5, in6, in7);
  48.     select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
  49.     select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
  50.     UNPCK_SH_SW(in0, a0_r, a0_l);
  51.     UNPCK_SH_SW(in2, temp3_r, temp3_l);
  52.     temp = in0 << 3;
  53.     w2 = (v4i32) __msa_splati_h(weights, 2);
  54.     w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  55.     w4 = (v4i32) __msa_splati_h(weights, 4);
  56.     w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  57.     w6 = (v4i32) __msa_splati_h(weights, 6);
  58.     w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  59.     MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  60.     ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
  61.     MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  62.          temp1_r, temp1_l, temp2_r, temp2_l);
  63.     BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  64.                 temp2_l, temp2_r, temp1_l, temp1_r,
  65.                 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  66.     UNPCK_SH_SW(in4, temp0_r, temp0_l);
  67.     UNPCK_SH_SW(in6, temp3_r, temp3_l);
  68.     MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  69.     MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  70.          temp2_r, temp2_l, temp1_r, temp1_l);
  71.     ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  72.     SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
  73.          a1_r, a1_l, a2_r, a2_l);
  74.     ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
  75.          a3_r, a3_l, a0_r, a0_l);
  76.     SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  77.     ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  78.     SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  79.     ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  80.     SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  81.     ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  82.     ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  83.                const0, const1, const2, const3);
  84.     ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  85.     const5 = __msa_ilvod_h(-w1, -w5);
  86.     const7 = __msa_ilvod_h(w3, -w1);
  87.     DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  88.                 b0_r, b1_r, b2_r, b3_r);
  89.     DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  90.                  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  91.     DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  92.                 b0_l, b1_l, b2_l, b3_l);
  93.     DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  94.                  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  95.     BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  96.                  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  97.                  temp0_r, temp0_l, temp1_r, temp1_l,
  98.                  temp2_r, temp2_l, temp3_r, temp3_l,
  99.                  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  100.     SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
  101.     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
  102.     PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
  103.                 temp2_l, temp2_r, temp3_l, temp3_r,
  104.                 temp0_r, temp1_r, temp2_r, temp3_r);
  105.     in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
  106.                                (v16u8) select_vec);
  107.     in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
  108.                                (v16u8) select_vec);
  109.     in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
  110.                                (v16u8) select_vec);
  111.     in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
  112.                                (v16u8) select_vec);
  113.     SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
  114.     SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
  115.     PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  116.                 a0_r, a1_r, a2_r, a3_r);
  117.     in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
  118.     in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
  119.     in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
  120.     in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
  121.     TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  122.                        in0, in1, in2, in3, in4, in5, in6, in7);
  123.  
  124.     UNPCK_SH_SW(in0, a0_r, a0_l);
  125.     UNPCK_SH_SW(in2, temp3_r, temp3_l);
  126.     w2 = (v4i32) __msa_splati_h(weights, 2);
  127.     w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  128.     w4 = (v4i32) __msa_splati_h(weights, 4);
  129.     w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  130.     w6 = (v4i32) __msa_splati_h(weights, 6);
  131.     w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  132.     MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  133.     ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
  134.     MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  135.          temp1_r, temp1_l, temp2_r, temp2_l);
  136.     BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  137.                 temp2_l, temp2_r, temp1_l, temp1_r,
  138.                 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  139.     UNPCK_SH_SW(in4, temp0_r, temp0_l);
  140.     UNPCK_SH_SW(in6, temp3_r, temp3_l);
  141.     MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  142.     MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
  143.          temp2_r, temp2_l, temp1_r, temp1_l);
  144.     ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  145.     SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
  146.          a1_r, a1_l, a2_r, a2_l);
  147.     ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
  148.          a3_r, a3_l, a0_r, a0_l);
  149.     SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  150.     ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  151.     SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  152.     ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  153.     SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  154.     ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  155.                const0, const1, const2, const3);
  156.     DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  157.                 b0_r, b1_r, b2_r, b3_r);
  158.     DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  159.                 b0_l, b1_l, b2_l, b3_l);
  160.     ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  161.     ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  162.     const5 = __msa_ilvod_h(-w1, -w5);
  163.     const7 = __msa_ilvod_h(w3, -w1);
  164.     DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  165.                  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  166.     DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  167.                  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  168.     BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  169.                  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  170.                  temp0_r, temp0_l, temp1_r, temp1_l,
  171.                  temp2_r, temp2_l, temp3_r, temp3_l,
  172.                  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  173.     SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
  174.     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
  175.     PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
  176.                 temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
  177.     SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
  178.     SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
  179.     PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  180.                 a0_r, a1_r, a2_r, a3_r);
  181.     ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
  182.            block, 8);
  183. }
  184.  
  185. static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
  186.                                 int16_t *block)
  187. {
  188.     int32_t const_val;
  189.     uint64_t tmp0, tmp1, tmp2, tmp3;
  190.     v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
  191.     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  192.     v8i16 w1, w3, w5, w7;
  193.     v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
  194.     v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
  195.     v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
  196.     v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
  197.     v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
  198.     v4i32 w2, w4, w6;
  199.     v8i16 select_vec, temp;
  200.     v8i16 zero = { 0 };
  201.     v4i32 const_val0 = __msa_ldi_w(1);
  202.     v4i32 const_val1 = __msa_ldi_w(1);
  203.  
  204.     LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  205.     const_val0 <<= 10;
  206.     const_val = 16383 * ((1 << 19) / 16383);
  207.     const_val1 = __msa_insert_w(const_val0, 0, const_val);
  208.     const_val1 = __msa_splati_w(const_val1, 0);
  209.     TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  210.                        in0, in1, in2, in3, in4, in5, in6, in7);
  211.     select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
  212.     select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
  213.     UNPCK_SH_SW(in0, a0_r, a0_l);
  214.     UNPCK_SH_SW(in2, temp3_r, temp3_l);
  215.     temp = in0 << 3;
  216.     w2 = (v4i32) __msa_splati_h(weights, 2);
  217.     w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  218.     w4 = (v4i32) __msa_splati_h(weights, 4);
  219.     w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  220.     w6 = (v4i32) __msa_splati_h(weights, 6);
  221.     w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  222.     MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  223.     ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
  224.     MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  225.     MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  226.     BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  227.                 temp2_l, temp2_r, temp1_l, temp1_r,
  228.                 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  229.     UNPCK_SH_SW(in4, temp0_r, temp0_l);
  230.     UNPCK_SH_SW(in6, temp3_r, temp3_l);
  231.     MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  232.     MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
  233.     MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
  234.     ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  235.     SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
  236.     SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
  237.     ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
  238.     ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
  239.     SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  240.     ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  241.     SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  242.     ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  243.     SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  244.     ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  245.     ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  246.                const0, const1, const2, const3);
  247.     ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  248.     const5 = __msa_ilvod_h(-w1, -w5);
  249.     const7 = __msa_ilvod_h(w3, -w1);
  250.     DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  251.                 b0_r, b1_r, b2_r, b3_r);
  252.     DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  253.                  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  254.     DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  255.                 b0_l, b1_l, b2_l, b3_l);
  256.     DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  257.                  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  258.     BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  259.                  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  260.                  temp0_r, temp0_l, temp1_r, temp1_l,
  261.                  temp2_r, temp2_l, temp3_r, temp3_l,
  262.                  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  263.     SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
  264.     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
  265.     PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
  266.                 temp2_l, temp2_r, temp3_l, temp3_r,
  267.                 temp0_r, temp1_r, temp2_r, temp3_r);
  268.     in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
  269.                                (v16u8) select_vec);
  270.     in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
  271.                                (v16u8) select_vec);
  272.     in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
  273.                                (v16u8) select_vec);
  274.     in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
  275.                                (v16u8) select_vec);
  276.     SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
  277.     SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
  278.     PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  279.                 a0_r, a1_r, a2_r, a3_r);
  280.     in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
  281.     in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
  282.     in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
  283.     in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
  284.     TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  285.                        in0, in1, in2, in3, in4, in5, in6, in7);
  286.     UNPCK_SH_SW(in0, a0_r, a0_l);
  287.     UNPCK_SH_SW(in2, temp3_r, temp3_l);
  288.     w2 = (v4i32) __msa_splati_h(weights, 2);
  289.     w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  290.     w4 = (v4i32) __msa_splati_h(weights, 4);
  291.     w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  292.     w6 = (v4i32) __msa_splati_h(weights, 6);
  293.     w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  294.     MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  295.     ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
  296.     MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  297.     MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  298.     BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  299.                 temp2_l, temp2_r, temp1_l, temp1_r,
  300.                 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  301.     UNPCK_SH_SW(in4, temp0_r, temp0_l);
  302.     UNPCK_SH_SW(in6, temp3_r, temp3_l);
  303.     MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  304.     MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
  305.     MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
  306.     ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  307.     SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
  308.     SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
  309.     ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
  310.     ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
  311.     SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  312.     ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  313.     SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  314.     ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  315.     SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  316.     ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  317.                const0, const1, const2, const3);
  318.     DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  319.                 b0_r, b1_r, b2_r, b3_r);
  320.     DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  321.                 b0_l, b1_l, b2_l, b3_l);
  322.     ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  323.     ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  324.     const5 = __msa_ilvod_h(-w1, -w5);
  325.     const7 = __msa_ilvod_h(w3, -w1);
  326.     DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  327.                  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  328.     DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  329.                  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  330.     BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  331.                  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  332.                  temp0_r, temp0_l, temp1_r, temp1_l,
  333.                  temp2_r, temp2_l, temp3_r, temp3_l,
  334.                  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  335.     SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
  336.     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
  337.     SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
  338.     SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
  339.     PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
  340.                 temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
  341.     PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  342.                 a0_r, a1_r, a2_r, a3_r);
  343.     temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
  344.     temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
  345.     temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
  346.     temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
  347.     PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
  348.                 temp2_r, temp2_r, temp3_r, temp3_r,
  349.                 temp0_r, temp1_r, temp2_r, temp3_r);
  350.     tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
  351.     tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
  352.     tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
  353.     tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
  354.     SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
  355.     dst += 4 * dst_stride;
  356.     a0_r = (v4i32) CLIP_SH_0_255(a0_r);
  357.     a1_r = (v4i32) CLIP_SH_0_255(a1_r);
  358.     a2_r = (v4i32) CLIP_SH_0_255(a2_r);
  359.     a3_r = (v4i32) CLIP_SH_0_255(a3_r);
  360.     PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
  361.                 a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
  362.     tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
  363.     tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
  364.     tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
  365.     tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
  366.     SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
  367.     dst += 4 * dst_stride;
  368. }
  369.  
  370. static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
  371.                                 int16_t *block)
  372. {
  373.     int32_t const_val;
  374.     uint64_t tmp0, tmp1, tmp2, tmp3;
  375.     v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
  376.     v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
  377.     v8i16 w1, w3, w5, w7;
  378.     v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
  379.     v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
  380.     v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
  381.     v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
  382.     v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
  383.     v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
  384.     v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
  385.     v4i32 w2, w4, w6;
  386.     v8i16 select_vec, temp;
  387.     v8i16 zero = { 0 };
  388.     v4i32 const_val0 = __msa_ldi_w(1);
  389.     v4i32 const_val1 = __msa_ldi_w(1);
  390.  
  391.     const_val0 <<= 10;
  392.     const_val = 16383 * ((1 << 19) / 16383);
  393.     const_val1 = __msa_insert_w(const_val0, 0, const_val);
  394.     const_val1 = __msa_splati_w(const_val1, 0);
  395.     LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
  396.     TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  397.                        in0, in1, in2, in3, in4, in5, in6, in7);
  398.  
  399.     select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
  400.     select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
  401.     UNPCK_SH_SW(in0, a0_r, a0_l);
  402.     UNPCK_SH_SW(in2, temp3_r, temp3_l);
  403.     ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  404.     UNPCK_SH_SW(in4, temp4_r, temp4_l);
  405.     UNPCK_SH_SW(in6, temp7_r, temp7_l);
  406.     ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
  407.     temp = in0 << 3;
  408.     SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
  409.     ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
  410.                const0, const1, const2, const3);
  411.     ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
  412.     const5 = __msa_ilvod_h(-w1, -w5);
  413.     const7 = __msa_ilvod_h(w3, -w1);
  414.     DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  415.                 b0_r, b1_r, b2_r, b3_r);
  416.     DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
  417.                  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  418.     DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  419.                 b0_l, b1_l, b2_l, b3_l);
  420.     DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
  421.                  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  422.     w2 = (v4i32) __msa_splati_h(weights, 2);
  423.     w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
  424.     w4 = (v4i32) __msa_splati_h(weights, 4);
  425.     w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
  426.     w6 = (v4i32) __msa_splati_h(weights, 6);
  427.     w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
  428.     MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  429.     ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
  430.     MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  431.     MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  432.     BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  433.                 temp2_l, temp2_r, temp1_l, temp1_r,
  434.                 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  435.     MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
  436.     MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
  437.     MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
  438.     ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
  439.     SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
  440.     SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
  441.     ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
  442.     ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
  443.     SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
  444.     ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
  445.     SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
  446.     BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  447.                  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  448.                  temp0_r, temp0_l, temp1_r, temp1_l,
  449.                  temp2_r, temp2_l, temp3_r, temp3_l,
  450.                  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  451.     SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
  452.     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
  453.     PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
  454.                 temp2_l, temp2_r, temp3_l, temp3_r,
  455.                 temp0_r, temp1_r, temp2_r, temp3_r);
  456.     in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
  457.                                (v16u8) select_vec);
  458.     in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
  459.                                (v16u8) select_vec);
  460.     in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
  461.                                (v16u8) select_vec);
  462.     in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
  463.                                (v16u8) select_vec);
  464.     SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
  465.     SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
  466.     PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  467.                 a0_r, a1_r, a2_r, a3_r);
  468.     in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
  469.     in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
  470.     in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
  471.     in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
  472.     TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
  473.                        in0, in1, in2, in3, in4, in5, in6, in7);
  474.  
  475.     UNPCK_SH_SW(in0, a0_r, a0_l);
  476.     UNPCK_SH_SW(in2, temp3_r, temp3_l);
  477.     MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
  478.     ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
  479.     MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
  480.     MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
  481.     BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
  482.                 temp2_l, temp2_r, temp1_l, temp1_r,
  483.                 a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
  484.     UNPCK_SH_SW(in4, temp0_r, temp0_l);
  485.     UNPCK_SH_SW(in6, temp3_r, temp3_l);
  486.     MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
  487.     MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
  488.     MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
  489.     ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
  490.     SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
  491.     SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
  492.     ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
  493.     ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
  494.     SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
  495.     ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
  496.     SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
  497.     ILVRL_H2_SW(in1, in3, b3_r, b3_l);
  498.     ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
  499.     DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
  500.                 b0_r, b1_r, b2_r, b3_r);
  501.     DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
  502.                 b0_l, b1_l, b2_l, b3_l);
  503.     DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
  504.                  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
  505.     DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
  506.                  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
  507.     BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
  508.                  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
  509.                  temp0_r, temp0_l, temp1_r, temp1_l,
  510.                  temp2_r, temp2_l, temp3_r, temp3_l,
  511.                  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
  512.     SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
  513.     SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
  514.     LD_SH4(dst, dst_stride, in0, in1, in2, in3);
  515.     PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
  516.                 temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
  517.     ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
  518.                temp0_l, temp1_l, temp2_l, temp3_l);
  519.     temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
  520.     temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
  521.     temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
  522.     temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
  523.     temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
  524.     temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
  525.     temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
  526.     temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
  527.     PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
  528.                 temp2_r, temp2_r, temp3_r, temp3_r,
  529.                 temp0_r, temp1_r, temp2_r, temp3_r);
  530.     tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
  531.     tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
  532.     tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
  533.     tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
  534.     SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
  535.  
  536.     SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
  537.     SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
  538.     LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
  539.     PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
  540.                 a0_r, a1_r, a2_r, a3_r);
  541.     ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
  542.                a3_l, a2_l, a1_l, a0_l);
  543.     a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
  544.     a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
  545.     a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
  546.     a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
  547.     a3_r = (v4i32) CLIP_SH_0_255(a3_r);
  548.     a2_r = (v4i32) CLIP_SH_0_255(a2_r);
  549.     a1_r = (v4i32) CLIP_SH_0_255(a1_r);
  550.     a0_r = (v4i32) CLIP_SH_0_255(a0_r);
  551.     PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
  552.                 a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
  553.     tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
  554.     tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
  555.     tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
  556.     tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
  557.     SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
  558. }
  559.  
  560. void ff_simple_idct_msa(int16_t *block)
  561. {
  562.     simple_idct_msa(block);
  563. }
  564.  
  565. void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
  566. {
  567.     simple_idct_put_msa(dst, dst_stride, block);
  568. }
  569.  
  570. void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
  571. {
  572.     simple_idct_add_msa(dst, dst_stride, block);
  573. }
  574.