Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
  3.  *
  4.  * This file is part of FFmpeg.
  5.  *
  6.  * FFmpeg is free software; you can redistribute it and/or
  7.  * modify it under the terms of the GNU Lesser General Public
  8.  * License as published by the Free Software Foundation; either
  9.  * version 2.1 of the License, or (at your option) any later version.
  10.  *
  11.  * FFmpeg is distributed in the hope that it will be useful,
  12.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14.  * Lesser General Public License for more details.
  15.  *
  16.  * You should have received a copy of the GNU Lesser General Public
  17.  * License along with FFmpeg; if not, write to the Free Software
  18.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19.  */
  20.  
  21. #include "libavutil/mips/generic_macros_msa.h"
  22. #include "me_cmp_mips.h"
  23.  
  24. static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
  25.                                uint8_t *ref, int32_t ref_stride,
  26.                                int32_t height)
  27. {
  28.     int32_t ht_cnt;
  29.     v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  30.     v8u16 sad = { 0 };
  31.  
  32.     for (ht_cnt = (height >> 2); ht_cnt--;) {
  33.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  34.         src += (4 * src_stride);
  35.         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  36.         ref += (4 * ref_stride);
  37.  
  38.         PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
  39.                     src0, src1, ref0, ref1);
  40.         sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  41.     }
  42.  
  43.     return (HADD_UH_U32(sad));
  44. }
  45.  
  46. static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
  47.                                 uint8_t *ref, int32_t ref_stride,
  48.                                 int32_t height)
  49. {
  50.     int32_t ht_cnt;
  51.     v16u8 src0, src1, ref0, ref1;
  52.     v8u16 sad = { 0 };
  53.  
  54.     for (ht_cnt = (height >> 2); ht_cnt--;) {
  55.         LD_UB2(src, src_stride, src0, src1);
  56.         src += (2 * src_stride);
  57.         LD_UB2(ref, ref_stride, ref0, ref1);
  58.         ref += (2 * ref_stride);
  59.         sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  60.  
  61.         LD_UB2(src, src_stride, src0, src1);
  62.         src += (2 * src_stride);
  63.         LD_UB2(ref, ref_stride, ref0, ref1);
  64.         ref += (2 * ref_stride);
  65.         sad += SAD_UB2_UH(src0, src1, ref0, ref1);
  66.     }
  67.  
  68.     return (HADD_UH_U32(sad));
  69. }
  70.  
  71. static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
  72.                                                      int32_t src_stride,
  73.                                                      uint8_t *ref,
  74.                                                      int32_t ref_stride,
  75.                                                      int32_t height)
  76. {
  77.     int32_t ht_cnt;
  78.     v16u8 src0, src1, src2, src3, comp0, comp1;
  79.     v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
  80.     v8u16 sad = { 0 };
  81.  
  82.     for (ht_cnt = (height >> 3); ht_cnt--;) {
  83.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  84.         src += (4 * src_stride);
  85.         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  86.         ref += (4 * ref_stride);
  87.  
  88.         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
  89.         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
  90.         SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  91.         SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  92.         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  93.         AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
  94.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  95.  
  96.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  97.         src += (4 * src_stride);
  98.         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  99.         ref += (4 * ref_stride);
  100.  
  101.         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
  102.         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
  103.         SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
  104.         SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
  105.         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
  106.         AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
  107.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  108.     }
  109.  
  110.     return (HADD_UH_U32(sad));
  111. }
  112.  
  113. static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
  114.                                                       int32_t src_stride,
  115.                                                       uint8_t *ref,
  116.                                                       int32_t ref_stride,
  117.                                                       int32_t height)
  118. {
  119.     int32_t ht_cnt;
  120.     v16u8 src0, src1, src2, src3, comp0, comp1;
  121.     v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
  122.     v8u16 sad = { 0 };
  123.  
  124.     for (ht_cnt = (height >> 3); ht_cnt--;) {
  125.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  126.         src += (4 * src_stride);
  127.         LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
  128.         LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
  129.         ref += (4 * ref_stride);
  130.  
  131.         AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
  132.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  133.         AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
  134.         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
  135.  
  136.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  137.         src += (4 * src_stride);
  138.         LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
  139.         LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
  140.         ref += (4 * ref_stride);
  141.  
  142.         AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
  143.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  144.         AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
  145.         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
  146.     }
  147.  
  148.     return (HADD_UH_U32(sad));
  149. }
  150.  
  151. static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
  152.                                                     int32_t src_stride,
  153.                                                     uint8_t *ref,
  154.                                                     int32_t ref_stride,
  155.                                                     int32_t height)
  156. {
  157.     int32_t ht_cnt;
  158.     v16u8 src0, src1, src2, src3, comp0, comp1;
  159.     v16u8 ref0, ref1, ref2, ref3, ref4;
  160.     v8u16 sad = { 0 };
  161.  
  162.     for (ht_cnt = (height >> 3); ht_cnt--;) {
  163.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  164.         src += (4 * src_stride);
  165.         LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
  166.         ref += (4 * ref_stride);
  167.  
  168.         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
  169.         PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
  170.         PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
  171.         AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
  172.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  173.  
  174.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  175.         src += (4 * src_stride);
  176.         LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
  177.         ref += (4 * ref_stride);
  178.  
  179.         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
  180.         PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
  181.         PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
  182.         AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
  183.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  184.     }
  185.  
  186.     return (HADD_UH_U32(sad));
  187. }
  188.  
  189. static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
  190.                                                      int32_t src_stride,
  191.                                                      uint8_t *ref,
  192.                                                      int32_t ref_stride,
  193.                                                      int32_t height)
  194. {
  195.     int32_t ht_cnt;
  196.     v16u8 src0, src1, src2, src3, comp0, comp1;
  197.     v16u8 ref0, ref1, ref2, ref3, ref4;
  198.     v8u16 sad = { 0 };
  199.  
  200.     for (ht_cnt = (height >> 3); ht_cnt--;) {
  201.         LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
  202.         ref += (5 * ref_stride);
  203.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  204.         src += (4 * src_stride);
  205.  
  206.         AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
  207.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  208.         AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
  209.         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
  210.  
  211.         ref4 = ref3;
  212.  
  213.         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
  214.         ref += (3 * ref_stride);
  215.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  216.         src += (4 * src_stride);
  217.  
  218.         AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
  219.         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
  220.         AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
  221.         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
  222.     }
  223.  
  224.     return (HADD_UH_U32(sad));
  225. }
  226.  
  227. static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
  228.                                                   int32_t src_stride,
  229.                                                   uint8_t *ref,
  230.                                                   int32_t ref_stride,
  231.                                                   int32_t height)
  232. {
  233.     int32_t ht_cnt;
  234.     v16u8 src0, src1, src2, src3, temp0, temp1, diff;
  235.     v16u8 ref0, ref1, ref2, ref3, ref4;
  236.     v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
  237.     v8u16 comp0, comp1, comp2, comp3;
  238.     v8u16 sad = { 0 };
  239.  
  240.     for (ht_cnt = (height >> 2); ht_cnt--;) {
  241.         LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
  242.         ref += (4 * ref_stride);
  243.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  244.         src += (4 * src_stride);
  245.  
  246.         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
  247.  
  248.         VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
  249.         comp0 = __msa_hadd_u_h(temp0, temp0);
  250.         comp1 = __msa_hadd_u_h(temp1, temp1);
  251.         comp0 += comp1;
  252.         comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
  253.         comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
  254.  
  255.         temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
  256.         comp2 = __msa_hadd_u_h(temp0, temp0);
  257.         comp1 += comp2;
  258.         comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
  259.         comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
  260.         comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
  261.         diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
  262.         sad += __msa_hadd_u_h(diff, diff);
  263.  
  264.         temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
  265.         comp3 = __msa_hadd_u_h(temp1, temp1);
  266.         comp2 += comp3;
  267.         comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
  268.         comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
  269.  
  270.         temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
  271.         comp0 = __msa_hadd_u_h(temp0, temp0);
  272.         comp3 += comp0;
  273.         comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
  274.         comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
  275.         comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
  276.         diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
  277.         sad += __msa_hadd_u_h(diff, diff);
  278.     }
  279.  
  280.     return (HADD_UH_U32(sad));
  281. }
  282.  
  283. static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
  284.                                                    int32_t src_stride,
  285.                                                    uint8_t *ref,
  286.                                                    int32_t ref_stride,
  287.                                                    int32_t height)
  288. {
  289.     int32_t ht_cnt;
  290.     v16u8 src0, src1, src2, src3, comp, diff;
  291.     v16u8 temp0, temp1, temp2, temp3;
  292.     v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
  293.     v8u16 comp0, comp1, comp2, comp3;
  294.     v8u16 sad = { 0 };
  295.  
  296.     for (ht_cnt = (height >> 3); ht_cnt--;) {
  297.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  298.         src += (4 * src_stride);
  299.         LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
  300.         LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
  301.         ref += (5 * ref_stride);
  302.  
  303.         ILVRL_B2_UB(ref14, ref04, temp0, temp1);
  304.         comp0 = __msa_hadd_u_h(temp0, temp0);
  305.         comp1 = __msa_hadd_u_h(temp1, temp1);
  306.         ILVRL_B2_UB(ref10, ref00, temp2, temp3);
  307.         comp2 = __msa_hadd_u_h(temp2, temp2);
  308.         comp3 = __msa_hadd_u_h(temp3, temp3);
  309.         comp0 += comp2;
  310.         comp1 += comp3;
  311.         SRARI_H2_UH(comp0, comp1, 2);
  312.         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
  313.         diff = __msa_asub_u_b(src0, comp);
  314.         sad += __msa_hadd_u_h(diff, diff);
  315.  
  316.         ILVRL_B2_UB(ref11, ref01, temp0, temp1);
  317.         comp0 = __msa_hadd_u_h(temp0, temp0);
  318.         comp1 = __msa_hadd_u_h(temp1, temp1);
  319.         comp2 += comp0;
  320.         comp3 += comp1;
  321.         SRARI_H2_UH(comp2, comp3, 2);
  322.         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
  323.         diff = __msa_asub_u_b(src1, comp);
  324.         sad += __msa_hadd_u_h(diff, diff);
  325.  
  326.         ILVRL_B2_UB(ref12, ref02, temp2, temp3);
  327.         comp2 = __msa_hadd_u_h(temp2, temp2);
  328.         comp3 = __msa_hadd_u_h(temp3, temp3);
  329.         comp0 += comp2;
  330.         comp1 += comp3;
  331.         SRARI_H2_UH(comp0, comp1, 2);
  332.         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
  333.         diff = __msa_asub_u_b(src2, comp);
  334.         sad += __msa_hadd_u_h(diff, diff);
  335.  
  336.         ILVRL_B2_UB(ref13, ref03, temp0, temp1);
  337.         comp0 = __msa_hadd_u_h(temp0, temp0);
  338.         comp1 = __msa_hadd_u_h(temp1, temp1);
  339.         comp2 += comp0;
  340.         comp3 += comp1;
  341.         SRARI_H2_UH(comp2, comp3, 2);
  342.         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
  343.         diff = __msa_asub_u_b(src3, comp);
  344.         sad += __msa_hadd_u_h(diff, diff);
  345.  
  346.         LD_UB4(src, src_stride, src0, src1, src2, src3);
  347.         src += (4 * src_stride);
  348.         LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
  349.         LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
  350.         ref += (3 * ref_stride);
  351.  
  352.         ILVRL_B2_UB(ref10, ref00, temp2, temp3);
  353.         comp2 = __msa_hadd_u_h(temp2, temp2);
  354.         comp3 = __msa_hadd_u_h(temp3, temp3);
  355.         comp0 += comp2;
  356.         comp1 += comp3;
  357.         SRARI_H2_UH(comp0, comp1, 2);
  358.         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
  359.         diff = __msa_asub_u_b(src0, comp);
  360.         sad += __msa_hadd_u_h(diff, diff);
  361.  
  362.         ILVRL_B2_UB(ref11, ref01, temp0, temp1);
  363.         comp0 = __msa_hadd_u_h(temp0, temp0);
  364.         comp1 = __msa_hadd_u_h(temp1, temp1);
  365.         comp2 += comp0;
  366.         comp3 += comp1;
  367.         SRARI_H2_UH(comp2, comp3, 2);
  368.         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
  369.         diff = __msa_asub_u_b(src1, comp);
  370.         sad += __msa_hadd_u_h(diff, diff);
  371.  
  372.         ILVRL_B2_UB(ref12, ref02, temp2, temp3);
  373.         comp2 = __msa_hadd_u_h(temp2, temp2);
  374.         comp3 = __msa_hadd_u_h(temp3, temp3);
  375.         comp0 += comp2;
  376.         comp1 += comp3;
  377.         SRARI_H2_UH(comp0, comp1, 2);
  378.         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
  379.         diff = __msa_asub_u_b(src2, comp);
  380.         sad += __msa_hadd_u_h(diff, diff);
  381.  
  382.         ILVRL_B2_UB(ref13, ref03, temp0, temp1);
  383.         comp0 = __msa_hadd_u_h(temp0, temp0);
  384.         comp1 = __msa_hadd_u_h(temp1, temp1);
  385.         comp2 += comp0;
  386.         comp3 += comp1;
  387.         SRARI_H2_UH(comp2, comp3, 2);
  388.         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
  389.         diff = __msa_asub_u_b(src3, comp);
  390.         sad += __msa_hadd_u_h(diff, diff);
  391.     }
  392.  
  393.     return (HADD_UH_U32(sad));
  394. }
  395.  
  396. #define CALC_MSE_B(src, ref, var)                                    \
  397. {                                                                    \
  398.     v16u8 src_l0_m, src_l1_m;                                        \
  399.     v8i16 res_l0_m, res_l1_m;                                        \
  400.                                                                      \
  401.     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
  402.     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
  403.     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
  404. }
  405.  
  406. static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
  407.                                uint8_t *ref_ptr, int32_t ref_stride,
  408.                                int32_t height)
  409. {
  410.     int32_t ht_cnt;
  411.     uint32_t sse;
  412.     uint32_t src0, src1, src2, src3;
  413.     uint32_t ref0, ref1, ref2, ref3;
  414.     v16u8 src = { 0 };
  415.     v16u8 ref = { 0 };
  416.     v4i32 var = { 0 };
  417.  
  418.     for (ht_cnt = (height >> 2); ht_cnt--;) {
  419.         LW4(src_ptr, src_stride, src0, src1, src2, src3);
  420.         src_ptr += (4 * src_stride);
  421.         LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  422.         ref_ptr += (4 * ref_stride);
  423.  
  424.         INSERT_W4_UB(src0, src1, src2, src3, src);
  425.         INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  426.         CALC_MSE_B(src, ref, var);
  427.     }
  428.  
  429.     sse = HADD_SW_S32(var);
  430.  
  431.     return sse;
  432. }
  433.  
  434. static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
  435.                                uint8_t *ref_ptr, int32_t ref_stride,
  436.                                int32_t height)
  437. {
  438.     int32_t ht_cnt;
  439.     uint32_t sse;
  440.     v16u8 src0, src1, src2, src3;
  441.     v16u8 ref0, ref1, ref2, ref3;
  442.     v4i32 var = { 0 };
  443.  
  444.     for (ht_cnt = (height >> 2); ht_cnt--;) {
  445.         LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
  446.         src_ptr += (4 * src_stride);
  447.         LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
  448.         ref_ptr += (4 * ref_stride);
  449.  
  450.         PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
  451.                     src0, src1, ref0, ref1);
  452.         CALC_MSE_B(src0, ref0, var);
  453.         CALC_MSE_B(src1, ref1, var);
  454.     }
  455.  
  456.     sse = HADD_SW_S32(var);
  457.  
  458.     return sse;
  459. }
  460.  
  461. static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
  462.                                 uint8_t *ref_ptr, int32_t ref_stride,
  463.                                 int32_t height)
  464. {
  465.     int32_t ht_cnt;
  466.     uint32_t sse;
  467.     v16u8 src, ref;
  468.     v4i32 var = { 0 };
  469.  
  470.     for (ht_cnt = (height >> 2); ht_cnt--;) {
  471.         src = LD_UB(src_ptr);
  472.         src_ptr += src_stride;
  473.         ref = LD_UB(ref_ptr);
  474.         ref_ptr += ref_stride;
  475.         CALC_MSE_B(src, ref, var);
  476.  
  477.         src = LD_UB(src_ptr);
  478.         src_ptr += src_stride;
  479.         ref = LD_UB(ref_ptr);
  480.         ref_ptr += ref_stride;
  481.         CALC_MSE_B(src, ref, var);
  482.  
  483.         src = LD_UB(src_ptr);
  484.         src_ptr += src_stride;
  485.         ref = LD_UB(ref_ptr);
  486.         ref_ptr += ref_stride;
  487.         CALC_MSE_B(src, ref, var);
  488.  
  489.         src = LD_UB(src_ptr);
  490.         src_ptr += src_stride;
  491.         ref = LD_UB(ref_ptr);
  492.         ref_ptr += ref_stride;
  493.         CALC_MSE_B(src, ref, var);
  494.     }
  495.  
  496.     sse = HADD_SW_S32(var);
  497.  
  498.     return sse;
  499. }
  500.  
  501. static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
  502.                                      uint8_t *ref, int32_t ref_stride)
  503. {
  504.     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  505.     v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  506.     v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
  507.     v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  508.     v8i16 sum = { 0 };
  509.     v8i16 zero = { 0 };
  510.  
  511.     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  512.     LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  513.     ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
  514.                src4, ref4, src5, ref5, src6, ref6, src7, ref7,
  515.                diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
  516.     HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
  517.     HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
  518.     TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
  519.                        diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
  520.     BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
  521.                 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
  522.     BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
  523.                 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
  524.     BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
  525.                 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
  526.     TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
  527.                        temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
  528.     BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
  529.                 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
  530.     BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
  531.                 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
  532.     ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
  533.          diff0, diff1, diff2, diff3);
  534.     sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
  535.     sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
  536.     sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
  537.     sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
  538.     sum += __msa_add_a_h((v8i16) diff0, zero);
  539.     sum += __msa_add_a_h((v8i16) diff1, zero);
  540.     sum += __msa_add_a_h((v8i16) diff2, zero);
  541.     sum += __msa_add_a_h((v8i16) diff3, zero);
  542.  
  543.     return (HADD_UH_U32(sum));
  544. }
  545.  
  546. static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
  547.                                       uint8_t *ref, int32_t ref_stride)
  548. {
  549.     int32_t sum_res = 0;
  550.     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  551.     v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
  552.     v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  553.     v8i16 sum = { 0 };
  554.     v16i8 zero = { 0 };
  555.  
  556.     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
  557.     TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
  558.                        src0, src1, src2, src3, src4, src5, src6, src7);
  559.     ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
  560.                zero, src4, zero, src5, zero, src6, zero, src7,
  561.                diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
  562.     BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
  563.                 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
  564.     BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
  565.                 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
  566.     BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
  567.                 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
  568.     TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
  569.                        temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
  570.     BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
  571.                 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
  572.     BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
  573.                 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
  574.     ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
  575.          diff0, diff1, diff2, diff3);
  576.     sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
  577.     sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
  578.     sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
  579.     sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
  580.     sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
  581.     sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
  582.     sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
  583.     sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
  584.     sum_res = (HADD_UH_U32(sum));
  585.     sum_res -= abs(temp0[0] + temp4[0]);
  586.  
  587.     return sum_res;
  588. }
  589.  
  590. int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
  591.                      ptrdiff_t stride, int height)
  592. {
  593.     return sad_16width_msa(src, stride, ref, stride, height);
  594. }
  595.  
  596. int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
  597.                     ptrdiff_t stride, int height)
  598. {
  599.     return sad_8width_msa(src, stride, ref, stride, height);
  600. }
  601.  
  602. int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  603.                         ptrdiff_t stride, int h)
  604. {
  605.     return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
  606. }
  607.  
  608. int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  609.                         ptrdiff_t stride, int h)
  610. {
  611.     return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
  612. }
  613.  
  614. int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  615.                          ptrdiff_t stride, int h)
  616. {
  617.     return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
  618. }
  619.  
  620. int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  621.                        ptrdiff_t stride, int h)
  622. {
  623.     return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
  624. }
  625.  
  626. int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  627.                        ptrdiff_t stride, int h)
  628. {
  629.     return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
  630. }
  631.  
  632. int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  633.                         ptrdiff_t stride, int h)
  634. {
  635.     return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
  636. }
  637.  
  638. int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
  639.                  ptrdiff_t stride, int height)
  640. {
  641.     return sse_16width_msa(src, stride, ref, stride, height);
  642. }
  643.  
  644. int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
  645.                 ptrdiff_t stride, int height)
  646. {
  647.     return sse_8width_msa(src, stride, ref, stride, height);
  648. }
  649.  
  650. int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
  651.                 ptrdiff_t stride, int height)
  652. {
  653.     return sse_4width_msa(src, stride, ref, stride, height);
  654. }
  655.  
  656. int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
  657.                              ptrdiff_t stride, int h)
  658. {
  659.     return hadamard_diff_8x8_msa(src, stride, dst, stride);
  660. }
  661.  
  662. int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
  663.                               ptrdiff_t stride, int h)
  664. {
  665.     return hadamard_intra_8x8_msa(src, stride, dst, stride);
  666. }
  667.  
  668. /* Hadamard Transform functions */
  669. #define WRAPPER8_16_SQ(name8, name16)                      \
  670. int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \
  671.            ptrdiff_t stride, int h)                        \
  672. {                                                          \
  673.     int score = 0;                                         \
  674.     score += name8(s, dst, src, stride, 8);                \
  675.     score += name8(s, dst + 8, src + 8, stride, 8);        \
  676.     if(h == 16) {                                          \
  677.         dst += 8 * stride;                                 \
  678.         src += 8 * stride;                                 \
  679.         score +=name8(s, dst, src, stride, 8);             \
  680.         score +=name8(s, dst + 8, src + 8, stride, 8);     \
  681.     }                                                      \
  682.     return score;                                          \
  683. }
  684.  
  685. WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
  686. WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
  687.