Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  3.  *
  4.  * This file is part of FFmpeg.
  5.  *
  6.  * FFmpeg is free software; you can redistribute it and/or
  7.  * modify it under the terms of the GNU Lesser General Public
  8.  * License as published by the Free Software Foundation; either
  9.  * version 2.1 of the License, or (at your option) any later version.
  10.  *
  11.  * FFmpeg is distributed in the hope that it will be useful,
  12.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14.  * Lesser General Public License for more details.
  15.  *
  16.  * You should have received a copy of the GNU Lesser General Public
  17.  * License along with FFmpeg; if not, write to the Free Software
  18.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19.  */
  20.  
  21. #include "config.h"
  22. #if HAVE_UNISTD_H
  23. #include <unistd.h>
  24. #endif
  25.  
  26. #include "libavutil/avassert.h"
  27. #include "libavutil/mem.h"
  28. #include "libavutil/ppc/types_altivec.h"
  29. #include "libavutil/ppc/util_altivec.h"
  30.  
  31. #define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
  32.  
  33. #if HAVE_BIGENDIAN
  34. #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
  35.     vec_u8 srcR1 = vec_ld(-2, s);\
  36.     vec_u8 srcR2 = vec_ld(14, s);\
  37.     switch (ali) {\
  38.     default: {\
  39.         srcM2 = vec_perm(srcR1, srcR2, pm2);\
  40.         srcM1 = vec_perm(srcR1, srcR2, pm1);\
  41.         srcP0 = vec_perm(srcR1, srcR2, pp0);\
  42.         srcP1 = vec_perm(srcR1, srcR2, pp1);\
  43.         srcP2 = vec_perm(srcR1, srcR2, pp2);\
  44.         srcP3 = vec_perm(srcR1, srcR2, pp3);\
  45.     } break;\
  46.     case 11: {\
  47.         srcM2 = vec_perm(srcR1, srcR2, pm2);\
  48.         srcM1 = vec_perm(srcR1, srcR2, pm1);\
  49.         srcP0 = vec_perm(srcR1, srcR2, pp0);\
  50.         srcP1 = vec_perm(srcR1, srcR2, pp1);\
  51.         srcP2 = vec_perm(srcR1, srcR2, pp2);\
  52.         srcP3 = srcR2;\
  53.     } break;\
  54.     case 12: {\
  55.         vec_u8 srcR3 = vec_ld(30, s);\
  56.         srcM2 = vec_perm(srcR1, srcR2, pm2);\
  57.         srcM1 = vec_perm(srcR1, srcR2, pm1);\
  58.         srcP0 = vec_perm(srcR1, srcR2, pp0);\
  59.         srcP1 = vec_perm(srcR1, srcR2, pp1);\
  60.         srcP2 = srcR2;\
  61.         srcP3 = vec_perm(srcR2, srcR3, pp3);\
  62.     } break;\
  63.     case 13: {\
  64.         vec_u8 srcR3 = vec_ld(30, s);\
  65.         srcM2 = vec_perm(srcR1, srcR2, pm2);\
  66.         srcM1 = vec_perm(srcR1, srcR2, pm1);\
  67.         srcP0 = vec_perm(srcR1, srcR2, pp0);\
  68.         srcP1 = srcR2;\
  69.         srcP2 = vec_perm(srcR2, srcR3, pp2);\
  70.         srcP3 = vec_perm(srcR2, srcR3, pp3);\
  71.     } break;\
  72.     case 14: {\
  73.         vec_u8 srcR3 = vec_ld(30, s);\
  74.         srcM2 = vec_perm(srcR1, srcR2, pm2);\
  75.         srcM1 = vec_perm(srcR1, srcR2, pm1);\
  76.         srcP0 = srcR2;\
  77.         srcP1 = vec_perm(srcR2, srcR3, pp1);\
  78.         srcP2 = vec_perm(srcR2, srcR3, pp2);\
  79.         srcP3 = vec_perm(srcR2, srcR3, pp3);\
  80.     } break;\
  81.     case 15: {\
  82.         vec_u8 srcR3 = vec_ld(30, s);\
  83.         srcM2 = vec_perm(srcR1, srcR2, pm2);\
  84.         srcM1 = srcR2;\
  85.         srcP0 = vec_perm(srcR2, srcR3, pp0);\
  86.         srcP1 = vec_perm(srcR2, srcR3, pp1);\
  87.         srcP2 = vec_perm(srcR2, srcR3, pp2);\
  88.         srcP3 = vec_perm(srcR2, srcR3, pp3);\
  89.     } break;\
  90.     }\
  91.  }
  92. #else
  93. #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
  94.     srcM2 =  vec_vsx_ld(-2, s);\
  95.     srcM1 = vec_vsx_ld(-1, s);\
  96.     srcP0 = vec_vsx_ld(0, s);\
  97.     srcP1 = vec_vsx_ld(1, s);\
  98.     srcP2 = vec_vsx_ld(2, s);\
  99.     srcP3 = vec_vsx_ld(3, s);\
  100.  }
  101. #endif /* HAVE_BIGENDIAN */
  102.  
  103. /* this code assume stride % 16 == 0 */
  104. #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
  105. static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
  106.                                                  const uint8_t *src,
  107.                                                  int dstStride, int srcStride)
  108. {
  109.     register int i;
  110.  
  111.     LOAD_ZERO;
  112.     vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
  113.     const vec_s16 v5ss = vec_splat_s16(5);
  114.     const vec_u16 v5us = vec_splat_u16(5);
  115.     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
  116.     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
  117.  
  118.     vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  119.  
  120.     register int align = ((((unsigned long)src) - 2) % 16);
  121.  
  122.     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
  123.               srcP2A, srcP2B, srcP3A, srcP3B,
  124.               srcM1A, srcM1B, srcM2A, srcM2B,
  125.               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
  126.               pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
  127.               psumA, psumB, sumA, sumB;
  128.  
  129.     vec_u8 sum, fsum;
  130.  
  131. #if HAVE_BIGENDIAN
  132.     permM2 = vec_lvsl(-2, src);
  133.     permM1 = vec_lvsl(-1, src);
  134.     permP0 = vec_lvsl(+0, src);
  135.     permP1 = vec_lvsl(+1, src);
  136.     permP2 = vec_lvsl(+2, src);
  137.     permP3 = vec_lvsl(+3, src);
  138. #endif /* HAVE_BIGENDIAN */
  139.  
  140.     for (i = 0 ; i < 16 ; i ++) {
  141.         load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
  142.  
  143.         srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
  144.         srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
  145.         srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
  146.         srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
  147.  
  148.         srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
  149.         srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
  150.         srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
  151.         srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
  152.  
  153.         srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
  154.         srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
  155.         srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
  156.         srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
  157.  
  158.         sum1A = vec_adds(srcP0A, srcP1A);
  159.         sum1B = vec_adds(srcP0B, srcP1B);
  160.         sum2A = vec_adds(srcM1A, srcP2A);
  161.         sum2B = vec_adds(srcM1B, srcP2B);
  162.         sum3A = vec_adds(srcM2A, srcP3A);
  163.         sum3B = vec_adds(srcM2B, srcP3B);
  164.  
  165.         pp1A = vec_mladd(sum1A, v20ss, v16ss);
  166.         pp1B = vec_mladd(sum1B, v20ss, v16ss);
  167.  
  168.         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
  169.         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
  170.  
  171.         pp3A = vec_add(sum3A, pp1A);
  172.         pp3B = vec_add(sum3B, pp1B);
  173.  
  174.         psumA = vec_sub(pp3A, pp2A);
  175.         psumB = vec_sub(pp3B, pp2B);
  176.  
  177.         sumA = vec_sra(psumA, v5us);
  178.         sumB = vec_sra(psumB, v5us);
  179.  
  180.         sum = vec_packsu(sumA, sumB);
  181.  
  182.         ASSERT_ALIGNED(dst);
  183.  
  184.         OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
  185.  
  186.         vec_st(fsum, 0, dst);
  187.  
  188.         src += srcStride;
  189.         dst += dstStride;
  190.     }
  191. }
  192. #endif
  193.  
  194. /* this code assume stride % 16 == 0 */
  195. #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
  196. static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
  197.                                                  const uint8_t *src,
  198.                                                  int dstStride, int srcStride)
  199. {
  200.     register int i;
  201.  
  202.     LOAD_ZERO;
  203.     vec_u8 perm;
  204. #if HAVE_BIGENDIAN
  205.     perm = vec_lvsl(0, src);
  206. #endif
  207.     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
  208.     const vec_u16 v5us = vec_splat_u16(5);
  209.     const vec_s16 v5ss = vec_splat_s16(5);
  210.     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
  211.  
  212.     const uint8_t *srcbis = src - (srcStride * 2);
  213.  
  214.     const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
  215.     srcbis += srcStride;
  216.     const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
  217.     srcbis += srcStride;
  218.     const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
  219.     srcbis += srcStride;
  220.     const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
  221.     srcbis += srcStride;
  222.     const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
  223.     srcbis += srcStride;
  224.  
  225.     vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
  226.     vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
  227.     vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
  228.     vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
  229.     vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
  230.     vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
  231.     vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
  232.     vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
  233.     vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
  234.     vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
  235.  
  236.     vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
  237.               psumA, psumB, sumA, sumB,
  238.               srcP3ssA, srcP3ssB,
  239.               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
  240.  
  241.     vec_u8 sum, fsum, srcP3;
  242.  
  243.     for (i = 0 ; i < 16 ; i++) {
  244.         srcP3 = load_with_perm_vec(0, srcbis, perm);
  245.         srcbis += srcStride;
  246.  
  247.         srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
  248.         srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
  249.  
  250.         sum1A = vec_adds(srcP0ssA, srcP1ssA);
  251.         sum1B = vec_adds(srcP0ssB, srcP1ssB);
  252.         sum2A = vec_adds(srcM1ssA, srcP2ssA);
  253.         sum2B = vec_adds(srcM1ssB, srcP2ssB);
  254.         sum3A = vec_adds(srcM2ssA, srcP3ssA);
  255.         sum3B = vec_adds(srcM2ssB, srcP3ssB);
  256.  
  257.         srcM2ssA = srcM1ssA;
  258.         srcM2ssB = srcM1ssB;
  259.         srcM1ssA = srcP0ssA;
  260.         srcM1ssB = srcP0ssB;
  261.         srcP0ssA = srcP1ssA;
  262.         srcP0ssB = srcP1ssB;
  263.         srcP1ssA = srcP2ssA;
  264.         srcP1ssB = srcP2ssB;
  265.         srcP2ssA = srcP3ssA;
  266.         srcP2ssB = srcP3ssB;
  267.  
  268.         pp1A = vec_mladd(sum1A, v20ss, v16ss);
  269.         pp1B = vec_mladd(sum1B, v20ss, v16ss);
  270.  
  271.         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
  272.         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
  273.  
  274.         pp3A = vec_add(sum3A, pp1A);
  275.         pp3B = vec_add(sum3B, pp1B);
  276.  
  277.         psumA = vec_sub(pp3A, pp2A);
  278.         psumB = vec_sub(pp3B, pp2B);
  279.  
  280.         sumA = vec_sra(psumA, v5us);
  281.         sumB = vec_sra(psumB, v5us);
  282.  
  283.         sum = vec_packsu(sumA, sumB);
  284.  
  285.         ASSERT_ALIGNED(dst);
  286.  
  287.         OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
  288.  
  289.         vec_st(fsum, 0, dst);
  290.  
  291.         dst += dstStride;
  292.     }
  293. }
  294. #endif
  295.  
  296. /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
  297. #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
  298. static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
  299.                                                   const uint8_t *src,
  300.                                                   int dstStride, int tmpStride,
  301.                                                   int srcStride)
  302. {
  303.     register int i;
  304.     LOAD_ZERO;
  305.     vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
  306.     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
  307.     const vec_u32 v10ui = vec_splat_u32(10);
  308.     const vec_s16 v5ss = vec_splat_s16(5);
  309.     const vec_s16 v1ss = vec_splat_s16(1);
  310.     const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
  311.     const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
  312.  
  313.     register int align = ((((unsigned long)src) - 2) % 16);
  314.  
  315.     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
  316.               srcP2A, srcP2B, srcP3A, srcP3B,
  317.               srcM1A, srcM1B, srcM2A, srcM2B,
  318.               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
  319.               pp1A, pp1B, pp2A, pp2B, psumA, psumB;
  320.  
  321.     const vec_u8 mperm = (const vec_u8)
  322.         {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
  323.          0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
  324.     int16_t *tmpbis = tmp;
  325.  
  326.     vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
  327.               tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
  328.               tmpP2ssA, tmpP2ssB;
  329.  
  330.     vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
  331.               pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
  332.               pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
  333.               ssumAe, ssumAo, ssumBe, ssumBo;
  334.     vec_u8 fsum, sumv, sum;
  335.     vec_s16 ssume, ssumo;
  336.  
  337. #if HAVE_BIGENDIAN
  338.     permM2 = vec_lvsl(-2, src);
  339.     permM1 = vec_lvsl(-1, src);
  340.     permP0 = vec_lvsl(+0, src);
  341.     permP1 = vec_lvsl(+1, src);
  342.     permP2 = vec_lvsl(+2, src);
  343.     permP3 = vec_lvsl(+3, src);
  344. #endif /* HAVE_BIGENDIAN */
  345.  
  346.     src -= (2 * srcStride);
  347.     for (i = 0 ; i < 21 ; i ++) {
  348.         vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  349.  
  350.         load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
  351.  
  352.         srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
  353.         srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
  354.         srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
  355.         srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
  356.  
  357.         srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
  358.         srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
  359.         srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
  360.         srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
  361.  
  362.         srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
  363.         srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
  364.         srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
  365.         srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
  366.  
  367.         sum1A = vec_adds(srcP0A, srcP1A);
  368.         sum1B = vec_adds(srcP0B, srcP1B);
  369.         sum2A = vec_adds(srcM1A, srcP2A);
  370.         sum2B = vec_adds(srcM1B, srcP2B);
  371.         sum3A = vec_adds(srcM2A, srcP3A);
  372.         sum3B = vec_adds(srcM2B, srcP3B);
  373.  
  374.         pp1A = vec_mladd(sum1A, v20ss, sum3A);
  375.         pp1B = vec_mladd(sum1B, v20ss, sum3B);
  376.  
  377.         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
  378.         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
  379.  
  380.         psumA = vec_sub(pp1A, pp2A);
  381.         psumB = vec_sub(pp1B, pp2B);
  382.  
  383.         vec_st(psumA, 0, tmp);
  384.         vec_st(psumB, 16, tmp);
  385.  
  386.         src += srcStride;
  387.         tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
  388.     }
  389.  
  390.     tmpM2ssA = vec_ld(0, tmpbis);
  391.     tmpM2ssB = vec_ld(16, tmpbis);
  392.     tmpbis += tmpStride;
  393.     tmpM1ssA = vec_ld(0, tmpbis);
  394.     tmpM1ssB = vec_ld(16, tmpbis);
  395.     tmpbis += tmpStride;
  396.     tmpP0ssA = vec_ld(0, tmpbis);
  397.     tmpP0ssB = vec_ld(16, tmpbis);
  398.     tmpbis += tmpStride;
  399.     tmpP1ssA = vec_ld(0, tmpbis);
  400.     tmpP1ssB = vec_ld(16, tmpbis);
  401.     tmpbis += tmpStride;
  402.     tmpP2ssA = vec_ld(0, tmpbis);
  403.     tmpP2ssB = vec_ld(16, tmpbis);
  404.     tmpbis += tmpStride;
  405.  
  406.     for (i = 0 ; i < 16 ; i++) {
  407.         const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
  408.         const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
  409.  
  410.         const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
  411.         const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
  412.         const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
  413.         const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
  414.         vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
  415.         vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
  416.  
  417.         tmpbis += tmpStride;
  418.  
  419.         tmpM2ssA = tmpM1ssA;
  420.         tmpM2ssB = tmpM1ssB;
  421.         tmpM1ssA = tmpP0ssA;
  422.         tmpM1ssB = tmpP0ssB;
  423.         tmpP0ssA = tmpP1ssA;
  424.         tmpP0ssB = tmpP1ssB;
  425.         tmpP1ssA = tmpP2ssA;
  426.         tmpP1ssB = tmpP2ssB;
  427.         tmpP2ssA = tmpP3ssA;
  428.         tmpP2ssB = tmpP3ssB;
  429.  
  430.         pp1Ae = vec_mule(sum1A, v20ss);
  431.         pp1Ao = vec_mulo(sum1A, v20ss);
  432.         pp1Be = vec_mule(sum1B, v20ss);
  433.         pp1Bo = vec_mulo(sum1B, v20ss);
  434.  
  435.         pp2Ae = vec_mule(sum2A, v5ss);
  436.         pp2Ao = vec_mulo(sum2A, v5ss);
  437.         pp2Be = vec_mule(sum2B, v5ss);
  438.         pp2Bo = vec_mulo(sum2B, v5ss);
  439.  
  440.         pp3Ao = vec_mulo(sum3A, v1ss);
  441.         pp3Bo = vec_mulo(sum3B, v1ss);
  442. #if !HAVE_BIGENDIAN
  443.         sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
  444.         sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
  445. #endif
  446.         pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
  447.         pp3Be = vec_sra((vec_s32)sum3B, v16ui);
  448.  
  449.         pp1cAe = vec_add(pp1Ae, v512si);
  450.         pp1cAo = vec_add(pp1Ao, v512si);
  451.         pp1cBe = vec_add(pp1Be, v512si);
  452.         pp1cBo = vec_add(pp1Bo, v512si);
  453.  
  454.         pp32Ae = vec_sub(pp3Ae, pp2Ae);
  455.         pp32Ao = vec_sub(pp3Ao, pp2Ao);
  456.         pp32Be = vec_sub(pp3Be, pp2Be);
  457.         pp32Bo = vec_sub(pp3Bo, pp2Bo);
  458.  
  459.         sumAe = vec_add(pp1cAe, pp32Ae);
  460.         sumAo = vec_add(pp1cAo, pp32Ao);
  461.         sumBe = vec_add(pp1cBe, pp32Be);
  462.         sumBo = vec_add(pp1cBo, pp32Bo);
  463.  
  464.         ssumAe = vec_sra(sumAe, v10ui);
  465.         ssumAo = vec_sra(sumAo, v10ui);
  466.         ssumBe = vec_sra(sumBe, v10ui);
  467.         ssumBo = vec_sra(sumBo, v10ui);
  468.  
  469.         ssume = vec_packs(ssumAe, ssumBe);
  470.         ssumo = vec_packs(ssumAo, ssumBo);
  471.  
  472.         sumv = vec_packsu(ssume, ssumo);
  473.         sum = vec_perm(sumv, sumv, mperm);
  474.  
  475.         ASSERT_ALIGNED(dst);
  476.  
  477.         OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
  478.  
  479.         vec_st(fsum, 0, dst);
  480.  
  481.         dst += dstStride;
  482.     }
  483. }
  484. #endif
  485.