Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2002 Brian Foley
  3.  * Copyright (c) 2002 Dieter Shirley
  4.  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5.  *
  6.  * This file is part of FFmpeg.
  7.  *
  8.  * FFmpeg is free software; you can redistribute it and/or
  9.  * modify it under the terms of the GNU Lesser General Public
  10.  * License as published by the Free Software Foundation; either
  11.  * version 2.1 of the License, or (at your option) any later version.
  12.  *
  13.  * FFmpeg is distributed in the hope that it will be useful,
  14.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16.  * Lesser General Public License for more details.
  17.  *
  18.  * You should have received a copy of the GNU Lesser General Public
  19.  * License along with FFmpeg; if not, write to the Free Software
  20.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21.  */
  22.  
  23. #include "config.h"
  24.  
  25. #if HAVE_ALTIVEC_H
  26. #include <altivec.h>
  27. #endif
  28.  
  29. #include "libavutil/attributes.h"
  30. #include "libavutil/cpu.h"
  31. #include "libavutil/ppc/cpu.h"
  32. #include "libavutil/ppc/types_altivec.h"
  33. #include "libavutil/ppc/util_altivec.h"
  34. #include "libavcodec/hpeldsp.h"
  35. #include "hpeldsp_altivec.h"
  36.  
  37. #if HAVE_ALTIVEC
  38. /* next one assumes that ((line_size % 16) == 0) */
  39. void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  40. {
  41.     register vector unsigned char pixelsv1;
  42.     register vector unsigned char pixelsv1B;
  43.     register vector unsigned char pixelsv1C;
  44.     register vector unsigned char pixelsv1D;
  45.  
  46.     int i;
  47.     register ptrdiff_t line_size_2 = line_size << 1;
  48.     register ptrdiff_t line_size_3 = line_size + line_size_2;
  49.     register ptrdiff_t line_size_4 = line_size << 2;
  50.  
  51. // hand-unrolling the loop by 4 gains about 15%
  52. // mininum execution time goes from 74 to 60 cycles
  53. // it's faster than -funroll-loops, but using
  54. // -funroll-loops w/ this is bad - 74 cycles again.
  55. // all this is on a 7450, tuning for the 7450
  56.     for (i = 0; i < h; i += 4) {
  57.         pixelsv1  = unaligned_load( 0, pixels);
  58.         pixelsv1B = unaligned_load(line_size, pixels);
  59.         pixelsv1C = unaligned_load(line_size_2, pixels);
  60.         pixelsv1D = unaligned_load(line_size_3, pixels);
  61.         VEC_ST(pixelsv1, 0, (unsigned char*)block);
  62.         VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
  63.         VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
  64.         VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
  65.         pixels+=line_size_4;
  66.         block +=line_size_4;
  67.     }
  68. }
  69.  
  70. /* next one assumes that ((line_size % 16) == 0) */
  71. #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  72. void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  73. {
  74.     register vector unsigned char pixelsv, blockv;
  75.  
  76.     int i;
  77.     for (i = 0; i < h; i++) {
  78.         blockv = vec_ld(0, block);
  79.         pixelsv = VEC_LD( 0, pixels);
  80.         blockv = vec_avg(blockv,pixelsv);
  81.         vec_st(blockv, 0, (unsigned char*)block);
  82.         pixels+=line_size;
  83.         block +=line_size;
  84.     }
  85. }
  86.  
  87. /* next one assumes that ((line_size % 8) == 0) */
  88. static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
  89. {
  90.     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  91.     int i;
  92.  
  93.    for (i = 0; i < h; i++) {
  94.        /* block is 8 bytes-aligned, so we're either in the
  95.           left block (16 bytes-aligned) or in the right block (not) */
  96.        int rightside = ((unsigned long)block & 0x0000000F);
  97.  
  98.        blockv = vec_ld(0, block);
  99.        pixelsv = VEC_LD( 0, pixels);
  100.  
  101.        if (rightside) {
  102.            pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  103.        } else {
  104.            pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  105.        }
  106.  
  107.        blockv = vec_avg(blockv, pixelsv);
  108.  
  109.        vec_st(blockv, 0, block);
  110.  
  111.        pixels += line_size;
  112.        block += line_size;
  113.    }
  114. }
  115.  
  116. /* next one assumes that ((line_size % 8) == 0) */
  117. static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  118. {
  119.     register int i;
  120.     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  121.     register vector unsigned char blockv;
  122.     register vector unsigned short pixelssum1, pixelssum2, temp3;
  123.     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  124.     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  125.  
  126.     pixelsv1 = VEC_LD(0, pixels);
  127.     pixelsv2 = VEC_LD(1, pixels);
  128.     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  129.     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  130.  
  131.     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  132.                          (vector unsigned short)pixelsv2);
  133.     pixelssum1 = vec_add(pixelssum1, vctwo);
  134.  
  135.     for (i = 0; i < h ; i++) {
  136.         int rightside = ((unsigned long)block & 0x0000000F);
  137.         blockv = vec_ld(0, block);
  138.  
  139.         pixelsv1 = unaligned_load(line_size, pixels);
  140.         pixelsv2 = unaligned_load(line_size+1, pixels);
  141.         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  142.         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  143.         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  144.                              (vector unsigned short)pixelsv2);
  145.         temp3 = vec_add(pixelssum1, pixelssum2);
  146.         temp3 = vec_sra(temp3, vctwo);
  147.         pixelssum1 = vec_add(pixelssum2, vctwo);
  148.         pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  149.  
  150.         if (rightside) {
  151.             blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  152.         } else {
  153.             blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  154.         }
  155.  
  156.         vec_st(blockv, 0, block);
  157.  
  158.         block += line_size;
  159.         pixels += line_size;
  160.     }
  161. }
  162.  
  163. /* next one assumes that ((line_size % 8) == 0) */
  164. static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  165. {
  166.     register int i;
  167.     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  168.     register vector unsigned char blockv;
  169.     register vector unsigned short pixelssum1, pixelssum2, temp3;
  170.     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  171.     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  172.     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  173.  
  174.     pixelsv1 = VEC_LD(0, pixels);
  175.     pixelsv2 = VEC_LD(1, pixels);
  176.     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  177.     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  178.     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  179.                          (vector unsigned short)pixelsv2);
  180.     pixelssum1 = vec_add(pixelssum1, vcone);
  181.  
  182.     for (i = 0; i < h ; i++) {
  183.         int rightside = ((unsigned long)block & 0x0000000F);
  184.         blockv = vec_ld(0, block);
  185.  
  186.         pixelsv1 = unaligned_load(line_size, pixels);
  187.         pixelsv2 = unaligned_load(line_size+1, pixels);
  188.         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  189.         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  190.         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  191.                              (vector unsigned short)pixelsv2);
  192.         temp3 = vec_add(pixelssum1, pixelssum2);
  193.         temp3 = vec_sra(temp3, vctwo);
  194.         pixelssum1 = vec_add(pixelssum2, vcone);
  195.         pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  196.  
  197.         if (rightside) {
  198.             blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  199.         } else {
  200.             blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  201.         }
  202.  
  203.         vec_st(blockv, 0, block);
  204.  
  205.         block += line_size;
  206.         pixels += line_size;
  207.     }
  208. }
  209.  
  210. /* next one assumes that ((line_size % 16) == 0) */
  211. static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
  212. {
  213.     register int i;
  214.     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  215.     register vector unsigned char blockv;
  216.     register vector unsigned short temp3, temp4,
  217.         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
  218.     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  219.     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  220.  
  221.     pixelsv1 = VEC_LD(0, pixels);
  222.     pixelsv2 = VEC_LD(1, pixels);
  223.     pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
  224.     pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
  225.     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  226.     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  227.     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  228.                          (vector unsigned short)pixelsv4);
  229.     pixelssum3 = vec_add(pixelssum3, vctwo);
  230.     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  231.                          (vector unsigned short)pixelsv2);
  232.     pixelssum1 = vec_add(pixelssum1, vctwo);
  233.  
  234.     for (i = 0; i < h ; i++) {
  235.         blockv = vec_ld(0, block);
  236.  
  237.         pixelsv1 = unaligned_load(line_size, pixels);
  238.         pixelsv2 = unaligned_load(line_size+1, pixels);
  239.  
  240.         pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
  241.         pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
  242.         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  243.         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  244.         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  245.                              (vector unsigned short)pixelsv4);
  246.         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  247.                              (vector unsigned short)pixelsv2);
  248.         temp4 = vec_add(pixelssum3, pixelssum4);
  249.         temp4 = vec_sra(temp4, vctwo);
  250.         temp3 = vec_add(pixelssum1, pixelssum2);
  251.         temp3 = vec_sra(temp3, vctwo);
  252.  
  253.         pixelssum3 = vec_add(pixelssum4, vctwo);
  254.         pixelssum1 = vec_add(pixelssum2, vctwo);
  255.  
  256.         blockv = vec_packsu(temp3, temp4);
  257.  
  258.         vec_st(blockv, 0, block);
  259.  
  260.         block += line_size;
  261.         pixels += line_size;
  262.     }
  263. }
  264.  
  265. /* next one assumes that ((line_size % 16) == 0) */
  266. static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
  267. {
  268.     register int i;
  269.     register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  270.     register vector unsigned char blockv;
  271.     register vector unsigned short temp3, temp4,
  272.         pixelssum1, pixelssum2, pixelssum3, pixelssum4;
  273.     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  274.     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  275.     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  276.  
  277.     pixelsv1 = VEC_LD(0, pixels);
  278.     pixelsv2 = VEC_LD(1, pixels);
  279.     pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
  280.     pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
  281.     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  282.     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  283.     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  284.                          (vector unsigned short)pixelsv4);
  285.     pixelssum3 = vec_add(pixelssum3, vcone);
  286.     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  287.                          (vector unsigned short)pixelsv2);
  288.     pixelssum1 = vec_add(pixelssum1, vcone);
  289.  
  290.     for (i = 0; i < h ; i++) {
  291.         pixelsv1 = unaligned_load(line_size, pixels);
  292.         pixelsv2 = unaligned_load(line_size+1, pixels);
  293.  
  294.         pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
  295.         pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
  296.         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  297.         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  298.         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  299.                              (vector unsigned short)pixelsv4);
  300.         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  301.                              (vector unsigned short)pixelsv2);
  302.         temp4 = vec_add(pixelssum3, pixelssum4);
  303.         temp4 = vec_sra(temp4, vctwo);
  304.         temp3 = vec_add(pixelssum1, pixelssum2);
  305.         temp3 = vec_sra(temp3, vctwo);
  306.  
  307.         pixelssum3 = vec_add(pixelssum4, vcone);
  308.         pixelssum1 = vec_add(pixelssum2, vcone);
  309.  
  310.         blockv = vec_packsu(temp3, temp4);
  311.  
  312.         VEC_ST(blockv, 0, block);
  313.  
  314.         block += line_size;
  315.         pixels += line_size;
  316.     }
  317. }
  318.  
  319. /* next one assumes that ((line_size % 8) == 0) */
  320. static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  321. {
  322.     register int i;
  323.     register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  324.     register vector unsigned char blockv, blocktemp;
  325.     register vector unsigned short pixelssum1, pixelssum2, temp3;
  326.  
  327.     register const vector unsigned char vczero = (const vector unsigned char)
  328.                                         vec_splat_u8(0);
  329.     register const vector unsigned short vctwo = (const vector unsigned short)
  330.                                         vec_splat_u16(2);
  331.  
  332.     pixelsv1 = VEC_LD(0, pixels);
  333.     pixelsv2 = VEC_LD(1, pixels);
  334.     pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  335.     pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  336.     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  337.                          (vector unsigned short)pixelsv2);
  338.     pixelssum1 = vec_add(pixelssum1, vctwo);
  339.  
  340.     for (i = 0; i < h ; i++) {
  341.         int rightside = ((unsigned long)block & 0x0000000F);
  342.         blockv = vec_ld(0, block);
  343.  
  344.         pixelsv1 = unaligned_load(line_size, pixels);
  345.         pixelsv2 = unaligned_load(line_size+1, pixels);
  346.  
  347.         pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
  348.         pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
  349.         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  350.                              (vector unsigned short)pixelsv2);
  351.         temp3 = vec_add(pixelssum1, pixelssum2);
  352.         temp3 = vec_sra(temp3, vctwo);
  353.         pixelssum1 = vec_add(pixelssum2, vctwo);
  354.         pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  355.  
  356.         if (rightside) {
  357.             blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  358.         } else {
  359.             blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  360.         }
  361.  
  362.         blockv = vec_avg(blocktemp, blockv);
  363.         vec_st(blockv, 0, block);
  364.  
  365.         block += line_size;
  366.         pixels += line_size;
  367.     }
  368. }
  369. #endif /* HAVE_ALTIVEC */
  370.  
  371. av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
  372. {
  373. #if HAVE_ALTIVEC
  374.     if (!PPC_ALTIVEC(av_get_cpu_flags()))
  375.         return;
  376.  
  377.     c->avg_pixels_tab[0][0]        = ff_avg_pixels16_altivec;
  378.     c->avg_pixels_tab[1][0]        = avg_pixels8_altivec;
  379.     c->avg_pixels_tab[1][3]        = avg_pixels8_xy2_altivec;
  380.  
  381.     c->put_pixels_tab[0][0]        = ff_put_pixels16_altivec;
  382.     c->put_pixels_tab[1][3]        = put_pixels8_xy2_altivec;
  383.     c->put_pixels_tab[0][3]        = put_pixels16_xy2_altivec;
  384.  
  385.     c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
  386.     c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
  387.     c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
  388. #endif /* HAVE_ALTIVEC */
  389. }
  390.