Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
  3.  *
  4.  * This file is part of FFmpeg.
  5.  *
  6.  * FFmpeg is free software; you can redistribute it and/or
  7.  * modify it under the terms of the GNU Lesser General Public
  8.  * License as published by the Free Software Foundation; either
  9.  * version 2.1 of the License, or (at your option) any later version.
  10.  *
  11.  * FFmpeg is distributed in the hope that it will be useful,
  12.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14.  * Lesser General Public License for more details.
  15.  *
  16.  * You should have received a copy of the GNU Lesser General Public
  17.  * License along with FFmpeg; if not, write to the Free Software
  18.  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19.  */
  20.  
  21. /**
  22.  ** @file
  23.  ** integer misc ops.
  24.  **/
  25.  
  26. #include "config.h"
  27. #if HAVE_ALTIVEC_H
  28. #include <altivec.h>
  29. #endif
  30.  
  31. #include "libavutil/attributes.h"
  32. #include "libavutil/ppc/types_altivec.h"
  33. #include "libavcodec/dsputil.h"
  34.  
  35. #include "dsputil_altivec.h"
  36.  
  37. static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
  38.                                      int size) {
  39.     int i, size16;
  40.     vector signed char vpix1;
  41.     vector signed short vpix2, vdiff, vpix1l,vpix1h;
  42.     union { vector signed int vscore;
  43.             int32_t score[4];
  44.           } u;
  45.     u.vscore = vec_splat_s32(0);
  46. //
  47. //XXX lazy way, fix it later
  48.  
  49. #define vec_unaligned_load(b) \
  50.     vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
  51.  
  52.     size16 = size >> 4;
  53.     while(size16) {
  54. //        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
  55.         //load pix1 and the first batch of pix2
  56.  
  57.         vpix1 = vec_unaligned_load(pix1);
  58.         vpix2 = vec_unaligned_load(pix2);
  59.         pix2 += 8;
  60.         //unpack
  61.         vpix1h = vec_unpackh(vpix1);
  62.         vdiff  = vec_sub(vpix1h, vpix2);
  63.         vpix1l = vec_unpackl(vpix1);
  64.         // load another batch from pix2
  65.         vpix2 = vec_unaligned_load(pix2);
  66.         u.vscore = vec_msum(vdiff, vdiff, u.vscore);
  67.         vdiff  = vec_sub(vpix1l, vpix2);
  68.         u.vscore = vec_msum(vdiff, vdiff, u.vscore);
  69.         pix1 += 16;
  70.         pix2 += 8;
  71.         size16--;
  72.     }
  73.     u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
  74.  
  75.     size %= 16;
  76.     for (i = 0; i < size; i++) {
  77.         u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
  78.     }
  79.     return u.score[3];
  80. }
  81.  
  82. static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
  83.                                            int order)
  84. {
  85.     int i;
  86.     LOAD_ZERO;
  87.     register vec_s16 vec1;
  88.     register vec_s32 res = vec_splat_s32(0), t;
  89.     int32_t ires;
  90.  
  91.     for(i = 0; i < order; i += 8){
  92.         vec1 = vec_unaligned_load(v1);
  93.         t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
  94.         res = vec_sums(t, res);
  95.         v1 += 8;
  96.         v2 += 8;
  97.     }
  98.     res = vec_splat(res, 3);
  99.     vec_ste(res, 0, &ires);
  100.     return ires;
  101. }
  102.  
  103. static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
  104. {
  105.     LOAD_ZERO;
  106.     vec_s16 *pv1 = (vec_s16*)v1;
  107.     register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
  108.     register vec_s16 t0, t1, i0, i1, i4;
  109.     register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
  110.     register vec_s32 res = zero_s32v;
  111.     register vec_u8 align = vec_lvsl(0, v2);
  112.     int32_t ires;
  113.     order >>= 4;
  114.     do {
  115.         i1 = vec_ld(16, v2);
  116.         t0 = vec_perm(i2, i1, align);
  117.         i2 = vec_ld(32, v2);
  118.         t1 = vec_perm(i1, i2, align);
  119.         i0 = pv1[0];
  120.         i1 = pv1[1];
  121.         res = vec_msum(t0, i0, res);
  122.         res = vec_msum(t1, i1, res);
  123.         i4 = vec_ld(16, v3);
  124.         t0 = vec_perm(i3, i4, align);
  125.         i3 = vec_ld(32, v3);
  126.         t1 = vec_perm(i4, i3, align);
  127.         pv1[0] = vec_mladd(t0, muls, i0);
  128.         pv1[1] = vec_mladd(t1, muls, i1);
  129.         pv1 += 2;
  130.         v2  += 16;
  131.         v3  += 16;
  132.     } while(--order);
  133.     res = vec_splat(vec_sums(res, zero_s32v), 3);
  134.     vec_ste(res, 0, &ires);
  135.     return ires;
  136. }
  137.  
  138. av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
  139. {
  140.     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
  141.     c->scalarproduct_int16 = scalarproduct_int16_altivec;
  142.     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
  143. }
  144.