Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * Copyright (C) 2009 David Conrad |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2.1 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with FFmpeg; if not, write to the Free Software |
||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | */ |
||
20 | |||
21 | #include |
||
22 | |||
23 | #include "config.h" |
||
24 | #include "libavutil/attributes.h" |
||
25 | #include "libavutil/cpu.h" |
||
26 | #include "libavutil/ppc/types_altivec.h" |
||
27 | #include "libavutil/ppc/util_altivec.h" |
||
28 | #include "libavcodec/vp3dsp.h" |
||
29 | #include "dsputil_altivec.h" |
||
30 | |||
31 | #if HAVE_ALTIVEC |
||
32 | |||
33 | static const vec_s16 constants = |
||
34 | {0, 64277, 60547, 54491, 46341, 36410, 25080, 12785}; |
||
35 | static const vec_u8 interleave_high = |
||
36 | {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29}; |
||
37 | |||
38 | #define IDCT_START \ |
||
39 | vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\ |
||
40 | vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\ |
||
41 | vec_s16 eight = vec_splat_s16(8);\ |
||
42 | vec_u16 four = vec_splat_u16(4);\ |
||
43 | \ |
||
44 | vec_s16 C1 = vec_splat(constants, 1);\ |
||
45 | vec_s16 C2 = vec_splat(constants, 2);\ |
||
46 | vec_s16 C3 = vec_splat(constants, 3);\ |
||
47 | vec_s16 C4 = vec_splat(constants, 4);\ |
||
48 | vec_s16 C5 = vec_splat(constants, 5);\ |
||
49 | vec_s16 C6 = vec_splat(constants, 6);\ |
||
50 | vec_s16 C7 = vec_splat(constants, 7);\ |
||
51 | \ |
||
52 | vec_s16 b0 = vec_ld(0x00, block);\ |
||
53 | vec_s16 b1 = vec_ld(0x10, block);\ |
||
54 | vec_s16 b2 = vec_ld(0x20, block);\ |
||
55 | vec_s16 b3 = vec_ld(0x30, block);\ |
||
56 | vec_s16 b4 = vec_ld(0x40, block);\ |
||
57 | vec_s16 b5 = vec_ld(0x50, block);\ |
||
58 | vec_s16 b6 = vec_ld(0x60, block);\ |
||
59 | vec_s16 b7 = vec_ld(0x70, block); |
||
60 | |||
61 | // these functions do (a*C)>>16 |
||
62 | // things are tricky because a is signed, but C unsigned. |
||
63 | // M15 is used if C fits in 15 bit unsigned (C6,C7) |
||
64 | // M16 is used if C requires 16 bits unsigned |
||
65 | static inline vec_s16 M15(vec_s16 a, vec_s16 C) |
||
66 | { |
||
67 | return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high); |
||
68 | } |
||
69 | static inline vec_s16 M16(vec_s16 a, vec_s16 C) |
||
70 | { |
||
71 | return vec_add(a, M15(a, C)); |
||
72 | } |
||
73 | |||
74 | #define IDCT_1D(ADD, SHIFT)\ |
||
75 | A = vec_add(M16(b1, C1), M15(b7, C7));\ |
||
76 | B = vec_sub(M15(b1, C7), M16(b7, C1));\ |
||
77 | C = vec_add(M16(b3, C3), M16(b5, C5));\ |
||
78 | D = vec_sub(M16(b5, C3), M16(b3, C5));\ |
||
79 | \ |
||
80 | Ad = M16(vec_sub(A, C), C4);\ |
||
81 | Bd = M16(vec_sub(B, D), C4);\ |
||
82 | \ |
||
83 | Cd = vec_add(A, C);\ |
||
84 | Dd = vec_add(B, D);\ |
||
85 | \ |
||
86 | E = ADD(M16(vec_add(b0, b4), C4));\ |
||
87 | F = ADD(M16(vec_sub(b0, b4), C4));\ |
||
88 | \ |
||
89 | G = vec_add(M16(b2, C2), M15(b6, C6));\ |
||
90 | H = vec_sub(M15(b2, C6), M16(b6, C2));\ |
||
91 | \ |
||
92 | Ed = vec_sub(E, G);\ |
||
93 | Gd = vec_add(E, G);\ |
||
94 | \ |
||
95 | Add = vec_add(F, Ad);\ |
||
96 | Bdd = vec_sub(Bd, H);\ |
||
97 | \ |
||
98 | Fd = vec_sub(F, Ad);\ |
||
99 | Hd = vec_add(Bd, H);\ |
||
100 | \ |
||
101 | b0 = SHIFT(vec_add(Gd, Cd));\ |
||
102 | b7 = SHIFT(vec_sub(Gd, Cd));\ |
||
103 | \ |
||
104 | b1 = SHIFT(vec_add(Add, Hd));\ |
||
105 | b2 = SHIFT(vec_sub(Add, Hd));\ |
||
106 | \ |
||
107 | b3 = SHIFT(vec_add(Ed, Dd));\ |
||
108 | b4 = SHIFT(vec_sub(Ed, Dd));\ |
||
109 | \ |
||
110 | b5 = SHIFT(vec_add(Fd, Bdd));\ |
||
111 | b6 = SHIFT(vec_sub(Fd, Bdd)); |
||
112 | |||
113 | #define NOP(a) a |
||
114 | #define ADD8(a) vec_add(a, eight) |
||
115 | #define SHIFT4(a) vec_sra(a, four) |
||
116 | |||
117 | static void vp3_idct_put_altivec(uint8_t *dst, int stride, int16_t block[64]) |
||
118 | { |
||
119 | vec_u8 t; |
||
120 | IDCT_START |
||
121 | |||
122 | // pixels are signed; so add 128*16 in addition to the normal 8 |
||
123 | vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11)); |
||
124 | eight = vec_add(eight, v2048); |
||
125 | |||
126 | IDCT_1D(NOP, NOP) |
||
127 | TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); |
||
128 | IDCT_1D(ADD8, SHIFT4) |
||
129 | |||
130 | #define PUT(a)\ |
||
131 | t = vec_packsu(a, a);\ |
||
132 | vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ |
||
133 | vec_ste((vec_u32)t, 4, (unsigned int *)dst); |
||
134 | |||
135 | PUT(b0) dst += stride; |
||
136 | PUT(b1) dst += stride; |
||
137 | PUT(b2) dst += stride; |
||
138 | PUT(b3) dst += stride; |
||
139 | PUT(b4) dst += stride; |
||
140 | PUT(b5) dst += stride; |
||
141 | PUT(b6) dst += stride; |
||
142 | PUT(b7) |
||
143 | memset(block, 0, sizeof(*block) * 64); |
||
144 | } |
||
145 | |||
146 | static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64]) |
||
147 | { |
||
148 | LOAD_ZERO; |
||
149 | vec_u8 t, vdst; |
||
150 | vec_s16 vdst_16; |
||
151 | vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst)); |
||
152 | |||
153 | IDCT_START |
||
154 | |||
155 | IDCT_1D(NOP, NOP) |
||
156 | TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7); |
||
157 | IDCT_1D(ADD8, SHIFT4) |
||
158 | |||
159 | #define ADD(a)\ |
||
160 | vdst = vec_ld(0, dst);\ |
||
161 | vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\ |
||
162 | vdst_16 = vec_adds(a, vdst_16);\ |
||
163 | t = vec_packsu(vdst_16, vdst_16);\ |
||
164 | vec_ste((vec_u32)t, 0, (unsigned int *)dst);\ |
||
165 | vec_ste((vec_u32)t, 4, (unsigned int *)dst); |
||
166 | |||
167 | ADD(b0) dst += stride; |
||
168 | ADD(b1) dst += stride; |
||
169 | ADD(b2) dst += stride; |
||
170 | ADD(b3) dst += stride; |
||
171 | ADD(b4) dst += stride; |
||
172 | ADD(b5) dst += stride; |
||
173 | ADD(b6) dst += stride; |
||
174 | ADD(b7) |
||
175 | memset(block, 0, sizeof(*block) * 64); |
||
176 | } |
||
177 | |||
178 | #endif /* HAVE_ALTIVEC */ |
||
179 | |||
180 | av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags) |
||
181 | { |
||
182 | #if HAVE_ALTIVEC |
||
183 | if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) |
||
184 | return; |
||
185 | |||
186 | c->idct_put = vp3_idct_put_altivec; |
||
187 | c->idct_add = vp3_idct_add_altivec; |
||
188 | #endif |
||
189 | } |