Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * Copyright (c) 2001 Michel Lespinasse |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2.1 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with FFmpeg; if not, write to the Free Software |
||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | */ |
||
20 | |||
21 | /* |
||
22 | * NOTE: This code is based on GPL code from the libmpeg2 project. The |
||
23 | * author, Michel Lespinasses, has given explicit permission to release |
||
24 | * under LGPL as part of FFmpeg. |
||
25 | */ |
||
26 | |||
27 | /* |
||
28 | * FFmpeg integration by Dieter Shirley |
||
29 | * |
||
30 | * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 |
||
31 | * project. I've deleted all of the libmpeg2-specific code, renamed the |
||
32 | * functions and reordered the function parameters. The only change to the |
||
33 | * IDCT function itself was to factor out the partial transposition, and to |
||
34 | * perform a full transpose at the end of the function. |
||
35 | */ |
||
36 | |||
37 | |||
38 | #include |
||
39 | #include |
||
40 | #include "config.h" |
||
41 | #if HAVE_ALTIVEC_H |
||
42 | #include |
||
43 | #endif |
||
44 | #include "libavutil/ppc/types_altivec.h" |
||
45 | #include "dsputil_altivec.h" |
||
46 | |||
47 | #define IDCT_HALF \ |
||
48 | /* 1st stage */ \ |
||
49 | t1 = vec_mradds (a1, vx7, vx1 ); \ |
||
50 | t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ |
||
51 | t7 = vec_mradds (a2, vx5, vx3); \ |
||
52 | t3 = vec_mradds (ma2, vx3, vx5); \ |
||
53 | \ |
||
54 | /* 2nd stage */ \ |
||
55 | t5 = vec_adds (vx0, vx4); \ |
||
56 | t0 = vec_subs (vx0, vx4); \ |
||
57 | t2 = vec_mradds (a0, vx6, vx2); \ |
||
58 | t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ |
||
59 | t6 = vec_adds (t8, t3); \ |
||
60 | t3 = vec_subs (t8, t3); \ |
||
61 | t8 = vec_subs (t1, t7); \ |
||
62 | t1 = vec_adds (t1, t7); \ |
||
63 | \ |
||
64 | /* 3rd stage */ \ |
||
65 | t7 = vec_adds (t5, t2); \ |
||
66 | t2 = vec_subs (t5, t2); \ |
||
67 | t5 = vec_adds (t0, t4); \ |
||
68 | t0 = vec_subs (t0, t4); \ |
||
69 | t4 = vec_subs (t8, t3); \ |
||
70 | t3 = vec_adds (t8, t3); \ |
||
71 | \ |
||
72 | /* 4th stage */ \ |
||
73 | vy0 = vec_adds (t7, t1); \ |
||
74 | vy7 = vec_subs (t7, t1); \ |
||
75 | vy1 = vec_mradds (c4, t3, t5); \ |
||
76 | vy6 = vec_mradds (mc4, t3, t5); \ |
||
77 | vy2 = vec_mradds (c4, t4, t0); \ |
||
78 | vy5 = vec_mradds (mc4, t4, t0); \ |
||
79 | vy3 = vec_adds (t2, t6); \ |
||
80 | vy4 = vec_subs (t2, t6); |
||
81 | |||
82 | |||
83 | #define IDCT \ |
||
84 | vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ |
||
85 | vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ |
||
86 | vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \ |
||
87 | vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ |
||
88 | vec_u16 shift; \ |
||
89 | \ |
||
90 | c4 = vec_splat (constants[0], 0); \ |
||
91 | a0 = vec_splat (constants[0], 1); \ |
||
92 | a1 = vec_splat (constants[0], 2); \ |
||
93 | a2 = vec_splat (constants[0], 3); \ |
||
94 | mc4 = vec_splat (constants[0], 4); \ |
||
95 | ma2 = vec_splat (constants[0], 5); \ |
||
96 | bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \ |
||
97 | \ |
||
98 | zero = vec_splat_s16 (0); \ |
||
99 | shift = vec_splat_u16 (4); \ |
||
100 | \ |
||
101 | vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ |
||
102 | vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ |
||
103 | vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ |
||
104 | vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ |
||
105 | vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ |
||
106 | vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ |
||
107 | vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ |
||
108 | vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ |
||
109 | \ |
||
110 | IDCT_HALF \ |
||
111 | \ |
||
112 | vx0 = vec_mergeh (vy0, vy4); \ |
||
113 | vx1 = vec_mergel (vy0, vy4); \ |
||
114 | vx2 = vec_mergeh (vy1, vy5); \ |
||
115 | vx3 = vec_mergel (vy1, vy5); \ |
||
116 | vx4 = vec_mergeh (vy2, vy6); \ |
||
117 | vx5 = vec_mergel (vy2, vy6); \ |
||
118 | vx6 = vec_mergeh (vy3, vy7); \ |
||
119 | vx7 = vec_mergel (vy3, vy7); \ |
||
120 | \ |
||
121 | vy0 = vec_mergeh (vx0, vx4); \ |
||
122 | vy1 = vec_mergel (vx0, vx4); \ |
||
123 | vy2 = vec_mergeh (vx1, vx5); \ |
||
124 | vy3 = vec_mergel (vx1, vx5); \ |
||
125 | vy4 = vec_mergeh (vx2, vx6); \ |
||
126 | vy5 = vec_mergel (vx2, vx6); \ |
||
127 | vy6 = vec_mergeh (vx3, vx7); \ |
||
128 | vy7 = vec_mergel (vx3, vx7); \ |
||
129 | \ |
||
130 | vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ |
||
131 | vx1 = vec_mergel (vy0, vy4); \ |
||
132 | vx2 = vec_mergeh (vy1, vy5); \ |
||
133 | vx3 = vec_mergel (vy1, vy5); \ |
||
134 | vx4 = vec_mergeh (vy2, vy6); \ |
||
135 | vx5 = vec_mergel (vy2, vy6); \ |
||
136 | vx6 = vec_mergeh (vy3, vy7); \ |
||
137 | vx7 = vec_mergel (vy3, vy7); \ |
||
138 | \ |
||
139 | IDCT_HALF \ |
||
140 | \ |
||
141 | shift = vec_splat_u16 (6); \ |
||
142 | vx0 = vec_sra (vy0, shift); \ |
||
143 | vx1 = vec_sra (vy1, shift); \ |
||
144 | vx2 = vec_sra (vy2, shift); \ |
||
145 | vx3 = vec_sra (vy3, shift); \ |
||
146 | vx4 = vec_sra (vy4, shift); \ |
||
147 | vx5 = vec_sra (vy5, shift); \ |
||
148 | vx6 = vec_sra (vy6, shift); \ |
||
149 | vx7 = vec_sra (vy7, shift); |
||
150 | |||
151 | |||
152 | static const vec_s16 constants[5] = { |
||
153 | {23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, |
||
154 | {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, |
||
155 | {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, |
||
156 | {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692}, |
||
157 | {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} |
||
158 | }; |
||
159 | |||
160 | void ff_idct_put_altivec(uint8_t* dest, int stride, int16_t *blk) |
||
161 | { |
||
162 | vec_s16 *block = (vec_s16*)blk; |
||
163 | vec_u8 tmp; |
||
164 | |||
165 | IDCT |
||
166 | |||
167 | #define COPY(dest,src) \ |
||
168 | tmp = vec_packsu (src, src); \ |
||
169 | vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ |
||
170 | vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); |
||
171 | |||
172 | COPY (dest, vx0) dest += stride; |
||
173 | COPY (dest, vx1) dest += stride; |
||
174 | COPY (dest, vx2) dest += stride; |
||
175 | COPY (dest, vx3) dest += stride; |
||
176 | COPY (dest, vx4) dest += stride; |
||
177 | COPY (dest, vx5) dest += stride; |
||
178 | COPY (dest, vx6) dest += stride; |
||
179 | COPY (dest, vx7) |
||
180 | } |
||
181 | |||
182 | void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk) |
||
183 | { |
||
184 | vec_s16 *block = (vec_s16*)blk; |
||
185 | vec_u8 tmp; |
||
186 | vec_s16 tmp2, tmp3; |
||
187 | vec_u8 perm0; |
||
188 | vec_u8 perm1; |
||
189 | vec_u8 p0, p1, p; |
||
190 | |||
191 | IDCT |
||
192 | |||
193 | p0 = vec_lvsl (0, dest); |
||
194 | p1 = vec_lvsl (stride, dest); |
||
195 | p = vec_splat_u8 (-1); |
||
196 | perm0 = vec_mergeh (p, p0); |
||
197 | perm1 = vec_mergeh (p, p1); |
||
198 | |||
199 | #define ADD(dest,src,perm) \ |
||
200 | /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ |
||
201 | tmp = vec_ld (0, dest); \ |
||
202 | tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \ |
||
203 | tmp3 = vec_adds (tmp2, src); \ |
||
204 | tmp = vec_packsu (tmp3, tmp3); \ |
||
205 | vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \ |
||
206 | vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest); |
||
207 | |||
208 | ADD (dest, vx0, perm0) dest += stride; |
||
209 | ADD (dest, vx1, perm1) dest += stride; |
||
210 | ADD (dest, vx2, perm0) dest += stride; |
||
211 | ADD (dest, vx3, perm1) dest += stride; |
||
212 | ADD (dest, vx4, perm0) dest += stride; |
||
213 | ADD (dest, vx5, perm1) dest += stride; |
||
214 | ADD (dest, vx6, perm0) dest += stride; |
||
215 | ADD (dest, vx7, perm1) |
||
216 | } |