Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * Copyright (c) 2004 Romain Dolbeau |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2.1 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with FFmpeg; if not, write to the Free Software |
||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | */ |
||
20 | |||
21 | #include "libavutil/mem.h" |
||
22 | |||
23 | #ifdef DEBUG |
||
24 | #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); |
||
25 | #else |
||
26 | #define ASSERT_ALIGNED(ptr) ; |
||
27 | #endif |
||
28 | |||
29 | /* this code assume stride % 16 == 0 */ |
||
30 | #ifdef PREFIX_h264_qpel16_h_lowpass_altivec |
||
31 | static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
||
32 | register int i; |
||
33 | |||
34 | LOAD_ZERO; |
||
35 | const vec_u8 permM2 = vec_lvsl(-2, src); |
||
36 | const vec_u8 permM1 = vec_lvsl(-1, src); |
||
37 | const vec_u8 permP0 = vec_lvsl(+0, src); |
||
38 | const vec_u8 permP1 = vec_lvsl(+1, src); |
||
39 | const vec_u8 permP2 = vec_lvsl(+2, src); |
||
40 | const vec_u8 permP3 = vec_lvsl(+3, src); |
||
41 | const vec_s16 v5ss = vec_splat_s16(5); |
||
42 | const vec_u16 v5us = vec_splat_u16(5); |
||
43 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
||
44 | const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
||
45 | |||
46 | vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
||
47 | |||
48 | register int align = ((((unsigned long)src) - 2) % 16); |
||
49 | |||
50 | vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
||
51 | srcP2A, srcP2B, srcP3A, srcP3B, |
||
52 | srcM1A, srcM1B, srcM2A, srcM2B, |
||
53 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
||
54 | pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
||
55 | psumA, psumB, sumA, sumB; |
||
56 | |||
57 | vec_u8 sum, fsum; |
||
58 | |||
59 | for (i = 0 ; i < 16 ; i ++) { |
||
60 | vec_u8 srcR1 = vec_ld(-2, src); |
||
61 | vec_u8 srcR2 = vec_ld(14, src); |
||
62 | |||
63 | switch (align) { |
||
64 | default: { |
||
65 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
66 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
67 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
68 | srcP1 = vec_perm(srcR1, srcR2, permP1); |
||
69 | srcP2 = vec_perm(srcR1, srcR2, permP2); |
||
70 | srcP3 = vec_perm(srcR1, srcR2, permP3); |
||
71 | } break; |
||
72 | case 11: { |
||
73 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
74 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
75 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
76 | srcP1 = vec_perm(srcR1, srcR2, permP1); |
||
77 | srcP2 = vec_perm(srcR1, srcR2, permP2); |
||
78 | srcP3 = srcR2; |
||
79 | } break; |
||
80 | case 12: { |
||
81 | vec_u8 srcR3 = vec_ld(30, src); |
||
82 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
83 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
84 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
85 | srcP1 = vec_perm(srcR1, srcR2, permP1); |
||
86 | srcP2 = srcR2; |
||
87 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
88 | } break; |
||
89 | case 13: { |
||
90 | vec_u8 srcR3 = vec_ld(30, src); |
||
91 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
92 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
93 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
94 | srcP1 = srcR2; |
||
95 | srcP2 = vec_perm(srcR2, srcR3, permP2); |
||
96 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
97 | } break; |
||
98 | case 14: { |
||
99 | vec_u8 srcR3 = vec_ld(30, src); |
||
100 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
101 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
102 | srcP0 = srcR2; |
||
103 | srcP1 = vec_perm(srcR2, srcR3, permP1); |
||
104 | srcP2 = vec_perm(srcR2, srcR3, permP2); |
||
105 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
106 | } break; |
||
107 | case 15: { |
||
108 | vec_u8 srcR3 = vec_ld(30, src); |
||
109 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
110 | srcM1 = srcR2; |
||
111 | srcP0 = vec_perm(srcR2, srcR3, permP0); |
||
112 | srcP1 = vec_perm(srcR2, srcR3, permP1); |
||
113 | srcP2 = vec_perm(srcR2, srcR3, permP2); |
||
114 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
115 | } break; |
||
116 | } |
||
117 | |||
118 | srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
||
119 | srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); |
||
120 | srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
||
121 | srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); |
||
122 | |||
123 | srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
||
124 | srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); |
||
125 | srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
||
126 | srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); |
||
127 | |||
128 | srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
||
129 | srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); |
||
130 | srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
||
131 | srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); |
||
132 | |||
133 | sum1A = vec_adds(srcP0A, srcP1A); |
||
134 | sum1B = vec_adds(srcP0B, srcP1B); |
||
135 | sum2A = vec_adds(srcM1A, srcP2A); |
||
136 | sum2B = vec_adds(srcM1B, srcP2B); |
||
137 | sum3A = vec_adds(srcM2A, srcP3A); |
||
138 | sum3B = vec_adds(srcM2B, srcP3B); |
||
139 | |||
140 | pp1A = vec_mladd(sum1A, v20ss, v16ss); |
||
141 | pp1B = vec_mladd(sum1B, v20ss, v16ss); |
||
142 | |||
143 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
||
144 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
||
145 | |||
146 | pp3A = vec_add(sum3A, pp1A); |
||
147 | pp3B = vec_add(sum3B, pp1B); |
||
148 | |||
149 | psumA = vec_sub(pp3A, pp2A); |
||
150 | psumB = vec_sub(pp3B, pp2B); |
||
151 | |||
152 | sumA = vec_sra(psumA, v5us); |
||
153 | sumB = vec_sra(psumB, v5us); |
||
154 | |||
155 | sum = vec_packsu(sumA, sumB); |
||
156 | |||
157 | ASSERT_ALIGNED(dst); |
||
158 | |||
159 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); |
||
160 | |||
161 | vec_st(fsum, 0, dst); |
||
162 | |||
163 | src += srcStride; |
||
164 | dst += dstStride; |
||
165 | } |
||
166 | } |
||
167 | #endif |
||
168 | |||
169 | /* this code assume stride % 16 == 0 */ |
||
170 | #ifdef PREFIX_h264_qpel16_v_lowpass_altivec |
||
171 | static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
||
172 | register int i; |
||
173 | |||
174 | LOAD_ZERO; |
||
175 | const vec_u8 perm = vec_lvsl(0, src); |
||
176 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
||
177 | const vec_u16 v5us = vec_splat_u16(5); |
||
178 | const vec_s16 v5ss = vec_splat_s16(5); |
||
179 | const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
||
180 | |||
181 | uint8_t *srcbis = src - (srcStride * 2); |
||
182 | |||
183 | const vec_u8 srcM2a = vec_ld(0, srcbis); |
||
184 | const vec_u8 srcM2b = vec_ld(16, srcbis); |
||
185 | const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm); |
||
186 | //srcbis += srcStride; |
||
187 | const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); |
||
188 | const vec_u8 srcM1b = vec_ld(16, srcbis); |
||
189 | const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm); |
||
190 | //srcbis += srcStride; |
||
191 | const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); |
||
192 | const vec_u8 srcP0b = vec_ld(16, srcbis); |
||
193 | const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm); |
||
194 | //srcbis += srcStride; |
||
195 | const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); |
||
196 | const vec_u8 srcP1b = vec_ld(16, srcbis); |
||
197 | const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm); |
||
198 | //srcbis += srcStride; |
||
199 | const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); |
||
200 | const vec_u8 srcP2b = vec_ld(16, srcbis); |
||
201 | const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm); |
||
202 | //srcbis += srcStride; |
||
203 | |||
204 | vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
||
205 | vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); |
||
206 | vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
||
207 | vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); |
||
208 | vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
||
209 | vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); |
||
210 | vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
||
211 | vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); |
||
212 | vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
||
213 | vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); |
||
214 | |||
215 | vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
||
216 | psumA, psumB, sumA, sumB, |
||
217 | srcP3ssA, srcP3ssB, |
||
218 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
||
219 | |||
220 | vec_u8 sum, fsum, srcP3a, srcP3b, srcP3; |
||
221 | |||
222 | for (i = 0 ; i < 16 ; i++) { |
||
223 | srcP3a = vec_ld(0, srcbis += srcStride); |
||
224 | srcP3b = vec_ld(16, srcbis); |
||
225 | srcP3 = vec_perm(srcP3a, srcP3b, perm); |
||
226 | srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
||
227 | srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); |
||
228 | //srcbis += srcStride; |
||
229 | |||
230 | sum1A = vec_adds(srcP0ssA, srcP1ssA); |
||
231 | sum1B = vec_adds(srcP0ssB, srcP1ssB); |
||
232 | sum2A = vec_adds(srcM1ssA, srcP2ssA); |
||
233 | sum2B = vec_adds(srcM1ssB, srcP2ssB); |
||
234 | sum3A = vec_adds(srcM2ssA, srcP3ssA); |
||
235 | sum3B = vec_adds(srcM2ssB, srcP3ssB); |
||
236 | |||
237 | srcM2ssA = srcM1ssA; |
||
238 | srcM2ssB = srcM1ssB; |
||
239 | srcM1ssA = srcP0ssA; |
||
240 | srcM1ssB = srcP0ssB; |
||
241 | srcP0ssA = srcP1ssA; |
||
242 | srcP0ssB = srcP1ssB; |
||
243 | srcP1ssA = srcP2ssA; |
||
244 | srcP1ssB = srcP2ssB; |
||
245 | srcP2ssA = srcP3ssA; |
||
246 | srcP2ssB = srcP3ssB; |
||
247 | |||
248 | pp1A = vec_mladd(sum1A, v20ss, v16ss); |
||
249 | pp1B = vec_mladd(sum1B, v20ss, v16ss); |
||
250 | |||
251 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
||
252 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
||
253 | |||
254 | pp3A = vec_add(sum3A, pp1A); |
||
255 | pp3B = vec_add(sum3B, pp1B); |
||
256 | |||
257 | psumA = vec_sub(pp3A, pp2A); |
||
258 | psumB = vec_sub(pp3B, pp2B); |
||
259 | |||
260 | sumA = vec_sra(psumA, v5us); |
||
261 | sumB = vec_sra(psumB, v5us); |
||
262 | |||
263 | sum = vec_packsu(sumA, sumB); |
||
264 | |||
265 | ASSERT_ALIGNED(dst); |
||
266 | |||
267 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); |
||
268 | |||
269 | vec_st(fsum, 0, dst); |
||
270 | |||
271 | dst += dstStride; |
||
272 | } |
||
273 | } |
||
274 | #endif |
||
275 | |||
276 | /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ |
||
277 | #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec |
||
278 | static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { |
||
279 | register int i; |
||
280 | LOAD_ZERO; |
||
281 | const vec_u8 permM2 = vec_lvsl(-2, src); |
||
282 | const vec_u8 permM1 = vec_lvsl(-1, src); |
||
283 | const vec_u8 permP0 = vec_lvsl(+0, src); |
||
284 | const vec_u8 permP1 = vec_lvsl(+1, src); |
||
285 | const vec_u8 permP2 = vec_lvsl(+2, src); |
||
286 | const vec_u8 permP3 = vec_lvsl(+3, src); |
||
287 | const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
||
288 | const vec_u32 v10ui = vec_splat_u32(10); |
||
289 | const vec_s16 v5ss = vec_splat_s16(5); |
||
290 | const vec_s16 v1ss = vec_splat_s16(1); |
||
291 | const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
||
292 | const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
||
293 | |||
294 | register int align = ((((unsigned long)src) - 2) % 16); |
||
295 | |||
296 | vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
||
297 | srcP2A, srcP2B, srcP3A, srcP3B, |
||
298 | srcM1A, srcM1B, srcM2A, srcM2B, |
||
299 | sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
||
300 | pp1A, pp1B, pp2A, pp2B, psumA, psumB; |
||
301 | |||
302 | const vec_u8 mperm = (const vec_u8) |
||
303 | {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, |
||
304 | 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; |
||
305 | int16_t *tmpbis = tmp; |
||
306 | |||
307 | vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, |
||
308 | tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, |
||
309 | tmpP2ssA, tmpP2ssB; |
||
310 | |||
311 | vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, |
||
312 | pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, |
||
313 | pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, |
||
314 | ssumAe, ssumAo, ssumBe, ssumBo; |
||
315 | vec_u8 fsum, sumv, sum; |
||
316 | vec_s16 ssume, ssumo; |
||
317 | |||
318 | src -= (2 * srcStride); |
||
319 | for (i = 0 ; i < 21 ; i ++) { |
||
320 | vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
||
321 | vec_u8 srcR1 = vec_ld(-2, src); |
||
322 | vec_u8 srcR2 = vec_ld(14, src); |
||
323 | |||
324 | switch (align) { |
||
325 | default: { |
||
326 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
327 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
328 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
329 | srcP1 = vec_perm(srcR1, srcR2, permP1); |
||
330 | srcP2 = vec_perm(srcR1, srcR2, permP2); |
||
331 | srcP3 = vec_perm(srcR1, srcR2, permP3); |
||
332 | } break; |
||
333 | case 11: { |
||
334 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
335 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
336 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
337 | srcP1 = vec_perm(srcR1, srcR2, permP1); |
||
338 | srcP2 = vec_perm(srcR1, srcR2, permP2); |
||
339 | srcP3 = srcR2; |
||
340 | } break; |
||
341 | case 12: { |
||
342 | vec_u8 srcR3 = vec_ld(30, src); |
||
343 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
344 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
345 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
346 | srcP1 = vec_perm(srcR1, srcR2, permP1); |
||
347 | srcP2 = srcR2; |
||
348 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
349 | } break; |
||
350 | case 13: { |
||
351 | vec_u8 srcR3 = vec_ld(30, src); |
||
352 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
353 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
354 | srcP0 = vec_perm(srcR1, srcR2, permP0); |
||
355 | srcP1 = srcR2; |
||
356 | srcP2 = vec_perm(srcR2, srcR3, permP2); |
||
357 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
358 | } break; |
||
359 | case 14: { |
||
360 | vec_u8 srcR3 = vec_ld(30, src); |
||
361 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
362 | srcM1 = vec_perm(srcR1, srcR2, permM1); |
||
363 | srcP0 = srcR2; |
||
364 | srcP1 = vec_perm(srcR2, srcR3, permP1); |
||
365 | srcP2 = vec_perm(srcR2, srcR3, permP2); |
||
366 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
367 | } break; |
||
368 | case 15: { |
||
369 | vec_u8 srcR3 = vec_ld(30, src); |
||
370 | srcM2 = vec_perm(srcR1, srcR2, permM2); |
||
371 | srcM1 = srcR2; |
||
372 | srcP0 = vec_perm(srcR2, srcR3, permP0); |
||
373 | srcP1 = vec_perm(srcR2, srcR3, permP1); |
||
374 | srcP2 = vec_perm(srcR2, srcR3, permP2); |
||
375 | srcP3 = vec_perm(srcR2, srcR3, permP3); |
||
376 | } break; |
||
377 | } |
||
378 | |||
379 | srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
||
380 | srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); |
||
381 | srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
||
382 | srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); |
||
383 | |||
384 | srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
||
385 | srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); |
||
386 | srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
||
387 | srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); |
||
388 | |||
389 | srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
||
390 | srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); |
||
391 | srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
||
392 | srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); |
||
393 | |||
394 | sum1A = vec_adds(srcP0A, srcP1A); |
||
395 | sum1B = vec_adds(srcP0B, srcP1B); |
||
396 | sum2A = vec_adds(srcM1A, srcP2A); |
||
397 | sum2B = vec_adds(srcM1B, srcP2B); |
||
398 | sum3A = vec_adds(srcM2A, srcP3A); |
||
399 | sum3B = vec_adds(srcM2B, srcP3B); |
||
400 | |||
401 | pp1A = vec_mladd(sum1A, v20ss, sum3A); |
||
402 | pp1B = vec_mladd(sum1B, v20ss, sum3B); |
||
403 | |||
404 | pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
||
405 | pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
||
406 | |||
407 | psumA = vec_sub(pp1A, pp2A); |
||
408 | psumB = vec_sub(pp1B, pp2B); |
||
409 | |||
410 | vec_st(psumA, 0, tmp); |
||
411 | vec_st(psumB, 16, tmp); |
||
412 | |||
413 | src += srcStride; |
||
414 | tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ |
||
415 | } |
||
416 | |||
417 | tmpM2ssA = vec_ld(0, tmpbis); |
||
418 | tmpM2ssB = vec_ld(16, tmpbis); |
||
419 | tmpbis += tmpStride; |
||
420 | tmpM1ssA = vec_ld(0, tmpbis); |
||
421 | tmpM1ssB = vec_ld(16, tmpbis); |
||
422 | tmpbis += tmpStride; |
||
423 | tmpP0ssA = vec_ld(0, tmpbis); |
||
424 | tmpP0ssB = vec_ld(16, tmpbis); |
||
425 | tmpbis += tmpStride; |
||
426 | tmpP1ssA = vec_ld(0, tmpbis); |
||
427 | tmpP1ssB = vec_ld(16, tmpbis); |
||
428 | tmpbis += tmpStride; |
||
429 | tmpP2ssA = vec_ld(0, tmpbis); |
||
430 | tmpP2ssB = vec_ld(16, tmpbis); |
||
431 | tmpbis += tmpStride; |
||
432 | |||
433 | for (i = 0 ; i < 16 ; i++) { |
||
434 | const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); |
||
435 | const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); |
||
436 | |||
437 | const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA); |
||
438 | const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB); |
||
439 | const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA); |
||
440 | const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB); |
||
441 | const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA); |
||
442 | const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB); |
||
443 | |||
444 | tmpbis += tmpStride; |
||
445 | |||
446 | tmpM2ssA = tmpM1ssA; |
||
447 | tmpM2ssB = tmpM1ssB; |
||
448 | tmpM1ssA = tmpP0ssA; |
||
449 | tmpM1ssB = tmpP0ssB; |
||
450 | tmpP0ssA = tmpP1ssA; |
||
451 | tmpP0ssB = tmpP1ssB; |
||
452 | tmpP1ssA = tmpP2ssA; |
||
453 | tmpP1ssB = tmpP2ssB; |
||
454 | tmpP2ssA = tmpP3ssA; |
||
455 | tmpP2ssB = tmpP3ssB; |
||
456 | |||
457 | pp1Ae = vec_mule(sum1A, v20ss); |
||
458 | pp1Ao = vec_mulo(sum1A, v20ss); |
||
459 | pp1Be = vec_mule(sum1B, v20ss); |
||
460 | pp1Bo = vec_mulo(sum1B, v20ss); |
||
461 | |||
462 | pp2Ae = vec_mule(sum2A, v5ss); |
||
463 | pp2Ao = vec_mulo(sum2A, v5ss); |
||
464 | pp2Be = vec_mule(sum2B, v5ss); |
||
465 | pp2Bo = vec_mulo(sum2B, v5ss); |
||
466 | |||
467 | pp3Ae = vec_sra((vec_s32)sum3A, v16ui); |
||
468 | pp3Ao = vec_mulo(sum3A, v1ss); |
||
469 | pp3Be = vec_sra((vec_s32)sum3B, v16ui); |
||
470 | pp3Bo = vec_mulo(sum3B, v1ss); |
||
471 | |||
472 | pp1cAe = vec_add(pp1Ae, v512si); |
||
473 | pp1cAo = vec_add(pp1Ao, v512si); |
||
474 | pp1cBe = vec_add(pp1Be, v512si); |
||
475 | pp1cBo = vec_add(pp1Bo, v512si); |
||
476 | |||
477 | pp32Ae = vec_sub(pp3Ae, pp2Ae); |
||
478 | pp32Ao = vec_sub(pp3Ao, pp2Ao); |
||
479 | pp32Be = vec_sub(pp3Be, pp2Be); |
||
480 | pp32Bo = vec_sub(pp3Bo, pp2Bo); |
||
481 | |||
482 | sumAe = vec_add(pp1cAe, pp32Ae); |
||
483 | sumAo = vec_add(pp1cAo, pp32Ao); |
||
484 | sumBe = vec_add(pp1cBe, pp32Be); |
||
485 | sumBo = vec_add(pp1cBo, pp32Bo); |
||
486 | |||
487 | ssumAe = vec_sra(sumAe, v10ui); |
||
488 | ssumAo = vec_sra(sumAo, v10ui); |
||
489 | ssumBe = vec_sra(sumBe, v10ui); |
||
490 | ssumBo = vec_sra(sumBo, v10ui); |
||
491 | |||
492 | ssume = vec_packs(ssumAe, ssumBe); |
||
493 | ssumo = vec_packs(ssumAo, ssumBo); |
||
494 | |||
495 | sumv = vec_packsu(ssume, ssumo); |
||
496 | sum = vec_perm(sumv, sumv, mperm); |
||
497 | |||
498 | ASSERT_ALIGNED(dst); |
||
499 | |||
500 | OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst)); |
||
501 | |||
502 | vec_st(fsum, 0, dst); |
||
503 | |||
504 | dst += dstStride; |
||
505 | } |
||
506 | } |
||
507 | #endif>>>> |