Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * Alpha optimized DSP utils |
||
3 | * Copyright (c) 2002 Falk Hueffner |
||
4 | * |
||
5 | * This file is part of FFmpeg. |
||
6 | * |
||
7 | * FFmpeg is free software; you can redistribute it and/or |
||
8 | * modify it under the terms of the GNU Lesser General Public |
||
9 | * License as published by the Free Software Foundation; either |
||
10 | * version 2.1 of the License, or (at your option) any later version. |
||
11 | * |
||
12 | * FFmpeg is distributed in the hope that it will be useful, |
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | * Lesser General Public License for more details. |
||
16 | * |
||
17 | * You should have received a copy of the GNU Lesser General Public |
||
18 | * License along with FFmpeg; if not, write to the Free Software |
||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | */ |
||
21 | |||
22 | #include "dsputil_alpha.h" |
||
23 | #include "asm.h" |
||
24 | |||
25 | void get_pixels_mvi(int16_t *restrict block, |
||
26 | const uint8_t *restrict pixels, int line_size) |
||
27 | { |
||
28 | int h = 8; |
||
29 | |||
30 | do { |
||
31 | uint64_t p; |
||
32 | |||
33 | p = ldq(pixels); |
||
34 | stq(unpkbw(p), block); |
||
35 | stq(unpkbw(p >> 32), block + 4); |
||
36 | |||
37 | pixels += line_size; |
||
38 | block += 8; |
||
39 | } while (--h); |
||
40 | } |
||
41 | |||
42 | void diff_pixels_mvi(int16_t *block, const uint8_t *s1, const uint8_t *s2, |
||
43 | int stride) { |
||
44 | int h = 8; |
||
45 | uint64_t mask = 0x4040; |
||
46 | |||
47 | mask |= mask << 16; |
||
48 | mask |= mask << 32; |
||
49 | do { |
||
50 | uint64_t x, y, c, d, a; |
||
51 | uint64_t signs; |
||
52 | |||
53 | x = ldq(s1); |
||
54 | y = ldq(s2); |
||
55 | c = cmpbge(x, y); |
||
56 | d = x - y; |
||
57 | a = zap(mask, c); /* We use 0x4040404040404040 here... */ |
||
58 | d += 4 * a; /* ...so we can use s4addq here. */ |
||
59 | signs = zap(-1, c); |
||
60 | |||
61 | stq(unpkbw(d) | (unpkbw(signs) << 8), block); |
||
62 | stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4); |
||
63 | |||
64 | s1 += stride; |
||
65 | s2 += stride; |
||
66 | block += 8; |
||
67 | } while (--h); |
||
68 | } |
||
69 | |||
70 | static inline uint64_t avg2(uint64_t a, uint64_t b) |
||
71 | { |
||
72 | return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); |
||
73 | } |
||
74 | |||
75 | static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) |
||
76 | { |
||
77 | uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) |
||
78 | + ((l2 & ~BYTE_VEC(0x03)) >> 2) |
||
79 | + ((l3 & ~BYTE_VEC(0x03)) >> 2) |
||
80 | + ((l4 & ~BYTE_VEC(0x03)) >> 2); |
||
81 | uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) |
||
82 | + (l2 & BYTE_VEC(0x03)) |
||
83 | + (l3 & BYTE_VEC(0x03)) |
||
84 | + (l4 & BYTE_VEC(0x03)) |
||
85 | + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); |
||
86 | return r1 + r2; |
||
87 | } |
||
88 | |||
89 | int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
||
90 | { |
||
91 | int result = 0; |
||
92 | |||
93 | if ((size_t) pix2 & 0x7) { |
||
94 | /* works only when pix2 is actually unaligned */ |
||
95 | do { /* do 8 pixel a time */ |
||
96 | uint64_t p1, p2; |
||
97 | |||
98 | p1 = ldq(pix1); |
||
99 | p2 = uldq(pix2); |
||
100 | result += perr(p1, p2); |
||
101 | |||
102 | pix1 += line_size; |
||
103 | pix2 += line_size; |
||
104 | } while (--h); |
||
105 | } else { |
||
106 | do { |
||
107 | uint64_t p1, p2; |
||
108 | |||
109 | p1 = ldq(pix1); |
||
110 | p2 = ldq(pix2); |
||
111 | result += perr(p1, p2); |
||
112 | |||
113 | pix1 += line_size; |
||
114 | pix2 += line_size; |
||
115 | } while (--h); |
||
116 | } |
||
117 | |||
118 | return result; |
||
119 | } |
||
120 | |||
121 | #if 0 /* now done in assembly */ |
||
122 | int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
||
123 | { |
||
124 | int result = 0; |
||
125 | int h = 16; |
||
126 | |||
127 | if ((size_t) pix2 & 0x7) { |
||
128 | /* works only when pix2 is actually unaligned */ |
||
129 | do { /* do 16 pixel a time */ |
||
130 | uint64_t p1_l, p1_r, p2_l, p2_r; |
||
131 | uint64_t t; |
||
132 | |||
133 | p1_l = ldq(pix1); |
||
134 | p1_r = ldq(pix1 + 8); |
||
135 | t = ldq_u(pix2 + 8); |
||
136 | p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); |
||
137 | p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); |
||
138 | pix1 += line_size; |
||
139 | pix2 += line_size; |
||
140 | |||
141 | result += perr(p1_l, p2_l) |
||
142 | + perr(p1_r, p2_r); |
||
143 | } while (--h); |
||
144 | } else { |
||
145 | do { |
||
146 | uint64_t p1_l, p1_r, p2_l, p2_r; |
||
147 | |||
148 | p1_l = ldq(pix1); |
||
149 | p1_r = ldq(pix1 + 8); |
||
150 | p2_l = ldq(pix2); |
||
151 | p2_r = ldq(pix2 + 8); |
||
152 | pix1 += line_size; |
||
153 | pix2 += line_size; |
||
154 | |||
155 | result += perr(p1_l, p2_l) |
||
156 | + perr(p1_r, p2_r); |
||
157 | } while (--h); |
||
158 | } |
||
159 | |||
160 | return result; |
||
161 | } |
||
162 | #endif |
||
163 | |||
164 | int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
||
165 | { |
||
166 | int result = 0; |
||
167 | uint64_t disalign = (size_t) pix2 & 0x7; |
||
168 | |||
169 | switch (disalign) { |
||
170 | case 0: |
||
171 | do { |
||
172 | uint64_t p1_l, p1_r, p2_l, p2_r; |
||
173 | uint64_t l, r; |
||
174 | |||
175 | p1_l = ldq(pix1); |
||
176 | p1_r = ldq(pix1 + 8); |
||
177 | l = ldq(pix2); |
||
178 | r = ldq(pix2 + 8); |
||
179 | p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56)); |
||
180 | p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56)); |
||
181 | pix1 += line_size; |
||
182 | pix2 += line_size; |
||
183 | |||
184 | result += perr(p1_l, p2_l) |
||
185 | + perr(p1_r, p2_r); |
||
186 | } while (--h); |
||
187 | break; |
||
188 | case 7: |
||
189 | /* |.......l|lllllllr|rrrrrrr*| |
||
190 | This case is special because disalign1 would be 8, which |
||
191 | gets treated as 0 by extqh. At least it is a bit faster |
||
192 | that way :) */ |
||
193 | do { |
||
194 | uint64_t p1_l, p1_r, p2_l, p2_r; |
||
195 | uint64_t l, m, r; |
||
196 | |||
197 | p1_l = ldq(pix1); |
||
198 | p1_r = ldq(pix1 + 8); |
||
199 | l = ldq_u(pix2); |
||
200 | m = ldq_u(pix2 + 8); |
||
201 | r = ldq_u(pix2 + 16); |
||
202 | p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m); |
||
203 | p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r); |
||
204 | pix1 += line_size; |
||
205 | pix2 += line_size; |
||
206 | |||
207 | result += perr(p1_l, p2_l) |
||
208 | + perr(p1_r, p2_r); |
||
209 | } while (--h); |
||
210 | break; |
||
211 | default: |
||
212 | do { |
||
213 | uint64_t disalign1 = disalign + 1; |
||
214 | uint64_t p1_l, p1_r, p2_l, p2_r; |
||
215 | uint64_t l, m, r; |
||
216 | |||
217 | p1_l = ldq(pix1); |
||
218 | p1_r = ldq(pix1 + 8); |
||
219 | l = ldq_u(pix2); |
||
220 | m = ldq_u(pix2 + 8); |
||
221 | r = ldq_u(pix2 + 16); |
||
222 | p2_l = avg2(extql(l, disalign) | extqh(m, disalign), |
||
223 | extql(l, disalign1) | extqh(m, disalign1)); |
||
224 | p2_r = avg2(extql(m, disalign) | extqh(r, disalign), |
||
225 | extql(m, disalign1) | extqh(r, disalign1)); |
||
226 | pix1 += line_size; |
||
227 | pix2 += line_size; |
||
228 | |||
229 | result += perr(p1_l, p2_l) |
||
230 | + perr(p1_r, p2_r); |
||
231 | } while (--h); |
||
232 | break; |
||
233 | } |
||
234 | return result; |
||
235 | } |
||
236 | |||
237 | int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
||
238 | { |
||
239 | int result = 0; |
||
240 | |||
241 | if ((size_t) pix2 & 0x7) { |
||
242 | uint64_t t, p2_l, p2_r; |
||
243 | t = ldq_u(pix2 + 8); |
||
244 | p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); |
||
245 | p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); |
||
246 | |||
247 | do { |
||
248 | uint64_t p1_l, p1_r, np2_l, np2_r; |
||
249 | uint64_t t; |
||
250 | |||
251 | p1_l = ldq(pix1); |
||
252 | p1_r = ldq(pix1 + 8); |
||
253 | pix2 += line_size; |
||
254 | t = ldq_u(pix2 + 8); |
||
255 | np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); |
||
256 | np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); |
||
257 | |||
258 | result += perr(p1_l, avg2(p2_l, np2_l)) |
||
259 | + perr(p1_r, avg2(p2_r, np2_r)); |
||
260 | |||
261 | pix1 += line_size; |
||
262 | p2_l = np2_l; |
||
263 | p2_r = np2_r; |
||
264 | |||
265 | } while (--h); |
||
266 | } else { |
||
267 | uint64_t p2_l, p2_r; |
||
268 | p2_l = ldq(pix2); |
||
269 | p2_r = ldq(pix2 + 8); |
||
270 | do { |
||
271 | uint64_t p1_l, p1_r, np2_l, np2_r; |
||
272 | |||
273 | p1_l = ldq(pix1); |
||
274 | p1_r = ldq(pix1 + 8); |
||
275 | pix2 += line_size; |
||
276 | np2_l = ldq(pix2); |
||
277 | np2_r = ldq(pix2 + 8); |
||
278 | |||
279 | result += perr(p1_l, avg2(p2_l, np2_l)) |
||
280 | + perr(p1_r, avg2(p2_r, np2_r)); |
||
281 | |||
282 | pix1 += line_size; |
||
283 | p2_l = np2_l; |
||
284 | p2_r = np2_r; |
||
285 | } while (--h); |
||
286 | } |
||
287 | return result; |
||
288 | } |
||
289 | |||
290 | int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
||
291 | { |
||
292 | int result = 0; |
||
293 | |||
294 | uint64_t p1_l, p1_r; |
||
295 | uint64_t p2_l, p2_r, p2_x; |
||
296 | |||
297 | p1_l = ldq(pix1); |
||
298 | p1_r = ldq(pix1 + 8); |
||
299 | |||
300 | if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ |
||
301 | p2_l = uldq(pix2); |
||
302 | p2_r = uldq(pix2 + 8); |
||
303 | p2_x = (uint64_t) pix2[16] << 56; |
||
304 | } else { |
||
305 | p2_l = ldq(pix2); |
||
306 | p2_r = ldq(pix2 + 8); |
||
307 | p2_x = ldq(pix2 + 16) << 56; |
||
308 | } |
||
309 | |||
310 | do { |
||
311 | uint64_t np1_l, np1_r; |
||
312 | uint64_t np2_l, np2_r, np2_x; |
||
313 | |||
314 | pix1 += line_size; |
||
315 | pix2 += line_size; |
||
316 | |||
317 | np1_l = ldq(pix1); |
||
318 | np1_r = ldq(pix1 + 8); |
||
319 | |||
320 | if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ |
||
321 | np2_l = uldq(pix2); |
||
322 | np2_r = uldq(pix2 + 8); |
||
323 | np2_x = (uint64_t) pix2[16] << 56; |
||
324 | } else { |
||
325 | np2_l = ldq(pix2); |
||
326 | np2_r = ldq(pix2 + 8); |
||
327 | np2_x = ldq(pix2 + 16) << 56; |
||
328 | } |
||
329 | |||
330 | result += perr(p1_l, |
||
331 | avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56), |
||
332 | np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56))) |
||
333 | + perr(p1_r, |
||
334 | avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x), |
||
335 | np2_r, (np2_r >> 8) | ((uint64_t) np2_x))); |
||
336 | |||
337 | p1_l = np1_l; |
||
338 | p1_r = np1_r; |
||
339 | p2_l = np2_l; |
||
340 | p2_r = np2_r; |
||
341 | p2_x = np2_x; |
||
342 | } while (--h); |
||
343 | |||
344 | return result; |
||
345 | }><>><>><>><>><>><>><>><>><>><>><>><> |