Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * VP8 NEON optimisations |
||
3 | * |
||
4 | * Copyright (c) 2010 Rob Clark |
||
5 | * Copyright (c) 2011 Mans Rullgard |
||
6 | * |
||
7 | * This file is part of FFmpeg. |
||
8 | * |
||
9 | * FFmpeg is free software; you can redistribute it and/or |
||
10 | * modify it under the terms of the GNU Lesser General Public |
||
11 | * License as published by the Free Software Foundation; either |
||
12 | * version 2.1 of the License, or (at your option) any later version. |
||
13 | * |
||
14 | * FFmpeg is distributed in the hope that it will be useful, |
||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
17 | * Lesser General Public License for more details. |
||
18 | * |
||
19 | * You should have received a copy of the GNU Lesser General Public |
||
20 | * License along with FFmpeg; if not, write to the Free Software |
||
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
22 | */ |
||
23 | |||
24 | #include "libavutil/arm/asm.S" |
||
25 | #include "neon.S" |
||
26 | |||
27 | function ff_vp8_luma_dc_wht_neon, export=1 |
||
28 | vld1.16 {q0-q1}, [r1,:128] |
||
29 | vmov.i16 q15, #0 |
||
30 | |||
31 | vadd.i16 d4, d0, d3 |
||
32 | vadd.i16 d6, d1, d2 |
||
33 | vst1.16 {q15}, [r1,:128]! |
||
34 | vsub.i16 d7, d1, d2 |
||
35 | vsub.i16 d5, d0, d3 |
||
36 | vst1.16 {q15}, [r1,:128] |
||
37 | vadd.i16 q0, q2, q3 |
||
38 | vsub.i16 q1, q2, q3 |
||
39 | |||
40 | vmov.i16 q8, #3 |
||
41 | |||
42 | vtrn.32 d0, d2 |
||
43 | vtrn.32 d1, d3 |
||
44 | vtrn.16 d0, d1 |
||
45 | vtrn.16 d2, d3 |
||
46 | |||
47 | vadd.i16 d0, d0, d16 |
||
48 | |||
49 | vadd.i16 d4, d0, d3 |
||
50 | vadd.i16 d6, d1, d2 |
||
51 | vsub.i16 d7, d1, d2 |
||
52 | vsub.i16 d5, d0, d3 |
||
53 | vadd.i16 q0, q2, q3 |
||
54 | vsub.i16 q1, q2, q3 |
||
55 | |||
56 | vshr.s16 q0, q0, #3 |
||
57 | vshr.s16 q1, q1, #3 |
||
58 | |||
59 | mov r3, #32 |
||
60 | vst1.16 {d0[0]}, [r0,:16], r3 |
||
61 | vst1.16 {d1[0]}, [r0,:16], r3 |
||
62 | vst1.16 {d2[0]}, [r0,:16], r3 |
||
63 | vst1.16 {d3[0]}, [r0,:16], r3 |
||
64 | vst1.16 {d0[1]}, [r0,:16], r3 |
||
65 | vst1.16 {d1[1]}, [r0,:16], r3 |
||
66 | vst1.16 {d2[1]}, [r0,:16], r3 |
||
67 | vst1.16 {d3[1]}, [r0,:16], r3 |
||
68 | vst1.16 {d0[2]}, [r0,:16], r3 |
||
69 | vst1.16 {d1[2]}, [r0,:16], r3 |
||
70 | vst1.16 {d2[2]}, [r0,:16], r3 |
||
71 | vst1.16 {d3[2]}, [r0,:16], r3 |
||
72 | vst1.16 {d0[3]}, [r0,:16], r3 |
||
73 | vst1.16 {d1[3]}, [r0,:16], r3 |
||
74 | vst1.16 {d2[3]}, [r0,:16], r3 |
||
75 | vst1.16 {d3[3]}, [r0,:16], r3 |
||
76 | |||
77 | bx lr |
||
78 | endfunc |
||
79 | |||
80 | function ff_vp8_idct_add_neon, export=1 |
||
81 | vld1.16 {q0-q1}, [r1,:128] |
||
82 | movw r3, #20091 |
||
83 | movt r3, #35468/2 |
||
84 | vdup.32 d4, r3 |
||
85 | |||
86 | vmull.s16 q12, d1, d4[0] |
||
87 | vmull.s16 q13, d3, d4[0] |
||
88 | vqdmulh.s16 d20, d1, d4[1] |
||
89 | vqdmulh.s16 d23, d3, d4[1] |
||
90 | vshrn.s32 d21, q12, #16 |
||
91 | vshrn.s32 d22, q13, #16 |
||
92 | vadd.s16 d21, d21, d1 |
||
93 | vadd.s16 d22, d22, d3 |
||
94 | |||
95 | vadd.s16 d16, d0, d2 |
||
96 | vsub.s16 d17, d0, d2 |
||
97 | vadd.s16 d18, d21, d23 |
||
98 | vsub.s16 d19, d20, d22 |
||
99 | vadd.s16 q0, q8, q9 |
||
100 | vsub.s16 q1, q8, q9 |
||
101 | |||
102 | vtrn.32 d0, d3 |
||
103 | vtrn.32 d1, d2 |
||
104 | vtrn.16 d0, d1 |
||
105 | vtrn.16 d3, d2 |
||
106 | |||
107 | vmov.i16 q15, #0 |
||
108 | vmull.s16 q12, d1, d4[0] |
||
109 | vst1.16 {q15}, [r1,:128]! |
||
110 | vmull.s16 q13, d2, d4[0] |
||
111 | vst1.16 {q15}, [r1,:128] |
||
112 | vqdmulh.s16 d21, d1, d4[1] |
||
113 | vqdmulh.s16 d23, d2, d4[1] |
||
114 | vshrn.s32 d20, q12, #16 |
||
115 | vshrn.s32 d22, q13, #16 |
||
116 | vadd.i16 d20, d20, d1 |
||
117 | vadd.i16 d22, d22, d2 |
||
118 | |||
119 | vadd.i16 d16, d0, d3 |
||
120 | vsub.i16 d17, d0, d3 |
||
121 | vadd.i16 d18, d20, d23 |
||
122 | vld1.32 {d20[]}, [r0,:32], r2 |
||
123 | vsub.i16 d19, d21, d22 |
||
124 | vld1.32 {d22[]}, [r0,:32], r2 |
||
125 | vadd.s16 q0, q8, q9 |
||
126 | vld1.32 {d23[]}, [r0,:32], r2 |
||
127 | vsub.s16 q1, q8, q9 |
||
128 | vld1.32 {d21[]}, [r0,:32], r2 |
||
129 | vrshr.s16 q0, q0, #3 |
||
130 | vtrn.32 q10, q11 |
||
131 | vrshr.s16 q1, q1, #3 |
||
132 | |||
133 | sub r0, r0, r2, lsl #2 |
||
134 | |||
135 | vtrn.32 d0, d3 |
||
136 | vtrn.32 d1, d2 |
||
137 | vtrn.16 d0, d1 |
||
138 | vtrn.16 d3, d2 |
||
139 | |||
140 | vaddw.u8 q0, q0, d20 |
||
141 | vaddw.u8 q1, q1, d21 |
||
142 | vqmovun.s16 d0, q0 |
||
143 | vqmovun.s16 d1, q1 |
||
144 | |||
145 | vst1.32 {d0[0]}, [r0,:32], r2 |
||
146 | vst1.32 {d0[1]}, [r0,:32], r2 |
||
147 | vst1.32 {d1[1]}, [r0,:32], r2 |
||
148 | vst1.32 {d1[0]}, [r0,:32], r2 |
||
149 | |||
150 | bx lr |
||
151 | endfunc |
||
152 | |||
153 | function ff_vp8_idct_dc_add_neon, export=1 |
||
154 | mov r3, #0 |
||
155 | ldrsh r12, [r1] |
||
156 | strh r3, [r1] |
||
157 | vdup.16 q1, r12 |
||
158 | vrshr.s16 q1, q1, #3 |
||
159 | vld1.32 {d0[]}, [r0,:32], r2 |
||
160 | vld1.32 {d1[]}, [r0,:32], r2 |
||
161 | vld1.32 {d0[1]}, [r0,:32], r2 |
||
162 | vld1.32 {d1[1]}, [r0,:32], r2 |
||
163 | vaddw.u8 q2, q1, d0 |
||
164 | vaddw.u8 q3, q1, d1 |
||
165 | sub r0, r0, r2, lsl #2 |
||
166 | vqmovun.s16 d0, q2 |
||
167 | vqmovun.s16 d1, q3 |
||
168 | vst1.32 {d0[0]}, [r0,:32], r2 |
||
169 | vst1.32 {d1[0]}, [r0,:32], r2 |
||
170 | vst1.32 {d0[1]}, [r0,:32], r2 |
||
171 | vst1.32 {d1[1]}, [r0,:32], r2 |
||
172 | bx lr |
||
173 | endfunc |
||
174 | |||
175 | function ff_vp8_idct_dc_add4uv_neon, export=1 |
||
176 | vmov.i16 d0, #0 |
||
177 | mov r3, #32 |
||
178 | vld1.16 {d16[]}, [r1,:16] |
||
179 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
180 | vld1.16 {d17[]}, [r1,:16] |
||
181 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
182 | vld1.16 {d18[]}, [r1,:16] |
||
183 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
184 | vld1.16 {d19[]}, [r1,:16] |
||
185 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
186 | mov r3, r0 |
||
187 | vrshr.s16 q8, q8, #3 @ dc >>= 3 |
||
188 | vld1.8 {d0}, [r0,:64], r2 |
||
189 | vrshr.s16 q9, q9, #3 |
||
190 | vld1.8 {d1}, [r0,:64], r2 |
||
191 | vaddw.u8 q10, q8, d0 |
||
192 | vld1.8 {d2}, [r0,:64], r2 |
||
193 | vaddw.u8 q0, q8, d1 |
||
194 | vld1.8 {d3}, [r0,:64], r2 |
||
195 | vaddw.u8 q11, q8, d2 |
||
196 | vld1.8 {d4}, [r0,:64], r2 |
||
197 | vaddw.u8 q1, q8, d3 |
||
198 | vld1.8 {d5}, [r0,:64], r2 |
||
199 | vaddw.u8 q12, q9, d4 |
||
200 | vld1.8 {d6}, [r0,:64], r2 |
||
201 | vaddw.u8 q2, q9, d5 |
||
202 | vld1.8 {d7}, [r0,:64], r2 |
||
203 | vaddw.u8 q13, q9, d6 |
||
204 | vqmovun.s16 d20, q10 |
||
205 | vaddw.u8 q3, q9, d7 |
||
206 | vqmovun.s16 d21, q0 |
||
207 | vqmovun.s16 d22, q11 |
||
208 | vst1.8 {d20}, [r3,:64], r2 |
||
209 | vqmovun.s16 d23, q1 |
||
210 | vst1.8 {d21}, [r3,:64], r2 |
||
211 | vqmovun.s16 d24, q12 |
||
212 | vst1.8 {d22}, [r3,:64], r2 |
||
213 | vqmovun.s16 d25, q2 |
||
214 | vst1.8 {d23}, [r3,:64], r2 |
||
215 | vqmovun.s16 d26, q13 |
||
216 | vst1.8 {d24}, [r3,:64], r2 |
||
217 | vqmovun.s16 d27, q3 |
||
218 | vst1.8 {d25}, [r3,:64], r2 |
||
219 | vst1.8 {d26}, [r3,:64], r2 |
||
220 | vst1.8 {d27}, [r3,:64], r2 |
||
221 | |||
222 | bx lr |
||
223 | endfunc |
||
224 | |||
225 | function ff_vp8_idct_dc_add4y_neon, export=1 |
||
226 | vmov.i16 d0, #0 |
||
227 | mov r3, #32 |
||
228 | vld1.16 {d16[]}, [r1,:16] |
||
229 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
230 | vld1.16 {d17[]}, [r1,:16] |
||
231 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
232 | vld1.16 {d18[]}, [r1,:16] |
||
233 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
234 | vld1.16 {d19[]}, [r1,:16] |
||
235 | vst1.16 {d0[0]}, [r1,:16], r3 |
||
236 | vrshr.s16 q8, q8, #3 @ dc >>= 3 |
||
237 | vld1.8 {q0}, [r0,:128], r2 |
||
238 | vrshr.s16 q9, q9, #3 |
||
239 | vld1.8 {q1}, [r0,:128], r2 |
||
240 | vaddw.u8 q10, q8, d0 |
||
241 | vld1.8 {q2}, [r0,:128], r2 |
||
242 | vaddw.u8 q0, q9, d1 |
||
243 | vld1.8 {q3}, [r0,:128], r2 |
||
244 | vaddw.u8 q11, q8, d2 |
||
245 | vaddw.u8 q1, q9, d3 |
||
246 | vaddw.u8 q12, q8, d4 |
||
247 | vaddw.u8 q2, q9, d5 |
||
248 | vaddw.u8 q13, q8, d6 |
||
249 | vaddw.u8 q3, q9, d7 |
||
250 | sub r0, r0, r2, lsl #2 |
||
251 | vqmovun.s16 d20, q10 |
||
252 | vqmovun.s16 d21, q0 |
||
253 | vqmovun.s16 d22, q11 |
||
254 | vqmovun.s16 d23, q1 |
||
255 | vqmovun.s16 d24, q12 |
||
256 | vst1.8 {q10}, [r0,:128], r2 |
||
257 | vqmovun.s16 d25, q2 |
||
258 | vst1.8 {q11}, [r0,:128], r2 |
||
259 | vqmovun.s16 d26, q13 |
||
260 | vst1.8 {q12}, [r0,:128], r2 |
||
261 | vqmovun.s16 d27, q3 |
||
262 | vst1.8 {q13}, [r0,:128], r2 |
||
263 | |||
264 | bx lr |
||
265 | endfunc |
||
266 | |||
267 | @ Register layout: |
||
268 | @ P3..Q3 -> q0..q7 |
||
269 | @ flim_E -> q14 |
||
270 | @ flim_I -> q15 |
||
271 | @ hev_thresh -> r12 |
||
272 | @ |
||
273 | .macro vp8_loop_filter, inner=0, simple=0 |
||
274 | .if \simple |
||
275 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) |
||
276 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) |
||
277 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |
||
278 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |
||
279 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |
||
280 | vmov.i8 q13, #0x80 |
||
281 | vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim |
||
282 | .else |
||
283 | @ calculate hev and normal_limit: |
||
284 | vabd.u8 q12, q2, q3 @ abs(P1-P0) |
||
285 | vabd.u8 q13, q5, q4 @ abs(Q1-Q0) |
||
286 | vabd.u8 q10, q0, q1 @ abs(P3-P2) |
||
287 | vabd.u8 q11, q1, q2 @ abs(P2-P1) |
||
288 | vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I |
||
289 | vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I |
||
290 | vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I |
||
291 | vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I |
||
292 | vand q8, q8, q9 |
||
293 | vabd.u8 q9, q7, q6 @ abs(Q3-Q2) |
||
294 | vand q8, q8, q11 |
||
295 | vabd.u8 q11, q6, q5 @ abs(Q2-Q1) |
||
296 | vand q8, q8, q10 |
||
297 | vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I |
||
298 | vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I |
||
299 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) |
||
300 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) |
||
301 | vand q8, q8, q10 |
||
302 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |
||
303 | vand q8, q8, q11 |
||
304 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |
||
305 | vdup.8 q15, r12 @ hev_thresh |
||
306 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |
||
307 | vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh |
||
308 | vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E |
||
309 | vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh |
||
310 | vand q8, q8, q11 |
||
311 | vmov.i8 q13, #0x80 |
||
312 | vorr q9, q12, q14 |
||
313 | .endif |
||
314 | |||
315 | @ at this point: |
||
316 | @ q8: normal_limit |
||
317 | @ q9: hev |
||
318 | |||
319 | @ convert to signed value: |
||
320 | veor q3, q3, q13 @ PS0 = P0 ^ 0x80 |
||
321 | veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 |
||
322 | |||
323 | vmov.i16 q12, #3 |
||
324 | vsubl.s8 q10, d8, d6 @ QS0 - PS0 |
||
325 | vsubl.s8 q11, d9, d7 @ (widened to 16bit) |
||
326 | veor q2, q2, q13 @ PS1 = P1 ^ 0x80 |
||
327 | veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 |
||
328 | vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) |
||
329 | vmul.i16 q11, q11, q12 |
||
330 | |||
331 | vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) |
||
332 | vmov.i8 q14, #4 |
||
333 | vmov.i8 q15, #3 |
||
334 | .if \inner |
||
335 | vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) |
||
336 | .endif |
||
337 | vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) |
||
338 | vaddw.s8 q11, q11, d25 |
||
339 | vqmovn.s16 d20, q10 @ narrow result back into q10 |
||
340 | vqmovn.s16 d21, q11 |
||
341 | .if !\inner && !\simple |
||
342 | veor q1, q1, q13 @ PS2 = P2 ^ 0x80 |
||
343 | veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 |
||
344 | .endif |
||
345 | vand q10, q10, q8 @ w &= normal_limit |
||
346 | |||
347 | @ registers used at this point.. |
||
348 | @ q0 -> P3 (don't corrupt) |
||
349 | @ q1-q6 -> PS2-QS2 |
||
350 | @ q7 -> Q3 (don't corrupt) |
||
351 | @ q9 -> hev |
||
352 | @ q10 -> w |
||
353 | @ q13 -> #0x80 |
||
354 | @ q14 -> #4 |
||
355 | @ q15 -> #3 |
||
356 | @ q8, q11, q12 -> unused |
||
357 | |||
358 | @ filter_common: is4tap==1 |
||
359 | @ c1 = clamp(w + 4) >> 3; |
||
360 | @ c2 = clamp(w + 3) >> 3; |
||
361 | @ Q0 = s2u(QS0 - c1); |
||
362 | @ P0 = s2u(PS0 + c2); |
||
363 | |||
364 | .if \simple |
||
365 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |
||
366 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |
||
367 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
||
368 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
||
369 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
||
370 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
||
371 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
||
372 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
||
373 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
||
374 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
||
375 | .elseif \inner |
||
376 | @ the !is4tap case of filter_common, only used for inner blocks |
||
377 | @ c3 = ((c1&~hev) + 1) >> 1; |
||
378 | @ Q1 = s2u(QS1 - c3); |
||
379 | @ P1 = s2u(PS1 + c3); |
||
380 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |
||
381 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |
||
382 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
||
383 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
||
384 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
||
385 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
||
386 | vbic q11, q11, q9 @ c1 & ~hev |
||
387 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
||
388 | vrshr.s8 q11, q11, #1 @ c3 >>= 1 |
||
389 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
||
390 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) |
||
391 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) |
||
392 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
||
393 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
||
394 | .else |
||
395 | vand q12, q10, q9 @ w & hev |
||
396 | vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) |
||
397 | vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) |
||
398 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
||
399 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
||
400 | vbic q10, q10, q9 @ w &= ~hev |
||
401 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
||
402 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
||
403 | |||
404 | @ filter_mbedge: |
||
405 | @ a = clamp((27*w + 63) >> 7); |
||
406 | @ Q0 = s2u(QS0 - a); |
||
407 | @ P0 = s2u(PS0 + a); |
||
408 | @ a = clamp((18*w + 63) >> 7); |
||
409 | @ Q1 = s2u(QS1 - a); |
||
410 | @ P1 = s2u(PS1 + a); |
||
411 | @ a = clamp((9*w + 63) >> 7); |
||
412 | @ Q2 = s2u(QS2 - a); |
||
413 | @ P2 = s2u(PS2 + a); |
||
414 | vmov.i16 q9, #63 |
||
415 | vshll.s8 q14, d20, #3 |
||
416 | vshll.s8 q15, d21, #3 |
||
417 | vaddw.s8 q14, q14, d20 |
||
418 | vaddw.s8 q15, q15, d21 |
||
419 | vadd.s16 q8, q9, q14 |
||
420 | vadd.s16 q9, q9, q15 @ 9*w + 63 |
||
421 | vadd.s16 q11, q8, q14 |
||
422 | vadd.s16 q12, q9, q15 @ 18*w + 63 |
||
423 | vadd.s16 q14, q11, q14 |
||
424 | vadd.s16 q15, q12, q15 @ 27*w + 63 |
||
425 | vqshrn.s16 d16, q8, #7 |
||
426 | vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) |
||
427 | vqshrn.s16 d22, q11, #7 |
||
428 | vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) |
||
429 | vqshrn.s16 d28, q14, #7 |
||
430 | vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) |
||
431 | vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) |
||
432 | vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) |
||
433 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) |
||
434 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) |
||
435 | vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) |
||
436 | vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) |
||
437 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
||
438 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
||
439 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
||
440 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
||
441 | veor q1, q1, q13 @ P2 = PS2 ^ 0x80 |
||
442 | veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 |
||
443 | .endif |
||
444 | .endm |
||
445 | |||
446 | .macro vp8_v_loop_filter16 name, inner=0, simple=0 |
||
447 | function ff_vp8_v_loop_filter16\name\()_neon, export=1 |
||
448 | vpush {q4-q7} |
||
449 | sub r0, r0, r1, lsl #1+!\simple |
||
450 | |||
451 | @ Load pixels: |
||
452 | .if !\simple |
||
453 | ldr r12, [sp, #64] @ hev_thresh |
||
454 | vld1.8 {q0}, [r0,:128], r1 @ P3 |
||
455 | vld1.8 {q1}, [r0,:128], r1 @ P2 |
||
456 | .endif |
||
457 | vld1.8 {q2}, [r0,:128], r1 @ P1 |
||
458 | vld1.8 {q3}, [r0,:128], r1 @ P0 |
||
459 | vld1.8 {q4}, [r0,:128], r1 @ Q0 |
||
460 | vld1.8 {q5}, [r0,:128], r1 @ Q1 |
||
461 | .if !\simple |
||
462 | vld1.8 {q6}, [r0,:128], r1 @ Q2 |
||
463 | vld1.8 {q7}, [r0,:128] @ Q3 |
||
464 | vdup.8 q15, r3 @ flim_I |
||
465 | .endif |
||
466 | vdup.8 q14, r2 @ flim_E |
||
467 | |||
468 | vp8_loop_filter inner=\inner, simple=\simple |
||
469 | |||
470 | @ back up to P2: dst -= stride * 6 |
||
471 | sub r0, r0, r1, lsl #2 |
||
472 | .if !\simple |
||
473 | sub r0, r0, r1, lsl #1 |
||
474 | |||
475 | @ Store pixels: |
||
476 | vst1.8 {q1}, [r0,:128], r1 @ P2 |
||
477 | .endif |
||
478 | vst1.8 {q2}, [r0,:128], r1 @ P1 |
||
479 | vst1.8 {q3}, [r0,:128], r1 @ P0 |
||
480 | vst1.8 {q4}, [r0,:128], r1 @ Q0 |
||
481 | vst1.8 {q5}, [r0,:128], r1 @ Q1 |
||
482 | .if !\simple |
||
483 | vst1.8 {q6}, [r0,:128] @ Q2 |
||
484 | .endif |
||
485 | |||
486 | vpop {q4-q7} |
||
487 | bx lr |
||
488 | endfunc |
||
489 | .endm |
||
490 | |||
491 | vp8_v_loop_filter16 |
||
492 | vp8_v_loop_filter16 _inner, inner=1 |
||
493 | vp8_v_loop_filter16 _simple, simple=1 |
||
494 | |||
495 | .macro vp8_v_loop_filter8uv name, inner=0 |
||
496 | function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 |
||
497 | vpush {q4-q7} |
||
498 | sub r0, r0, r2, lsl #2 |
||
499 | sub r1, r1, r2, lsl #2 |
||
500 | ldr r12, [sp, #64] @ flim_I |
||
501 | |||
502 | @ Load pixels: |
||
503 | vld1.8 {d0}, [r0,:64], r2 @ P3 |
||
504 | vld1.8 {d1}, [r1,:64], r2 @ P3 |
||
505 | vld1.8 {d2}, [r0,:64], r2 @ P2 |
||
506 | vld1.8 {d3}, [r1,:64], r2 @ P2 |
||
507 | vld1.8 {d4}, [r0,:64], r2 @ P1 |
||
508 | vld1.8 {d5}, [r1,:64], r2 @ P1 |
||
509 | vld1.8 {d6}, [r0,:64], r2 @ P0 |
||
510 | vld1.8 {d7}, [r1,:64], r2 @ P0 |
||
511 | vld1.8 {d8}, [r0,:64], r2 @ Q0 |
||
512 | vld1.8 {d9}, [r1,:64], r2 @ Q0 |
||
513 | vld1.8 {d10}, [r0,:64], r2 @ Q1 |
||
514 | vld1.8 {d11}, [r1,:64], r2 @ Q1 |
||
515 | vld1.8 {d12}, [r0,:64], r2 @ Q2 |
||
516 | vld1.8 {d13}, [r1,:64], r2 @ Q2 |
||
517 | vld1.8 {d14}, [r0,:64] @ Q3 |
||
518 | vld1.8 {d15}, [r1,:64] @ Q3 |
||
519 | |||
520 | vdup.8 q14, r3 @ flim_E |
||
521 | vdup.8 q15, r12 @ flim_I |
||
522 | ldr r12, [sp, #68] @ hev_thresh |
||
523 | |||
524 | vp8_loop_filter inner=\inner |
||
525 | |||
526 | @ back up to P2: u,v -= stride * 6 |
||
527 | sub r0, r0, r2, lsl #2 |
||
528 | sub r1, r1, r2, lsl #2 |
||
529 | sub r0, r0, r2, lsl #1 |
||
530 | sub r1, r1, r2, lsl #1 |
||
531 | |||
532 | @ Store pixels: |
||
533 | vst1.8 {d2}, [r0,:64], r2 @ P2 |
||
534 | vst1.8 {d3}, [r1,:64], r2 @ P2 |
||
535 | vst1.8 {d4}, [r0,:64], r2 @ P1 |
||
536 | vst1.8 {d5}, [r1,:64], r2 @ P1 |
||
537 | vst1.8 {d6}, [r0,:64], r2 @ P0 |
||
538 | vst1.8 {d7}, [r1,:64], r2 @ P0 |
||
539 | vst1.8 {d8}, [r0,:64], r2 @ Q0 |
||
540 | vst1.8 {d9}, [r1,:64], r2 @ Q0 |
||
541 | vst1.8 {d10}, [r0,:64], r2 @ Q1 |
||
542 | vst1.8 {d11}, [r1,:64], r2 @ Q1 |
||
543 | vst1.8 {d12}, [r0,:64] @ Q2 |
||
544 | vst1.8 {d13}, [r1,:64] @ Q2 |
||
545 | |||
546 | vpop {q4-q7} |
||
547 | bx lr |
||
548 | endfunc |
||
549 | .endm |
||
550 | |||
551 | vp8_v_loop_filter8uv |
||
552 | vp8_v_loop_filter8uv _inner, inner=1 |
||
553 | |||
554 | .macro vp8_h_loop_filter16 name, inner=0, simple=0 |
||
555 | function ff_vp8_h_loop_filter16\name\()_neon, export=1 |
||
556 | vpush {q4-q7} |
||
557 | sub r0, r0, #4 |
||
558 | .if !\simple |
||
559 | ldr r12, [sp, #64] @ hev_thresh |
||
560 | .endif |
||
561 | |||
562 | @ Load pixels: |
||
563 | vld1.8 {d0}, [r0], r1 @ load first 8-line src data |
||
564 | vld1.8 {d2}, [r0], r1 |
||
565 | vld1.8 {d4}, [r0], r1 |
||
566 | vld1.8 {d6}, [r0], r1 |
||
567 | vld1.8 {d8}, [r0], r1 |
||
568 | vld1.8 {d10}, [r0], r1 |
||
569 | vld1.8 {d12}, [r0], r1 |
||
570 | vld1.8 {d14}, [r0], r1 |
||
571 | vld1.8 {d1}, [r0], r1 @ load second 8-line src data |
||
572 | vld1.8 {d3}, [r0], r1 |
||
573 | vld1.8 {d5}, [r0], r1 |
||
574 | vld1.8 {d7}, [r0], r1 |
||
575 | vld1.8 {d9}, [r0], r1 |
||
576 | vld1.8 {d11}, [r0], r1 |
||
577 | vld1.8 {d13}, [r0], r1 |
||
578 | vld1.8 {d15}, [r0], r1 |
||
579 | |||
580 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 |
||
581 | |||
582 | vdup.8 q14, r2 @ flim_E |
||
583 | .if !\simple |
||
584 | vdup.8 q15, r3 @ flim_I |
||
585 | .endif |
||
586 | |||
587 | vp8_loop_filter inner=\inner, simple=\simple |
||
588 | |||
589 | sub r0, r0, r1, lsl #4 @ backup 16 rows |
||
590 | |||
591 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 |
||
592 | |||
593 | @ Store pixels: |
||
594 | vst1.8 {d0}, [r0], r1 |
||
595 | vst1.8 {d2}, [r0], r1 |
||
596 | vst1.8 {d4}, [r0], r1 |
||
597 | vst1.8 {d6}, [r0], r1 |
||
598 | vst1.8 {d8}, [r0], r1 |
||
599 | vst1.8 {d10}, [r0], r1 |
||
600 | vst1.8 {d12}, [r0], r1 |
||
601 | vst1.8 {d14}, [r0], r1 |
||
602 | vst1.8 {d1}, [r0], r1 |
||
603 | vst1.8 {d3}, [r0], r1 |
||
604 | vst1.8 {d5}, [r0], r1 |
||
605 | vst1.8 {d7}, [r0], r1 |
||
606 | vst1.8 {d9}, [r0], r1 |
||
607 | vst1.8 {d11}, [r0], r1 |
||
608 | vst1.8 {d13}, [r0], r1 |
||
609 | vst1.8 {d15}, [r0] |
||
610 | |||
611 | vpop {q4-q7} |
||
612 | bx lr |
||
613 | endfunc |
||
614 | .endm |
||
615 | |||
616 | vp8_h_loop_filter16 |
||
617 | vp8_h_loop_filter16 _inner, inner=1 |
||
618 | vp8_h_loop_filter16 _simple, simple=1 |
||
619 | |||
620 | .macro vp8_h_loop_filter8uv name, inner=0 |
||
621 | function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 |
||
622 | vpush {q4-q7} |
||
623 | sub r0, r0, #4 |
||
624 | sub r1, r1, #4 |
||
625 | ldr r12, [sp, #64] @ flim_I |
||
626 | |||
627 | @ Load pixels: |
||
628 | vld1.8 {d0}, [r0], r2 @ load u |
||
629 | vld1.8 {d1}, [r1], r2 @ load v |
||
630 | vld1.8 {d2}, [r0], r2 |
||
631 | vld1.8 {d3}, [r1], r2 |
||
632 | vld1.8 {d4}, [r0], r2 |
||
633 | vld1.8 {d5}, [r1], r2 |
||
634 | vld1.8 {d6}, [r0], r2 |
||
635 | vld1.8 {d7}, [r1], r2 |
||
636 | vld1.8 {d8}, [r0], r2 |
||
637 | vld1.8 {d9}, [r1], r2 |
||
638 | vld1.8 {d10}, [r0], r2 |
||
639 | vld1.8 {d11}, [r1], r2 |
||
640 | vld1.8 {d12}, [r0], r2 |
||
641 | vld1.8 {d13}, [r1], r2 |
||
642 | vld1.8 {d14}, [r0], r2 |
||
643 | vld1.8 {d15}, [r1], r2 |
||
644 | |||
645 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 |
||
646 | |||
647 | vdup.8 q14, r3 @ flim_E |
||
648 | vdup.8 q15, r12 @ flim_I |
||
649 | ldr r12, [sp, #68] @ hev_thresh |
||
650 | |||
651 | vp8_loop_filter inner=\inner |
||
652 | |||
653 | sub r0, r0, r2, lsl #3 @ backup u 8 rows |
||
654 | sub r1, r1, r2, lsl #3 @ backup v 8 rows |
||
655 | |||
656 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 |
||
657 | |||
658 | @ Store pixels: |
||
659 | vst1.8 {d0}, [r0], r2 |
||
660 | vst1.8 {d1}, [r1], r2 |
||
661 | vst1.8 {d2}, [r0], r2 |
||
662 | vst1.8 {d3}, [r1], r2 |
||
663 | vst1.8 {d4}, [r0], r2 |
||
664 | vst1.8 {d5}, [r1], r2 |
||
665 | vst1.8 {d6}, [r0], r2 |
||
666 | vst1.8 {d7}, [r1], r2 |
||
667 | vst1.8 {d8}, [r0], r2 |
||
668 | vst1.8 {d9}, [r1], r2 |
||
669 | vst1.8 {d10}, [r0], r2 |
||
670 | vst1.8 {d11}, [r1], r2 |
||
671 | vst1.8 {d12}, [r0], r2 |
||
672 | vst1.8 {d13}, [r1], r2 |
||
673 | vst1.8 {d14}, [r0] |
||
674 | vst1.8 {d15}, [r1] |
||
675 | |||
676 | vpop {q4-q7} |
||
677 | bx lr |
||
678 | endfunc |
||
679 | .endm |
||
680 | |||
681 | vp8_h_loop_filter8uv |
||
682 | vp8_h_loop_filter8uv _inner, inner=1 |
||
683 | |||
684 | function ff_put_vp8_pixels16_neon, export=1 |
||
685 | ldr r12, [sp, #0] @ h |
||
686 | 1: |
||
687 | subs r12, r12, #4 |
||
688 | vld1.8 {q0}, [r2], r3 |
||
689 | vld1.8 {q1}, [r2], r3 |
||
690 | vld1.8 {q2}, [r2], r3 |
||
691 | vld1.8 {q3}, [r2], r3 |
||
692 | vst1.8 {q0}, [r0,:128], r1 |
||
693 | vst1.8 {q1}, [r0,:128], r1 |
||
694 | vst1.8 {q2}, [r0,:128], r1 |
||
695 | vst1.8 {q3}, [r0,:128], r1 |
||
696 | bgt 1b |
||
697 | bx lr |
||
698 | endfunc |
||
699 | |||
700 | function ff_put_vp8_pixels8_neon, export=1 |
||
701 | ldr r12, [sp, #0] @ h |
||
702 | 1: |
||
703 | subs r12, r12, #4 |
||
704 | vld1.8 {d0}, [r2], r3 |
||
705 | vld1.8 {d1}, [r2], r3 |
||
706 | vld1.8 {d2}, [r2], r3 |
||
707 | vld1.8 {d3}, [r2], r3 |
||
708 | vst1.8 {d0}, [r0,:64], r1 |
||
709 | vst1.8 {d1}, [r0,:64], r1 |
||
710 | vst1.8 {d2}, [r0,:64], r1 |
||
711 | vst1.8 {d3}, [r0,:64], r1 |
||
712 | bgt 1b |
||
713 | bx lr |
||
714 | endfunc |
||
715 | |||
716 | /* 4/6-tap 8th-pel MC */ |
||
717 | |||
718 | .macro vp8_epel8_h6 d, a, b |
||
719 | vext.8 d27, \a, \b, #1 |
||
720 | vmovl.u8 q8, \a |
||
721 | vext.8 d28, \a, \b, #2 |
||
722 | vmovl.u8 q9, d27 |
||
723 | vext.8 d29, \a, \b, #3 |
||
724 | vmovl.u8 q10, d28 |
||
725 | vext.8 d30, \a, \b, #4 |
||
726 | vmovl.u8 q11, d29 |
||
727 | vext.8 d31, \a, \b, #5 |
||
728 | vmovl.u8 q12, d30 |
||
729 | vmul.u16 q10, q10, d0[2] |
||
730 | vmovl.u8 q13, d31 |
||
731 | vmul.u16 q11, q11, d0[3] |
||
732 | vmls.u16 q10, q9, d0[1] |
||
733 | vmls.u16 q11, q12, d1[0] |
||
734 | vmla.u16 q10, q8, d0[0] |
||
735 | vmla.u16 q11, q13, d1[1] |
||
736 | vqadd.s16 q11, q10, q11 |
||
737 | vqrshrun.s16 \d, q11, #7 |
||
738 | .endm |
||
739 | |||
740 | .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 |
||
741 | vext.8 q14, \q0, \q1, #3 |
||
742 | vext.8 q15, \q0, \q1, #4 |
||
743 | vmovl.u8 q11, d28 |
||
744 | vmovl.u8 q14, d29 |
||
745 | vext.8 q3, \q0, \q1, #2 |
||
746 | vmovl.u8 q12, d30 |
||
747 | vmovl.u8 q15, d31 |
||
748 | vext.8 q8, \q0, \q1, #1 |
||
749 | vmovl.u8 q10, d6 |
||
750 | vmovl.u8 q3, d7 |
||
751 | vext.8 q2, \q0, \q1, #5 |
||
752 | vmovl.u8 q13, d4 |
||
753 | vmovl.u8 q2, d5 |
||
754 | vmovl.u8 q9, d16 |
||
755 | vmovl.u8 q8, d17 |
||
756 | vmul.u16 q11, q11, d0[3] |
||
757 | vmul.u16 q10, q10, d0[2] |
||
758 | vmul.u16 q3, q3, d0[2] |
||
759 | vmul.u16 q14, q14, d0[3] |
||
760 | vmls.u16 q11, q12, d1[0] |
||
761 | vmovl.u8 q12, \s0 |
||
762 | vmovl.u8 q1, \s1 |
||
763 | vmls.u16 q10, q9, d0[1] |
||
764 | vmls.u16 q3, q8, d0[1] |
||
765 | vmls.u16 q14, q15, d1[0] |
||
766 | vmla.u16 q10, q12, d0[0] |
||
767 | vmla.u16 q11, q13, d1[1] |
||
768 | vmla.u16 q3, q1, d0[0] |
||
769 | vmla.u16 q14, q2, d1[1] |
||
770 | vqadd.s16 q11, q10, q11 |
||
771 | vqadd.s16 q14, q3, q14 |
||
772 | vqrshrun.s16 \d0, q11, #7 |
||
773 | vqrshrun.s16 \d1, q14, #7 |
||
774 | .endm |
||
775 | |||
776 | .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 |
||
777 | vmovl.u8 q10, \s2 |
||
778 | vmovl.u8 q11, \s3 |
||
779 | vmovl.u8 q9, \s1 |
||
780 | vmovl.u8 q12, \s4 |
||
781 | vmovl.u8 q8, \s0 |
||
782 | vmovl.u8 q13, \s5 |
||
783 | vmul.u16 q10, q10, d0[2] |
||
784 | vmul.u16 q11, q11, d0[3] |
||
785 | vmls.u16 q10, q9, d0[1] |
||
786 | vmls.u16 q11, q12, d1[0] |
||
787 | vmla.u16 q10, q8, d0[0] |
||
788 | vmla.u16 q11, q13, d1[1] |
||
789 | vqadd.s16 q11, q10, q11 |
||
790 | vqrshrun.s16 \d0, q11, #7 |
||
791 | .endm |
||
792 | |||
793 | .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 |
||
794 | vmovl.u8 q10, \s0 |
||
795 | vmovl.u8 q11, \s3 |
||
796 | vmovl.u8 q14, \s6 |
||
797 | vmovl.u8 q9, \s1 |
||
798 | vmovl.u8 q12, \s4 |
||
799 | vmovl.u8 q8, \s2 |
||
800 | vmovl.u8 q13, \s5 |
||
801 | vmul.u16 q10, q10, d0[0] |
||
802 | vmul.u16 q15, q11, d0[3] |
||
803 | vmul.u16 q11, q11, d0[2] |
||
804 | vmul.u16 q14, q14, d1[1] |
||
805 | vmls.u16 q10, q9, d0[1] |
||
806 | vmls.u16 q15, q12, d1[0] |
||
807 | vmls.u16 q11, q8, d0[1] |
||
808 | vmls.u16 q14, q13, d1[0] |
||
809 | vmla.u16 q10, q8, d0[2] |
||
810 | vmla.u16 q15, q13, d1[1] |
||
811 | vmla.u16 q11, q9, d0[0] |
||
812 | vmla.u16 q14, q12, d0[3] |
||
813 | vqadd.s16 q15, q10, q15 |
||
814 | vqadd.s16 q14, q11, q14 |
||
815 | vqrshrun.s16 \d0, q15, #7 |
||
816 | vqrshrun.s16 \d1, q14, #7 |
||
817 | .endm |
||
818 | |||
819 | .macro vp8_epel8_h4 d, a, b |
||
820 | vext.8 d28, \a, \b, #1 |
||
821 | vmovl.u8 q9, \a |
||
822 | vext.8 d29, \a, \b, #2 |
||
823 | vmovl.u8 q10, d28 |
||
824 | vext.8 d30, \a, \b, #3 |
||
825 | vmovl.u8 q11, d29 |
||
826 | vmovl.u8 q12, d30 |
||
827 | vmul.u16 q10, q10, d0[2] |
||
828 | vmul.u16 q11, q11, d0[3] |
||
829 | vmls.u16 q10, q9, d0[1] |
||
830 | vmls.u16 q11, q12, d1[0] |
||
831 | vqadd.s16 q11, q10, q11 |
||
832 | vqrshrun.s16 \d, q11, #7 |
||
833 | .endm |
||
834 | |||
835 | .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 |
||
836 | vmovl.u8 q9, \s0 |
||
837 | vmovl.u8 q10, \s1 |
||
838 | vmovl.u8 q11, \s2 |
||
839 | vmovl.u8 q12, \s3 |
||
840 | vmovl.u8 q13, \s4 |
||
841 | vmul.u16 q8, q10, d0[2] |
||
842 | vmul.u16 q14, q11, d0[3] |
||
843 | vmul.u16 q11, q11, d0[2] |
||
844 | vmul.u16 q15, q12, d0[3] |
||
845 | vmls.u16 q8, q9, d0[1] |
||
846 | vmls.u16 q14, q12, d1[0] |
||
847 | vmls.u16 q11, q10, d0[1] |
||
848 | vmls.u16 q15, q13, d1[0] |
||
849 | vqadd.s16 q8, q8, q14 |
||
850 | vqadd.s16 q11, q11, q15 |
||
851 | vqrshrun.s16 \d0, q8, #7 |
||
852 | vqrshrun.s16 \d1, q11, #7 |
||
853 | .endm |
||
854 | |||
855 | function ff_put_vp8_epel16_v6_neon, export=1 |
||
856 | sub r2, r2, r3, lsl #1 |
||
857 | push {r4,lr} |
||
858 | vpush {d8-d15} |
||
859 | |||
860 | ldr r4, [sp, #80] @ my |
||
861 | movrel lr, subpel_filters-16 |
||
862 | ldr r12, [sp, #72] @ h |
||
863 | add r4, lr, r4, lsl #4 |
||
864 | vld1.16 {q0}, [r4,:128] |
||
865 | 1: |
||
866 | vld1.8 {d2-d3}, [r2], r3 |
||
867 | vld1.8 {d4-d5}, [r2], r3 |
||
868 | vld1.8 {d6-d7}, [r2], r3 |
||
869 | vld1.8 {d8-d9}, [r2], r3 |
||
870 | vld1.8 {d10-d11},[r2], r3 |
||
871 | vld1.8 {d12-d13},[r2], r3 |
||
872 | vld1.8 {d14-d15},[r2] |
||
873 | sub r2, r2, r3, lsl #2 |
||
874 | |||
875 | vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 |
||
876 | vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 |
||
877 | |||
878 | vst1.8 {d2-d3}, [r0,:128], r1 |
||
879 | vst1.8 {d4-d5}, [r0,:128], r1 |
||
880 | subs r12, r12, #2 |
||
881 | bne 1b |
||
882 | |||
883 | vpop {d8-d15} |
||
884 | pop {r4,pc} |
||
885 | endfunc |
||
886 | |||
887 | function ff_put_vp8_epel16_h6_neon, export=1 |
||
888 | sub r2, r2, #2 |
||
889 | push {r4,lr} |
||
890 | |||
891 | ldr r4, [sp, #12] @ mx |
||
892 | movrel lr, subpel_filters-16 |
||
893 | ldr r12, [sp, #8] @ h |
||
894 | add r4, lr, r4, lsl #4 |
||
895 | vld1.16 {q0}, [r4,:128] |
||
896 | 1: |
||
897 | vld1.8 {d2-d4}, [r2], r3 |
||
898 | |||
899 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |
||
900 | |||
901 | vst1.8 {d2-d3}, [r0,:128], r1 |
||
902 | subs r12, r12, #1 |
||
903 | bne 1b |
||
904 | |||
905 | pop {r4,pc} |
||
906 | endfunc |
||
907 | |||
908 | function ff_put_vp8_epel16_h6v6_neon, export=1 |
||
909 | sub r2, r2, r3, lsl #1 |
||
910 | sub r2, r2, #2 |
||
911 | push {r4,lr} |
||
912 | vpush {d8-d9} |
||
913 | |||
914 | @ first pass (horizontal): |
||
915 | ldr r4, [sp, #28] @ mx |
||
916 | movrel lr, subpel_filters-16 |
||
917 | ldr r12, [sp, #24] @ h |
||
918 | add r4, lr, r4, lsl #4 |
||
919 | sub sp, sp, #336+16 |
||
920 | vld1.16 {q0}, [r4,:128] |
||
921 | add lr, sp, #15 |
||
922 | add r12, r12, #5 |
||
923 | bic lr, lr, #15 |
||
924 | 1: |
||
925 | vld1.8 {d2,d3,d4}, [r2], r3 |
||
926 | |||
927 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |
||
928 | |||
929 | vst1.8 {d2-d3}, [lr,:128]! |
||
930 | subs r12, r12, #1 |
||
931 | bne 1b |
||
932 | |||
933 | @ second pass (vertical): |
||
934 | ldr r4, [sp, #336+16+32] @ my |
||
935 | movrel lr, subpel_filters-16 |
||
936 | ldr r12, [sp, #336+16+24] @ h |
||
937 | add r4, lr, r4, lsl #4 |
||
938 | add lr, sp, #15 |
||
939 | vld1.16 {q0}, [r4,:128] |
||
940 | bic lr, lr, #15 |
||
941 | 2: |
||
942 | vld1.8 {d2-d5}, [lr,:128]! |
||
943 | vld1.8 {d6-d9}, [lr,:128]! |
||
944 | vld1.8 {d28-d31},[lr,:128] |
||
945 | sub lr, lr, #48 |
||
946 | |||
947 | vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 |
||
948 | vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 |
||
949 | |||
950 | vst1.8 {d2-d3}, [r0,:128], r1 |
||
951 | subs r12, r12, #1 |
||
952 | bne 2b |
||
953 | |||
954 | add sp, sp, #336+16 |
||
955 | vpop {d8-d9} |
||
956 | pop {r4,pc} |
||
957 | endfunc |
||
958 | |||
959 | function ff_put_vp8_epel8_v6_neon, export=1 |
||
960 | sub r2, r2, r3, lsl #1 |
||
961 | push {r4,lr} |
||
962 | |||
963 | ldr r4, [sp, #16] @ my |
||
964 | movrel lr, subpel_filters-16 |
||
965 | ldr r12, [sp, #8] @ h |
||
966 | add r4, lr, r4, lsl #4 |
||
967 | vld1.16 {q0}, [r4,:128] |
||
968 | 1: |
||
969 | vld1.8 {d2}, [r2], r3 |
||
970 | vld1.8 {d3}, [r2], r3 |
||
971 | vld1.8 {d4}, [r2], r3 |
||
972 | vld1.8 {d5}, [r2], r3 |
||
973 | vld1.8 {d6}, [r2], r3 |
||
974 | vld1.8 {d7}, [r2], r3 |
||
975 | vld1.8 {d28}, [r2] |
||
976 | |||
977 | sub r2, r2, r3, lsl #2 |
||
978 | |||
979 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |
||
980 | |||
981 | vst1.8 {d2}, [r0,:64], r1 |
||
982 | vst1.8 {d3}, [r0,:64], r1 |
||
983 | subs r12, r12, #2 |
||
984 | bne 1b |
||
985 | |||
986 | pop {r4,pc} |
||
987 | endfunc |
||
988 | |||
989 | function ff_put_vp8_epel8_h6_neon, export=1 |
||
990 | sub r2, r2, #2 |
||
991 | push {r4,lr} |
||
992 | |||
993 | ldr r4, [sp, #12] @ mx |
||
994 | movrel lr, subpel_filters-16 |
||
995 | ldr r12, [sp, #8] @ h |
||
996 | add r4, lr, r4, lsl #4 |
||
997 | vld1.16 {q0}, [r4,:128] |
||
998 | 1: |
||
999 | vld1.8 {d2,d3}, [r2], r3 |
||
1000 | |||
1001 | vp8_epel8_h6 d2, d2, d3 |
||
1002 | |||
1003 | vst1.8 {d2}, [r0,:64], r1 |
||
1004 | subs r12, r12, #1 |
||
1005 | bne 1b |
||
1006 | |||
1007 | pop {r4,pc} |
||
1008 | endfunc |
||
1009 | |||
1010 | function ff_put_vp8_epel8_h6v6_neon, export=1 |
||
1011 | sub r2, r2, r3, lsl #1 |
||
1012 | sub r2, r2, #2 |
||
1013 | push {r4,lr} |
||
1014 | |||
1015 | @ first pass (horizontal): |
||
1016 | ldr r4, [sp, #12] @ mx |
||
1017 | movrel lr, subpel_filters-16 |
||
1018 | ldr r12, [sp, #8] @ h |
||
1019 | add r4, lr, r4, lsl #4 |
||
1020 | sub sp, sp, #168+16 |
||
1021 | vld1.16 {q0}, [r4,:128] |
||
1022 | add lr, sp, #15 |
||
1023 | add r12, r12, #5 |
||
1024 | bic lr, lr, #15 |
||
1025 | 1: |
||
1026 | vld1.8 {d2,d3}, [r2], r3 |
||
1027 | |||
1028 | vp8_epel8_h6 d2, d2, d3 |
||
1029 | |||
1030 | vst1.8 {d2}, [lr,:64]! |
||
1031 | subs r12, r12, #1 |
||
1032 | bne 1b |
||
1033 | |||
1034 | @ second pass (vertical): |
||
1035 | ldr r4, [sp, #168+16+16] @ my |
||
1036 | movrel lr, subpel_filters-16 |
||
1037 | ldr r12, [sp, #168+16+8] @ h |
||
1038 | add r4, lr, r4, lsl #4 |
||
1039 | add lr, sp, #15 |
||
1040 | vld1.16 {q0}, [r4,:128] |
||
1041 | bic lr, lr, #15 |
||
1042 | 2: |
||
1043 | vld1.8 {d2-d5}, [lr,:128]! |
||
1044 | vld1.8 {d6-d7}, [lr,:128]! |
||
1045 | vld1.8 {d30}, [lr,:64] |
||
1046 | sub lr, lr, #32 |
||
1047 | |||
1048 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |
||
1049 | |||
1050 | vst1.8 {d2}, [r0,:64], r1 |
||
1051 | vst1.8 {d3}, [r0,:64], r1 |
||
1052 | subs r12, r12, #2 |
||
1053 | bne 2b |
||
1054 | |||
1055 | add sp, sp, #168+16 |
||
1056 | pop {r4,pc} |
||
1057 | endfunc |
||
1058 | |||
1059 | function ff_put_vp8_epel8_v4_neon, export=1 |
||
1060 | sub r2, r2, r3 |
||
1061 | push {r4,lr} |
||
1062 | |||
1063 | ldr r4, [sp, #16] @ my |
||
1064 | movrel lr, subpel_filters-16 |
||
1065 | ldr r12, [sp, #8] @ h |
||
1066 | add r4, lr, r4, lsl #4 |
||
1067 | vld1.16 {q0}, [r4,:128] |
||
1068 | 1: |
||
1069 | vld1.8 {d2}, [r2], r3 |
||
1070 | vld1.8 {d3}, [r2], r3 |
||
1071 | vld1.8 {d4}, [r2], r3 |
||
1072 | vld1.8 {d5}, [r2], r3 |
||
1073 | vld1.8 {d6}, [r2] |
||
1074 | sub r2, r2, r3, lsl #1 |
||
1075 | |||
1076 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||
1077 | |||
1078 | vst1.8 {d2}, [r0,:64], r1 |
||
1079 | vst1.8 {d3}, [r0,:64], r1 |
||
1080 | subs r12, r12, #2 |
||
1081 | bne 1b |
||
1082 | |||
1083 | pop {r4,pc} |
||
1084 | endfunc |
||
1085 | |||
1086 | function ff_put_vp8_epel8_h4_neon, export=1 |
||
1087 | sub r2, r2, #1 |
||
1088 | push {r4,lr} |
||
1089 | |||
1090 | ldr r4, [sp, #12] @ mx |
||
1091 | movrel lr, subpel_filters-16 |
||
1092 | ldr r12, [sp, #8] @ h |
||
1093 | add r4, lr, r4, lsl #4 |
||
1094 | vld1.16 {q0}, [r4,:128] |
||
1095 | 1: |
||
1096 | vld1.8 {d2,d3}, [r2], r3 |
||
1097 | |||
1098 | vp8_epel8_h4 d2, d2, d3 |
||
1099 | |||
1100 | vst1.8 {d2}, [r0,:64], r1 |
||
1101 | subs r12, r12, #1 |
||
1102 | bne 1b |
||
1103 | |||
1104 | pop {r4,pc} |
||
1105 | endfunc |
||
1106 | |||
1107 | function ff_put_vp8_epel8_h4v4_neon, export=1 |
||
1108 | sub r2, r2, r3 |
||
1109 | sub r2, r2, #1 |
||
1110 | push {r4,lr} |
||
1111 | |||
1112 | @ first pass (horizontal): |
||
1113 | ldr r4, [sp, #12] @ mx |
||
1114 | movrel lr, subpel_filters-16 |
||
1115 | ldr r12, [sp, #8] @ h |
||
1116 | add r4, lr, r4, lsl #4 |
||
1117 | sub sp, sp, #168+16 |
||
1118 | vld1.16 {q0}, [r4,:128] |
||
1119 | add lr, sp, #15 |
||
1120 | add r12, r12, #3 |
||
1121 | bic lr, lr, #15 |
||
1122 | 1: |
||
1123 | vld1.8 {d2,d3}, [r2], r3 |
||
1124 | |||
1125 | vp8_epel8_h4 d2, d2, d3 |
||
1126 | |||
1127 | vst1.8 {d2}, [lr,:64]! |
||
1128 | subs r12, r12, #1 |
||
1129 | bne 1b |
||
1130 | |||
1131 | @ second pass (vertical): |
||
1132 | ldr r4, [sp, #168+16+16] @ my |
||
1133 | movrel lr, subpel_filters-16 |
||
1134 | ldr r12, [sp, #168+16+8] @ h |
||
1135 | add r4, lr, r4, lsl #4 |
||
1136 | add lr, sp, #15 |
||
1137 | vld1.16 {q0}, [r4,:128] |
||
1138 | bic lr, lr, #15 |
||
1139 | 2: |
||
1140 | vld1.8 {d2-d5}, [lr,:128]! |
||
1141 | vld1.8 {d6}, [lr,:64] |
||
1142 | sub lr, lr, #16 |
||
1143 | |||
1144 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||
1145 | |||
1146 | vst1.8 {d2}, [r0,:64], r1 |
||
1147 | vst1.8 {d3}, [r0,:64], r1 |
||
1148 | subs r12, r12, #2 |
||
1149 | bne 2b |
||
1150 | |||
1151 | add sp, sp, #168+16 |
||
1152 | pop {r4,pc} |
||
1153 | endfunc |
||
1154 | |||
1155 | function ff_put_vp8_epel8_h6v4_neon, export=1 |
||
1156 | sub r2, r2, r3 |
||
1157 | sub r2, r2, #2 |
||
1158 | push {r4,lr} |
||
1159 | |||
1160 | @ first pass (horizontal): |
||
1161 | ldr r4, [sp, #12] @ mx |
||
1162 | movrel lr, subpel_filters-16 |
||
1163 | ldr r12, [sp, #8] @ h |
||
1164 | add r4, lr, r4, lsl #4 |
||
1165 | sub sp, sp, #168+16 |
||
1166 | vld1.16 {q0}, [r4,:128] |
||
1167 | add lr, sp, #15 |
||
1168 | add r12, r12, #3 |
||
1169 | bic lr, lr, #15 |
||
1170 | 1: |
||
1171 | vld1.8 {d2,d3}, [r2], r3 |
||
1172 | |||
1173 | vp8_epel8_h6 d2, d2, d3 |
||
1174 | |||
1175 | vst1.8 {d2}, [lr,:64]! |
||
1176 | subs r12, r12, #1 |
||
1177 | bne 1b |
||
1178 | |||
1179 | @ second pass (vertical): |
||
1180 | ldr r4, [sp, #168+16+16] @ my |
||
1181 | movrel lr, subpel_filters-16 |
||
1182 | ldr r12, [sp, #168+16+8] @ h |
||
1183 | add r4, lr, r4, lsl #4 |
||
1184 | add lr, sp, #15 |
||
1185 | vld1.16 {q0}, [r4,:128] |
||
1186 | bic lr, lr, #15 |
||
1187 | 2: |
||
1188 | vld1.8 {d2-d5}, [lr,:128]! |
||
1189 | vld1.8 {d6}, [lr,:64] |
||
1190 | sub lr, lr, #16 |
||
1191 | |||
1192 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||
1193 | |||
1194 | vst1.8 {d2}, [r0,:64], r1 |
||
1195 | vst1.8 {d3}, [r0,:64], r1 |
||
1196 | subs r12, r12, #2 |
||
1197 | bne 2b |
||
1198 | |||
1199 | add sp, sp, #168+16 |
||
1200 | pop {r4,pc} |
||
1201 | endfunc |
||
1202 | |||
1203 | function ff_put_vp8_epel8_h4v6_neon, export=1 |
||
1204 | sub r2, r2, r3, lsl #1 |
||
1205 | sub r2, r2, #1 |
||
1206 | push {r4,lr} |
||
1207 | |||
1208 | @ first pass (horizontal): |
||
1209 | ldr r4, [sp, #12] @ mx |
||
1210 | movrel lr, subpel_filters-16 |
||
1211 | ldr r12, [sp, #8] @ h |
||
1212 | add r4, lr, r4, lsl #4 |
||
1213 | sub sp, sp, #168+16 |
||
1214 | vld1.16 {q0}, [r4,:128] |
||
1215 | add lr, sp, #15 |
||
1216 | add r12, r12, #5 |
||
1217 | bic lr, lr, #15 |
||
1218 | 1: |
||
1219 | vld1.8 {d2,d3}, [r2], r3 |
||
1220 | |||
1221 | vp8_epel8_h4 d2, d2, d3 |
||
1222 | |||
1223 | vst1.8 {d2}, [lr,:64]! |
||
1224 | subs r12, r12, #1 |
||
1225 | bne 1b |
||
1226 | |||
1227 | @ second pass (vertical): |
||
1228 | ldr r4, [sp, #168+16+16] @ my |
||
1229 | movrel lr, subpel_filters-16 |
||
1230 | ldr r12, [sp, #168+16+8] @ h |
||
1231 | add r4, lr, r4, lsl #4 |
||
1232 | add lr, sp, #15 |
||
1233 | vld1.16 {q0}, [r4,:128] |
||
1234 | bic lr, lr, #15 |
||
1235 | 2: |
||
1236 | vld1.8 {d2-d5}, [lr,:128]! |
||
1237 | vld1.8 {d6-d7}, [lr,:128]! |
||
1238 | vld1.8 {d30}, [lr,:64] |
||
1239 | sub lr, lr, #32 |
||
1240 | |||
1241 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |
||
1242 | |||
1243 | vst1.8 {d2}, [r0,:64], r1 |
||
1244 | vst1.8 {d3}, [r0,:64], r1 |
||
1245 | subs r12, r12, #2 |
||
1246 | bne 2b |
||
1247 | |||
1248 | add sp, sp, #168+16 |
||
1249 | pop {r4,pc} |
||
1250 | endfunc |
||
1251 | |||
1252 | .ltorg |
||
1253 | |||
1254 | function ff_put_vp8_epel4_v6_neon, export=1 |
||
1255 | sub r2, r2, r3, lsl #1 |
||
1256 | push {r4,lr} |
||
1257 | |||
1258 | ldr r4, [sp, #16] @ my |
||
1259 | movrel lr, subpel_filters-16 |
||
1260 | ldr r12, [sp, #8] @ h |
||
1261 | add r4, lr, r4, lsl #4 |
||
1262 | vld1.16 {q0}, [r4,:128] |
||
1263 | 1: |
||
1264 | vld1.32 {d2[]}, [r2], r3 |
||
1265 | vld1.32 {d3[]}, [r2], r3 |
||
1266 | vld1.32 {d4[]}, [r2], r3 |
||
1267 | vld1.32 {d5[]}, [r2], r3 |
||
1268 | vld1.32 {d6[]}, [r2], r3 |
||
1269 | vld1.32 {d7[]}, [r2], r3 |
||
1270 | vld1.32 {d28[]}, [r2] |
||
1271 | sub r2, r2, r3, lsl #2 |
||
1272 | vld1.32 {d2[1]}, [r2], r3 |
||
1273 | vld1.32 {d3[1]}, [r2], r3 |
||
1274 | vld1.32 {d4[1]}, [r2], r3 |
||
1275 | vld1.32 {d5[1]}, [r2], r3 |
||
1276 | vld1.32 {d6[1]}, [r2], r3 |
||
1277 | vld1.32 {d7[1]}, [r2], r3 |
||
1278 | vld1.32 {d28[1]}, [r2] |
||
1279 | sub r2, r2, r3, lsl #2 |
||
1280 | |||
1281 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |
||
1282 | |||
1283 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1284 | vst1.32 {d3[0]}, [r0,:32], r1 |
||
1285 | vst1.32 {d2[1]}, [r0,:32], r1 |
||
1286 | vst1.32 {d3[1]}, [r0,:32], r1 |
||
1287 | subs r12, r12, #4 |
||
1288 | bne 1b |
||
1289 | |||
1290 | pop {r4,pc} |
||
1291 | endfunc |
||
1292 | |||
1293 | function ff_put_vp8_epel4_h6_neon, export=1 |
||
1294 | sub r2, r2, #2 |
||
1295 | push {r4,lr} |
||
1296 | |||
1297 | ldr r4, [sp, #12] @ mx |
||
1298 | movrel lr, subpel_filters-16 |
||
1299 | ldr r12, [sp, #8] @ h |
||
1300 | add r4, lr, r4, lsl #4 |
||
1301 | vld1.16 {q0}, [r4,:128] |
||
1302 | 1: |
||
1303 | vld1.8 {q1}, [r2], r3 |
||
1304 | vp8_epel8_h6 d2, d2, d3 |
||
1305 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1306 | subs r12, r12, #1 |
||
1307 | bne 1b |
||
1308 | |||
1309 | pop {r4,pc} |
||
1310 | endfunc |
||
1311 | |||
1312 | function ff_put_vp8_epel4_h6v6_neon, export=1 |
||
1313 | sub r2, r2, r3, lsl #1 |
||
1314 | sub r2, r2, #2 |
||
1315 | push {r4,lr} |
||
1316 | |||
1317 | ldr r4, [sp, #12] @ mx |
||
1318 | movrel lr, subpel_filters-16 |
||
1319 | ldr r12, [sp, #8] @ h |
||
1320 | add r4, lr, r4, lsl #4 |
||
1321 | sub sp, sp, #52+16 |
||
1322 | vld1.16 {q0}, [r4,:128] |
||
1323 | add lr, sp, #15 |
||
1324 | add r12, r12, #5 |
||
1325 | bic lr, lr, #15 |
||
1326 | 1: |
||
1327 | vld1.8 {q1}, [r2], r3 |
||
1328 | vp8_epel8_h6 d2, d2, d3 |
||
1329 | vst1.32 {d2[0]}, [lr,:32]! |
||
1330 | subs r12, r12, #1 |
||
1331 | bne 1b |
||
1332 | |||
1333 | ldr r4, [sp, #52+16+16] @ my |
||
1334 | movrel lr, subpel_filters-16 |
||
1335 | ldr r12, [sp, #52+16+8] @ h |
||
1336 | add r4, lr, r4, lsl #4 |
||
1337 | add lr, sp, #15 |
||
1338 | vld1.16 {q0}, [r4,:128] |
||
1339 | bic lr, lr, #15 |
||
1340 | 2: |
||
1341 | vld1.8 {d2-d3}, [lr,:128]! |
||
1342 | vld1.8 {d6}, [lr,:64]! |
||
1343 | vld1.32 {d28[]}, [lr,:32] |
||
1344 | sub lr, lr, #16 |
||
1345 | vld1.8 {d4-d5}, [lr]! |
||
1346 | vld1.8 {d7}, [lr,:64]! |
||
1347 | vld1.32 {d28[1]}, [lr,:32] |
||
1348 | sub lr, lr, #16 |
||
1349 | vtrn.32 q1, q2 |
||
1350 | vtrn.32 d6, d7 |
||
1351 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |
||
1352 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1353 | vst1.32 {d3[0]}, [r0,:32], r1 |
||
1354 | vst1.32 {d2[1]}, [r0,:32], r1 |
||
1355 | vst1.32 {d3[1]}, [r0,:32], r1 |
||
1356 | subs r12, r12, #4 |
||
1357 | bne 2b |
||
1358 | |||
1359 | add sp, sp, #52+16 |
||
1360 | pop {r4,pc} |
||
1361 | endfunc |
||
1362 | |||
1363 | function ff_put_vp8_epel4_h4v6_neon, export=1 |
||
1364 | sub r2, r2, r3, lsl #1 |
||
1365 | sub r2, r2, #1 |
||
1366 | push {r4,lr} |
||
1367 | |||
1368 | ldr r4, [sp, #12] @ mx |
||
1369 | movrel lr, subpel_filters-16 |
||
1370 | ldr r12, [sp, #8] @ h |
||
1371 | add r4, lr, r4, lsl #4 |
||
1372 | sub sp, sp, #52+16 |
||
1373 | vld1.16 {q0}, [r4,:128] |
||
1374 | add lr, sp, #15 |
||
1375 | add r12, r12, #5 |
||
1376 | bic lr, lr, #15 |
||
1377 | 1: |
||
1378 | vld1.8 {d2}, [r2], r3 |
||
1379 | vp8_epel8_h4 d2, d2, d2 |
||
1380 | vst1.32 {d2[0]}, [lr,:32]! |
||
1381 | subs r12, r12, #1 |
||
1382 | bne 1b |
||
1383 | |||
1384 | ldr r4, [sp, #52+16+16] @ my |
||
1385 | movrel lr, subpel_filters-16 |
||
1386 | ldr r12, [sp, #52+16+8] @ h |
||
1387 | add r4, lr, r4, lsl #4 |
||
1388 | add lr, sp, #15 |
||
1389 | vld1.16 {q0}, [r4,:128] |
||
1390 | bic lr, lr, #15 |
||
1391 | 2: |
||
1392 | vld1.8 {d2-d3}, [lr,:128]! |
||
1393 | vld1.8 {d6}, [lr,:64]! |
||
1394 | vld1.32 {d28[]}, [lr,:32] |
||
1395 | sub lr, lr, #16 |
||
1396 | vld1.8 {d4-d5}, [lr]! |
||
1397 | vld1.8 {d7}, [lr,:64]! |
||
1398 | vld1.32 {d28[1]}, [lr,:32] |
||
1399 | sub lr, lr, #16 |
||
1400 | vtrn.32 q1, q2 |
||
1401 | vtrn.32 d6, d7 |
||
1402 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |
||
1403 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1404 | vst1.32 {d3[0]}, [r0,:32], r1 |
||
1405 | vst1.32 {d2[1]}, [r0,:32], r1 |
||
1406 | vst1.32 {d3[1]}, [r0,:32], r1 |
||
1407 | subs r12, r12, #4 |
||
1408 | bne 2b |
||
1409 | |||
1410 | add sp, sp, #52+16 |
||
1411 | pop {r4,pc} |
||
1412 | endfunc |
||
1413 | |||
1414 | function ff_put_vp8_epel4_h6v4_neon, export=1 |
||
1415 | sub r2, r2, r3 |
||
1416 | sub r2, r2, #2 |
||
1417 | push {r4,lr} |
||
1418 | |||
1419 | ldr r4, [sp, #12] @ mx |
||
1420 | movrel lr, subpel_filters-16 |
||
1421 | ldr r12, [sp, #8] @ h |
||
1422 | add r4, lr, r4, lsl #4 |
||
1423 | sub sp, sp, #44+16 |
||
1424 | vld1.16 {q0}, [r4,:128] |
||
1425 | add lr, sp, #15 |
||
1426 | add r12, r12, #3 |
||
1427 | bic lr, lr, #15 |
||
1428 | 1: |
||
1429 | vld1.8 {q1}, [r2], r3 |
||
1430 | vp8_epel8_h6 d2, d2, d3 |
||
1431 | vst1.32 {d2[0]}, [lr,:32]! |
||
1432 | subs r12, r12, #1 |
||
1433 | bne 1b |
||
1434 | |||
1435 | ldr r4, [sp, #44+16+16] @ my |
||
1436 | movrel lr, subpel_filters-16 |
||
1437 | ldr r12, [sp, #44+16+8] @ h |
||
1438 | add r4, lr, r4, lsl #4 |
||
1439 | add lr, sp, #15 |
||
1440 | vld1.16 {q0}, [r4,:128] |
||
1441 | bic lr, lr, #15 |
||
1442 | 2: |
||
1443 | vld1.8 {d2-d3}, [lr,:128]! |
||
1444 | vld1.32 {d6[]}, [lr,:32] |
||
1445 | sub lr, lr, #8 |
||
1446 | vld1.8 {d4-d5}, [lr]! |
||
1447 | vld1.32 {d6[1]}, [lr,:32] |
||
1448 | sub lr, lr, #8 |
||
1449 | vtrn.32 q1, q2 |
||
1450 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |
||
1451 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1452 | vst1.32 {d3[0]}, [r0,:32], r1 |
||
1453 | vst1.32 {d2[1]}, [r0,:32], r1 |
||
1454 | vst1.32 {d3[1]}, [r0,:32], r1 |
||
1455 | subs r12, r12, #4 |
||
1456 | bne 2b |
||
1457 | |||
1458 | add sp, sp, #44+16 |
||
1459 | pop {r4,pc} |
||
1460 | endfunc |
||
1461 | |||
1462 | function ff_put_vp8_epel4_h4_neon, export=1 |
||
1463 | sub r2, r2, #1 |
||
1464 | push {r4,lr} |
||
1465 | |||
1466 | ldr r4, [sp, #12] @ mx |
||
1467 | movrel lr, subpel_filters-16 |
||
1468 | ldr r12, [sp, #8] @ h |
||
1469 | add r4, lr, r4, lsl #4 |
||
1470 | vld1.16 {q0}, [r4,:128] |
||
1471 | 1: |
||
1472 | vld1.8 {d2}, [r2], r3 |
||
1473 | vp8_epel8_h4 d2, d2, d2 |
||
1474 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1475 | subs r12, r12, #1 |
||
1476 | bne 1b |
||
1477 | |||
1478 | pop {r4,pc} |
||
1479 | endfunc |
||
1480 | |||
1481 | function ff_put_vp8_epel4_v4_neon, export=1 |
||
1482 | sub r2, r2, r3 |
||
1483 | push {r4,lr} |
||
1484 | |||
1485 | ldr r4, [sp, #16] @ my |
||
1486 | movrel lr, subpel_filters-16 |
||
1487 | ldr r12, [sp, #8] @ h |
||
1488 | add r4, lr, r4, lsl #4 |
||
1489 | vld1.16 {q0}, [r4,:128] |
||
1490 | 1: |
||
1491 | vld1.32 {d2[]}, [r2], r3 |
||
1492 | vld1.32 {d3[]}, [r2], r3 |
||
1493 | vld1.32 {d4[]}, [r2], r3 |
||
1494 | vld1.32 {d5[]}, [r2], r3 |
||
1495 | vld1.32 {d6[]}, [r2] |
||
1496 | sub r2, r2, r3, lsl #1 |
||
1497 | vld1.32 {d2[1]}, [r2], r3 |
||
1498 | vld1.32 {d3[1]}, [r2], r3 |
||
1499 | vld1.32 {d4[1]}, [r2], r3 |
||
1500 | vld1.32 {d5[1]}, [r2], r3 |
||
1501 | vld1.32 {d6[1]}, [r2] |
||
1502 | sub r2, r2, r3, lsl #1 |
||
1503 | |||
1504 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||
1505 | |||
1506 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1507 | vst1.32 {d3[0]}, [r0,:32], r1 |
||
1508 | vst1.32 {d2[1]}, [r0,:32], r1 |
||
1509 | vst1.32 {d3[1]}, [r0,:32], r1 |
||
1510 | subs r12, r12, #4 |
||
1511 | bne 1b |
||
1512 | |||
1513 | pop {r4,pc} |
||
1514 | endfunc |
||
1515 | |||
1516 | function ff_put_vp8_epel4_h4v4_neon, export=1 |
||
1517 | sub r2, r2, r3 |
||
1518 | sub r2, r2, #1 |
||
1519 | push {r4,lr} |
||
1520 | |||
1521 | ldr r4, [sp, #12] @ mx |
||
1522 | movrel lr, subpel_filters-16 |
||
1523 | ldr r12, [sp, #8] @ h |
||
1524 | add r4, lr, r4, lsl #4 |
||
1525 | sub sp, sp, #44+16 |
||
1526 | vld1.16 {q0}, [r4,:128] |
||
1527 | add lr, sp, #15 |
||
1528 | add r12, r12, #3 |
||
1529 | bic lr, lr, #15 |
||
1530 | 1: |
||
1531 | vld1.8 {d2}, [r2], r3 |
||
1532 | vp8_epel8_h4 d2, d2, d3 |
||
1533 | vst1.32 {d2[0]}, [lr,:32]! |
||
1534 | subs r12, r12, #1 |
||
1535 | bne 1b |
||
1536 | |||
1537 | ldr r4, [sp, #44+16+16] @ my |
||
1538 | movrel lr, subpel_filters-16 |
||
1539 | ldr r12, [sp, #44+16+8] @ h |
||
1540 | add r4, lr, r4, lsl #4 |
||
1541 | add lr, sp, #15 |
||
1542 | vld1.16 {q0}, [r4,:128] |
||
1543 | bic lr, lr, #15 |
||
1544 | 2: |
||
1545 | vld1.8 {d2-d3}, [lr,:128]! |
||
1546 | vld1.32 {d6[]}, [lr,:32] |
||
1547 | sub lr, lr, #8 |
||
1548 | vld1.8 {d4-d5}, [lr]! |
||
1549 | vld1.32 {d6[1]}, [lr,:32] |
||
1550 | sub lr, lr, #8 |
||
1551 | vtrn.32 q1, q2 |
||
1552 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |
||
1553 | vst1.32 {d2[0]}, [r0,:32], r1 |
||
1554 | vst1.32 {d3[0]}, [r0,:32], r1 |
||
1555 | vst1.32 {d2[1]}, [r0,:32], r1 |
||
1556 | vst1.32 {d3[1]}, [r0,:32], r1 |
||
1557 | subs r12, r12, #4 |
||
1558 | bne 2b |
||
1559 | |||
1560 | add sp, sp, #44+16 |
||
1561 | pop {r4,pc} |
||
1562 | endfunc |
||
1563 | |||
1564 | @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit |
||
1565 | @ arithmatic can be used to apply filters |
||
1566 | const subpel_filters, align=4 |
||
1567 | .short 0, 6, 123, 12, 1, 0, 0, 0 |
||
1568 | .short 2, 11, 108, 36, 8, 1, 0, 0 |
||
1569 | .short 0, 9, 93, 50, 6, 0, 0, 0 |
||
1570 | .short 3, 16, 77, 77, 16, 3, 0, 0 |
||
1571 | .short 0, 6, 50, 93, 9, 0, 0, 0 |
||
1572 | .short 1, 8, 36, 108, 11, 2, 0, 0 |
||
1573 | .short 0, 1, 12, 123, 6, 0, 0, 0 |
||
1574 | endconst |
||
1575 | |||
1576 | /* Bilinear MC */ |
||
1577 | |||
1578 | function ff_put_vp8_bilin16_h_neon, export=1 |
||
1579 | push {lr} |
||
1580 | ldr lr, [sp, #8] @ mx |
||
1581 | rsb r12, lr, #8 |
||
1582 | vdup.8 d0, lr |
||
1583 | vdup.8 d1, r12 |
||
1584 | ldr r12, [sp, #4] @ h |
||
1585 | 1: |
||
1586 | subs r12, r12, #2 |
||
1587 | vld1.8 {d2-d4}, [r2], r3 |
||
1588 | vext.8 q2, q1, q2, #1 |
||
1589 | vmull.u8 q8, d2, d1 |
||
1590 | vmlal.u8 q8, d4, d0 |
||
1591 | vld1.8 {d18-d20},[r2], r3 |
||
1592 | vmull.u8 q3, d3, d1 |
||
1593 | vmlal.u8 q3, d5, d0 |
||
1594 | vext.8 q10, q9, q10, #1 |
||
1595 | vmull.u8 q11, d18, d1 |
||
1596 | vmlal.u8 q11, d20, d0 |
||
1597 | vmull.u8 q12, d19, d1 |
||
1598 | vmlal.u8 q12, d21, d0 |
||
1599 | vrshrn.u16 d4, q8, #3 |
||
1600 | vrshrn.u16 d5, q3, #3 |
||
1601 | vrshrn.u16 d6, q11, #3 |
||
1602 | vrshrn.u16 d7, q12, #3 |
||
1603 | vst1.8 {q2}, [r0,:128], r1 |
||
1604 | vst1.8 {q3}, [r0,:128], r1 |
||
1605 | bgt 1b |
||
1606 | |||
1607 | pop {pc} |
||
1608 | endfunc |
||
1609 | |||
1610 | function ff_put_vp8_bilin16_v_neon, export=1 |
||
1611 | push {lr} |
||
1612 | ldr lr, [sp, #12] @ my |
||
1613 | rsb r12, lr, #8 |
||
1614 | vdup.8 d0, lr |
||
1615 | vdup.8 d1, r12 |
||
1616 | ldr r12, [sp, #4] @ h |
||
1617 | vld1.8 {q1}, [r2], r3 |
||
1618 | 1: |
||
1619 | subs r12, r12, #2 |
||
1620 | vld1.8 {q2}, [r2], r3 |
||
1621 | vmull.u8 q3, d2, d1 |
||
1622 | vmlal.u8 q3, d4, d0 |
||
1623 | vmull.u8 q8, d3, d1 |
||
1624 | vmlal.u8 q8, d5, d0 |
||
1625 | vld1.8 {q1}, [r2], r3 |
||
1626 | vmull.u8 q9, d4, d1 |
||
1627 | vmlal.u8 q9, d2, d0 |
||
1628 | vmull.u8 q10, d5, d1 |
||
1629 | vmlal.u8 q10, d3, d0 |
||
1630 | vrshrn.u16 d4, q3, #3 |
||
1631 | vrshrn.u16 d5, q8, #3 |
||
1632 | vrshrn.u16 d6, q9, #3 |
||
1633 | vrshrn.u16 d7, q10, #3 |
||
1634 | vst1.8 {q2}, [r0,:128], r1 |
||
1635 | vst1.8 {q3}, [r0,:128], r1 |
||
1636 | bgt 1b |
||
1637 | |||
1638 | pop {pc} |
||
1639 | endfunc |
||
1640 | |||
1641 | function ff_put_vp8_bilin16_hv_neon, export=1 |
||
1642 | push {lr} |
||
1643 | ldr lr, [sp, #8] @ mx |
||
1644 | rsb r12, lr, #8 |
||
1645 | vdup.8 d0, lr |
||
1646 | vdup.8 d1, r12 |
||
1647 | ldr lr, [sp, #12] @ my |
||
1648 | rsb r12, lr, #8 |
||
1649 | vdup.8 d2, lr |
||
1650 | vdup.8 d3, r12 |
||
1651 | ldr r12, [sp, #4] @ h |
||
1652 | |||
1653 | vld1.8 {d4-d6}, [r2], r3 |
||
1654 | vext.8 q3, q2, q3, #1 |
||
1655 | vmull.u8 q8, d4, d1 |
||
1656 | vmlal.u8 q8, d6, d0 |
||
1657 | vmull.u8 q9, d5, d1 |
||
1658 | vmlal.u8 q9, d7, d0 |
||
1659 | vrshrn.u16 d4, q8, #3 |
||
1660 | vrshrn.u16 d5, q9, #3 |
||
1661 | 1: |
||
1662 | subs r12, r12, #2 |
||
1663 | vld1.8 {d18-d20},[r2], r3 |
||
1664 | vext.8 q10, q9, q10, #1 |
||
1665 | vmull.u8 q11, d18, d1 |
||
1666 | vmlal.u8 q11, d20, d0 |
||
1667 | vld1.8 {d26-d28},[r2], r3 |
||
1668 | vmull.u8 q12, d19, d1 |
||
1669 | vmlal.u8 q12, d21, d0 |
||
1670 | vext.8 q14, q13, q14, #1 |
||
1671 | vmull.u8 q8, d26, d1 |
||
1672 | vmlal.u8 q8, d28, d0 |
||
1673 | vmull.u8 q9, d27, d1 |
||
1674 | vmlal.u8 q9, d29, d0 |
||
1675 | vrshrn.u16 d6, q11, #3 |
||
1676 | vrshrn.u16 d7, q12, #3 |
||
1677 | vmull.u8 q12, d4, d3 |
||
1678 | vmlal.u8 q12, d6, d2 |
||
1679 | vmull.u8 q15, d5, d3 |
||
1680 | vmlal.u8 q15, d7, d2 |
||
1681 | vrshrn.u16 d4, q8, #3 |
||
1682 | vrshrn.u16 d5, q9, #3 |
||
1683 | vmull.u8 q10, d6, d3 |
||
1684 | vmlal.u8 q10, d4, d2 |
||
1685 | vmull.u8 q11, d7, d3 |
||
1686 | vmlal.u8 q11, d5, d2 |
||
1687 | vrshrn.u16 d24, q12, #3 |
||
1688 | vrshrn.u16 d25, q15, #3 |
||
1689 | vst1.8 {q12}, [r0,:128], r1 |
||
1690 | vrshrn.u16 d20, q10, #3 |
||
1691 | vrshrn.u16 d21, q11, #3 |
||
1692 | vst1.8 {q10}, [r0,:128], r1 |
||
1693 | bgt 1b |
||
1694 | |||
1695 | pop {pc} |
||
1696 | endfunc |
||
1697 | |||
1698 | function ff_put_vp8_bilin8_h_neon, export=1 |
||
1699 | push {lr} |
||
1700 | ldr lr, [sp, #8] @ mx |
||
1701 | rsb r12, lr, #8 |
||
1702 | vdup.8 d0, lr |
||
1703 | vdup.8 d1, r12 |
||
1704 | ldr r12, [sp, #4] @ h |
||
1705 | 1: |
||
1706 | subs r12, r12, #2 |
||
1707 | vld1.8 {q1}, [r2], r3 |
||
1708 | vext.8 d3, d2, d3, #1 |
||
1709 | vmull.u8 q2, d2, d1 |
||
1710 | vmlal.u8 q2, d3, d0 |
||
1711 | vld1.8 {q3}, [r2], r3 |
||
1712 | vext.8 d7, d6, d7, #1 |
||
1713 | vmull.u8 q8, d6, d1 |
||
1714 | vmlal.u8 q8, d7, d0 |
||
1715 | vrshrn.u16 d4, q2, #3 |
||
1716 | vrshrn.u16 d16, q8, #3 |
||
1717 | vst1.8 {d4}, [r0,:64], r1 |
||
1718 | vst1.8 {d16}, [r0,:64], r1 |
||
1719 | bgt 1b |
||
1720 | |||
1721 | pop {pc} |
||
1722 | endfunc |
||
1723 | |||
1724 | function ff_put_vp8_bilin8_v_neon, export=1 |
||
1725 | push {lr} |
||
1726 | ldr lr, [sp, #12] @ my |
||
1727 | rsb r12, lr, #8 |
||
1728 | vdup.8 d0, lr |
||
1729 | vdup.8 d1, r12 |
||
1730 | ldr r12, [sp, #4] @ h |
||
1731 | vld1.8 {d2}, [r2], r3 |
||
1732 | 1: |
||
1733 | subs r12, r12, #2 |
||
1734 | vld1.8 {d3}, [r2], r3 |
||
1735 | vmull.u8 q2, d2, d1 |
||
1736 | vmlal.u8 q2, d3, d0 |
||
1737 | vld1.8 {d2}, [r2], r3 |
||
1738 | vmull.u8 q3, d3, d1 |
||
1739 | vmlal.u8 q3, d2, d0 |
||
1740 | vrshrn.u16 d4, q2, #3 |
||
1741 | vrshrn.u16 d6, q3, #3 |
||
1742 | vst1.8 {d4}, [r0,:64], r1 |
||
1743 | vst1.8 {d6}, [r0,:64], r1 |
||
1744 | bgt 1b |
||
1745 | |||
1746 | pop {pc} |
||
1747 | endfunc |
||
1748 | |||
1749 | function ff_put_vp8_bilin8_hv_neon, export=1 |
||
1750 | push {lr} |
||
1751 | ldr lr, [sp, #8] @ mx |
||
1752 | rsb r12, lr, #8 |
||
1753 | vdup.8 d0, lr |
||
1754 | vdup.8 d1, r12 |
||
1755 | ldr lr, [sp, #12] @ my |
||
1756 | rsb r12, lr, #8 |
||
1757 | vdup.8 d2, lr |
||
1758 | vdup.8 d3, r12 |
||
1759 | ldr r12, [sp, #4] @ h |
||
1760 | |||
1761 | vld1.8 {q2}, [r2], r3 |
||
1762 | vext.8 d5, d4, d5, #1 |
||
1763 | vmull.u8 q9, d4, d1 |
||
1764 | vmlal.u8 q9, d5, d0 |
||
1765 | vrshrn.u16 d22, q9, #3 |
||
1766 | 1: |
||
1767 | subs r12, r12, #2 |
||
1768 | vld1.8 {q3}, [r2], r3 |
||
1769 | vext.8 d7, d6, d7, #1 |
||
1770 | vmull.u8 q8, d6, d1 |
||
1771 | vmlal.u8 q8, d7, d0 |
||
1772 | vld1.8 {q2}, [r2], r3 |
||
1773 | vext.8 d5, d4, d5, #1 |
||
1774 | vmull.u8 q9, d4, d1 |
||
1775 | vmlal.u8 q9, d5, d0 |
||
1776 | vrshrn.u16 d16, q8, #3 |
||
1777 | vmull.u8 q10, d22, d3 |
||
1778 | vmlal.u8 q10, d16, d2 |
||
1779 | vrshrn.u16 d22, q9, #3 |
||
1780 | vmull.u8 q12, d16, d3 |
||
1781 | vmlal.u8 q12, d22, d2 |
||
1782 | vrshrn.u16 d20, q10, #3 |
||
1783 | vst1.8 {d20}, [r0,:64], r1 |
||
1784 | vrshrn.u16 d23, q12, #3 |
||
1785 | vst1.8 {d23}, [r0,:64], r1 |
||
1786 | bgt 1b |
||
1787 | |||
1788 | pop {pc} |
||
1789 | endfunc |
||
1790 | |||
1791 | function ff_put_vp8_bilin4_h_neon, export=1 |
||
1792 | push {lr} |
||
1793 | ldr lr, [sp, #8] @ mx |
||
1794 | rsb r12, lr, #8 |
||
1795 | vdup.8 d0, lr |
||
1796 | vdup.8 d1, r12 |
||
1797 | ldr r12, [sp, #4] @ h |
||
1798 | 1: |
||
1799 | subs r12, r12, #2 |
||
1800 | vld1.8 {d2}, [r2], r3 |
||
1801 | vext.8 d3, d2, d3, #1 |
||
1802 | vld1.8 {d6}, [r2], r3 |
||
1803 | vext.8 d7, d6, d7, #1 |
||
1804 | vtrn.32 q1, q3 |
||
1805 | vmull.u8 q2, d2, d1 |
||
1806 | vmlal.u8 q2, d3, d0 |
||
1807 | vrshrn.u16 d4, q2, #3 |
||
1808 | vst1.32 {d4[0]}, [r0,:32], r1 |
||
1809 | vst1.32 {d4[1]}, [r0,:32], r1 |
||
1810 | bgt 1b |
||
1811 | |||
1812 | pop {pc} |
||
1813 | endfunc |
||
1814 | |||
1815 | function ff_put_vp8_bilin4_v_neon, export=1 |
||
1816 | push {lr} |
||
1817 | ldr lr, [sp, #12] @ my |
||
1818 | rsb r12, lr, #8 |
||
1819 | vdup.8 d0, lr |
||
1820 | vdup.8 d1, r12 |
||
1821 | ldr r12, [sp, #4] @ h |
||
1822 | vld1.32 {d2[]}, [r2], r3 |
||
1823 | 1: |
||
1824 | vld1.32 {d3[]}, [r2] |
||
1825 | vld1.32 {d2[1]}, [r2], r3 |
||
1826 | vld1.32 {d3[1]}, [r2], r3 |
||
1827 | vmull.u8 q2, d2, d1 |
||
1828 | vmlal.u8 q2, d3, d0 |
||
1829 | vtrn.32 d3, d2 |
||
1830 | vrshrn.u16 d4, q2, #3 |
||
1831 | vst1.32 {d4[0]}, [r0,:32], r1 |
||
1832 | vst1.32 {d4[1]}, [r0,:32], r1 |
||
1833 | subs r12, r12, #2 |
||
1834 | bgt 1b |
||
1835 | |||
1836 | pop {pc} |
||
1837 | endfunc |
||
1838 | |||
1839 | function ff_put_vp8_bilin4_hv_neon, export=1 |
||
1840 | push {lr} |
||
1841 | ldr lr, [sp, #8] @ mx |
||
1842 | rsb r12, lr, #8 |
||
1843 | vdup.8 d0, lr |
||
1844 | vdup.8 d1, r12 |
||
1845 | ldr lr, [sp, #12] @ my |
||
1846 | rsb r12, lr, #8 |
||
1847 | vdup.8 d2, lr |
||
1848 | vdup.8 d3, r12 |
||
1849 | ldr r12, [sp, #4] @ h |
||
1850 | |||
1851 | vld1.8 {d4}, [r2], r3 |
||
1852 | vext.8 d5, d4, d4, #1 |
||
1853 | vmull.u8 q9, d4, d1 |
||
1854 | vmlal.u8 q9, d5, d0 |
||
1855 | vrshrn.u16 d22, q9, #3 |
||
1856 | 1: |
||
1857 | subs r12, r12, #2 |
||
1858 | vld1.8 {d6}, [r2], r3 |
||
1859 | vext.8 d7, d6, d6, #1 |
||
1860 | vld1.8 {d4}, [r2], r3 |
||
1861 | vext.8 d5, d4, d4, #1 |
||
1862 | vtrn.32 q3, q2 |
||
1863 | vmull.u8 q8, d6, d1 |
||
1864 | vmlal.u8 q8, d7, d0 |
||
1865 | vrshrn.u16 d16, q8, #3 |
||
1866 | vmull.u8 q10, d16, d2 |
||
1867 | vtrn.32 d22, d16 |
||
1868 | vmlal.u8 q10, d22, d3 |
||
1869 | vrev64.32 d22, d16 |
||
1870 | vrshrn.u16 d20, q10, #3 |
||
1871 | vst1.32 {d20[0]}, [r0,:32], r1 |
||
1872 | vst1.32 {d20[1]}, [r0,:32], r1 |
||
1873 | bgt 1b |
||
1874 | |||
1875 | pop {pc} |
||
1876 | endfunc=>=>=>=>=>=>=>=> |