Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard |
||
3 | * Copyright (c) 2013 Janne Grunau |
||
4 | * |
||
5 | * This file is part of FFmpeg. |
||
6 | * |
||
7 | * FFmpeg is free software; you can redistribute it and/or |
||
8 | * modify it under the terms of the GNU Lesser General Public |
||
9 | * License as published by the Free Software Foundation; either |
||
10 | * version 2.1 of the License, or (at your option) any later version. |
||
11 | * |
||
12 | * FFmpeg is distributed in the hope that it will be useful, |
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | * Lesser General Public License for more details. |
||
16 | * |
||
17 | * You should have received a copy of the GNU Lesser General Public |
||
18 | * License along with FFmpeg; if not, write to the Free Software |
||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | */ |
||
21 | |||
22 | #include "libavutil/aarch64/asm.S" |
||
23 | #include "neon.S" |
||
24 | |||
25 | /* H.264 qpel MC */ |
||
26 | |||
27 | .macro lowpass_const r |
||
28 | movz \r, #20, lsl #16 |
||
29 | movk \r, #5 |
||
30 | mov v6.S[0], \r |
||
31 | .endm |
||
32 | |||
33 | //trashes v0-v5 |
||
34 | .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 |
||
35 | ext v2.8B, \r0\().8B, \r1\().8B, #2 |
||
36 | ext v3.8B, \r0\().8B, \r1\().8B, #3 |
||
37 | uaddl v2.8H, v2.8B, v3.8B |
||
38 | ext v4.8B, \r0\().8B, \r1\().8B, #1 |
||
39 | ext v5.8B, \r0\().8B, \r1\().8B, #4 |
||
40 | uaddl v4.8H, v4.8B, v5.8B |
||
41 | ext v1.8B, \r0\().8B, \r1\().8B, #5 |
||
42 | uaddl \d0\().8H, \r0\().8B, v1.8B |
||
43 | ext v0.8B, \r2\().8B, \r3\().8B, #2 |
||
44 | mla \d0\().8H, v2.8H, v6.H[1] |
||
45 | ext v1.8B, \r2\().8B, \r3\().8B, #3 |
||
46 | uaddl v0.8H, v0.8B, v1.8B |
||
47 | ext v1.8B, \r2\().8B, \r3\().8B, #1 |
||
48 | mls \d0\().8H, v4.8H, v6.H[0] |
||
49 | ext v3.8B, \r2\().8B, \r3\().8B, #4 |
||
50 | uaddl v1.8H, v1.8B, v3.8B |
||
51 | ext v2.8B, \r2\().8B, \r3\().8B, #5 |
||
52 | uaddl \d1\().8H, \r2\().8B, v2.8B |
||
53 | mla \d1\().8H, v0.8H, v6.H[1] |
||
54 | mls \d1\().8H, v1.8H, v6.H[0] |
||
55 | .if \narrow |
||
56 | sqrshrun \d0\().8B, \d0\().8H, #5 |
||
57 | sqrshrun \d1\().8B, \d1\().8H, #5 |
||
58 | .endif |
||
59 | .endm |
||
60 | |||
61 | //trashes v0-v5, v7, v30-v31 |
||
62 | .macro lowpass_8H r0, r1 |
||
63 | ext v0.16B, \r0\().16B, \r0\().16B, #2 |
||
64 | ext v1.16B, \r0\().16B, \r0\().16B, #3 |
||
65 | uaddl v0.8H, v0.8B, v1.8B |
||
66 | ext v2.16B, \r0\().16B, \r0\().16B, #1 |
||
67 | ext v3.16B, \r0\().16B, \r0\().16B, #4 |
||
68 | uaddl v2.8H, v2.8B, v3.8B |
||
69 | ext v30.16B, \r0\().16B, \r0\().16B, #5 |
||
70 | uaddl \r0\().8H, \r0\().8B, v30.8B |
||
71 | ext v4.16B, \r1\().16B, \r1\().16B, #2 |
||
72 | mla \r0\().8H, v0.8H, v6.H[1] |
||
73 | ext v5.16B, \r1\().16B, \r1\().16B, #3 |
||
74 | uaddl v4.8H, v4.8B, v5.8B |
||
75 | ext v7.16B, \r1\().16B, \r1\().16B, #1 |
||
76 | mls \r0\().8H, v2.8H, v6.H[0] |
||
77 | ext v0.16B, \r1\().16B, \r1\().16B, #4 |
||
78 | uaddl v7.8H, v7.8B, v0.8B |
||
79 | ext v31.16B, \r1\().16B, \r1\().16B, #5 |
||
80 | uaddl \r1\().8H, \r1\().8B, v31.8B |
||
81 | mla \r1\().8H, v4.8H, v6.H[1] |
||
82 | mls \r1\().8H, v7.8H, v6.H[0] |
||
83 | .endm |
||
84 | |||
85 | // trashes v2-v5, v30 |
||
86 | .macro lowpass_8_1 r0, r1, d0, narrow=1 |
||
87 | ext v2.8B, \r0\().8B, \r1\().8B, #2 |
||
88 | ext v3.8B, \r0\().8B, \r1\().8B, #3 |
||
89 | uaddl v2.8H, v2.8B, v3.8B |
||
90 | ext v4.8B, \r0\().8B, \r1\().8B, #1 |
||
91 | ext v5.8B, \r0\().8B, \r1\().8B, #4 |
||
92 | uaddl v4.8H, v4.8B, v5.8B |
||
93 | ext v30.8B, \r0\().8B, \r1\().8B, #5 |
||
94 | uaddl \d0\().8H, \r0\().8B, v30.8B |
||
95 | mla \d0\().8H, v2.8H, v6.H[1] |
||
96 | mls \d0\().8H, v4.8H, v6.H[0] |
||
97 | .if \narrow |
||
98 | sqrshrun \d0\().8B, \d0\().8H, #5 |
||
99 | .endif |
||
100 | .endm |
||
101 | |||
102 | // trashed v0-v7 |
||
103 | .macro lowpass_8.16 r0, r1, r2 |
||
104 | ext v1.16B, \r0\().16B, \r1\().16B, #4 |
||
105 | ext v0.16B, \r0\().16B, \r1\().16B, #6 |
||
106 | saddl v5.4S, v1.4H, v0.4H |
||
107 | ext v2.16B, \r0\().16B, \r1\().16B, #2 |
||
108 | saddl2 v1.4S, v1.8H, v0.8H |
||
109 | ext v3.16B, \r0\().16B, \r1\().16B, #8 |
||
110 | saddl v6.4S, v2.4H, v3.4H |
||
111 | ext \r1\().16B, \r0\().16B, \r1\().16B, #10 |
||
112 | saddl2 v2.4S, v2.8H, v3.8H |
||
113 | saddl v0.4S, \r0\().4H, \r1\().4H |
||
114 | saddl2 v4.4S, \r0\().8H, \r1\().8H |
||
115 | |||
116 | shl v3.4S, v5.4S, #4 |
||
117 | shl v5.4S, v5.4S, #2 |
||
118 | shl v7.4S, v6.4S, #2 |
||
119 | add v5.4S, v5.4S, v3.4S |
||
120 | add v6.4S, v6.4S, v7.4S |
||
121 | |||
122 | shl v3.4S, v1.4S, #4 |
||
123 | shl v1.4S, v1.4S, #2 |
||
124 | shl v7.4S, v2.4S, #2 |
||
125 | add v1.4S, v1.4S, v3.4S |
||
126 | add v2.4S, v2.4S, v7.4S |
||
127 | |||
128 | add v5.4S, v5.4S, v0.4S |
||
129 | sub v5.4S, v5.4S, v6.4S |
||
130 | |||
131 | add v1.4S, v1.4S, v4.4S |
||
132 | sub v1.4S, v1.4S, v2.4S |
||
133 | |||
134 | rshrn v5.4H, v5.4S, #10 |
||
135 | rshrn2 v5.8H, v1.4S, #10 |
||
136 | |||
137 | sqxtun \r2\().8B, v5.8H |
||
138 | .endm |
||
139 | |||
140 | function put_h264_qpel16_h_lowpass_neon_packed |
||
141 | mov x4, x30 |
||
142 | mov x12, #16 |
||
143 | mov x3, #8 |
||
144 | bl put_h264_qpel8_h_lowpass_neon |
||
145 | sub x1, x1, x2, lsl #4 |
||
146 | add x1, x1, #8 |
||
147 | mov x12, #16 |
||
148 | mov x30, x4 |
||
149 | b put_h264_qpel8_h_lowpass_neon |
||
150 | endfunc |
||
151 | |||
152 | .macro h264_qpel_h_lowpass type |
||
153 | function \type\()_h264_qpel16_h_lowpass_neon |
||
154 | mov x13, x30 |
||
155 | mov x12, #16 |
||
156 | bl \type\()_h264_qpel8_h_lowpass_neon |
||
157 | sub x0, x0, x3, lsl #4 |
||
158 | sub x1, x1, x2, lsl #4 |
||
159 | add x0, x0, #8 |
||
160 | add x1, x1, #8 |
||
161 | mov x12, #16 |
||
162 | mov x30, x13 |
||
163 | endfunc |
||
164 | |||
165 | function \type\()_h264_qpel8_h_lowpass_neon |
||
166 | 1: ld1 {v28.8B, v29.8B}, [x1], x2 |
||
167 | ld1 {v16.8B, v17.8B}, [x1], x2 |
||
168 | subs x12, x12, #2 |
||
169 | lowpass_8 v28, v29, v16, v17, v28, v16 |
||
170 | .ifc \type,avg |
||
171 | ld1 {v2.8B}, [x0], x3 |
||
172 | urhadd v28.8B, v28.8B, v2.8B |
||
173 | ld1 {v3.8B}, [x0] |
||
174 | urhadd v16.8B, v16.8B, v3.8B |
||
175 | sub x0, x0, x3 |
||
176 | .endif |
||
177 | st1 {v28.8B}, [x0], x3 |
||
178 | st1 {v16.8B}, [x0], x3 |
||
179 | b.ne 1b |
||
180 | ret |
||
181 | endfunc |
||
182 | .endm |
||
183 | |||
184 | h264_qpel_h_lowpass put |
||
185 | h264_qpel_h_lowpass avg |
||
186 | |||
187 | .macro h264_qpel_h_lowpass_l2 type |
||
188 | function \type\()_h264_qpel16_h_lowpass_l2_neon |
||
189 | mov x13, x30 |
||
190 | mov x12, #16 |
||
191 | bl \type\()_h264_qpel8_h_lowpass_l2_neon |
||
192 | sub x0, x0, x2, lsl #4 |
||
193 | sub x1, x1, x2, lsl #4 |
||
194 | sub x3, x3, x2, lsl #4 |
||
195 | add x0, x0, #8 |
||
196 | add x1, x1, #8 |
||
197 | add x3, x3, #8 |
||
198 | mov x12, #16 |
||
199 | mov x30, x13 |
||
200 | endfunc |
||
201 | |||
202 | function \type\()_h264_qpel8_h_lowpass_l2_neon |
||
203 | 1: ld1 {v26.8B, v27.8B}, [x1], x2 |
||
204 | ld1 {v16.8B, v17.8B}, [x1], x2 |
||
205 | ld1 {v28.8B}, [x3], x2 |
||
206 | ld1 {v29.8B}, [x3], x2 |
||
207 | subs x12, x12, #2 |
||
208 | lowpass_8 v26, v27, v16, v17, v26, v27 |
||
209 | urhadd v26.8B, v26.8B, v28.8B |
||
210 | urhadd v27.8B, v27.8B, v29.8B |
||
211 | .ifc \type,avg |
||
212 | ld1 {v2.8B}, [x0], x2 |
||
213 | urhadd v26.8B, v26.8B, v2.8B |
||
214 | ld1 {v3.8B}, [x0] |
||
215 | urhadd v27.8B, v27.8B, v3.8B |
||
216 | sub x0, x0, x2 |
||
217 | .endif |
||
218 | st1 {v26.8B}, [x0], x2 |
||
219 | st1 {v27.8B}, [x0], x2 |
||
220 | b.ne 1b |
||
221 | ret |
||
222 | endfunc |
||
223 | .endm |
||
224 | |||
225 | h264_qpel_h_lowpass_l2 put |
||
226 | h264_qpel_h_lowpass_l2 avg |
||
227 | |||
228 | function put_h264_qpel16_v_lowpass_neon_packed |
||
229 | mov x4, x30 |
||
230 | mov x2, #8 |
||
231 | bl put_h264_qpel8_v_lowpass_neon |
||
232 | sub x1, x1, x3, lsl #2 |
||
233 | bl put_h264_qpel8_v_lowpass_neon |
||
234 | sub x1, x1, x3, lsl #4 |
||
235 | sub x1, x1, x3, lsl #2 |
||
236 | add x1, x1, #8 |
||
237 | bl put_h264_qpel8_v_lowpass_neon |
||
238 | sub x1, x1, x3, lsl #2 |
||
239 | mov x30, x4 |
||
240 | b put_h264_qpel8_v_lowpass_neon |
||
241 | endfunc |
||
242 | |||
243 | .macro h264_qpel_v_lowpass type |
||
244 | function \type\()_h264_qpel16_v_lowpass_neon |
||
245 | mov x4, x30 |
||
246 | bl \type\()_h264_qpel8_v_lowpass_neon |
||
247 | sub x1, x1, x3, lsl #2 |
||
248 | bl \type\()_h264_qpel8_v_lowpass_neon |
||
249 | sub x0, x0, x2, lsl #4 |
||
250 | add x0, x0, #8 |
||
251 | sub x1, x1, x3, lsl #4 |
||
252 | sub x1, x1, x3, lsl #2 |
||
253 | add x1, x1, #8 |
||
254 | bl \type\()_h264_qpel8_v_lowpass_neon |
||
255 | sub x1, x1, x3, lsl #2 |
||
256 | mov x30, x4 |
||
257 | endfunc |
||
258 | |||
259 | function \type\()_h264_qpel8_v_lowpass_neon |
||
260 | ld1 {v16.8B}, [x1], x3 |
||
261 | ld1 {v18.8B}, [x1], x3 |
||
262 | ld1 {v20.8B}, [x1], x3 |
||
263 | ld1 {v22.8B}, [x1], x3 |
||
264 | ld1 {v24.8B}, [x1], x3 |
||
265 | ld1 {v26.8B}, [x1], x3 |
||
266 | ld1 {v28.8B}, [x1], x3 |
||
267 | ld1 {v30.8B}, [x1], x3 |
||
268 | ld1 {v17.8B}, [x1], x3 |
||
269 | ld1 {v19.8B}, [x1], x3 |
||
270 | ld1 {v21.8B}, [x1], x3 |
||
271 | ld1 {v23.8B}, [x1], x3 |
||
272 | ld1 {v25.8B}, [x1] |
||
273 | |||
274 | transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 |
||
275 | transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 |
||
276 | lowpass_8 v16, v17, v18, v19, v16, v17 |
||
277 | lowpass_8 v20, v21, v22, v23, v18, v19 |
||
278 | lowpass_8 v24, v25, v26, v27, v20, v21 |
||
279 | lowpass_8 v28, v29, v30, v31, v22, v23 |
||
280 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
||
281 | |||
282 | .ifc \type,avg |
||
283 | ld1 {v24.8B}, [x0], x2 |
||
284 | urhadd v16.8B, v16.8B, v24.8B |
||
285 | ld1 {v25.8B}, [x0], x2 |
||
286 | urhadd v17.8B, v17.8B, v25.8B |
||
287 | ld1 {v26.8B}, [x0], x2 |
||
288 | urhadd v18.8B, v18.8B, v26.8B |
||
289 | ld1 {v27.8B}, [x0], x2 |
||
290 | urhadd v19.8B, v19.8B, v27.8B |
||
291 | ld1 {v28.8B}, [x0], x2 |
||
292 | urhadd v20.8B, v20.8B, v28.8B |
||
293 | ld1 {v29.8B}, [x0], x2 |
||
294 | urhadd v21.8B, v21.8B, v29.8B |
||
295 | ld1 {v30.8B}, [x0], x2 |
||
296 | urhadd v22.8B, v22.8B, v30.8B |
||
297 | ld1 {v31.8B}, [x0], x2 |
||
298 | urhadd v23.8B, v23.8B, v31.8B |
||
299 | sub x0, x0, x2, lsl #3 |
||
300 | .endif |
||
301 | |||
302 | st1 {v16.8B}, [x0], x2 |
||
303 | st1 {v17.8B}, [x0], x2 |
||
304 | st1 {v18.8B}, [x0], x2 |
||
305 | st1 {v19.8B}, [x0], x2 |
||
306 | st1 {v20.8B}, [x0], x2 |
||
307 | st1 {v21.8B}, [x0], x2 |
||
308 | st1 {v22.8B}, [x0], x2 |
||
309 | st1 {v23.8B}, [x0], x2 |
||
310 | |||
311 | ret |
||
312 | endfunc |
||
313 | .endm |
||
314 | |||
315 | h264_qpel_v_lowpass put |
||
316 | h264_qpel_v_lowpass avg |
||
317 | |||
318 | .macro h264_qpel_v_lowpass_l2 type |
||
319 | function \type\()_h264_qpel16_v_lowpass_l2_neon |
||
320 | mov x4, x30 |
||
321 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
||
322 | sub x1, x1, x3, lsl #2 |
||
323 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
||
324 | sub x0, x0, x3, lsl #4 |
||
325 | sub x12, x12, x2, lsl #4 |
||
326 | add x0, x0, #8 |
||
327 | add x12, x12, #8 |
||
328 | sub x1, x1, x3, lsl #4 |
||
329 | sub x1, x1, x3, lsl #2 |
||
330 | add x1, x1, #8 |
||
331 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
||
332 | sub x1, x1, x3, lsl #2 |
||
333 | mov x30, x4 |
||
334 | endfunc |
||
335 | |||
336 | function \type\()_h264_qpel8_v_lowpass_l2_neon |
||
337 | ld1 {v16.8B}, [x1], x3 |
||
338 | ld1 {v18.8B}, [x1], x3 |
||
339 | ld1 {v20.8B}, [x1], x3 |
||
340 | ld1 {v22.8B}, [x1], x3 |
||
341 | ld1 {v24.8B}, [x1], x3 |
||
342 | ld1 {v26.8B}, [x1], x3 |
||
343 | ld1 {v28.8B}, [x1], x3 |
||
344 | ld1 {v30.8B}, [x1], x3 |
||
345 | ld1 {v17.8B}, [x1], x3 |
||
346 | ld1 {v19.8B}, [x1], x3 |
||
347 | ld1 {v21.8B}, [x1], x3 |
||
348 | ld1 {v23.8B}, [x1], x3 |
||
349 | ld1 {v25.8B}, [x1] |
||
350 | |||
351 | transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 |
||
352 | transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 |
||
353 | lowpass_8 v16, v17, v18, v19, v16, v17 |
||
354 | lowpass_8 v20, v21, v22, v23, v18, v19 |
||
355 | lowpass_8 v24, v25, v26, v27, v20, v21 |
||
356 | lowpass_8 v28, v29, v30, v31, v22, v23 |
||
357 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
||
358 | |||
359 | ld1 {v24.8B}, [x12], x2 |
||
360 | ld1 {v25.8B}, [x12], x2 |
||
361 | ld1 {v26.8B}, [x12], x2 |
||
362 | ld1 {v27.8B}, [x12], x2 |
||
363 | ld1 {v28.8B}, [x12], x2 |
||
364 | urhadd v16.8B, v24.8B, v16.8B |
||
365 | urhadd v17.8B, v25.8B, v17.8B |
||
366 | ld1 {v29.8B}, [x12], x2 |
||
367 | urhadd v18.8B, v26.8B, v18.8B |
||
368 | urhadd v19.8B, v27.8B, v19.8B |
||
369 | ld1 {v30.8B}, [x12], x2 |
||
370 | urhadd v20.8B, v28.8B, v20.8B |
||
371 | urhadd v21.8B, v29.8B, v21.8B |
||
372 | ld1 {v31.8B}, [x12], x2 |
||
373 | urhadd v22.8B, v30.8B, v22.8B |
||
374 | urhadd v23.8B, v31.8B, v23.8B |
||
375 | |||
376 | .ifc \type,avg |
||
377 | ld1 {v24.8B}, [x0], x3 |
||
378 | urhadd v16.8B, v16.8B, v24.8B |
||
379 | ld1 {v25.8B}, [x0], x3 |
||
380 | urhadd v17.8B, v17.8B, v25.8B |
||
381 | ld1 {v26.8B}, [x0], x3 |
||
382 | urhadd v18.8B, v18.8B, v26.8B |
||
383 | ld1 {v27.8B}, [x0], x3 |
||
384 | urhadd v19.8B, v19.8B, v27.8B |
||
385 | ld1 {v28.8B}, [x0], x3 |
||
386 | urhadd v20.8B, v20.8B, v28.8B |
||
387 | ld1 {v29.8B}, [x0], x3 |
||
388 | urhadd v21.8B, v21.8B, v29.8B |
||
389 | ld1 {v30.8B}, [x0], x3 |
||
390 | urhadd v22.8B, v22.8B, v30.8B |
||
391 | ld1 {v31.8B}, [x0], x3 |
||
392 | urhadd v23.8B, v23.8B, v31.8B |
||
393 | sub x0, x0, x3, lsl #3 |
||
394 | .endif |
||
395 | |||
396 | st1 {v16.8B}, [x0], x3 |
||
397 | st1 {v17.8B}, [x0], x3 |
||
398 | st1 {v18.8B}, [x0], x3 |
||
399 | st1 {v19.8B}, [x0], x3 |
||
400 | st1 {v20.8B}, [x0], x3 |
||
401 | st1 {v21.8B}, [x0], x3 |
||
402 | st1 {v22.8B}, [x0], x3 |
||
403 | st1 {v23.8B}, [x0], x3 |
||
404 | |||
405 | ret |
||
406 | endfunc |
||
407 | .endm |
||
408 | |||
409 | h264_qpel_v_lowpass_l2 put |
||
410 | h264_qpel_v_lowpass_l2 avg |
||
411 | |||
412 | function put_h264_qpel8_hv_lowpass_neon_top |
||
413 | lowpass_const w12 |
||
414 | ld1 {v16.8H}, [x1], x3 |
||
415 | ld1 {v17.8H}, [x1], x3 |
||
416 | ld1 {v18.8H}, [x1], x3 |
||
417 | ld1 {v19.8H}, [x1], x3 |
||
418 | ld1 {v20.8H}, [x1], x3 |
||
419 | ld1 {v21.8H}, [x1], x3 |
||
420 | ld1 {v22.8H}, [x1], x3 |
||
421 | ld1 {v23.8H}, [x1], x3 |
||
422 | ld1 {v24.8H}, [x1], x3 |
||
423 | ld1 {v25.8H}, [x1], x3 |
||
424 | ld1 {v26.8H}, [x1], x3 |
||
425 | ld1 {v27.8H}, [x1], x3 |
||
426 | ld1 {v28.8H}, [x1] |
||
427 | lowpass_8H v16, v17 |
||
428 | lowpass_8H v18, v19 |
||
429 | lowpass_8H v20, v21 |
||
430 | lowpass_8H v22, v23 |
||
431 | lowpass_8H v24, v25 |
||
432 | lowpass_8H v26, v27 |
||
433 | lowpass_8H v28, v29 |
||
434 | |||
435 | transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
||
436 | transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 |
||
437 | |||
438 | lowpass_8.16 v16, v24, v16 |
||
439 | lowpass_8.16 v17, v25, v17 |
||
440 | |||
441 | lowpass_8.16 v18, v26, v18 |
||
442 | lowpass_8.16 v19, v27, v19 |
||
443 | |||
444 | lowpass_8.16 v20, v28, v20 |
||
445 | lowpass_8.16 v21, v29, v21 |
||
446 | |||
447 | lowpass_8.16 v22, v30, v22 |
||
448 | lowpass_8.16 v23, v31, v23 |
||
449 | |||
450 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 |
||
451 | |||
452 | ret |
||
453 | endfunc |
||
454 | |||
455 | .macro h264_qpel8_hv_lowpass type |
||
456 | function \type\()_h264_qpel8_hv_lowpass_neon |
||
457 | mov x10, x30 |
||
458 | bl put_h264_qpel8_hv_lowpass_neon_top |
||
459 | .ifc \type,avg |
||
460 | ld1 {v0.8B}, [x0], x2 |
||
461 | urhadd v16.8B, v16.8B, v0.8B |
||
462 | ld1 {v1.8B}, [x0], x2 |
||
463 | urhadd v17.8B, v17.8B, v1.8B |
||
464 | ld1 {v2.8B}, [x0], x2 |
||
465 | urhadd v18.8B, v18.8B, v2.8B |
||
466 | ld1 {v3.8B}, [x0], x2 |
||
467 | urhadd v19.8B, v19.8B, v3.8B |
||
468 | ld1 {v4.8B}, [x0], x2 |
||
469 | urhadd v20.8B, v20.8B, v4.8B |
||
470 | ld1 {v5.8B}, [x0], x2 |
||
471 | urhadd v21.8B, v21.8B, v5.8B |
||
472 | ld1 {v6.8B}, [x0], x2 |
||
473 | urhadd v22.8B, v22.8B, v6.8B |
||
474 | ld1 {v7.8B}, [x0], x2 |
||
475 | urhadd v23.8B, v23.8B, v7.8B |
||
476 | sub x0, x0, x2, lsl #3 |
||
477 | .endif |
||
478 | |||
479 | st1 {v16.8B}, [x0], x2 |
||
480 | st1 {v17.8B}, [x0], x2 |
||
481 | st1 {v18.8B}, [x0], x2 |
||
482 | st1 {v19.8B}, [x0], x2 |
||
483 | st1 {v20.8B}, [x0], x2 |
||
484 | st1 {v21.8B}, [x0], x2 |
||
485 | st1 {v22.8B}, [x0], x2 |
||
486 | st1 {v23.8B}, [x0], x2 |
||
487 | |||
488 | ret x10 |
||
489 | endfunc |
||
490 | .endm |
||
491 | |||
492 | h264_qpel8_hv_lowpass put |
||
493 | h264_qpel8_hv_lowpass avg |
||
494 | |||
495 | .macro h264_qpel8_hv_lowpass_l2 type |
||
496 | function \type\()_h264_qpel8_hv_lowpass_l2_neon |
||
497 | mov x10, x30 |
||
498 | bl put_h264_qpel8_hv_lowpass_neon_top |
||
499 | |||
500 | ld1 {v0.8B, v1.8B}, [x2], #16 |
||
501 | ld1 {v2.8B, v3.8B}, [x2], #16 |
||
502 | urhadd v0.8B, v0.8B, v16.8B |
||
503 | urhadd v1.8B, v1.8B, v17.8B |
||
504 | ld1 {v4.8B, v5.8B}, [x2], #16 |
||
505 | urhadd v2.8B, v2.8B, v18.8B |
||
506 | urhadd v3.8B, v3.8B, v19.8B |
||
507 | ld1 {v6.8B, v7.8B}, [x2], #16 |
||
508 | urhadd v4.8B, v4.8B, v20.8B |
||
509 | urhadd v5.8B, v5.8B, v21.8B |
||
510 | urhadd v6.8B, v6.8B, v22.8B |
||
511 | urhadd v7.8B, v7.8B, v23.8B |
||
512 | .ifc \type,avg |
||
513 | ld1 {v16.8B}, [x0], x3 |
||
514 | urhadd v0.8B, v0.8B, v16.8B |
||
515 | ld1 {v17.8B}, [x0], x3 |
||
516 | urhadd v1.8B, v1.8B, v17.8B |
||
517 | ld1 {v18.8B}, [x0], x3 |
||
518 | urhadd v2.8B, v2.8B, v18.8B |
||
519 | ld1 {v19.8B}, [x0], x3 |
||
520 | urhadd v3.8B, v3.8B, v19.8B |
||
521 | ld1 {v20.8B}, [x0], x3 |
||
522 | urhadd v4.8B, v4.8B, v20.8B |
||
523 | ld1 {v21.8B}, [x0], x3 |
||
524 | urhadd v5.8B, v5.8B, v21.8B |
||
525 | ld1 {v22.8B}, [x0], x3 |
||
526 | urhadd v6.8B, v6.8B, v22.8B |
||
527 | ld1 {v23.8B}, [x0], x3 |
||
528 | urhadd v7.8B, v7.8B, v23.8B |
||
529 | sub x0, x0, x3, lsl #3 |
||
530 | .endif |
||
531 | st1 {v0.8B}, [x0], x3 |
||
532 | st1 {v1.8B}, [x0], x3 |
||
533 | st1 {v2.8B}, [x0], x3 |
||
534 | st1 {v3.8B}, [x0], x3 |
||
535 | st1 {v4.8B}, [x0], x3 |
||
536 | st1 {v5.8B}, [x0], x3 |
||
537 | st1 {v6.8B}, [x0], x3 |
||
538 | st1 {v7.8B}, [x0], x3 |
||
539 | |||
540 | ret x10 |
||
541 | endfunc |
||
542 | .endm |
||
543 | |||
544 | h264_qpel8_hv_lowpass_l2 put |
||
545 | h264_qpel8_hv_lowpass_l2 avg |
||
546 | |||
547 | .macro h264_qpel16_hv type |
||
548 | function \type\()_h264_qpel16_hv_lowpass_neon |
||
549 | mov x13, x30 |
||
550 | bl \type\()_h264_qpel8_hv_lowpass_neon |
||
551 | sub x1, x1, x3, lsl #2 |
||
552 | bl \type\()_h264_qpel8_hv_lowpass_neon |
||
553 | sub x1, x1, x3, lsl #4 |
||
554 | sub x1, x1, x3, lsl #2 |
||
555 | add x1, x1, #8 |
||
556 | sub x0, x0, x2, lsl #4 |
||
557 | add x0, x0, #8 |
||
558 | bl \type\()_h264_qpel8_hv_lowpass_neon |
||
559 | sub x1, x1, x3, lsl #2 |
||
560 | mov x30, x13 |
||
561 | b \type\()_h264_qpel8_hv_lowpass_neon |
||
562 | endfunc |
||
563 | |||
564 | function \type\()_h264_qpel16_hv_lowpass_l2_neon |
||
565 | mov x13, x30 |
||
566 | sub x2, x4, #256 |
||
567 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
||
568 | sub x1, x1, x3, lsl #2 |
||
569 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
||
570 | sub x1, x1, x3, lsl #4 |
||
571 | sub x1, x1, x3, lsl #2 |
||
572 | add x1, x1, #8 |
||
573 | sub x0, x0, x3, lsl #4 |
||
574 | add x0, x0, #8 |
||
575 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
||
576 | sub x1, x1, x3, lsl #2 |
||
577 | mov x30, x13 |
||
578 | b \type\()_h264_qpel8_hv_lowpass_l2_neon |
||
579 | endfunc |
||
580 | .endm |
||
581 | |||
582 | h264_qpel16_hv put |
||
583 | h264_qpel16_hv avg |
||
584 | |||
585 | .macro h264_qpel8 type |
||
586 | function ff_\type\()_h264_qpel8_mc10_neon, export=1 |
||
587 | lowpass_const w3 |
||
588 | mov x3, x1 |
||
589 | sub x1, x1, #2 |
||
590 | mov x12, #8 |
||
591 | b \type\()_h264_qpel8_h_lowpass_l2_neon |
||
592 | endfunc |
||
593 | |||
594 | function ff_\type\()_h264_qpel8_mc20_neon, export=1 |
||
595 | lowpass_const w3 |
||
596 | sub x1, x1, #2 |
||
597 | mov x3, x2 |
||
598 | mov x12, #8 |
||
599 | b \type\()_h264_qpel8_h_lowpass_neon |
||
600 | endfunc |
||
601 | |||
602 | function ff_\type\()_h264_qpel8_mc30_neon, export=1 |
||
603 | lowpass_const w3 |
||
604 | add x3, x1, #1 |
||
605 | sub x1, x1, #2 |
||
606 | mov x12, #8 |
||
607 | b \type\()_h264_qpel8_h_lowpass_l2_neon |
||
608 | endfunc |
||
609 | |||
610 | function ff_\type\()_h264_qpel8_mc01_neon, export=1 |
||
611 | mov x14, x30 |
||
612 | mov x12, x1 |
||
613 | \type\()_h264_qpel8_mc01: |
||
614 | lowpass_const w3 |
||
615 | mov x3, x2 |
||
616 | sub x1, x1, x2, lsl #1 |
||
617 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
||
618 | ret x14 |
||
619 | endfunc |
||
620 | |||
621 | function ff_\type\()_h264_qpel8_mc11_neon, export=1 |
||
622 | mov x14, x30 |
||
623 | mov x8, x0 |
||
624 | mov x9, x1 |
||
625 | \type\()_h264_qpel8_mc11: |
||
626 | lowpass_const w3 |
||
627 | mov x11, sp |
||
628 | sub sp, sp, #64 |
||
629 | mov x0, sp |
||
630 | sub x1, x1, #2 |
||
631 | mov x3, #8 |
||
632 | mov x12, #8 |
||
633 | bl put_h264_qpel8_h_lowpass_neon |
||
634 | mov x0, x8 |
||
635 | mov x3, x2 |
||
636 | mov x12, sp |
||
637 | sub x1, x9, x2, lsl #1 |
||
638 | mov x2, #8 |
||
639 | bl \type\()_h264_qpel8_v_lowpass_l2_neon |
||
640 | mov sp, x11 |
||
641 | ret x14 |
||
642 | endfunc |
||
643 | |||
644 | function ff_\type\()_h264_qpel8_mc21_neon, export=1 |
||
645 | mov x14, x30 |
||
646 | mov x8, x0 |
||
647 | mov x9, x1 |
||
648 | \type\()_h264_qpel8_mc21: |
||
649 | lowpass_const w3 |
||
650 | mov x11, sp |
||
651 | sub sp, sp, #(8*8+16*12) |
||
652 | sub x1, x1, #2 |
||
653 | mov x3, #8 |
||
654 | mov x0, sp |
||
655 | mov x12, #8 |
||
656 | bl put_h264_qpel8_h_lowpass_neon |
||
657 | mov x4, x0 |
||
658 | mov x0, x8 |
||
659 | sub x1, x9, x2, lsl #1 |
||
660 | sub x1, x1, #2 |
||
661 | mov x3, x2 |
||
662 | sub x2, x4, #64 |
||
663 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
||
664 | mov sp, x11 |
||
665 | ret x14 |
||
666 | endfunc |
||
667 | |||
668 | function ff_\type\()_h264_qpel8_mc31_neon, export=1 |
||
669 | add x1, x1, #1 |
||
670 | mov x14, x30 |
||
671 | mov x8, x0 |
||
672 | mov x9, x1 |
||
673 | sub x1, x1, #1 |
||
674 | b \type\()_h264_qpel8_mc11 |
||
675 | endfunc |
||
676 | |||
677 | function ff_\type\()_h264_qpel8_mc02_neon, export=1 |
||
678 | mov x14, x30 |
||
679 | lowpass_const w3 |
||
680 | sub x1, x1, x2, lsl #1 |
||
681 | mov x3, x2 |
||
682 | bl \type\()_h264_qpel8_v_lowpass_neon |
||
683 | ret x14 |
||
684 | endfunc |
||
685 | |||
686 | function ff_\type\()_h264_qpel8_mc12_neon, export=1 |
||
687 | mov x14, x30 |
||
688 | mov x8, x0 |
||
689 | mov x9, x1 |
||
690 | \type\()_h264_qpel8_mc12: |
||
691 | lowpass_const w3 |
||
692 | mov x11, sp |
||
693 | sub sp, sp, #(8*8+16*12) |
||
694 | sub x1, x1, x2, lsl #1 |
||
695 | mov x3, x2 |
||
696 | mov x2, #8 |
||
697 | mov x0, sp |
||
698 | bl put_h264_qpel8_v_lowpass_neon |
||
699 | mov x4, x0 |
||
700 | mov x0, x8 |
||
701 | sub x1, x9, x3, lsl #1 |
||
702 | sub x1, x1, #2 |
||
703 | sub x2, x4, #64 |
||
704 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon |
||
705 | mov sp, x11 |
||
706 | ret x14 |
||
707 | endfunc |
||
708 | |||
709 | function ff_\type\()_h264_qpel8_mc22_neon, export=1 |
||
710 | mov x14, x30 |
||
711 | mov x11, sp |
||
712 | sub x1, x1, x2, lsl #1 |
||
713 | sub x1, x1, #2 |
||
714 | mov x3, x2 |
||
715 | bl \type\()_h264_qpel8_hv_lowpass_neon |
||
716 | mov sp, x11 |
||
717 | ret x14 |
||
718 | endfunc |
||
719 | |||
720 | function ff_\type\()_h264_qpel8_mc32_neon, export=1 |
||
721 | mov x14, x30 |
||
722 | mov x8, x0 |
||
723 | mov x9, x1 |
||
724 | add x1, x1, #1 |
||
725 | b \type\()_h264_qpel8_mc12 |
||
726 | endfunc |
||
727 | |||
728 | function ff_\type\()_h264_qpel8_mc03_neon, export=1 |
||
729 | mov x14, x30 |
||
730 | add x12, x1, x2 |
||
731 | b \type\()_h264_qpel8_mc01 |
||
732 | endfunc |
||
733 | |||
734 | function ff_\type\()_h264_qpel8_mc13_neon, export=1 |
||
735 | mov x14, x30 |
||
736 | mov x8, x0 |
||
737 | mov x9, x1 |
||
738 | add x1, x1, x2 |
||
739 | b \type\()_h264_qpel8_mc11 |
||
740 | endfunc |
||
741 | |||
742 | function ff_\type\()_h264_qpel8_mc23_neon, export=1 |
||
743 | mov x14, x30 |
||
744 | mov x8, x0 |
||
745 | mov x9, x1 |
||
746 | add x1, x1, x2 |
||
747 | b \type\()_h264_qpel8_mc21 |
||
748 | endfunc |
||
749 | |||
750 | function ff_\type\()_h264_qpel8_mc33_neon, export=1 |
||
751 | add x1, x1, #1 |
||
752 | mov x14, x30 |
||
753 | mov x8, x0 |
||
754 | mov x9, x1 |
||
755 | add x1, x1, x2 |
||
756 | sub x1, x1, #1 |
||
757 | b \type\()_h264_qpel8_mc11 |
||
758 | endfunc |
||
759 | .endm |
||
760 | |||
761 | h264_qpel8 put |
||
762 | h264_qpel8 avg |
||
763 | |||
764 | .macro h264_qpel16 type |
||
765 | function ff_\type\()_h264_qpel16_mc10_neon, export=1 |
||
766 | lowpass_const w3 |
||
767 | mov x3, x1 |
||
768 | sub x1, x1, #2 |
||
769 | b \type\()_h264_qpel16_h_lowpass_l2_neon |
||
770 | endfunc |
||
771 | |||
772 | function ff_\type\()_h264_qpel16_mc20_neon, export=1 |
||
773 | lowpass_const w3 |
||
774 | sub x1, x1, #2 |
||
775 | mov x3, x2 |
||
776 | b \type\()_h264_qpel16_h_lowpass_neon |
||
777 | endfunc |
||
778 | |||
779 | function ff_\type\()_h264_qpel16_mc30_neon, export=1 |
||
780 | lowpass_const w3 |
||
781 | add x3, x1, #1 |
||
782 | sub x1, x1, #2 |
||
783 | b \type\()_h264_qpel16_h_lowpass_l2_neon |
||
784 | endfunc |
||
785 | |||
786 | function ff_\type\()_h264_qpel16_mc01_neon, export=1 |
||
787 | mov x14, x30 |
||
788 | mov x12, x1 |
||
789 | \type\()_h264_qpel16_mc01: |
||
790 | lowpass_const w3 |
||
791 | mov x3, x2 |
||
792 | sub x1, x1, x2, lsl #1 |
||
793 | bl \type\()_h264_qpel16_v_lowpass_l2_neon |
||
794 | ret x14 |
||
795 | endfunc |
||
796 | |||
797 | function ff_\type\()_h264_qpel16_mc11_neon, export=1 |
||
798 | mov x14, x30 |
||
799 | mov x8, x0 |
||
800 | mov x9, x1 |
||
801 | \type\()_h264_qpel16_mc11: |
||
802 | lowpass_const w3 |
||
803 | mov x11, sp |
||
804 | sub sp, sp, #256 |
||
805 | mov x0, sp |
||
806 | sub x1, x1, #2 |
||
807 | mov x3, #16 |
||
808 | bl put_h264_qpel16_h_lowpass_neon |
||
809 | mov x0, x8 |
||
810 | mov x3, x2 |
||
811 | mov x12, sp |
||
812 | sub x1, x9, x2, lsl #1 |
||
813 | mov x2, #16 |
||
814 | bl \type\()_h264_qpel16_v_lowpass_l2_neon |
||
815 | mov sp, x11 |
||
816 | ret x14 |
||
817 | endfunc |
||
818 | |||
819 | function ff_\type\()_h264_qpel16_mc21_neon, export=1 |
||
820 | mov x14, x30 |
||
821 | mov x8, x0 |
||
822 | mov x9, x1 |
||
823 | \type\()_h264_qpel16_mc21: |
||
824 | lowpass_const w3 |
||
825 | mov x11, sp |
||
826 | sub sp, sp, #(16*16+16*12) |
||
827 | sub x1, x1, #2 |
||
828 | mov x0, sp |
||
829 | bl put_h264_qpel16_h_lowpass_neon_packed |
||
830 | mov x4, x0 |
||
831 | mov x0, x8 |
||
832 | sub x1, x9, x2, lsl #1 |
||
833 | sub x1, x1, #2 |
||
834 | mov x3, x2 |
||
835 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
||
836 | mov sp, x11 |
||
837 | ret x14 |
||
838 | endfunc |
||
839 | |||
840 | function ff_\type\()_h264_qpel16_mc31_neon, export=1 |
||
841 | add x1, x1, #1 |
||
842 | mov x14, x30 |
||
843 | mov x8, x0 |
||
844 | mov x9, x1 |
||
845 | sub x1, x1, #1 |
||
846 | b \type\()_h264_qpel16_mc11 |
||
847 | endfunc |
||
848 | |||
849 | function ff_\type\()_h264_qpel16_mc02_neon, export=1 |
||
850 | mov x14, x30 |
||
851 | lowpass_const w3 |
||
852 | sub x1, x1, x2, lsl #1 |
||
853 | mov x3, x2 |
||
854 | bl \type\()_h264_qpel16_v_lowpass_neon |
||
855 | ret x14 |
||
856 | endfunc |
||
857 | |||
858 | function ff_\type\()_h264_qpel16_mc12_neon, export=1 |
||
859 | mov x14, x30 |
||
860 | mov x8, x0 |
||
861 | mov x9, x1 |
||
862 | \type\()_h264_qpel16_mc12: |
||
863 | lowpass_const w3 |
||
864 | mov x11, sp |
||
865 | sub sp, sp, #(16*16+16*12) |
||
866 | sub x1, x1, x2, lsl #1 |
||
867 | mov x0, sp |
||
868 | mov x3, x2 |
||
869 | bl put_h264_qpel16_v_lowpass_neon_packed |
||
870 | mov x4, x0 |
||
871 | mov x0, x8 |
||
872 | sub x1, x9, x3, lsl #1 |
||
873 | sub x1, x1, #2 |
||
874 | mov x2, x3 |
||
875 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon |
||
876 | mov sp, x11 |
||
877 | ret x14 |
||
878 | endfunc |
||
879 | |||
880 | function ff_\type\()_h264_qpel16_mc22_neon, export=1 |
||
881 | mov x14, x30 |
||
882 | lowpass_const w3 |
||
883 | mov x11, sp |
||
884 | sub x1, x1, x2, lsl #1 |
||
885 | sub x1, x1, #2 |
||
886 | mov x3, x2 |
||
887 | bl \type\()_h264_qpel16_hv_lowpass_neon |
||
888 | mov sp, x11 // restore stack |
||
889 | ret x14 |
||
890 | endfunc |
||
891 | |||
892 | function ff_\type\()_h264_qpel16_mc32_neon, export=1 |
||
893 | mov x14, x30 |
||
894 | mov x8, x0 |
||
895 | mov x9, x1 |
||
896 | add x1, x1, #1 |
||
897 | b \type\()_h264_qpel16_mc12 |
||
898 | endfunc |
||
899 | |||
900 | function ff_\type\()_h264_qpel16_mc03_neon, export=1 |
||
901 | mov x14, x30 |
||
902 | add x12, x1, x2 |
||
903 | b \type\()_h264_qpel16_mc01 |
||
904 | endfunc |
||
905 | |||
906 | function ff_\type\()_h264_qpel16_mc13_neon, export=1 |
||
907 | mov x14, x30 |
||
908 | mov x8, x0 |
||
909 | mov x9, x1 |
||
910 | add x1, x1, x2 |
||
911 | b \type\()_h264_qpel16_mc11 |
||
912 | endfunc |
||
913 | |||
914 | function ff_\type\()_h264_qpel16_mc23_neon, export=1 |
||
915 | mov x14, x30 |
||
916 | mov x8, x0 |
||
917 | mov x9, x1 |
||
918 | add x1, x1, x2 |
||
919 | b \type\()_h264_qpel16_mc21 |
||
920 | endfunc |
||
921 | |||
922 | function ff_\type\()_h264_qpel16_mc33_neon, export=1 |
||
923 | add x1, x1, #1 |
||
924 | mov x14, x30 |
||
925 | mov x8, x0 |
||
926 | mov x9, x1 |
||
927 | add x1, x1, x2 |
||
928 | sub x1, x1, #1 |
||
929 | b \type\()_h264_qpel16_mc11 |
||
930 | endfunc |
||
931 | .endm |
||
932 | |||
933 | h264_qpel16 put |
||
934 | h264_qpel16 avg |