Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6147 serge 1
/*
2
 * Copyright (c) 2008 Mans Rullgard 
3
 * Copyright (c) 2013 Janne Grunau 
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
 
22
#include "libavutil/aarch64/asm.S"
23
#include "neon.S"
24
 
25
        /* H.264 qpel MC */
26
 
27
.macro  lowpass_const   r
28
        movz            \r, #20, lsl #16
29
        movk            \r, #5
30
        mov             v6.S[0], \r
31
.endm
32
 
33
//trashes v0-v5
34
.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
35
        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
36
        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
37
        uaddl           v2.8H,      v2.8B,     v3.8B
38
        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
39
        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
40
        uaddl           v4.8H,      v4.8B,     v5.8B
41
        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
42
        uaddl           \d0\().8H,  \r0\().8B, v1.8B
43
        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
44
        mla             \d0\().8H,  v2.8H,     v6.H[1]
45
        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
46
        uaddl           v0.8H,      v0.8B,     v1.8B
47
        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
48
        mls             \d0\().8H,  v4.8H,     v6.H[0]
49
        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
50
        uaddl           v1.8H,      v1.8B,     v3.8B
51
        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
52
        uaddl           \d1\().8H,  \r2\().8B, v2.8B
53
        mla             \d1\().8H,  v0.8H,     v6.H[1]
54
        mls             \d1\().8H,  v1.8H,     v6.H[0]
55
  .if \narrow
56
        sqrshrun        \d0\().8B,  \d0\().8H, #5
57
        sqrshrun        \d1\().8B,  \d1\().8H, #5
58
  .endif
59
.endm
60
 
61
//trashes v0-v5, v7, v30-v31
62
.macro  lowpass_8H      r0,  r1
63
        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
64
        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
65
        uaddl           v0.8H,      v0.8B,      v1.8B
66
        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
67
        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
68
        uaddl           v2.8H,      v2.8B,      v3.8B
69
        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
70
        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
71
        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
72
        mla             \r0\().8H,  v0.8H,      v6.H[1]
73
        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
74
        uaddl           v4.8H,      v4.8B,      v5.8B
75
        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
76
        mls             \r0\().8H,  v2.8H,      v6.H[0]
77
        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
78
        uaddl           v7.8H,      v7.8B,      v0.8B
79
        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
80
        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
81
        mla             \r1\().8H,  v4.8H,      v6.H[1]
82
        mls             \r1\().8H,  v7.8H,      v6.H[0]
83
.endm
84
 
85
// trashes v2-v5, v30
86
.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
87
        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
88
        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
89
        uaddl           v2.8H,     v2.8B,     v3.8B
90
        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
91
        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
92
        uaddl           v4.8H,     v4.8B,     v5.8B
93
        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
94
        uaddl           \d0\().8H, \r0\().8B, v30.8B
95
        mla             \d0\().8H, v2.8H,     v6.H[1]
96
        mls             \d0\().8H, v4.8H,     v6.H[0]
97
  .if \narrow
98
        sqrshrun        \d0\().8B, \d0\().8H, #5
99
  .endif
100
.endm
101
 
102
// trashed v0-v7
103
.macro  lowpass_8.16    r0,  r1,  r2
104
        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
105
        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
106
        saddl           v5.4S,      v1.4H,      v0.4H
107
        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
108
        saddl2          v1.4S,      v1.8H,      v0.8H
109
        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
110
        saddl           v6.4S,      v2.4H,      v3.4H
111
        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
112
        saddl2          v2.4S,      v2.8H,      v3.8H
113
        saddl           v0.4S,      \r0\().4H,  \r1\().4H
114
        saddl2          v4.4S,      \r0\().8H,  \r1\().8H
115
 
116
        shl             v3.4S,  v5.4S,  #4
117
        shl             v5.4S,  v5.4S,  #2
118
        shl             v7.4S,  v6.4S,  #2
119
        add             v5.4S,  v5.4S,  v3.4S
120
        add             v6.4S,  v6.4S,  v7.4S
121
 
122
        shl             v3.4S,  v1.4S,  #4
123
        shl             v1.4S,  v1.4S,  #2
124
        shl             v7.4S,  v2.4S,  #2
125
        add             v1.4S,  v1.4S,  v3.4S
126
        add             v2.4S,  v2.4S,  v7.4S
127
 
128
        add             v5.4S,  v5.4S,  v0.4S
129
        sub             v5.4S,  v5.4S,  v6.4S
130
 
131
        add             v1.4S,  v1.4S,  v4.4S
132
        sub             v1.4S,  v1.4S,  v2.4S
133
 
134
        rshrn           v5.4H,  v5.4S,  #10
135
        rshrn2          v5.8H,  v1.4S,  #10
136
 
137
        sqxtun          \r2\().8B,  v5.8H
138
.endm
139
 
140
function put_h264_qpel16_h_lowpass_neon_packed
141
        mov             x4,  x30
142
        mov             x12, #16
143
        mov             x3,  #8
144
        bl              put_h264_qpel8_h_lowpass_neon
145
        sub             x1,  x1,  x2, lsl #4
146
        add             x1,  x1,  #8
147
        mov             x12, #16
148
        mov             x30, x4
149
        b               put_h264_qpel8_h_lowpass_neon
150
endfunc
151
 
152
.macro  h264_qpel_h_lowpass type
153
function \type\()_h264_qpel16_h_lowpass_neon
154
        mov             x13, x30
155
        mov             x12, #16
156
        bl              \type\()_h264_qpel8_h_lowpass_neon
157
        sub             x0,  x0,  x3, lsl #4
158
        sub             x1,  x1,  x2, lsl #4
159
        add             x0,  x0,  #8
160
        add             x1,  x1,  #8
161
        mov             x12, #16
162
        mov             x30, x13
163
endfunc
164
 
165
function \type\()_h264_qpel8_h_lowpass_neon
166
1:      ld1             {v28.8B, v29.8B}, [x1], x2
167
        ld1             {v16.8B, v17.8B}, [x1], x2
168
        subs            x12, x12, #2
169
        lowpass_8       v28, v29, v16, v17, v28, v16
170
  .ifc \type,avg
171
        ld1             {v2.8B},    [x0], x3
172
        urhadd          v28.8B, v28.8B,  v2.8B
173
        ld1             {v3.8B},    [x0]
174
        urhadd          v16.8B, v16.8B, v3.8B
175
        sub             x0,  x0,  x3
176
  .endif
177
        st1             {v28.8B},    [x0], x3
178
        st1             {v16.8B},    [x0], x3
179
        b.ne            1b
180
        ret
181
endfunc
182
.endm
183
 
184
        h264_qpel_h_lowpass put
185
        h264_qpel_h_lowpass avg
186
 
187
.macro  h264_qpel_h_lowpass_l2 type
188
function \type\()_h264_qpel16_h_lowpass_l2_neon
189
        mov             x13, x30
190
        mov             x12, #16
191
        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
192
        sub             x0,  x0,  x2, lsl #4
193
        sub             x1,  x1,  x2, lsl #4
194
        sub             x3,  x3,  x2, lsl #4
195
        add             x0,  x0,  #8
196
        add             x1,  x1,  #8
197
        add             x3,  x3,  #8
198
        mov             x12, #16
199
        mov             x30, x13
200
endfunc
201
 
202
function \type\()_h264_qpel8_h_lowpass_l2_neon
203
1:      ld1             {v26.8B, v27.8B}, [x1], x2
204
        ld1             {v16.8B, v17.8B}, [x1], x2
205
        ld1             {v28.8B},     [x3], x2
206
        ld1             {v29.8B},     [x3], x2
207
        subs            x12, x12, #2
208
        lowpass_8       v26, v27, v16, v17, v26, v27
209
        urhadd          v26.8B, v26.8B, v28.8B
210
        urhadd          v27.8B, v27.8B, v29.8B
211
  .ifc \type,avg
212
        ld1             {v2.8B},      [x0], x2
213
        urhadd          v26.8B, v26.8B, v2.8B
214
        ld1             {v3.8B},      [x0]
215
        urhadd          v27.8B, v27.8B, v3.8B
216
        sub             x0,  x0,  x2
217
  .endif
218
        st1             {v26.8B},     [x0], x2
219
        st1             {v27.8B},     [x0], x2
220
        b.ne            1b
221
        ret
222
endfunc
223
.endm
224
 
225
        h264_qpel_h_lowpass_l2 put
226
        h264_qpel_h_lowpass_l2 avg
227
 
228
function put_h264_qpel16_v_lowpass_neon_packed
229
        mov             x4,  x30
230
        mov             x2,  #8
231
        bl              put_h264_qpel8_v_lowpass_neon
232
        sub             x1,  x1,  x3, lsl #2
233
        bl              put_h264_qpel8_v_lowpass_neon
234
        sub             x1,  x1,  x3, lsl #4
235
        sub             x1,  x1,  x3, lsl #2
236
        add             x1,  x1,  #8
237
        bl              put_h264_qpel8_v_lowpass_neon
238
        sub             x1,  x1,  x3, lsl #2
239
        mov             x30, x4
240
        b               put_h264_qpel8_v_lowpass_neon
241
endfunc
242
 
243
.macro  h264_qpel_v_lowpass type
244
function \type\()_h264_qpel16_v_lowpass_neon
245
        mov             x4,  x30
246
        bl              \type\()_h264_qpel8_v_lowpass_neon
247
        sub             x1,  x1,  x3, lsl #2
248
        bl              \type\()_h264_qpel8_v_lowpass_neon
249
        sub             x0,  x0,  x2, lsl #4
250
        add             x0,  x0,  #8
251
        sub             x1,  x1,  x3, lsl #4
252
        sub             x1,  x1,  x3, lsl #2
253
        add             x1,  x1,  #8
254
        bl              \type\()_h264_qpel8_v_lowpass_neon
255
        sub             x1,  x1,  x3, lsl #2
256
        mov             x30, x4
257
endfunc
258
 
259
function \type\()_h264_qpel8_v_lowpass_neon
260
        ld1             {v16.8B}, [x1], x3
261
        ld1             {v18.8B}, [x1], x3
262
        ld1             {v20.8B}, [x1], x3
263
        ld1             {v22.8B}, [x1], x3
264
        ld1             {v24.8B}, [x1], x3
265
        ld1             {v26.8B}, [x1], x3
266
        ld1             {v28.8B}, [x1], x3
267
        ld1             {v30.8B}, [x1], x3
268
        ld1             {v17.8B}, [x1], x3
269
        ld1             {v19.8B}, [x1], x3
270
        ld1             {v21.8B}, [x1], x3
271
        ld1             {v23.8B}, [x1], x3
272
        ld1             {v25.8B}, [x1]
273
 
274
        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
275
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
276
        lowpass_8       v16, v17, v18, v19, v16, v17
277
        lowpass_8       v20, v21, v22, v23, v18, v19
278
        lowpass_8       v24, v25, v26, v27, v20, v21
279
        lowpass_8       v28, v29, v30, v31, v22, v23
280
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
281
 
282
  .ifc \type,avg
283
        ld1             {v24.8B},  [x0], x2
284
        urhadd          v16.8B, v16.8B, v24.8B
285
        ld1             {v25.8B}, [x0], x2
286
        urhadd          v17.8B, v17.8B, v25.8B
287
        ld1             {v26.8B}, [x0], x2
288
        urhadd          v18.8B, v18.8B, v26.8B
289
        ld1             {v27.8B}, [x0], x2
290
        urhadd          v19.8B, v19.8B, v27.8B
291
        ld1             {v28.8B}, [x0], x2
292
        urhadd          v20.8B, v20.8B, v28.8B
293
        ld1             {v29.8B}, [x0], x2
294
        urhadd          v21.8B, v21.8B, v29.8B
295
        ld1             {v30.8B}, [x0], x2
296
        urhadd          v22.8B, v22.8B, v30.8B
297
        ld1             {v31.8B}, [x0], x2
298
        urhadd          v23.8B, v23.8B, v31.8B
299
        sub             x0,  x0,  x2,  lsl #3
300
  .endif
301
 
302
        st1             {v16.8B}, [x0], x2
303
        st1             {v17.8B}, [x0], x2
304
        st1             {v18.8B}, [x0], x2
305
        st1             {v19.8B}, [x0], x2
306
        st1             {v20.8B}, [x0], x2
307
        st1             {v21.8B}, [x0], x2
308
        st1             {v22.8B}, [x0], x2
309
        st1             {v23.8B}, [x0], x2
310
 
311
        ret
312
endfunc
313
.endm
314
 
315
        h264_qpel_v_lowpass put
316
        h264_qpel_v_lowpass avg
317
 
318
.macro  h264_qpel_v_lowpass_l2 type
319
function \type\()_h264_qpel16_v_lowpass_l2_neon
320
        mov             x4,  x30
321
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
322
        sub             x1,  x1,  x3, lsl #2
323
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
324
        sub             x0,  x0,  x3, lsl #4
325
        sub             x12, x12, x2, lsl #4
326
        add             x0,  x0,  #8
327
        add             x12, x12, #8
328
        sub             x1,  x1,  x3, lsl #4
329
        sub             x1,  x1,  x3, lsl #2
330
        add             x1,  x1,  #8
331
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
332
        sub             x1,  x1,  x3, lsl #2
333
        mov             x30, x4
334
endfunc
335
 
336
function \type\()_h264_qpel8_v_lowpass_l2_neon
337
        ld1             {v16.8B}, [x1], x3
338
        ld1             {v18.8B}, [x1], x3
339
        ld1             {v20.8B}, [x1], x3
340
        ld1             {v22.8B}, [x1], x3
341
        ld1             {v24.8B}, [x1], x3
342
        ld1             {v26.8B}, [x1], x3
343
        ld1             {v28.8B}, [x1], x3
344
        ld1             {v30.8B}, [x1], x3
345
        ld1             {v17.8B}, [x1], x3
346
        ld1             {v19.8B}, [x1], x3
347
        ld1             {v21.8B}, [x1], x3
348
        ld1             {v23.8B}, [x1], x3
349
        ld1             {v25.8B}, [x1]
350
 
351
        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
352
        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
353
        lowpass_8       v16, v17, v18, v19, v16, v17
354
        lowpass_8       v20, v21, v22, v23, v18, v19
355
        lowpass_8       v24, v25, v26, v27, v20, v21
356
        lowpass_8       v28, v29, v30, v31, v22, v23
357
        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
358
 
359
        ld1             {v24.8B},  [x12], x2
360
        ld1             {v25.8B},  [x12], x2
361
        ld1             {v26.8B},  [x12], x2
362
        ld1             {v27.8B},  [x12], x2
363
        ld1             {v28.8B},  [x12], x2
364
        urhadd          v16.8B, v24.8B, v16.8B
365
        urhadd          v17.8B, v25.8B, v17.8B
366
        ld1             {v29.8B},  [x12], x2
367
        urhadd          v18.8B, v26.8B, v18.8B
368
        urhadd          v19.8B, v27.8B, v19.8B
369
        ld1             {v30.8B}, [x12], x2
370
        urhadd          v20.8B, v28.8B, v20.8B
371
        urhadd          v21.8B, v29.8B, v21.8B
372
        ld1             {v31.8B}, [x12], x2
373
        urhadd          v22.8B, v30.8B, v22.8B
374
        urhadd          v23.8B, v31.8B, v23.8B
375
 
376
  .ifc \type,avg
377
        ld1             {v24.8B}, [x0], x3
378
        urhadd          v16.8B, v16.8B, v24.8B
379
        ld1             {v25.8B}, [x0], x3
380
        urhadd          v17.8B, v17.8B, v25.8B
381
        ld1             {v26.8B}, [x0], x3
382
        urhadd          v18.8B, v18.8B, v26.8B
383
        ld1             {v27.8B}, [x0], x3
384
        urhadd          v19.8B, v19.8B, v27.8B
385
        ld1             {v28.8B}, [x0], x3
386
        urhadd          v20.8B, v20.8B, v28.8B
387
        ld1             {v29.8B}, [x0], x3
388
        urhadd          v21.8B, v21.8B, v29.8B
389
        ld1             {v30.8B}, [x0], x3
390
        urhadd          v22.8B, v22.8B, v30.8B
391
        ld1             {v31.8B}, [x0], x3
392
        urhadd          v23.8B, v23.8B, v31.8B
393
        sub             x0,  x0,  x3,  lsl #3
394
  .endif
395
 
396
        st1             {v16.8B}, [x0], x3
397
        st1             {v17.8B}, [x0], x3
398
        st1             {v18.8B}, [x0], x3
399
        st1             {v19.8B}, [x0], x3
400
        st1             {v20.8B}, [x0], x3
401
        st1             {v21.8B}, [x0], x3
402
        st1             {v22.8B}, [x0], x3
403
        st1             {v23.8B}, [x0], x3
404
 
405
        ret
406
endfunc
407
.endm
408
 
409
        h264_qpel_v_lowpass_l2 put
410
        h264_qpel_v_lowpass_l2 avg
411
 
412
function put_h264_qpel8_hv_lowpass_neon_top
413
        lowpass_const   w12
414
        ld1             {v16.8H}, [x1], x3
415
        ld1             {v17.8H}, [x1], x3
416
        ld1             {v18.8H}, [x1], x3
417
        ld1             {v19.8H}, [x1], x3
418
        ld1             {v20.8H}, [x1], x3
419
        ld1             {v21.8H}, [x1], x3
420
        ld1             {v22.8H}, [x1], x3
421
        ld1             {v23.8H}, [x1], x3
422
        ld1             {v24.8H}, [x1], x3
423
        ld1             {v25.8H}, [x1], x3
424
        ld1             {v26.8H}, [x1], x3
425
        ld1             {v27.8H}, [x1], x3
426
        ld1             {v28.8H}, [x1]
427
        lowpass_8H      v16, v17
428
        lowpass_8H      v18, v19
429
        lowpass_8H      v20, v21
430
        lowpass_8H      v22, v23
431
        lowpass_8H      v24, v25
432
        lowpass_8H      v26, v27
433
        lowpass_8H      v28, v29
434
 
435
        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
436
        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
437
 
438
        lowpass_8.16    v16, v24, v16
439
        lowpass_8.16    v17, v25, v17
440
 
441
        lowpass_8.16    v18, v26, v18
442
        lowpass_8.16    v19, v27, v19
443
 
444
        lowpass_8.16    v20, v28, v20
445
        lowpass_8.16    v21, v29, v21
446
 
447
        lowpass_8.16    v22, v30, v22
448
        lowpass_8.16    v23, v31, v23
449
 
450
        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
451
 
452
        ret
453
endfunc
454
 
455
.macro  h264_qpel8_hv_lowpass type
456
function \type\()_h264_qpel8_hv_lowpass_neon
457
        mov             x10, x30
458
        bl              put_h264_qpel8_hv_lowpass_neon_top
459
  .ifc \type,avg
460
        ld1             {v0.8B},      [x0], x2
461
        urhadd          v16.8B, v16.8B, v0.8B
462
        ld1             {v1.8B},      [x0], x2
463
        urhadd          v17.8B, v17.8B, v1.8B
464
        ld1             {v2.8B},      [x0], x2
465
        urhadd          v18.8B, v18.8B, v2.8B
466
        ld1             {v3.8B},      [x0], x2
467
        urhadd          v19.8B, v19.8B, v3.8B
468
        ld1             {v4.8B},      [x0], x2
469
        urhadd          v20.8B, v20.8B, v4.8B
470
        ld1             {v5.8B},      [x0], x2
471
        urhadd          v21.8B, v21.8B, v5.8B
472
        ld1             {v6.8B},      [x0], x2
473
        urhadd          v22.8B, v22.8B, v6.8B
474
        ld1             {v7.8B},      [x0], x2
475
        urhadd          v23.8B, v23.8B, v7.8B
476
        sub             x0,  x0,  x2,  lsl #3
477
  .endif
478
 
479
        st1             {v16.8B},     [x0], x2
480
        st1             {v17.8B},     [x0], x2
481
        st1             {v18.8B},     [x0], x2
482
        st1             {v19.8B},     [x0], x2
483
        st1             {v20.8B},     [x0], x2
484
        st1             {v21.8B},     [x0], x2
485
        st1             {v22.8B},     [x0], x2
486
        st1             {v23.8B},     [x0], x2
487
 
488
        ret             x10
489
endfunc
490
.endm
491
 
492
        h264_qpel8_hv_lowpass put
493
        h264_qpel8_hv_lowpass avg
494
 
495
.macro  h264_qpel8_hv_lowpass_l2 type
496
function \type\()_h264_qpel8_hv_lowpass_l2_neon
497
        mov             x10, x30
498
        bl              put_h264_qpel8_hv_lowpass_neon_top
499
 
500
        ld1             {v0.8B, v1.8B},  [x2], #16
501
        ld1             {v2.8B, v3.8B},  [x2], #16
502
        urhadd          v0.8B,  v0.8B,  v16.8B
503
        urhadd          v1.8B,  v1.8B,  v17.8B
504
        ld1             {v4.8B, v5.8B},  [x2], #16
505
        urhadd          v2.8B,  v2.8B,  v18.8B
506
        urhadd          v3.8B,  v3.8B,  v19.8B
507
        ld1             {v6.8B, v7.8B},  [x2], #16
508
        urhadd          v4.8B,  v4.8B,  v20.8B
509
        urhadd          v5.8B,  v5.8B,  v21.8B
510
        urhadd          v6.8B,  v6.8B,  v22.8B
511
        urhadd          v7.8B,  v7.8B,  v23.8B
512
  .ifc \type,avg
513
        ld1             {v16.8B},     [x0], x3
514
        urhadd          v0.8B,  v0.8B,  v16.8B
515
        ld1             {v17.8B},     [x0], x3
516
        urhadd          v1.8B,  v1.8B,  v17.8B
517
        ld1             {v18.8B},     [x0], x3
518
        urhadd          v2.8B,  v2.8B,  v18.8B
519
        ld1             {v19.8B},     [x0], x3
520
        urhadd          v3.8B,  v3.8B,  v19.8B
521
        ld1             {v20.8B},     [x0], x3
522
        urhadd          v4.8B,  v4.8B,  v20.8B
523
        ld1             {v21.8B},     [x0], x3
524
        urhadd          v5.8B,  v5.8B,  v21.8B
525
        ld1             {v22.8B},     [x0], x3
526
        urhadd          v6.8B,  v6.8B,  v22.8B
527
        ld1             {v23.8B},     [x0], x3
528
        urhadd          v7.8B,  v7.8B,  v23.8B
529
        sub             x0,  x0,  x3,  lsl #3
530
  .endif
531
        st1             {v0.8B},      [x0], x3
532
        st1             {v1.8B},      [x0], x3
533
        st1             {v2.8B},      [x0], x3
534
        st1             {v3.8B},      [x0], x3
535
        st1             {v4.8B},      [x0], x3
536
        st1             {v5.8B},      [x0], x3
537
        st1             {v6.8B},      [x0], x3
538
        st1             {v7.8B},      [x0], x3
539
 
540
        ret             x10
541
endfunc
542
.endm
543
 
544
        h264_qpel8_hv_lowpass_l2 put
545
        h264_qpel8_hv_lowpass_l2 avg
546
 
547
.macro  h264_qpel16_hv  type
548
function \type\()_h264_qpel16_hv_lowpass_neon
549
        mov             x13, x30
550
        bl              \type\()_h264_qpel8_hv_lowpass_neon
551
        sub             x1,  x1,  x3, lsl #2
552
        bl              \type\()_h264_qpel8_hv_lowpass_neon
553
        sub             x1,  x1,  x3, lsl #4
554
        sub             x1,  x1,  x3, lsl #2
555
        add             x1,  x1,  #8
556
        sub             x0,  x0,  x2, lsl #4
557
        add             x0,  x0,  #8
558
        bl              \type\()_h264_qpel8_hv_lowpass_neon
559
        sub             x1,  x1,  x3, lsl #2
560
        mov             x30, x13
561
        b               \type\()_h264_qpel8_hv_lowpass_neon
562
endfunc
563
 
564
function \type\()_h264_qpel16_hv_lowpass_l2_neon
565
        mov             x13, x30
566
        sub             x2,  x4,  #256
567
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
568
        sub             x1,  x1,  x3, lsl #2
569
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
570
        sub             x1,  x1,  x3, lsl #4
571
        sub             x1,  x1,  x3, lsl #2
572
        add             x1,  x1,  #8
573
        sub             x0,  x0,  x3, lsl #4
574
        add             x0,  x0,  #8
575
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
576
        sub             x1,  x1,  x3, lsl #2
577
        mov             x30, x13
578
        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
579
endfunc
580
.endm
581
 
582
        h264_qpel16_hv put
583
        h264_qpel16_hv avg
584
 
585
.macro  h264_qpel8      type
586
function ff_\type\()_h264_qpel8_mc10_neon, export=1
587
        lowpass_const   w3
588
        mov             x3,  x1
589
        sub             x1,  x1,  #2
590
        mov             x12, #8
591
        b               \type\()_h264_qpel8_h_lowpass_l2_neon
592
endfunc
593
 
594
function ff_\type\()_h264_qpel8_mc20_neon, export=1
595
        lowpass_const   w3
596
        sub             x1,  x1,  #2
597
        mov             x3,  x2
598
        mov             x12, #8
599
        b               \type\()_h264_qpel8_h_lowpass_neon
600
endfunc
601
 
602
function ff_\type\()_h264_qpel8_mc30_neon, export=1
603
        lowpass_const   w3
604
        add             x3,  x1,  #1
605
        sub             x1,  x1,  #2
606
        mov             x12, #8
607
        b               \type\()_h264_qpel8_h_lowpass_l2_neon
608
endfunc
609
 
610
function ff_\type\()_h264_qpel8_mc01_neon, export=1
611
        mov             x14, x30
612
        mov             x12, x1
613
\type\()_h264_qpel8_mc01:
614
        lowpass_const   w3
615
        mov             x3,  x2
616
        sub             x1,  x1,  x2, lsl #1
617
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
618
        ret             x14
619
endfunc
620
 
621
function ff_\type\()_h264_qpel8_mc11_neon, export=1
622
        mov             x14, x30
623
        mov             x8,  x0
624
        mov             x9,  x1
625
\type\()_h264_qpel8_mc11:
626
        lowpass_const   w3
627
        mov             x11, sp
628
        sub             sp,  sp,  #64
629
        mov             x0,  sp
630
        sub             x1,  x1,  #2
631
        mov             x3,  #8
632
        mov             x12, #8
633
        bl              put_h264_qpel8_h_lowpass_neon
634
        mov             x0,  x8
635
        mov             x3,  x2
636
        mov             x12, sp
637
        sub             x1,  x9,  x2, lsl #1
638
        mov             x2,  #8
639
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
640
        mov             sp,  x11
641
        ret             x14
642
endfunc
643
 
644
function ff_\type\()_h264_qpel8_mc21_neon, export=1
645
        mov             x14, x30
646
        mov             x8,  x0
647
        mov             x9,  x1
648
\type\()_h264_qpel8_mc21:
649
        lowpass_const   w3
650
        mov             x11, sp
651
        sub             sp,  sp,  #(8*8+16*12)
652
        sub             x1,  x1,  #2
653
        mov             x3,  #8
654
        mov             x0,  sp
655
        mov             x12, #8
656
        bl              put_h264_qpel8_h_lowpass_neon
657
        mov             x4,  x0
658
        mov             x0,  x8
659
        sub             x1,  x9,  x2, lsl #1
660
        sub             x1,  x1,  #2
661
        mov             x3,  x2
662
        sub             x2,  x4,  #64
663
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
664
        mov             sp,  x11
665
        ret             x14
666
endfunc
667
 
668
function ff_\type\()_h264_qpel8_mc31_neon, export=1
669
        add             x1,  x1,  #1
670
        mov             x14, x30
671
        mov             x8,  x0
672
        mov             x9,  x1
673
        sub             x1,  x1,  #1
674
        b               \type\()_h264_qpel8_mc11
675
endfunc
676
 
677
function ff_\type\()_h264_qpel8_mc02_neon, export=1
678
        mov             x14, x30
679
        lowpass_const   w3
680
        sub             x1,  x1,  x2, lsl #1
681
        mov             x3,  x2
682
        bl              \type\()_h264_qpel8_v_lowpass_neon
683
        ret             x14
684
endfunc
685
 
686
function ff_\type\()_h264_qpel8_mc12_neon, export=1
687
        mov             x14, x30
688
        mov             x8,  x0
689
        mov             x9,  x1
690
\type\()_h264_qpel8_mc12:
691
        lowpass_const   w3
692
        mov             x11, sp
693
        sub             sp,  sp,  #(8*8+16*12)
694
        sub             x1,  x1,  x2, lsl #1
695
        mov             x3,  x2
696
        mov             x2,  #8
697
        mov             x0,  sp
698
        bl              put_h264_qpel8_v_lowpass_neon
699
        mov             x4,  x0
700
        mov             x0,  x8
701
        sub             x1,  x9,  x3, lsl #1
702
        sub             x1,  x1,  #2
703
        sub             x2,  x4,  #64
704
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
705
        mov             sp,  x11
706
        ret             x14
707
endfunc
708
 
709
function ff_\type\()_h264_qpel8_mc22_neon, export=1
710
        mov             x14, x30
711
        mov             x11, sp
712
        sub             x1,  x1,  x2, lsl #1
713
        sub             x1,  x1,  #2
714
        mov             x3,  x2
715
        bl              \type\()_h264_qpel8_hv_lowpass_neon
716
        mov             sp,  x11
717
        ret             x14
718
endfunc
719
 
720
function ff_\type\()_h264_qpel8_mc32_neon, export=1
721
        mov             x14, x30
722
        mov             x8,  x0
723
        mov             x9,  x1
724
        add             x1,  x1,  #1
725
        b               \type\()_h264_qpel8_mc12
726
endfunc
727
 
728
function ff_\type\()_h264_qpel8_mc03_neon, export=1
729
        mov             x14, x30
730
        add             x12, x1,  x2
731
        b               \type\()_h264_qpel8_mc01
732
endfunc
733
 
734
function ff_\type\()_h264_qpel8_mc13_neon, export=1
735
        mov             x14, x30
736
        mov             x8,  x0
737
        mov             x9,  x1
738
        add             x1,  x1,  x2
739
        b               \type\()_h264_qpel8_mc11
740
endfunc
741
 
742
function ff_\type\()_h264_qpel8_mc23_neon, export=1
743
        mov             x14, x30
744
        mov             x8,  x0
745
        mov             x9,  x1
746
        add             x1,  x1,  x2
747
        b               \type\()_h264_qpel8_mc21
748
endfunc
749
 
750
function ff_\type\()_h264_qpel8_mc33_neon, export=1
751
        add             x1,  x1,  #1
752
        mov             x14, x30
753
        mov             x8,  x0
754
        mov             x9,  x1
755
        add             x1,  x1,  x2
756
        sub             x1,  x1,  #1
757
        b               \type\()_h264_qpel8_mc11
758
endfunc
759
.endm
760
 
761
        h264_qpel8 put
762
        h264_qpel8 avg
763
 
764
.macro  h264_qpel16     type
765
function ff_\type\()_h264_qpel16_mc10_neon, export=1
766
        lowpass_const   w3
767
        mov             x3,  x1
768
        sub             x1,  x1,  #2
769
        b               \type\()_h264_qpel16_h_lowpass_l2_neon
770
endfunc
771
 
772
function ff_\type\()_h264_qpel16_mc20_neon, export=1
773
        lowpass_const   w3
774
        sub             x1,  x1,  #2
775
        mov             x3,  x2
776
        b               \type\()_h264_qpel16_h_lowpass_neon
777
endfunc
778
 
779
function ff_\type\()_h264_qpel16_mc30_neon, export=1
780
        lowpass_const   w3
781
        add             x3,  x1,  #1
782
        sub             x1,  x1,  #2
783
        b               \type\()_h264_qpel16_h_lowpass_l2_neon
784
endfunc
785
 
786
function ff_\type\()_h264_qpel16_mc01_neon, export=1
787
        mov             x14, x30
788
        mov             x12, x1
789
\type\()_h264_qpel16_mc01:
790
        lowpass_const   w3
791
        mov             x3,  x2
792
        sub             x1,  x1,  x2, lsl #1
793
        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
794
        ret             x14
795
endfunc
796
 
797
function ff_\type\()_h264_qpel16_mc11_neon, export=1
798
        mov             x14, x30
799
        mov             x8,  x0
800
        mov             x9,  x1
801
\type\()_h264_qpel16_mc11:
802
        lowpass_const   w3
803
        mov             x11, sp
804
        sub             sp,  sp,  #256
805
        mov             x0,  sp
806
        sub             x1,  x1,  #2
807
        mov             x3,  #16
808
        bl              put_h264_qpel16_h_lowpass_neon
809
        mov             x0,  x8
810
        mov             x3,  x2
811
        mov             x12, sp
812
        sub             x1,  x9,  x2, lsl #1
813
        mov             x2,  #16
814
        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
815
        mov             sp,  x11
816
        ret             x14
817
endfunc
818
 
819
function ff_\type\()_h264_qpel16_mc21_neon, export=1
820
        mov             x14, x30
821
        mov             x8,  x0
822
        mov             x9,  x1
823
\type\()_h264_qpel16_mc21:
824
        lowpass_const   w3
825
        mov             x11, sp
826
        sub             sp,  sp,  #(16*16+16*12)
827
        sub             x1,  x1,  #2
828
        mov             x0,  sp
829
        bl              put_h264_qpel16_h_lowpass_neon_packed
830
        mov             x4,  x0
831
        mov             x0,  x8
832
        sub             x1,  x9,  x2, lsl #1
833
        sub             x1,  x1,  #2
834
        mov             x3,  x2
835
        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
836
        mov             sp,  x11
837
        ret             x14
838
endfunc
839
 
840
function ff_\type\()_h264_qpel16_mc31_neon, export=1
841
        add             x1,  x1,  #1
842
        mov             x14, x30
843
        mov             x8,  x0
844
        mov             x9,  x1
845
        sub             x1,  x1,  #1
846
        b               \type\()_h264_qpel16_mc11
847
endfunc
848
 
849
function ff_\type\()_h264_qpel16_mc02_neon, export=1
850
        mov             x14, x30
851
        lowpass_const   w3
852
        sub             x1,  x1,  x2, lsl #1
853
        mov             x3,  x2
854
        bl              \type\()_h264_qpel16_v_lowpass_neon
855
        ret             x14
856
endfunc
857
 
858
function ff_\type\()_h264_qpel16_mc12_neon, export=1
859
        mov             x14, x30
860
        mov             x8,  x0
861
        mov             x9,  x1
862
\type\()_h264_qpel16_mc12:
863
        lowpass_const   w3
864
        mov             x11, sp
865
        sub             sp,  sp,  #(16*16+16*12)
866
        sub             x1,  x1,  x2, lsl #1
867
        mov             x0,  sp
868
        mov             x3,  x2
869
        bl              put_h264_qpel16_v_lowpass_neon_packed
870
        mov             x4,  x0
871
        mov             x0,  x8
872
        sub             x1,  x9,  x3, lsl #1
873
        sub             x1,  x1,  #2
874
        mov             x2,  x3
875
        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
876
        mov             sp,  x11
877
        ret             x14
878
endfunc
879
 
880
function ff_\type\()_h264_qpel16_mc22_neon, export=1
881
        mov             x14, x30
882
        lowpass_const   w3
883
        mov             x11, sp
884
        sub             x1,  x1,  x2, lsl #1
885
        sub             x1,  x1,  #2
886
        mov             x3,  x2
887
        bl              \type\()_h264_qpel16_hv_lowpass_neon
888
        mov             sp,  x11 // restore stack
889
        ret             x14
890
endfunc
891
 
892
function ff_\type\()_h264_qpel16_mc32_neon, export=1
893
        mov             x14, x30
894
        mov             x8,  x0
895
        mov             x9,  x1
896
        add             x1,  x1,  #1
897
        b               \type\()_h264_qpel16_mc12
898
endfunc
899
 
900
function ff_\type\()_h264_qpel16_mc03_neon, export=1
901
        mov             x14, x30
902
        add             x12, x1,  x2
903
        b               \type\()_h264_qpel16_mc01
904
endfunc
905
 
906
function ff_\type\()_h264_qpel16_mc13_neon, export=1
907
        mov             x14, x30
908
        mov             x8,  x0
909
        mov             x9,  x1
910
        add             x1,  x1,  x2
911
        b               \type\()_h264_qpel16_mc11
912
endfunc
913
 
914
function ff_\type\()_h264_qpel16_mc23_neon, export=1
915
        mov             x14, x30
916
        mov             x8,  x0
917
        mov             x9,  x1
918
        add             x1,  x1,  x2
919
        b               \type\()_h264_qpel16_mc21
920
endfunc
921
 
922
function ff_\type\()_h264_qpel16_mc33_neon, export=1
923
        add             x1,  x1,  #1
924
        mov             x14, x30
925
        mov             x8,  x0
926
        mov             x9,  x1
927
        add             x1,  x1,  x2
928
        sub             x1,  x1,  #1
929
        b               \type\()_h264_qpel16_mc11
930
endfunc
931
.endm
932
 
933
        h264_qpel16 put
934
        h264_qpel16 avg