Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/*
2
 * Copyright (c) 2011 Janne Grunau 
3
 * Copyright (c) 2011 Mans Rullgard 
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
 
22
#include "libavutil/arm/asm.S"
23
#include "neon.S"
24
 
25
.macro  qpel_lowpass    r0,  r1,  rc1, rc2, shift
26
        vext.8          d25, \r0, \r1, #1       @ src[-1]
27
        vext.8          d26, \r0, \r1, #4       @ src[ 2]
28
        vext.8          d24, \r0, \r1, #5       @ src[ 3]
29
        vaddl.u8        q9,  d25, d26
30
        vaddl.u8        q8,  \r0, d24
31
        vext.8          d27, \r0, \r1, #2       @ src[ 0]
32
        vshl.s16        q12, q9,  #2
33
        vsub.s16        q8,  q8,  q9
34
        vext.8          d28, \r0, \r1, #3       @ src[ 1]
35
        vsub.s16        q8,  q8,  q12
36
        vmlal.u8        q8,  d27, \rc1
37
        vmlal.u8        q8,  d28, \rc2
38
        vqrshrun.s16    \r0, q8,  #\shift
39
.endm
40
 
41
.macro  qpel_lowpass_x2 r0,  r1,  r2,  r3,  rc1, rc2, shift
42
        vext.8          d25, \r0, \r1, #1       @ src[-1]
43
        vext.8          d26, \r0, \r1, #4       @ src[ 2]
44
        vext.8          d24, \r0, \r1, #5       @ src[ 3]
45
        vaddl.u8        q9,  d25, d26
46
        vaddl.u8        q8,  \r0, d24
47
        vext.8          d29, \r0, \r1, #2       @ src[ 0]
48
        vext.8          d28, \r0, \r1, #3       @ src[ 1]
49
        vshl.s16        q10, q9,  #2
50
        vext.8          \r1, \r2, \r3, #1       @ src[-1]
51
        vsub.s16        q8,  q8,  q9
52
        vext.8          d22, \r2, \r3, #4       @ src[ 2]
53
        vext.8          \r0, \r2, \r3, #5       @ src[ 3]
54
        vaddl.u8        q13, \r1, d22
55
        vaddl.u8        q12, \r2, \r0
56
        vsub.s16        q8,  q8,  q10
57
        vshl.s16        q9,  q13, #2
58
        vsub.s16        q12, q12, q13
59
        vmlal.u8        q8,  d29, \rc1
60
        vmlal.u8        q8,  d28, \rc2
61
        vsub.s16        q12, q12, q9
62
        vext.8          d26, \r2, \r3, #2       @ src[ 0]
63
        vext.8          d27, \r2, \r3, #3       @ src[ 1]
64
        vmlal.u8        q12, d26, \rc1
65
        vmlal.u8        q12, d27, \rc2
66
        vqrshrun.s16    \r0, q8,  #\shift
67
        vqrshrun.s16    \r2, q12, #\shift
68
.endm
69
 
70
.macro  rv40_qpel8_h    shift
71
function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
72
1:
73
        vld1.8          {q2},     [r1], r2
74
        vld1.8          {q3},     [r1], r2
75
        qpel_lowpass_x2 d4,  d5,  d6,  d7,  d0,  d1,  \shift
76
        vst1.8          {d4},     [r12,:64]!
77
        vst1.8          {d6},     [r12,:64]!
78
        subs            r3,  r3,  #2
79
        bgt             1b
80
        vld1.8          {q2},     [r1]
81
        qpel_lowpass    d4,  d5,  d0,  d1,  \shift
82
        vst1.8          {d4},     [r12,:64]!
83
        bx              lr
84
endfunc
85
.endm
86
 
87
.macro  rv40_qpel8_v    shift, type
88
function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
89
        vld1.64         {d2},     [r1,:64]!
90
        vld1.64         {d3},     [r1,:64]!
91
        vld1.64         {d4},     [r1,:64]!
92
        vld1.64         {d5},     [r1,:64]!
93
        vld1.64         {d6},     [r1,:64]!
94
        vld1.64         {d7},     [r1,:64]!
95
        vld1.64         {d8},     [r1,:64]!
96
        vld1.64         {d9},     [r1,:64]!
97
        vld1.64         {d10},    [r1,:64]!
98
        vld1.64         {d11},    [r1,:64]!
99
        vld1.64         {d12},    [r1,:64]!
100
        vld1.64         {d13},    [r1,:64]!
101
        vld1.64         {d14},    [r1,:64]!
102
        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
103
        transpose_8x8   d10, d11, d12, d13, d14, d15, d30, d31
104
        qpel_lowpass_x2 d2,  d10, d3,  d11, d0,  d1,  \shift
105
        qpel_lowpass_x2 d4,  d12, d5,  d13, d0,  d1,  \shift
106
        qpel_lowpass_x2 d6,  d14, d7,  d15, d0,  d1,  \shift
107
        qpel_lowpass_x2 d8,  d30, d9,  d31, d0,  d1,  \shift
108
        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
109
  .ifc \type,avg
110
        vld1.64         d12,      [r0,:64], r2
111
        vld1.64         d13,      [r0,:64], r2
112
        vld1.64         d14,      [r0,:64], r2
113
        vld1.64         d15,      [r0,:64], r2
114
        vld1.64         d16,      [r0,:64], r2
115
        vld1.64         d17,      [r0,:64], r2
116
        vld1.64         d18,      [r0,:64], r2
117
        vld1.64         d19,      [r0,:64], r2
118
        sub             r0,  r0,  r2,  lsl #3
119
        vrhadd.u8       q1,  q1,  q6
120
        vrhadd.u8       q2,  q2,  q7
121
        vrhadd.u8       q3,  q3,  q8
122
        vrhadd.u8       q4,  q4,  q9
123
  .endif
124
        vst1.64         d2,       [r0,:64], r2
125
        vst1.64         d3,       [r0,:64], r2
126
        vst1.64         d4,       [r0,:64], r2
127
        vst1.64         d5,       [r0,:64], r2
128
        vst1.64         d6,       [r0,:64], r2
129
        vst1.64         d7,       [r0,:64], r2
130
        vst1.64         d8,       [r0,:64], r2
131
        vst1.64         d9,       [r0,:64], r2
132
        bx              lr
133
endfunc
134
.endm
135
 
136
        rv40_qpel8_h    5
137
        rv40_qpel8_h    6
138
 
139
.macro  rv40_qpel       type
140
function \type\()_rv40_qpel8_h_lowpass_neon
141
  .ifc \type,avg
142
        mov             r12, r0
143
  .endif
144
1:
145
        vld1.8          {q2},     [r1], r2
146
        vld1.8          {q3},     [r1], r2
147
        qpel_lowpass_x2 d4,  d5,  d6,  d7,  d0,  d1,  6
148
  .ifc \type,avg
149
        vld1.8          {d3},     [r12,:64], r2
150
        vld1.8          {d16},    [r12,:64], r2
151
        vrhadd.u8       d4,  d4,  d3
152
        vrhadd.u8       d6,  d6,  d16
153
  .endif
154
        vst1.8          {d4},     [r0,:64], r2
155
        vst1.8          {d6},     [r0,:64], r2
156
        subs            r3,  r3,  #2
157
        bgt             1b
158
        bx              lr
159
endfunc
160
 
161
function \type\()_rv40_qpel8_v_lowpass_neon
162
        vld1.64         {d2},     [r1], r2
163
        vld1.64         {d3},     [r1], r2
164
        vld1.64         {d4},     [r1], r2
165
        vld1.64         {d5},     [r1], r2
166
        vld1.64         {d6},     [r1], r2
167
        vld1.64         {d7},     [r1], r2
168
        vld1.64         {d8},     [r1], r2
169
        vld1.64         {d9},     [r1], r2
170
        vld1.64         {d10},    [r1], r2
171
        vld1.64         {d11},    [r1], r2
172
        vld1.64         {d12},    [r1], r2
173
        vld1.64         {d13},    [r1], r2
174
        vld1.64         {d14},    [r1]
175
        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
176
        transpose_8x8   d10, d11, d12, d13, d14, d15, d30, d31
177
        qpel_lowpass_x2 d2,  d10, d3,  d11, d0,  d1,  6
178
        qpel_lowpass_x2 d4,  d12, d5,  d13, d0,  d1,  6
179
        qpel_lowpass_x2 d6,  d14, d7,  d15, d0,  d1,  6
180
        qpel_lowpass_x2 d8,  d30, d9,  d31, d0,  d1,  6
181
        transpose_8x8   d2,  d3,  d4,  d5,  d6,  d7,  d8,  d9
182
  .ifc \type,avg
183
        vld1.64         d12,      [r0,:64], r2
184
        vld1.64         d13,      [r0,:64], r2
185
        vld1.64         d14,      [r0,:64], r2
186
        vld1.64         d15,      [r0,:64], r2
187
        vld1.64         d16,      [r0,:64], r2
188
        vld1.64         d17,      [r0,:64], r2
189
        vld1.64         d18,      [r0,:64], r2
190
        vld1.64         d19,      [r0,:64], r2
191
        sub             r0,  r0,  r2,  lsl #3
192
        vrhadd.u8       q1,  q1,  q6
193
        vrhadd.u8       q2,  q2,  q7
194
        vrhadd.u8       q3,  q3,  q8
195
        vrhadd.u8       q4,  q4,  q9
196
  .endif
197
        vst1.64         d2,       [r0,:64], r2
198
        vst1.64         d3,       [r0,:64], r2
199
        vst1.64         d4,       [r0,:64], r2
200
        vst1.64         d5,       [r0,:64], r2
201
        vst1.64         d6,       [r0,:64], r2
202
        vst1.64         d7,       [r0,:64], r2
203
        vst1.64         d8,       [r0,:64], r2
204
        vst1.64         d9,       [r0,:64], r2
205
        bx              lr
206
endfunc
207
 
208
        rv40_qpel8_v    5, \type
209
        rv40_qpel8_v    6, \type
210
 
211
function ff_\type\()_rv40_qpel8_mc10_neon, export=1
212
        sub             r1,  r1,  #2
213
        mov             r3,  #8
214
        vmov.i8         d0,  #52
215
        vmov.i8         d1,  #20
216
        b               \type\()_rv40_qpel8_h_lowpass_neon
217
endfunc
218
 
219
function ff_\type\()_rv40_qpel8_mc30_neon, export=1
220
        sub             r1,  r1,  #2
221
        mov             r3,  #8
222
        vmov.i8         d0,  #20
223
        vmov.i8         d1,  #52
224
        b               \type\()_rv40_qpel8_h_lowpass_neon
225
endfunc
226
 
227
function ff_\type\()_rv40_qpel8_mc01_neon, export=1
228
        push            {r4, lr}
229
        vpush           {d8-d15}
230
        sub             r1,  r1,  r2,  lsl #1
231
        vmov.i8         d0,  #52
232
        vmov.i8         d1,  #20
233
        bl              \type\()_rv40_qpel8_v_lowpass_neon
234
        vpop            {d8-d15}
235
        pop             {r4, pc}
236
endfunc
237
 
238
function ff_\type\()_rv40_qpel8_mc11_neon, export=1
239
        push            {r4, lr}
240
        vpush           {d8-d15}
241
        sub             sp,  sp,  #14*8
242
        add             r12, sp,  #7
243
        bic             r12, r12, #7
244
        sub             r1,  r1,  r2,  lsl #1
245
        sub             r1,  r1,  #2
246
        mov             r3,  #12
247
        vmov.i8         d0,  #52
248
        vmov.i8         d1,  #20
249
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
250
        add             r1,  sp,  #7
251
        bic             r1,  r1,  #7
252
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
253
        add             sp,  sp,  #14*8
254
        vpop            {d8-d15}
255
        pop             {r4, pc}
256
endfunc
257
 
258
function ff_\type\()_rv40_qpel8_mc21_neon, export=1
259
        push            {r4, lr}
260
        vpush           {d8-d15}
261
        sub             sp,  sp,  #14*8
262
        add             r12, sp,  #7
263
        bic             r12, r12, #7
264
        sub             r1,  r1,  r2,  lsl #1
265
        sub             r1,  r1,  #2
266
        mov             r3,  #12
267
        vmov.i8         d0,  #20
268
        vmov.i8         d1,  #20
269
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
270
        add             r1,  sp,  #7
271
        bic             r1,  r1,  #7
272
        vmov.i8         d0,  #52
273
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
274
        add             sp,  sp,  #14*8
275
        vpop            {d8-d15}
276
        pop             {r4, pc}
277
endfunc
278
 
279
function ff_\type\()_rv40_qpel8_mc31_neon, export=1
280
        push            {r4, lr}
281
        vpush           {d8-d15}
282
        sub             sp,  sp,  #14*8
283
        add             r12, sp,  #7
284
        bic             r12, r12, #7
285
        sub             r1,  r1,  r2,  lsl #1
286
        sub             r1,  r1,  #2
287
        mov             r3,  #12
288
        vmov.i8         d0,  #20
289
        vmov.i8         d1,  #52
290
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
291
        add             r1,  sp,  #7
292
        bic             r1,  r1,  #7
293
        vswp            d0,  d1
294
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
295
        add             sp,  sp,  #14*8
296
        vpop            {d8-d15}
297
        pop             {r4, pc}
298
endfunc
299
 
300
function ff_\type\()_rv40_qpel8_mc12_neon, export=1
301
        push            {r4, lr}
302
        vpush           {d8-d15}
303
        sub             sp,  sp,  #14*8
304
        add             r12, sp,  #7
305
        bic             r12, r12, #7
306
        sub             r1,  r1,  r2,  lsl #1
307
        sub             r1,  r1,  #2
308
        mov             r3,  #12
309
        vmov.i8         d0,  #52
310
        vmov.i8         d1,  #20
311
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
312
        add             r1,  sp,  #7
313
        bic             r1,  r1,  #7
314
        vmov.i8         d0,  #20
315
        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
316
        add             sp,  sp,  #14*8
317
        vpop            {d8-d15}
318
        pop             {r4, pc}
319
endfunc
320
 
321
function ff_\type\()_rv40_qpel8_mc22_neon, export=1
322
        push            {r4, lr}
323
        vpush           {d8-d15}
324
        sub             sp,  sp,  #14*8
325
        add             r12, sp,  #7
326
        bic             r12, r12, #7
327
        sub             r1,  r1,  r2,  lsl #1
328
        sub             r1,  r1,  #2
329
        mov             r3,  #12
330
        vmov.i8         d0,  #20
331
        vmov.i8         d1,  #20
332
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
333
        add             r1,  sp,  #7
334
        bic             r1,  r1,  #7
335
        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
336
        add             sp,  sp,  #14*8
337
        vpop            {d8-d15}
338
        pop             {r4, pc}
339
endfunc
340
 
341
function ff_\type\()_rv40_qpel8_mc32_neon, export=1
342
        push            {r4, lr}
343
        vpush           {d8-d15}
344
        sub             sp,  sp,  #14*8
345
        add             r12, sp,  #7
346
        bic             r12, r12, #7
347
        sub             r1,  r1,  r2,  lsl #1
348
        sub             r1,  r1,  #2
349
        mov             r3,  #12
350
        vmov.i8         d0,  #20
351
        vmov.i8         d1,  #52
352
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
353
        add             r1,  sp,  #7
354
        bic             r1,  r1,  #7
355
        vmov.i8         d1,  #20
356
        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
357
        add             sp,  sp,  #14*8
358
        vpop            {d8-d15}
359
        pop             {r4, pc}
360
endfunc
361
 
362
function ff_\type\()_rv40_qpel8_mc03_neon, export=1
363
        push            {r4, lr}
364
        vpush           {d8-d15}
365
        sub             r1,  r1,  r2,  lsl #1
366
        vmov.i8         d0,  #20
367
        vmov.i8         d1,  #52
368
        bl              \type\()_rv40_qpel8_v_lowpass_neon
369
        vpop            {d8-d15}
370
        pop             {r4, pc}
371
endfunc
372
 
373
function ff_\type\()_rv40_qpel8_mc33_neon, export=1
374
        mov             r3,  #8
375
        b               X(ff_\type\()_pixels8_xy2_neon)
376
endfunc
377
 
378
function ff_\type\()_rv40_qpel8_mc13_neon, export=1
379
        push            {r4, lr}
380
        vpush           {d8-d15}
381
        sub             sp,  sp,  #14*8
382
        add             r12, sp,  #7
383
        bic             r12, r12, #7
384
        sub             r1,  r1,  r2,  lsl #1
385
        sub             r1,  r1,  #2
386
        mov             r3,  #12
387
        vmov.i8         d0,  #52
388
        vmov.i8         d1,  #20
389
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
390
        add             r1,  sp,  #7
391
        bic             r1,  r1,  #7
392
        vswp            d0,  d1
393
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
394
        add             sp,  sp,  #14*8
395
        vpop            {d8-d15}
396
        pop             {r4, pc}
397
endfunc
398
 
399
function ff_\type\()_rv40_qpel8_mc23_neon, export=1
400
        push            {r4, lr}
401
        vpush           {d8-d15}
402
        sub             sp,  sp,  #14*8
403
        add             r12, sp,  #7
404
        bic             r12, r12, #7
405
        sub             r1,  r1,  r2,  lsl #1
406
        sub             r1,  r1,  #2
407
        mov             r3,  #12
408
        vmov.i8         d0,  #20
409
        vmov.i8         d1,  #20
410
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
411
        add             r1,  sp,  #7
412
        bic             r1,  r1,  #7
413
        vmov.i8         d1,  #52
414
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
415
        add             sp,  sp,  #14*8
416
        vpop            {d8-d15}
417
        pop             {r4, pc}
418
endfunc
419
 
420
function ff_\type\()_rv40_qpel16_mc10_neon, export=1
421
        vmov.i8         d0,  #52
422
        vmov.i8         d1,  #20
423
.L\type\()_rv40_qpel16_h:
424
        push            {r1, lr}
425
        sub             r1,  r1,  #2
426
        mov             r3,  #16
427
        bl              \type\()_rv40_qpel8_h_lowpass_neon
428
        pop             {r1, lr}
429
        sub             r0,  r0,  r2,  lsl #4
430
        add             r0,  r0,  #8
431
        add             r1,  r1,  #6
432
        mov             r3,  #16
433
        b               \type\()_rv40_qpel8_h_lowpass_neon
434
endfunc
435
 
436
function ff_\type\()_rv40_qpel16_mc30_neon, export=1
437
        vmov.i8         d0,  #20
438
        vmov.i8         d1,  #52
439
        b               .L\type\()_rv40_qpel16_h
440
endfunc
441
 
442
function ff_\type\()_rv40_qpel16_mc01_neon, export=1
443
        vmov.i8         d0,  #52
444
        vmov.i8         d1,  #20
445
.L\type\()_rv40_qpel16_v:
446
        sub             r1,  r1,  r2,  lsl #1
447
        push            {r1, lr}
448
        vpush           {d8-d15}
449
        bl              \type\()_rv40_qpel8_v_lowpass_neon
450
        sub             r1,  r1,  r2,  lsl #2
451
        bl              \type\()_rv40_qpel8_v_lowpass_neon
452
        ldr             r1,  [sp, #64]
453
        sub             r0,  r0,  r2,  lsl #4
454
        add             r0,  r0,  #8
455
        add             r1,  r1,  #8
456
        bl              \type\()_rv40_qpel8_v_lowpass_neon
457
        sub             r1,  r1,  r2,  lsl #2
458
        bl              \type\()_rv40_qpel8_v_lowpass_neon
459
        vpop            {d8-d15}
460
        pop             {r1, pc}
461
endfunc
462
 
463
function ff_\type\()_rv40_qpel16_mc11_neon, export=1
464
        sub             r1,  r1,  r2,  lsl #1
465
        sub             r1,  r1,  #2
466
        push            {r1, lr}
467
        vpush           {d8-d15}
468
        sub             sp,  sp,  #44*8
469
        add             r12, sp,  #7
470
        bic             r12, r12, #7
471
        mov             r3,  #20
472
        vmov.i8         d0,  #52
473
        vmov.i8         d1,  #20
474
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
475
        ldr             r1,  [sp, #416]
476
        add             r1,  r1,  #8
477
        mov             r3,  #20
478
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
479
.L\type\()_rv40_qpel16_v_s6:
480
        add             r1,  sp,  #7
481
        bic             r1,  r1,  #7
482
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
483
        sub             r1,  r1,  #40
484
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
485
        sub             r0,  r0,  r2,  lsl #4
486
        add             r0,  r0,  #8
487
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
488
        sub             r1,  r1,  #40
489
        bl              \type\()_rv40_qpel8_v_lp_packed_s6_neon
490
        add             sp,  sp,  #44*8
491
        vpop            {d8-d15}
492
        pop             {r1, pc}
493
endfunc
494
 
495
function ff_\type\()_rv40_qpel16_mc21_neon, export=1
496
        sub             r1,  r1,  r2,  lsl #1
497
        sub             r1,  r1,  #2
498
        push            {r1, lr}
499
        vpush           {d8-d15}
500
        sub             sp,  sp,  #44*8
501
        add             r12, sp,  #7
502
        bic             r12, r12, #7
503
        mov             r3,  #20
504
        vmov.i8         d0,  #20
505
        vmov.i8         d1,  #20
506
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
507
        ldr             r1,  [sp, #416]
508
        add             r1,  r1,  #8
509
        mov             r3,  #20
510
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
511
        vmov.i8         d0,  #52
512
        b               .L\type\()_rv40_qpel16_v_s6
513
endfunc
514
 
515
function ff_\type\()_rv40_qpel16_mc31_neon, export=1
516
        sub             r1,  r1,  r2,  lsl #1
517
        sub             r1,  r1,  #2
518
        push            {r1, lr}
519
        vpush           {d8-d15}
520
        sub             sp,  sp,  #44*8
521
        add             r12, sp,  #7
522
        bic             r12, r12, #7
523
        mov             r3,  #20
524
        vmov.i8         d0,  #20
525
        vmov.i8         d1,  #52
526
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
527
        ldr             r1,  [sp, #416]
528
        add             r1,  r1,  #8
529
        mov             r3,  #20
530
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
531
        vswp            d0,  d1
532
        b               .L\type\()_rv40_qpel16_v_s6
533
endfunc
534
 
535
function ff_\type\()_rv40_qpel16_mc12_neon, export=1
536
        sub             r1,  r1,  r2,  lsl #1
537
        sub             r1,  r1,  #2
538
        push            {r1, lr}
539
        vpush           {d8-d15}
540
        sub             sp,  sp,  #44*8
541
        add             r12, sp,  #7
542
        bic             r12, r12, #7
543
        mov             r3,  #20
544
        vmov.i8         d0,  #52
545
        vmov.i8         d1,  #20
546
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
547
        ldr             r1,  [sp, #416]
548
        add             r1,  r1,  #8
549
        mov             r3,  #20
550
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
551
        vmov.i8         d0,  #20
552
.L\type\()_rv40_qpel16_v_s5:
553
        add             r1,  sp,  #7
554
        bic             r1,  r1,  #7
555
        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
556
        sub             r1,  r1,  #40
557
        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
558
        sub             r0,  r0,  r2,  lsl #4
559
        add             r0,  r0,  #8
560
        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
561
        sub             r1,  r1,  #40
562
        bl              \type\()_rv40_qpel8_v_lp_packed_s5_neon
563
        add             sp,  sp,  #44*8
564
        vpop            {d8-d15}
565
        pop             {r1, pc}
566
endfunc
567
 
568
function ff_\type\()_rv40_qpel16_mc22_neon, export=1
569
        sub             r1,  r1,  r2,  lsl #1
570
        sub             r1,  r1,  #2
571
        push            {r1, lr}
572
        vpush           {d8-d15}
573
        sub             sp,  sp,  #44*8
574
        add             r12, sp,  #7
575
        bic             r12, r12, #7
576
        mov             r3,  #20
577
        vmov.i8         d0,  #20
578
        vmov.i8         d1,  #20
579
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
580
        ldr             r1,  [sp, #416]
581
        add             r1,  r1,  #8
582
        mov             r3,  #20
583
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
584
        b               .L\type\()_rv40_qpel16_v_s5
585
endfunc
586
 
587
function ff_\type\()_rv40_qpel16_mc32_neon, export=1
588
        sub             r1,  r1,  r2,  lsl #1
589
        sub             r1,  r1,  #2
590
        push            {r1, lr}
591
        vpush           {d8-d15}
592
        sub             sp,  sp,  #44*8
593
        add             r12, sp,  #7
594
        bic             r12, r12, #7
595
        mov             r3,  #20
596
        vmov.i8         d0,  #20
597
        vmov.i8         d1,  #52
598
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
599
        ldr             r1,  [sp, #416]
600
        add             r1,  r1,  #8
601
        mov             r3,  #20
602
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
603
        vmov.i8         d1,  #20
604
        b               .L\type\()_rv40_qpel16_v_s5
605
endfunc
606
 
607
function ff_\type\()_rv40_qpel16_mc03_neon, export=1
608
        vmov.i8         d0,  #20
609
        vmov.i8         d1,  #52
610
        b               .L\type\()_rv40_qpel16_v
611
endfunc
612
 
613
function ff_\type\()_rv40_qpel16_mc13_neon, export=1
614
        sub             r1,  r1,  r2,  lsl #1
615
        sub             r1,  r1,  #2
616
        push            {r1, lr}
617
        vpush           {d8-d15}
618
        sub             sp,  sp,  #44*8
619
        add             r12, sp,  #7
620
        bic             r12, r12, #7
621
        mov             r3,  #20
622
        vmov.i8         d0,  #52
623
        vmov.i8         d1,  #20
624
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
625
        ldr             r1,  [sp, #416]
626
        add             r1,  r1,  #8
627
        mov             r3,  #20
628
        bl              put_rv40_qpel8_h_lp_packed_s6_neon
629
        vswp            d0,  d1
630
        b               .L\type\()_rv40_qpel16_v_s6
631
endfunc
632
 
633
function ff_\type\()_rv40_qpel16_mc23_neon, export=1
634
        sub             r1,  r1,  r2,  lsl #1
635
        sub             r1,  r1,  #2
636
        push            {r1, lr}
637
        vpush           {d8-d15}
638
        sub             sp,  sp,  #44*8
639
        add             r12, sp,  #7
640
        bic             r12, r12, #7
641
        mov             r3,  #20
642
        vmov.i8         d0,  #20
643
        vmov.i8         d1,  #20
644
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
645
        ldr             r1,  [sp, #416]
646
        add             r1,  r1,  #8
647
        mov             r3,  #20
648
        bl              put_rv40_qpel8_h_lp_packed_s5_neon
649
        vmov.i8         d1,  #52
650
        b               .L\type\()_rv40_qpel16_v_s6
651
endfunc
652
 
653
function ff_\type\()_rv40_qpel16_mc33_neon, export=1
654
        mov             r3,  #16
655
        b               X(ff_\type\()_pixels16_xy2_neon)
656
endfunc
657
.endm
658
 
659
        rv40_qpel       put
660
        rv40_qpel       avg
661
 
662
.macro  rv40_weight
663
        vmovl.u8        q8,  d2
664
        vmovl.u8        q9,  d3
665
        vmovl.u8        q10, d4
666
        vmovl.u8        q11, d5
667
        vmull.u16       q2,  d16, d0[2]
668
        vmull.u16       q3,  d17, d0[2]
669
        vmull.u16       q8,  d18, d0[2]
670
        vmull.u16       q9,  d19, d0[2]
671
        vmull.u16       q12, d20, d0[0]
672
        vmull.u16       q13, d21, d0[0]
673
        vmull.u16       q14, d22, d0[0]
674
        vmull.u16       q15, d23, d0[0]
675
        vshrn.i32       d4,  q2,  #9
676
        vshrn.i32       d5,  q3,  #9
677
        vshrn.i32       d6,  q8,  #9
678
        vshrn.i32       d7,  q9,  #9
679
        vshrn.i32       d16, q12, #9
680
        vshrn.i32       d17, q13, #9
681
        vshrn.i32       d18, q14, #9
682
        vshrn.i32       d19, q15, #9
683
        vadd.u16        q2,  q2,  q8
684
        vadd.u16        q3,  q3,  q9
685
        vrshrn.i16      d2,  q2,  #5
686
        vrshrn.i16      d3,  q3,  #5
687
.endm
688
 
689
/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
690
                                    int w1, int w2, int stride) */
691
function ff_rv40_weight_func_16_neon, export=1
692
        ldr             r12, [sp]
693
        vmov            d0,  r3,  r12
694
        ldr             r12, [sp, #4]
695
        mov             r3,  #16
696
1:
697
        vld1.8          {q1},     [r1,:128], r12
698
        vld1.8          {q2},     [r2,:128], r12
699
        rv40_weight
700
        vst1.8          {q1},     [r0,:128], r12
701
        subs            r3,  r3,  #1
702
        bne             1b
703
        bx              lr
704
endfunc
705
 
706
/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
707
                                   int w1, int w2, int stride) */
708
function ff_rv40_weight_func_8_neon, export=1
709
        ldr             r12, [sp]
710
        vmov            d0,  r3,  r12
711
        ldr             r12, [sp, #4]
712
        mov             r3,  #8
713
1:
714
        vld1.8          {d2},     [r1,:64], r12
715
        vld1.8          {d3},     [r1,:64], r12
716
        vld1.8          {d4},     [r2,:64], r12
717
        vld1.8          {d5},     [r2,:64], r12
718
        rv40_weight
719
        vst1.8          {d2},     [r0,:64], r12
720
        vst1.8          {d3},     [r0,:64], r12
721
        subs            r3,  r3,  #2
722
        bne             1b
723
        bx              lr
724
endfunc
725
 
726
function ff_rv40_h_loop_filter_strength_neon, export=1
727
        pkhbt           r2,  r3,  r2,  lsl #18
728
 
729
        ldr             r3,  [r0]
730
        ldr_dpre        r12, r0,  r1
731
        teq             r3,  r12
732
        beq             1f
733
 
734
        sub             r0,  r0,  r1,  lsl #1
735
 
736
        vld1.32         {d4[]},   [r0,:32], r1  @ -3
737
        vld1.32         {d0[]},   [r0,:32], r1  @ -2
738
        vld1.32         {d4[1]},  [r0,:32], r1  @ -1
739
        vld1.32         {d5[]},   [r0,:32], r1  @  0
740
        vld1.32         {d1[]},   [r0,:32], r1  @  1
741
        vld1.32         {d5[0]},  [r0,:32], r1  @  2
742
 
743
        vpaddl.u8       q8,  q0                 @ -2, -2, -2, -2,  1,  1,  1,  1
744
        vpaddl.u8       q9,  q2                 @ -3, -3, -1, -1,  2,  2,  0,  0
745
        vdup.32         d30, r2                 @ beta2, beta << 2
746
        vpadd.u16       d16, d16, d17           @ -2, -2,  1,  1
747
        vpadd.u16       d18, d18, d19           @ -3, -1,  2,  0
748
        vabd.u16        d16, d18, d16
749
        vclt.u16        d16, d16, d30
750
 
751
        ldrd            r2,  r3,  [sp, #4]
752
        vmovl.u16       q12, d16
753
        vtrn.16         d16, d17
754
        vshr.u32        q12, q12, #15
755
        ldr             r0,  [sp]
756
        vst1.32         {d24[1]}, [r2,:32]
757
        vst1.32         {d25[1]}, [r3,:32]
758
 
759
        cmp             r0,  #0
760
        it              eq
761
        bxeq            lr
762
 
763
        vand            d18, d16, d17
764
        vtrn.32         d18, d19
765
        vand            d18, d18, d19
766
        vmov.u16        r0,  d18[0]
767
        bx              lr
768
1:
769
        ldrd            r2,  r3,  [sp, #4]
770
        mov             r0,  #0
771
        str             r0,  [r2]
772
        str             r0,  [r3]
773
        bx              lr
774
endfunc
775
 
776
function ff_rv40_v_loop_filter_strength_neon, export=1
777
        sub             r0,  r0,  #3
778
        pkhbt           r2,  r3,  r2,  lsl #18
779
 
780
        vld1.8          {d0},     [r0], r1
781
        vld1.8          {d1},     [r0], r1
782
        vld1.8          {d2},     [r0], r1
783
        vld1.8          {d3},     [r0], r1
784
 
785
        vaddl.u8        q0,  d0,  d1
786
        vaddl.u8        q1,  d2,  d3
787
        vdup.32         q15, r2
788
        vadd.u16        q0,  q0,  q1            @ -3, -2, -1,  0,  1,  2
789
        vext.16         q1,  q0,  q0,  #1       @ -2, -1,  0,  1,  2
790
        vabd.u16        q0,  q1,  q0
791
        vclt.u16        q0,  q0,  q15
792
 
793
        ldrd            r2,  r3,  [sp, #4]
794
        vmovl.u16       q1,  d0
795
        vext.16         d1,  d0,  d1,  #3
796
        vshr.u32        q1,  q1,  #15
797
        ldr             r0,  [sp]
798
        vst1.32         {d2[1]},  [r2,:32]
799
        vst1.32         {d3[1]},  [r3,:32]
800
 
801
        cmp             r0,  #0
802
        it              eq
803
        bxeq            lr
804
 
805
        vand            d0,  d0,  d1
806
        vtrn.16         d0,  d1
807
        vand            d0,  d0,  d1
808
        vmov.u16        r0,  d0[0]
809
        bx              lr
810
endfunc
811
 
812
.macro  rv40_weak_loop_filter
813
        vdup.16         d30, r2                 @ filter_p1
814
        vdup.16         d31, r3                 @ filter_q1
815
        ldrd            r2,  r3,  [sp]
816
        vdup.16         d28, r2                 @ alpha
817
        vdup.16         d29, r3                 @ beta
818
        ldr             r12, [sp, #8]
819
        vdup.16         d25, r12                @ lim_p0q0
820
        ldrd            r2,  r3,  [sp, #12]
821
        vsubl.u8        q9,  d5,  d4            @ x, t
822
        vabdl.u8        q8,  d5,  d4            @ x, abs(t)
823
        vneg.s16        q15, q15
824
        vceq.i16        d16, d19, #0            @ !t
825
        vshl.s16        d19, d19, #2            @ t << 2
826
        vmul.u16        d18, d17, d28           @ alpha * abs(t)
827
        vand            d24, d30, d31           @ filter_p1 & filter_q1
828
        vsubl.u8        q1,  d0,  d4            @ p1p2, p1p0
829
        vsubl.u8        q3,  d1,  d5            @ q1q2, q1q0
830
        vmov.i16        d22, #3
831
        vshr.u16        d18, d18, #7
832
        vadd.i16        d22, d22, d24           @ 3 - (filter_p1 & filter_q1)
833
        vsubl.u8        q10, d0,  d1            @ src[-2] - src[1]
834
        vcle.u16        d18, d18, d22
835
        vand            d20, d20, d24
836
        vneg.s16        d23, d25                @ -lim_p0q0
837
        vadd.s16        d19, d19, d20
838
        vbic            d16, d18, d16           @ t && u <= 3 - (fp1 & fq1)
839
        vtrn.32         d4,  d5                 @ -3,  2, -1,  0
840
        vrshr.s16       d19, d19, #3
841
        vmov            d28, d29                @ beta
842
        vswp            d3,  d6                 @ q1q2, p1p0
843
        vmin.s16        d19, d19, d25
844
        vand            d30, d30, d16
845
        vand            d31, d31, d16
846
        vadd.s16        q10, q1,  q3            @ p1p2 + p1p0, q1q2 + q1q0
847
        vmax.s16        d19, d19, d23           @ diff
848
        vabs.s16        q1,  q1                 @ abs(p1p2), abs(q1q2)
849
        vand            d18, d19, d16           @ diff
850
        vcle.u16        q1,  q1,  q14
851
        vneg.s16        d19, d18                @ -diff
852
        vdup.16         d26, r3                 @ lim_p1
853
        vaddw.u8        q2,  q9,  d5            @ src[-1]+diff, src[0]-diff
854
        vhsub.s16       q11, q10, q9
855
        vand            q1,  q1,  q15
856
        vqmovun.s16     d4,  q2                 @ -1,  0
857
        vand            q9,  q11, q1
858
        vdup.16         d27, r2                 @ lim_q1
859
        vneg.s16        q9,  q9
860
        vneg.s16        q14, q13
861
        vmin.s16        q9,  q9,  q13
862
        vtrn.32         d0,  d1                 @ -2,  1,  -2,  1
863
        vmax.s16        q9,  q9,  q14
864
        vaddw.u8        q3,  q9,  d0
865
        vqmovun.s16     d5,  q3                 @ -2,  1
866
.endm
867
 
868
function ff_rv40_h_weak_loop_filter_neon, export=1
869
        sub             r0,  r0,  r1,  lsl #1
870
        sub             r0,  r0,  r1
871
 
872
        vld1.32         {d4[]},   [r0,:32], r1
873
        vld1.32         {d0[]},   [r0,:32], r1
874
        vld1.32         {d4[1]},  [r0,:32], r1
875
        vld1.32         {d5[]},   [r0,:32], r1
876
        vld1.32         {d1[]},   [r0,:32], r1
877
        vld1.32         {d5[0]},  [r0,:32]
878
 
879
        sub             r0,  r0,  r1,  lsl #2
880
 
881
        rv40_weak_loop_filter
882
 
883
        vst1.32         {d5[0]},  [r0,:32], r1
884
        vst1.32         {d4[0]},  [r0,:32], r1
885
        vst1.32         {d4[1]},  [r0,:32], r1
886
        vst1.32         {d5[1]},  [r0,:32], r1
887
 
888
        bx              lr
889
endfunc
890
 
891
function ff_rv40_v_weak_loop_filter_neon, export=1
892
        sub             r12, r0,  #3
893
        sub             r0,  r0,  #2
894
 
895
        vld1.8          {d4},     [r12], r1
896
        vld1.8          {d5},     [r12], r1
897
        vld1.8          {d2},     [r12], r1
898
        vld1.8          {d3},     [r12], r1
899
 
900
        vtrn.16         q2,  q1
901
        vtrn.8          d4,  d5
902
        vtrn.8          d2,  d3
903
 
904
        vrev64.32       d5,  d5
905
        vtrn.32         q2,  q1
906
        vdup.32         d0,  d3[0]
907
        vdup.32         d1,  d2[0]
908
 
909
        rv40_weak_loop_filter
910
 
911
        vtrn.32         q2,  q3
912
        vswp            d4,  d5
913
 
914
        vst4.8          {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
915
        vst4.8          {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
916
        vst4.8          {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
917
        vst4.8          {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
918
 
919
        bx              lr
920
endfunc