Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
/*
2
 * Copyright (c) 2012 Mans Rullgard
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
 
21
#include "libavutil/arm/asm.S"
22
 
23
function ff_ps_add_squares_neon, export=1
24
        mov             r3,  r0
25
        sub             r2,  r2,  #4
26
        vld1.32         {q0},     [r1,:128]!
27
        vmul.f32        q0,  q0,  q0
28
        vld1.32         {q2},     [r1,:128]!
29
        vmul.f32        q2,  q2,  q2
30
        vld1.32         {q1},     [r0,:128]!
31
1:
32
        vpadd.f32       d6,  d0,  d1
33
        vld1.32         {q0},     [r1,:128]!
34
        vpadd.f32       d7,  d4,  d5
35
        vmul.f32        q0,  q0,  q0
36
        vld1.32         {q2},     [r1,:128]!
37
        vadd.f32        q3,  q1,  q3
38
        vld1.32         {q1},     [r0,:128]!
39
        vmul.f32        q2,  q2,  q2
40
        vst1.32         {q3},     [r3,:128]!
41
        subs            r2,  r2,  #4
42
        bgt             1b
43
        vpadd.f32       d6,  d0,  d1
44
        vpadd.f32       d7,  d4,  d5
45
        vadd.f32        q1,  q1,  q3
46
        vst1.32         {q1},     [r3,:128]!
47
        bx              lr
48
endfunc
49
 
50
function ff_ps_mul_pair_single_neon, export=1
51
        sub             r3,  r3,  #4
52
        tst             r1,  #8
53
        bne             2f
54
        vld1.32         {q0},     [r1,:128]!
55
1:
56
        vld1.32         {q3},     [r2,:128]!
57
        vmul.f32        d4,  d0,  d6[0]
58
        vmul.f32        d5,  d1,  d6[1]
59
        vld1.32         {q1},     [r1,:128]!
60
        vmul.f32        d6,  d2,  d7[0]
61
        vmul.f32        d7,  d3,  d7[1]
62
        vld1.32         {q0},     [r1,:128]!
63
        vst1.32         {q2,q3},  [r0,:128]!
64
        subs            r3,  r3,  #4
65
        bgt             1b
66
        vld1.32         {q3},     [r2,:128]!
67
        vmul.f32        d4,  d0,  d6[0]
68
        vmul.f32        d5,  d1,  d6[1]
69
        vld1.32         {q1},     [r1,:128]!
70
        vmul.f32        d6,  d2,  d7[0]
71
        vmul.f32        d7,  d3,  d7[1]
72
        vst1.32         {q2,q3},  [r0,:128]!
73
        bx              lr
74
2:
75
        vld1.32         {d0},     [r1,:64]!
76
        vld1.32         {d1,d2},  [r1,:128]!
77
1:
78
        vld1.32         {q3},     [r2,:128]!
79
        vmul.f32        d4,  d0,  d6[0]
80
        vmul.f32        d5,  d1,  d6[1]
81
        vld1.32         {d0,d1},  [r1,:128]!
82
        vmul.f32        d6,  d2,  d7[0]
83
        vmul.f32        d7,  d0,  d7[1]
84
        vmov            d0,  d1
85
        vld1.32         {d1,d2},  [r1,:128]!
86
        vst1.32         {q2,q3},  [r0,:128]!
87
        subs            r3,  r3,  #4
88
        bgt             1b
89
        vld1.32         {q3},     [r2,:128]!
90
        vmul.f32        d4,  d0,  d6[0]
91
        vmul.f32        d5,  d1,  d6[1]
92
        vld1.32         {d0},     [r1,:64]!
93
        vmul.f32        d6,  d2,  d7[0]
94
        vmul.f32        d7,  d0,  d7[1]
95
        vst1.32         {q2,q3},  [r0,:128]!
96
        bx              lr
97
endfunc
98
 
99
function ff_ps_hybrid_synthesis_deint_neon, export=1
100
        push            {r4-r8,lr}
101
        add             r0,  r0,  r2,  lsl #2
102
        add             r1,  r1,  r2,  lsl #5+1+2
103
        rsb             r2,  r2,  #64
104
        mov             r5,  #64*4
105
        mov             lr,  r0
106
        add             r4,  r0,  #38*64*4
107
        mov             r12, r3
108
2:
109
        vld1.32         {d0,d1},  [r1,:128]!
110
        vst1.32         {d0[0]},  [lr,:32], r5
111
        vst1.32         {d0[1]},  [r4,:32], r5
112
        vst1.32         {d1[0]},  [lr,:32], r5
113
        vst1.32         {d1[1]},  [r4,:32], r5
114
        subs            r12, r12, #2
115
        bgt             2b
116
        add             r0,  r0,  #4
117
        sub             r2,  r2,  #1
118
        tst             r2,  #2
119
        bne             6f
120
1:
121
        mov             lr,  r0
122
        add             r4,  r0,  #38*64*4
123
        add             r6,  r1,  #  32*2*4
124
        add             r7,  r1,  #2*32*2*4
125
        add             r8,  r1,  #3*32*2*4
126
        mov             r12, r3
127
2:
128
        vld1.32         {d0,d1},  [r1,:128]!
129
        vld1.32         {d2,d3},  [r6,:128]!
130
        vld1.32         {d4,d5},  [r7,:128]!
131
        vld1.32         {d6,d7},  [r8,:128]!
132
        vst4.32         {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
133
        vst4.32         {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
134
        vst4.32         {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
135
        vst4.32         {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
136
        subs            r12, r12, #2
137
        bgt             2b
138
        add             r0,  r0,  #16
139
        add             r1,  r1,  #3*32*2*4
140
        subs            r2,  r2,  #4
141
        bgt             1b
142
        pop             {r4-r8,pc}
143
6:
144
        mov             lr,  r0
145
        add             r4,  r0,  #38*64*4
146
        add             r6,  r1,  #32*2*4
147
        mov             r12, r3
148
2:
149
        vld1.32         {d0,d1},  [r1,:128]!
150
        vld1.32         {d2,d3},  [r6,:128]!
151
        vst2.32         {d0[0],d2[0]}, [lr,:64], r5
152
        vst2.32         {d0[1],d2[1]}, [r4,:64], r5
153
        vst2.32         {d1[0],d3[0]}, [lr,:64], r5
154
        vst2.32         {d1[1],d3[1]}, [r4,:64], r5
155
        subs            r12, r12, #2
156
        bgt             2b
157
        add             r0,  r0,  #8
158
        add             r1,  r1,  #32*2*4
159
        sub             r2,  r2,  #2
160
        b               1b
161
endfunc
162
 
163
function ff_ps_hybrid_analysis_neon, export=1
164
        vldm            r1,  {d19-d31}
165
        ldr             r12, [sp]
166
        lsl             r3,  r3,  #3
167
        vadd.f32        d16, d19, d31
168
        vadd.f32        d17, d20, d30
169
        vsub.f32        d18, d19, d31
170
        vsub.f32        d19, d20, d30
171
        vsub.f32        d0,  d21, d29
172
        vsub.f32        d1,  d22, d28
173
        vadd.f32        d2,  d21, d29
174
        vadd.f32        d3,  d22, d28
175
        vadd.f32        d20, d23, d27
176
        vadd.f32        d21, d24, d26
177
        vsub.f32        d22, d23, d27
178
        vsub.f32        d23, d24, d26
179
        vmov.i32        d6,  #1<<31
180
        vmov.i32        d7,  #0
181
        vmov.f32        q14, #0.0
182
        vmov.f32        q15, #0.0
183
        vtrn.32         d6,  d7
184
        vrev64.32       q9,  q9
185
        vrev64.32       q0,  q0
186
        vrev64.32       q11, q11
187
        veor            q9,  q9,  q3
188
        veor            q0,  q0,  q3
189
        veor            q11, q11, q3
190
        vld1.32         {q13},    [r2,:128]!
191
        vtrn.32         q8,  q9
192
        vtrn.32         q1,  q0
193
        vtrn.32         q10, q11
194
        sub             r12, r12, #1
195
        vmla.f32        q14, q8,  q13
196
        vld1.32         {q2},     [r2,:128]!
197
        vmla.f32        q15, q9,  q13
198
1:
199
        vmla.f32        q14, q1,  q2
200
        vld1.32         {q13},    [r2,:128]!
201
        vmla.f32        q15, q0,  q2
202
        vmla.f32        q14, q10, q13
203
        vld1.32         {q2},     [r2,:128]!
204
        vmla.f32        q15, q11, q13
205
        vld1.32         {q13},    [r2,:128]!
206
        vadd.f32        d6,  d28, d29
207
        vadd.f32        d7,  d30, d31
208
        vmov.f32        q14, #0.0
209
        vmov.f32        q15, #0.0
210
        vmla.f32        q14, q8,  q13
211
        vpadd.f32       d6,  d6,  d7
212
        vmla.f32        q15, q9,  q13
213
        vmla.f32        d6,  d25, d4[0]
214
        vld1.32         {q2},     [r2,:128]!
215
        vst1.32         {d6},     [r0,:64], r3
216
        subs            r12, r12, #1
217
        bgt             1b
218
        vmla.f32        q14, q1,  q2
219
        vld1.32         {q13},    [r2,:128]!
220
        vmla.f32        q15, q0,  q2
221
        vmla.f32        q14, q10, q13
222
        vld1.32         {q2},     [r2,:128]!
223
        vmla.f32        q15, q11, q13
224
        vadd.f32        d6,  d28, d29
225
        vadd.f32        d7,  d30, d31
226
        vpadd.f32       d6,  d6,  d7
227
        vmla.f32        d6,  d25, d4[0]
228
        vst1.32         {d6},     [r0,:64], r3
229
        bx              lr
230
endfunc
231
 
232
function ff_ps_stereo_interpolate_neon, export=1
233
        vld1.32         {q0},     [r2]
234
        vld1.32         {q14},    [r3]
235
        vadd.f32        q15, q14, q14
236
        mov             r2,  r0
237
        mov             r3,  r1
238
        ldr             r12, [sp]
239
        vadd.f32        q1,  q0,  q14
240
        vadd.f32        q0,  q0,  q15
241
        vld1.32         {q2},     [r0,:64]!
242
        vld1.32         {q3},     [r1,:64]!
243
        subs            r12, r12, #1
244
        beq             2f
245
1:
246
        vmul.f32        d16, d4,  d2[0]
247
        vmul.f32        d17, d5,  d0[0]
248
        vmul.f32        d18, d4,  d2[1]
249
        vmul.f32        d19, d5,  d0[1]
250
        vmla.f32        d16, d6,  d3[0]
251
        vmla.f32        d17, d7,  d1[0]
252
        vmla.f32        d18, d6,  d3[1]
253
        vmla.f32        d19, d7,  d1[1]
254
        vadd.f32        q1,  q1,  q15
255
        vadd.f32        q0,  q0,  q15
256
        vld1.32         {q2},     [r0,:64]!
257
        vld1.32         {q3},     [r1,:64]!
258
        vst1.32         {q8},     [r2,:64]!
259
        vst1.32         {q9},     [r3,:64]!
260
        subs            r12, r12, #2
261
        bgt             1b
262
        it              lt
263
        bxlt            lr
264
2:
265
        vmul.f32        d16, d4,  d2[0]
266
        vmul.f32        d18, d4,  d2[1]
267
        vmla.f32        d16, d6,  d3[0]
268
        vmla.f32        d18, d6,  d3[1]
269
        vst1.32         {d16},    [r2,:64]!
270
        vst1.32         {d18},    [r3,:64]!
271
        bx              lr
272
endfunc