Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
/*
2
 * Copyright (c) 2011 Mans Rullgard 
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
 
21
#include "libavutil/arm/asm.S"
22
 
23
.macro  bflies          d0,  d1,  r0,  r1
24
        vrev64.32       \r0, \d1                @ t5, t6, t1, t2
25
        vhsub.s16       \r1, \d1, \r0           @ t1-t5, t2-t6, t5-t1, t6-t2
26
        vhadd.s16       \r0, \d1, \r0           @ t1+t5, t2+t6, t5+t1, t6+t2
27
        vext.16         \r1, \r1, \r1, #1       @ t2-t6, t5-t1, t6-t2, t1-t5
28
        vtrn.32         \r0, \r1                @ t1+t5, t2+t6, t2-t6, t5-t1
29
                                                @ t5,    t6,    t4,    t3
30
        vhsub.s16       \d1, \d0, \r0
31
        vhadd.s16       \d0, \d0, \r0
32
.endm
33
 
34
.macro  transform01     q0,  q1,  d3,  c0,  c1,  r0,  w0,  w1
35
        vrev32.16       \r0, \d3
36
        vmull.s16       \w0, \d3, \c0
37
        vmlal.s16       \w0, \r0, \c1
38
        vshrn.s32       \d3, \w0, #15
39
        bflies          \q0, \q1, \w0, \w1
40
.endm
41
 
42
.macro  transform2      d0,  d1,  d2,  d3,  q0,  q1,  c0,  c1,  c2,  c3, \
43
                        r0,  r1,  w0,  w1
44
        vrev32.16       \r0, \d1
45
        vrev32.16       \r1, \d3
46
        vmull.s16       \w0, \d1, \c0
47
        vmlal.s16       \w0, \r0, \c1
48
        vmull.s16       \w1, \d3, \c2
49
        vmlal.s16       \w1, \r1, \c3
50
        vshrn.s32       \d1, \w0, #15
51
        vshrn.s32       \d3, \w1, #15
52
        bflies          \q0, \q1, \w0, \w1
53
.endm
54
 
55
.macro  fft4            d0,  d1,  r0,  r1
56
        vhsub.s16       \r0, \d0, \d1           @ t3, t4, t8, t7
57
        vhsub.s16       \r1, \d1, \d0
58
        vhadd.s16       \d0, \d0, \d1           @ t1, t2, t6, t5
59
        vmov.i64        \d1, #0xffff00000000
60
        vbit            \r0, \r1, \d1
61
        vrev64.16       \r1, \r0                @ t7, t8, t4, t3
62
        vtrn.32         \r0, \r1                @ t3, t4, t7, t8
63
        vtrn.32         \d0, \r0                @ t1, t2, t3, t4, t6, t5, t8, t7
64
        vhsub.s16       \d1, \d0, \r0           @ r2, i2, r3, i1
65
        vhadd.s16       \d0, \d0, \r0           @ r0, i0, r1, i3
66
.endm
67
 
68
.macro  fft8            d0,  d1,  d2,  d3,  q0,  q1,  c0,  c1,  r0,  r1, w0, w1
69
        fft4            \d0, \d1, \r0, \r1
70
        vtrn.32         \d0, \d1                @ z0, z2, z1, z3
71
        vhadd.s16       \r0, \d2, \d3           @ t1, t2, t3, t4
72
        vhsub.s16       \d3, \d2, \d3           @ z5, z7
73
        vmov            \d2, \r0
74
        transform01     \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
75
.endm
76
 
77
function fft4_neon
78
        vld1.16         {d0-d1},  [r0]
79
        fft4            d0,  d1,  d2,  d3
80
        vst1.16         {d0-d1},  [r0]
81
        bx              lr
82
endfunc
83
 
84
function fft8_neon
85
        vld1.16         {d0-d3},  [r0,:128]
86
        movrel          r1,  coefs
87
        vld1.16         {d30},    [r1,:64]
88
        vdup.16         d31, d30[0]
89
        fft8            d0,  d1,  d2,  d3,  q0,  q1,  d31, d30, d20, d21, q8, q9
90
        vtrn.32         d0,  d1
91
        vtrn.32         d2,  d3
92
        vst1.16         {d0-d3},  [r0,:128]
93
        bx              lr
94
endfunc
95
 
96
function fft16_neon
97
        vld1.16         {d0-d3},  [r0,:128]!
98
        vld1.16         {d4-d7},  [r0,:128]
99
        movrel          r1,  coefs
100
        sub             r0,  r0,  #32
101
        vld1.16         {d28-d31},[r1,:128]
102
        vdup.16         d31, d28[0]
103
        fft8            d0,  d1,  d2,  d3,  q0,  q1,  d31, d28, d20, d21, q8, q9
104
        vswp            d5,  d6
105
        fft4            q2,  q3,  q8,  q9
106
        vswp            d5,  d6
107
        vtrn.32         q0,  q1             @ z0, z4, z2, z6, z1, z5, z3, z7
108
        vtrn.32         q2,  q3             @ z8, z12,z10,z14,z9, z13,z11,z15
109
        vswp            d1,  d2
110
        vdup.16         d31, d28[0]
111
        transform01     q0,  q2,  d5,  d31, d28, d20, q8, q9
112
        vdup.16         d26, d29[0]
113
        vdup.16         d27, d30[0]
114
        transform2      d2,  d6,  d3,  d7,  q1,  q3,  d26, d30, d27, d29, \
115
                        d20, d21, q8,  q9
116
        vtrn.32         q0,  q1
117
        vtrn.32         q2,  q3
118
        vst1.16         {d0-d3},  [r0,:128]!
119
        vst1.16         {d4-d7},  [r0,:128]
120
        bx              lr
121
endfunc
122
 
123
function fft_pass_neon
124
        push            {r4,lr}
125
        movrel          lr,  coefs+24
126
        vld1.16         {d30},    [lr,:64]
127
        lsl             r12, r2,  #3
128
        vmov            d31, d30
129
        add             r3,  r1,  r2,  lsl #2
130
        mov             lr,  #-8
131
        sub             r3,  r3,  #2
132
        mov             r4,  r0
133
        vld1.16         {d27[]},  [r3,:16]
134
        sub             r3,  r3,  #6
135
        vld1.16         {q0},     [r4,:128], r12
136
        vld1.16         {q1},     [r4,:128], r12
137
        vld1.16         {q2},     [r4,:128], r12
138
        vld1.16         {q3},     [r4,:128], r12
139
        vld1.16         {d28},    [r1,:64]!
140
        vld1.16         {d29},    [r3,:64], lr
141
        vswp            d1,  d2
142
        vswp            d5,  d6
143
        vtrn.32         d0,  d1
144
        vtrn.32         d4,  d5
145
        vdup.16         d25, d28[1]
146
        vmul.s16        d27, d27, d31
147
        transform01     q0,  q2,  d5,  d25, d27, d20, q8,  q9
148
        b               2f
149
1:
150
        mov             r4,  r0
151
        vdup.16         d26, d29[0]
152
        vld1.16         {q0},     [r4,:128], r12
153
        vld1.16         {q1},     [r4,:128], r12
154
        vld1.16         {q2},     [r4,:128], r12
155
        vld1.16         {q3},     [r4,:128], r12
156
        vld1.16         {d28},    [r1,:64]!
157
        vld1.16         {d29},    [r3,:64], lr
158
        vswp            d1,  d2
159
        vswp            d5,  d6
160
        vtrn.32         d0,  d1
161
        vtrn.32         d4,  d5
162
        vdup.16         d24, d28[0]
163
        vdup.16         d25, d28[1]
164
        vdup.16         d27, d29[3]
165
        vmul.s16        q13, q13, q15
166
        transform2      d0,  d4,  d1,  d5,  q0,  q2,  d24, d26, d25, d27, \
167
                        d16, d17, q9,  q10
168
2:
169
        vtrn.32         d2,  d3
170
        vtrn.32         d6,  d7
171
        vdup.16         d24, d28[2]
172
        vdup.16         d26, d29[2]
173
        vdup.16         d25, d28[3]
174
        vdup.16         d27, d29[1]
175
        vmul.s16        q13, q13, q15
176
        transform2      d2,  d6,  d3,  d7,  q1,  q3,  d24, d26, d25, d27, \
177
                        d16, d17, q9,  q10
178
        vtrn.32         d0,  d1
179
        vtrn.32         d2,  d3
180
        vtrn.32         d4,  d5
181
        vtrn.32         d6,  d7
182
        vswp            d1,  d2
183
        vswp            d5,  d6
184
        mov             r4,  r0
185
        vst1.16         {q0},     [r4,:128], r12
186
        vst1.16         {q1},     [r4,:128], r12
187
        vst1.16         {q2},     [r4,:128], r12
188
        vst1.16         {q3},     [r4,:128], r12
189
        add             r0,  r0,  #16
190
        subs            r2,  r2,  #2
191
        bgt             1b
192
        pop             {r4,pc}
193
endfunc
194
 
195
#define F_SQRT1_2   23170
196
#define F_COS_16_1  30274
197
#define F_COS_16_3  12540
198
 
199
const   coefs, align=4
200
        .short          F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2,  F_SQRT1_2
201
        .short          F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
202
        .short          F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
203
        .short          1,         -1,         -1,          1
204
endconst
205
 
206
.macro  def_fft n, n2, n4
207
function fft\n\()_neon
208
        push            {r4, lr}
209
        mov             r4,  r0
210
        bl              fft\n2\()_neon
211
        add             r0,  r4,  #\n4*2*4
212
        bl              fft\n4\()_neon
213
        add             r0,  r4,  #\n4*3*4
214
        bl              fft\n4\()_neon
215
        mov             r0,  r4
216
        pop             {r4, lr}
217
        movrelx         r1,  X(ff_cos_\n\()_fixed)
218
        mov             r2,  #\n4/2
219
        b               fft_pass_neon
220
endfunc
221
.endm
222
 
223
        def_fft    32,    16,     8
224
        def_fft    64,    32,    16
225
        def_fft   128,    64,    32
226
        def_fft   256,   128,    64
227
        def_fft   512,   256,   128
228
        def_fft  1024,   512,   256
229
        def_fft  2048,  1024,   512
230
        def_fft  4096,  2048,  1024
231
        def_fft  8192,  4096,  2048
232
        def_fft 16384,  8192,  4096
233
        def_fft 32768, 16384,  8192
234
        def_fft 65536, 32768, 16384
235
 
236
function ff_fft_fixed_calc_neon, export=1
237
        ldr             r2,  [r0]
238
        sub             r2,  r2,  #2
239
        movrel          r3,  fft_fixed_tab_neon
240
        ldr             r3,  [r3, r2, lsl #2]
241
        mov             r0,  r1
242
        bx              r3
243
endfunc
244
 
245
const   fft_fixed_tab_neon
246
        .word fft4_neon
247
        .word fft8_neon
248
        .word fft16_neon
249
        .word fft32_neon
250
        .word fft64_neon
251
        .word fft128_neon
252
        .word fft256_neon
253
        .word fft512_neon
254
        .word fft1024_neon
255
        .word fft2048_neon
256
        .word fft4096_neon
257
        .word fft8192_neon
258
        .word fft16384_neon
259
        .word fft32768_neon
260
        .word fft65536_neon
261
endconst