Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * x86-optimized AC-3 DSP utils |
||
3 | * Copyright (c) 2011 Justin Ruggles |
||
4 | * |
||
5 | * This file is part of FFmpeg. |
||
6 | * |
||
7 | * FFmpeg is free software; you can redistribute it and/or |
||
8 | * modify it under the terms of the GNU Lesser General Public |
||
9 | * License as published by the Free Software Foundation; either |
||
10 | * version 2.1 of the License, or (at your option) any later version. |
||
11 | * |
||
12 | * FFmpeg is distributed in the hope that it will be useful, |
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | * Lesser General Public License for more details. |
||
16 | * |
||
17 | * You should have received a copy of the GNU Lesser General Public |
||
18 | * License along with FFmpeg; if not, write to the Free Software |
||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | */ |
||
21 | |||
22 | #include "libavutil/mem.h" |
||
23 | #include "libavutil/x86/asm.h" |
||
24 | #include "libavutil/x86/cpu.h" |
||
25 | #include "dsputil_x86.h" |
||
26 | #include "libavcodec/ac3.h" |
||
27 | #include "libavcodec/ac3dsp.h" |
||
28 | |||
29 | void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
||
30 | void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
||
31 | void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
||
32 | |||
33 | int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); |
||
34 | int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); |
||
35 | int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); |
||
36 | int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); |
||
37 | |||
38 | void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); |
||
39 | void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); |
||
40 | |||
41 | void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); |
||
42 | void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); |
||
43 | |||
44 | void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); |
||
45 | void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); |
||
46 | void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); |
||
47 | |||
48 | int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); |
||
49 | |||
50 | void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); |
||
51 | void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); |
||
52 | void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); |
||
53 | |||
54 | #if ARCH_X86_32 && defined(__INTEL_COMPILER) |
||
55 | # undef HAVE_7REGS |
||
56 | # define HAVE_7REGS 0 |
||
57 | #endif |
||
58 | |||
59 | #if HAVE_SSE_INLINE && HAVE_7REGS |
||
60 | |||
61 | #define IF1(x) x |
||
62 | #define IF0(x) |
||
63 | |||
64 | #define MIX5(mono, stereo) \ |
||
65 | __asm__ volatile ( \ |
||
66 | "movss 0(%1), %%xmm5 \n" \ |
||
67 | "movss 8(%1), %%xmm6 \n" \ |
||
68 | "movss 24(%1), %%xmm7 \n" \ |
||
69 | "shufps $0, %%xmm5, %%xmm5 \n" \ |
||
70 | "shufps $0, %%xmm6, %%xmm6 \n" \ |
||
71 | "shufps $0, %%xmm7, %%xmm7 \n" \ |
||
72 | "1: \n" \ |
||
73 | "movaps (%0, %2), %%xmm0 \n" \ |
||
74 | "movaps (%0, %3), %%xmm1 \n" \ |
||
75 | "movaps (%0, %4), %%xmm2 \n" \ |
||
76 | "movaps (%0, %5), %%xmm3 \n" \ |
||
77 | "movaps (%0, %6), %%xmm4 \n" \ |
||
78 | "mulps %%xmm5, %%xmm0 \n" \ |
||
79 | "mulps %%xmm6, %%xmm1 \n" \ |
||
80 | "mulps %%xmm5, %%xmm2 \n" \ |
||
81 | "mulps %%xmm7, %%xmm3 \n" \ |
||
82 | "mulps %%xmm7, %%xmm4 \n" \ |
||
83 | stereo("addps %%xmm1, %%xmm0 \n") \ |
||
84 | "addps %%xmm1, %%xmm2 \n" \ |
||
85 | "addps %%xmm3, %%xmm0 \n" \ |
||
86 | "addps %%xmm4, %%xmm2 \n" \ |
||
87 | mono("addps %%xmm2, %%xmm0 \n") \ |
||
88 | "movaps %%xmm0, (%0, %2) \n" \ |
||
89 | stereo("movaps %%xmm2, (%0, %3) \n") \ |
||
90 | "add $16, %0 \n" \ |
||
91 | "jl 1b \n" \ |
||
92 | : "+&r"(i) \ |
||
93 | : "r"(matrix), \ |
||
94 | "r"(samples[0] + len), \ |
||
95 | "r"(samples[1] + len), \ |
||
96 | "r"(samples[2] + len), \ |
||
97 | "r"(samples[3] + len), \ |
||
98 | "r"(samples[4] + len) \ |
||
99 | : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
||
100 | "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ |
||
101 | "memory" \ |
||
102 | ); |
||
103 | |||
104 | #define MIX_MISC(stereo) \ |
||
105 | __asm__ volatile ( \ |
||
106 | "mov %5, %2 \n" \ |
||
107 | "1: \n" \ |
||
108 | "mov -%c7(%6, %2, %c8), %3 \n" \ |
||
109 | "movaps (%3, %0), %%xmm0 \n" \ |
||
110 | stereo("movaps %%xmm0, %%xmm1 \n") \ |
||
111 | "mulps %%xmm4, %%xmm0 \n" \ |
||
112 | stereo("mulps %%xmm5, %%xmm1 \n") \ |
||
113 | "2: \n" \ |
||
114 | "mov (%6, %2, %c8), %1 \n" \ |
||
115 | "movaps (%1, %0), %%xmm2 \n" \ |
||
116 | stereo("movaps %%xmm2, %%xmm3 \n") \ |
||
117 | "mulps (%4, %2, 8), %%xmm2 \n" \ |
||
118 | stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ |
||
119 | "addps %%xmm2, %%xmm0 \n" \ |
||
120 | stereo("addps %%xmm3, %%xmm1 \n") \ |
||
121 | "add $4, %2 \n" \ |
||
122 | "jl 2b \n" \ |
||
123 | "mov %5, %2 \n" \ |
||
124 | stereo("mov (%6, %2, %c8), %1 \n") \ |
||
125 | "movaps %%xmm0, (%3, %0) \n" \ |
||
126 | stereo("movaps %%xmm1, (%1, %0) \n") \ |
||
127 | "add $16, %0 \n" \ |
||
128 | "jl 1b \n" \ |
||
129 | : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ |
||
130 | : "r"(matrix_simd + in_ch), \ |
||
131 | "g"((intptr_t) - 4 * (in_ch - 1)), \ |
||
132 | "r"(samp + in_ch), \ |
||
133 | "i"(sizeof(float *)), "i"(sizeof(float *)/4) \ |
||
134 | : "memory" \ |
||
135 | ); |
||
136 | |||
137 | static void ac3_downmix_sse(float **samples, float (*matrix)[2], |
||
138 | int out_ch, int in_ch, int len) |
||
139 | { |
||
140 | int (*matrix_cmp)[2] = (int(*)[2])matrix; |
||
141 | intptr_t i, j, k, m; |
||
142 | |||
143 | i = -len * sizeof(float); |
||
144 | if (in_ch == 5 && out_ch == 2 && |
||
145 | !(matrix_cmp[0][1] | matrix_cmp[2][0] | |
||
146 | matrix_cmp[3][1] | matrix_cmp[4][0] | |
||
147 | (matrix_cmp[1][0] ^ matrix_cmp[1][1]) | |
||
148 | (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) { |
||
149 | MIX5(IF0, IF1); |
||
150 | } else if (in_ch == 5 && out_ch == 1 && |
||
151 | matrix_cmp[0][0] == matrix_cmp[2][0] && |
||
152 | matrix_cmp[3][0] == matrix_cmp[4][0]) { |
||
153 | MIX5(IF1, IF0); |
||
154 | } else { |
||
155 | DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; |
||
156 | float *samp[AC3_MAX_CHANNELS]; |
||
157 | |||
158 | for (j = 0; j < in_ch; j++) |
||
159 | samp[j] = samples[j] + len; |
||
160 | |||
161 | j = 2 * in_ch * sizeof(float); |
||
162 | __asm__ volatile ( |
||
163 | "1: \n" |
||
164 | "sub $8, %0 \n" |
||
165 | "movss (%2, %0), %%xmm4 \n" |
||
166 | "movss 4(%2, %0), %%xmm5 \n" |
||
167 | "shufps $0, %%xmm4, %%xmm4 \n" |
||
168 | "shufps $0, %%xmm5, %%xmm5 \n" |
||
169 | "movaps %%xmm4, (%1, %0, 4) \n" |
||
170 | "movaps %%xmm5, 16(%1, %0, 4) \n" |
||
171 | "jg 1b \n" |
||
172 | : "+&r"(j) |
||
173 | : "r"(matrix_simd), "r"(matrix) |
||
174 | : "memory" |
||
175 | ); |
||
176 | if (out_ch == 2) { |
||
177 | MIX_MISC(IF1); |
||
178 | } else { |
||
179 | MIX_MISC(IF0); |
||
180 | } |
||
181 | } |
||
182 | } |
||
183 | |||
184 | #endif /* HAVE_SSE_INLINE && HAVE_7REGS */ |
||
185 | |||
186 | av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
||
187 | { |
||
188 | int cpu_flags = av_get_cpu_flags(); |
||
189 | |||
190 | if (EXTERNAL_MMX(cpu_flags)) { |
||
191 | c->ac3_exponent_min = ff_ac3_exponent_min_mmx; |
||
192 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; |
||
193 | c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; |
||
194 | c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; |
||
195 | } |
||
196 | if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
||
197 | if (!bit_exact) { |
||
198 | c->float_to_fixed24 = ff_float_to_fixed24_3dnow; |
||
199 | } |
||
200 | } |
||
201 | if (EXTERNAL_MMXEXT(cpu_flags)) { |
||
202 | c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; |
||
203 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; |
||
204 | } |
||
205 | if (EXTERNAL_SSE(cpu_flags)) { |
||
206 | c->float_to_fixed24 = ff_float_to_fixed24_sse; |
||
207 | } |
||
208 | if (EXTERNAL_SSE2(cpu_flags)) { |
||
209 | c->ac3_exponent_min = ff_ac3_exponent_min_sse2; |
||
210 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; |
||
211 | c->float_to_fixed24 = ff_float_to_fixed24_sse2; |
||
212 | c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; |
||
213 | c->extract_exponents = ff_ac3_extract_exponents_sse2; |
||
214 | if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { |
||
215 | c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; |
||
216 | c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; |
||
217 | } |
||
218 | } |
||
219 | if (EXTERNAL_SSSE3(cpu_flags)) { |
||
220 | c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; |
||
221 | if (!(cpu_flags & AV_CPU_FLAG_ATOM)) { |
||
222 | c->extract_exponents = ff_ac3_extract_exponents_ssse3; |
||
223 | } |
||
224 | } |
||
225 | |||
226 | #if HAVE_SSE_INLINE && HAVE_7REGS |
||
227 | if (INLINE_SSE(cpu_flags)) { |
||
228 | c->downmix = ac3_downmix_sse; |
||
229 | } |
||
230 | #endif |
||
231 | }> |