Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/*
2
 * MMX optimized MP3 decoding functions
3
 * Copyright (c) 2010 Vitor Sessak
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
 
22
#include "libavutil/attributes.h"
23
#include "libavutil/cpu.h"
24
#include "libavutil/internal.h"
25
#include "libavutil/x86/asm.h"
26
#include "libavutil/x86/cpu.h"
27
#include "libavcodec/mpegaudiodsp.h"
28
 
29
#define DECL(CPU)\
30
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
31
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
32
 
33
DECL(sse)
34
DECL(sse2)
35
DECL(sse3)
36
DECL(ssse3)
37
DECL(avx)
38
 
39
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
40
                               float *tmpbuf);
41
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
42
                               float *tmpbuf);
43
 
44
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
45
 
46
#if HAVE_SSE2_INLINE
47
 
48
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
49
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
50
 
51
#define SUM8(op, sum, w, p)               \
52
{                                         \
53
    op(sum, (w)[0 * 64], (p)[0 * 64]);    \
54
    op(sum, (w)[1 * 64], (p)[1 * 64]);    \
55
    op(sum, (w)[2 * 64], (p)[2 * 64]);    \
56
    op(sum, (w)[3 * 64], (p)[3 * 64]);    \
57
    op(sum, (w)[4 * 64], (p)[4 * 64]);    \
58
    op(sum, (w)[5 * 64], (p)[5 * 64]);    \
59
    op(sum, (w)[6 * 64], (p)[6 * 64]);    \
60
    op(sum, (w)[7 * 64], (p)[7 * 64]);    \
61
}
62
 
63
static void apply_window(const float *buf, const float *win1,
64
                         const float *win2, float *sum1, float *sum2, int len)
65
{
66
    x86_reg count = - 4*len;
67
    const float *win1a = win1+len;
68
    const float *win2a = win2+len;
69
    const float *bufa  = buf+len;
70
    float *sum1a = sum1+len;
71
    float *sum2a = sum2+len;
72
 
73
 
74
#define MULT(a, b)                                 \
75
    "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
76
    "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
77
    "mulps         %%xmm2, %%xmm1           \n\t"  \
78
    "subps         %%xmm1, %%xmm0           \n\t"  \
79
    "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
80
    "subps         %%xmm2, %%xmm4           \n\t"  \
81
 
82
    __asm__ volatile(
83
            "1:                                   \n\t"
84
            "xorps       %%xmm0, %%xmm0           \n\t"
85
            "xorps       %%xmm4, %%xmm4           \n\t"
86
 
87
            MULT(   0,   0)
88
            MULT( 256,  64)
89
            MULT( 512, 128)
90
            MULT( 768, 192)
91
            MULT(1024, 256)
92
            MULT(1280, 320)
93
            MULT(1536, 384)
94
            MULT(1792, 448)
95
 
96
            "movaps      %%xmm0, (%4,%0)          \n\t"
97
            "movaps      %%xmm4, (%5,%0)          \n\t"
98
            "add            $16,  %0              \n\t"
99
            "jl              1b                   \n\t"
100
            :"+&r"(count)
101
            :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
102
            );
103
 
104
#undef MULT
105
}
106
 
107
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
108
                             int incr)
109
{
110
    LOCAL_ALIGNED_16(float, suma, [17]);
111
    LOCAL_ALIGNED_16(float, sumb, [17]);
112
    LOCAL_ALIGNED_16(float, sumc, [17]);
113
    LOCAL_ALIGNED_16(float, sumd, [17]);
114
 
115
    float sum;
116
 
117
    /* copy to avoid wrap */
118
    __asm__ volatile(
119
            "movaps    0(%0), %%xmm0   \n\t" \
120
            "movaps   16(%0), %%xmm1   \n\t" \
121
            "movaps   32(%0), %%xmm2   \n\t" \
122
            "movaps   48(%0), %%xmm3   \n\t" \
123
            "movaps   %%xmm0,   0(%1) \n\t" \
124
            "movaps   %%xmm1,  16(%1) \n\t" \
125
            "movaps   %%xmm2,  32(%1) \n\t" \
126
            "movaps   %%xmm3,  48(%1) \n\t" \
127
            "movaps   64(%0), %%xmm0   \n\t" \
128
            "movaps   80(%0), %%xmm1   \n\t" \
129
            "movaps   96(%0), %%xmm2   \n\t" \
130
            "movaps  112(%0), %%xmm3   \n\t" \
131
            "movaps   %%xmm0,  64(%1) \n\t" \
132
            "movaps   %%xmm1,  80(%1) \n\t" \
133
            "movaps   %%xmm2,  96(%1) \n\t" \
134
            "movaps   %%xmm3, 112(%1) \n\t"
135
            ::"r"(in), "r"(in+512)
136
            :"memory"
137
            );
138
 
139
    apply_window(in + 16, win     , win + 512, suma, sumc, 16);
140
    apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
141
 
142
    SUM8(MACS, suma[0], win + 32, in + 48);
143
 
144
    sumc[ 0] = 0;
145
    sumb[16] = 0;
146
    sumd[16] = 0;
147
 
148
#define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
149
            "movups " #sumd "(%4),       %%xmm0          \n\t" \
150
            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
151
            "subps  " #suma "(%1),       %%xmm0          \n\t" \
152
            "movaps        %%xmm0," #out1 "(%0)          \n\t" \
153
\
154
            "movups " #sumc "(%3),       %%xmm0          \n\t" \
155
            "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
156
            "addps  " #sumb "(%2),       %%xmm0          \n\t" \
157
            "movaps        %%xmm0," #out2 "(%0)          \n\t"
158
 
159
    if (incr == 1) {
160
        __asm__ volatile(
161
            SUMS( 0, 48,  4, 52,  0, 112)
162
            SUMS(16, 32, 20, 36, 16,  96)
163
            SUMS(32, 16, 36, 20, 32,  80)
164
            SUMS(48,  0, 52,  4, 48,  64)
165
 
166
            :"+&r"(out)
167
            :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
168
            :"memory"
169
            );
170
        out += 16*incr;
171
    } else {
172
        int j;
173
        float *out2 = out + 32 * incr;
174
        out[0  ]  = -suma[   0];
175
        out += incr;
176
        out2 -= incr;
177
        for(j=1;j<16;j++) {
178
            *out  = -suma[   j] + sumd[16-j];
179
            *out2 =  sumb[16-j] + sumc[   j];
180
            out  += incr;
181
            out2 -= incr;
182
        }
183
    }
184
 
185
    sum = 0;
186
    SUM8(MLSS, sum, win + 16 + 32, in + 32);
187
    *out = sum;
188
}
189
 
190
#endif /* HAVE_SSE2_INLINE */
191
 
192
#if HAVE_YASM
193
#define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
194
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
195
                               int count, int switch_point, int block_type) \
196
{                                                                           \
197
    int align_end = count - (count & 3);                                \
198
    int j;                                                              \
199
    for (j = 0; j < align_end; j+= 4) {                                 \
200
        LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
201
        float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
202
        /* apply window & overlap with previous buffer */               \
203
                                                                        \
204
        /* select window */                                             \
205
        ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
206
        in      += 4*18;                                                \
207
        buf     += 4*18;                                                \
208
        out     += 4;                                                   \
209
    }                                                                   \
210
    for (; j < count; j++) {                                            \
211
        /* apply window & overlap with previous buffer */               \
212
                                                                        \
213
        /* select window */                                             \
214
        int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
215
        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
216
                                                                        \
217
        ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
218
                                                                        \
219
        in  += 18;                                                      \
220
        buf++;                                                          \
221
        out++;                                                          \
222
    }                                                                   \
223
}
224
 
225
#if HAVE_SSE
226
DECL_IMDCT_BLOCKS(sse,sse)
227
DECL_IMDCT_BLOCKS(sse2,sse)
228
DECL_IMDCT_BLOCKS(sse3,sse)
229
DECL_IMDCT_BLOCKS(ssse3,sse)
230
#endif
231
#if HAVE_AVX_EXTERNAL
232
DECL_IMDCT_BLOCKS(avx,avx)
233
#endif
234
#endif /* HAVE_YASM */
235
 
236
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
237
{
238
    int cpu_flags = av_get_cpu_flags();
239
 
240
    int i, j;
241
    for (j = 0; j < 4; j++) {
242
        for (i = 0; i < 40; i ++) {
243
            mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
244
            mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
245
            mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
246
            mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
247
            mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
248
            mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
249
            mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
250
            mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
251
        }
252
    }
253
 
254
#if HAVE_SSE2_INLINE
255
    if (cpu_flags & AV_CPU_FLAG_SSE2) {
256
        s->apply_window_float = apply_window_mp3;
257
    }
258
#endif /* HAVE_SSE2_INLINE */
259
 
260
#if HAVE_YASM
261
    if (EXTERNAL_SSE(cpu_flags)) {
262
        s->imdct36_blocks_float = imdct36_blocks_sse;
263
    }
264
    if (EXTERNAL_SSE2(cpu_flags)) {
265
        s->imdct36_blocks_float = imdct36_blocks_sse2;
266
    }
267
    if (EXTERNAL_SSE3(cpu_flags)) {
268
        s->imdct36_blocks_float = imdct36_blocks_sse3;
269
    }
270
    if (EXTERNAL_SSSE3(cpu_flags)) {
271
        s->imdct36_blocks_float = imdct36_blocks_ssse3;
272
    }
273
    if (EXTERNAL_AVX(cpu_flags)) {
274
        s->imdct36_blocks_float = imdct36_blocks_avx;
275
    }
276
#endif /* HAVE_YASM */
277
}