Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
/*
2
 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
 
21
#include "libavutil/attributes.h"
22
#include "libavutil/cpu.h"
23
#include "libavutil/x86/asm.h"
24
#include "libavutil/x86/cpu.h"
25
#include "libavcodec/h264dsp.h"
26
#include "dsputil_x86.h"
27
 
28
/***********************************/
29
/* IDCT */
30
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT)                                  \
31
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \
32
                                                       int16_t *block,  \
33
                                                       int stride);
34
 
35
IDCT_ADD_FUNC(, 8, mmx)
36
IDCT_ADD_FUNC(, 10, sse2)
37
IDCT_ADD_FUNC(_dc, 8, mmxext)
38
IDCT_ADD_FUNC(_dc, 10, mmxext)
39
IDCT_ADD_FUNC(8_dc, 8, mmxext)
40
IDCT_ADD_FUNC(8_dc, 10, sse2)
41
IDCT_ADD_FUNC(8, 8, mmx)
42
IDCT_ADD_FUNC(8, 8, sse2)
43
IDCT_ADD_FUNC(8, 10, sse2)
44
IDCT_ADD_FUNC(, 10, avx)
45
IDCT_ADD_FUNC(8_dc, 10, avx)
46
IDCT_ADD_FUNC(8, 10, avx)
47
 
48
 
49
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT)                         \
50
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT       \
51
    (uint8_t *dst, const int *block_offset,                             \
52
     int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
53
 
54
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
55
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
56
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
57
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
58
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
59
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
60
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
61
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
62
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
63
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
64
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
65
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
66
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
67
IDCT_ADD_REP_FUNC(, 16, 10, avx)
68
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
69
 
70
 
71
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT)                      \
72
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT     \
73
    (uint8_t **dst, const int *block_offset,                          \
74
     int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
75
 
76
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
77
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
78
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
79
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
80
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
81
 
82
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
83
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
84
 
85
/***********************************/
86
/* deblocking */
87
 
88
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
89
                                         int8_t ref[2][40],
90
                                         int16_t mv[2][40][2],
91
                                         int bidir, int edges, int step,
92
                                         int mask_mv0, int mask_mv1, int field);
93
 
94
#define LF_FUNC(DIR, TYPE, DEPTH, OPT)                                        \
95
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
96
                                                               int stride,    \
97
                                                               int alpha,     \
98
                                                               int beta,      \
99
                                                               int8_t *tc0);
100
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
101
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix,  \
102
                                                               int stride,    \
103
                                                               int alpha,     \
104
                                                               int beta);
105
 
106
#define LF_FUNCS(type, depth)                   \
107
LF_FUNC(h,  chroma,       depth, mmxext)        \
108
LF_IFUNC(h, chroma_intra, depth, mmxext)        \
109
LF_FUNC(v,  chroma,       depth, mmxext)        \
110
LF_IFUNC(v, chroma_intra, depth, mmxext)        \
111
LF_FUNC(h,  luma,         depth, mmxext)        \
112
LF_IFUNC(h, luma_intra,   depth, mmxext)        \
113
LF_FUNC(h,  luma,         depth, sse2)          \
114
LF_IFUNC(h, luma_intra,   depth, sse2)          \
115
LF_FUNC(v,  luma,         depth, sse2)          \
116
LF_IFUNC(v, luma_intra,   depth, sse2)          \
117
LF_FUNC(h,  chroma,       depth, sse2)          \
118
LF_IFUNC(h, chroma_intra, depth, sse2)          \
119
LF_FUNC(v,  chroma,       depth, sse2)          \
120
LF_IFUNC(v, chroma_intra, depth, sse2)          \
121
LF_FUNC(h,  luma,         depth, avx)           \
122
LF_IFUNC(h, luma_intra,   depth, avx)           \
123
LF_FUNC(v,  luma,         depth, avx)           \
124
LF_IFUNC(v, luma_intra,   depth, avx)           \
125
LF_FUNC(h,  chroma,       depth, avx)           \
126
LF_IFUNC(h, chroma_intra, depth, avx)           \
127
LF_FUNC(v,  chroma,       depth, avx)           \
128
LF_IFUNC(v, chroma_intra, depth, avx)
129
 
130
LF_FUNCS(uint8_t,   8)
131
LF_FUNCS(uint16_t, 10)
132
 
133
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
134
LF_FUNC(v8, luma, 8, mmxext)
135
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
136
                                    int beta, int8_t *tc0)
137
{
138
    if ((tc0[0] & tc0[1]) >= 0)
139
        ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
140
    if ((tc0[2] & tc0[3]) >= 0)
141
        ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
142
}
143
LF_IFUNC(v8, luma_intra, 8, mmxext)
144
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
145
                                          int alpha, int beta)
146
{
147
    ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
148
    ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
149
}
150
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
151
 
152
LF_FUNC(v,  luma,       10, mmxext)
153
LF_IFUNC(v, luma_intra, 10, mmxext)
154
 
155
/***********************************/
156
/* weighted prediction */
157
 
158
#define H264_WEIGHT(W, OPT)                                             \
159
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride,         \
160
                                      int height, int log2_denom,       \
161
                                      int weight, int offset);
162
 
163
#define H264_BIWEIGHT(W, OPT)                                           \
164
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src,     \
165
                                        int stride, int height,         \
166
                                        int log2_denom, int weightd,    \
167
                                        int weights, int offset);
168
 
169
#define H264_BIWEIGHT_MMX(W)                    \
170
    H264_WEIGHT(W, mmxext)                      \
171
    H264_BIWEIGHT(W, mmxext)
172
 
173
#define H264_BIWEIGHT_MMX_SSE(W)                \
174
    H264_BIWEIGHT_MMX(W)                        \
175
    H264_WEIGHT(W, sse2)                        \
176
    H264_BIWEIGHT(W, sse2)                      \
177
    H264_BIWEIGHT(W, ssse3)
178
 
179
H264_BIWEIGHT_MMX_SSE(16)
180
H264_BIWEIGHT_MMX_SSE(8)
181
H264_BIWEIGHT_MMX(4)
182
 
183
#define H264_WEIGHT_10(W, DEPTH, OPT)                                   \
184
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,       \
185
                                                    int stride,         \
186
                                                    int height,         \
187
                                                    int log2_denom,     \
188
                                                    int weight,         \
189
                                                    int offset);
190
 
191
#define H264_BIWEIGHT_10(W, DEPTH, OPT)                                 \
192
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst,     \
193
                                                      uint8_t *src,     \
194
                                                      int stride,       \
195
                                                      int height,       \
196
                                                      int log2_denom,   \
197
                                                      int weightd,      \
198
                                                      int weights,      \
199
                                                      int offset);
200
 
201
#define H264_BIWEIGHT_10_SSE(W, DEPTH)          \
202
    H264_WEIGHT_10(W, DEPTH, sse2)              \
203
    H264_WEIGHT_10(W, DEPTH, sse4)              \
204
    H264_BIWEIGHT_10(W, DEPTH, sse2)            \
205
    H264_BIWEIGHT_10(W, DEPTH, sse4)
206
 
207
H264_BIWEIGHT_10_SSE(16, 10)
208
H264_BIWEIGHT_10_SSE(8,  10)
209
H264_BIWEIGHT_10_SSE(4,  10)
210
 
211
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
212
                                 const int chroma_format_idc)
213
{
214
#if HAVE_YASM
215
    int cpu_flags = av_get_cpu_flags();
216
 
217
    if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(cpu_flags))
218
        c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
219
 
220
    if (bit_depth == 8) {
221
        if (EXTERNAL_MMX(cpu_flags)) {
222
            c->h264_idct_dc_add   =
223
            c->h264_idct_add      = ff_h264_idct_add_8_mmx;
224
            c->h264_idct8_dc_add  =
225
            c->h264_idct8_add     = ff_h264_idct8_add_8_mmx;
226
 
227
            c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
228
            c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
229
            if (chroma_format_idc == 1)
230
                c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
231
            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
232
            if (cpu_flags & AV_CPU_FLAG_CMOV)
233
                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
234
        }
235
        if (EXTERNAL_MMXEXT(cpu_flags)) {
236
            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmxext;
237
            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
238
            c->h264_idct_add16   = ff_h264_idct_add16_8_mmxext;
239
            c->h264_idct8_add4   = ff_h264_idct8_add4_8_mmxext;
240
            if (chroma_format_idc == 1)
241
                c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
242
            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
243
 
244
            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmxext;
245
            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
246
            if (chroma_format_idc == 1) {
247
                c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmxext;
248
                c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
249
            }
250
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
251
            c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
252
            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmxext;
253
            c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
254
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
255
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
256
            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
257
            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
258
            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
259
 
260
            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
261
            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
262
            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
263
        }
264
        if (EXTERNAL_SSE2(cpu_flags)) {
265
            c->h264_idct8_add  = ff_h264_idct8_add_8_sse2;
266
 
267
            c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
268
            c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
269
            if (chroma_format_idc == 1)
270
                c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
271
            c->h264_idct_add16intra      = ff_h264_idct_add16intra_8_sse2;
272
            c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
273
 
274
            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
275
            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
276
 
277
            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
278
            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
279
 
280
            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_sse2;
281
            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_sse2;
282
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
283
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
284
        }
285
        if (EXTERNAL_SSSE3(cpu_flags)) {
286
            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
287
            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
288
        }
289
        if (EXTERNAL_AVX(cpu_flags)) {
290
            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_8_avx;
291
            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_avx;
292
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
293
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
294
        }
295
    } else if (bit_depth == 10) {
296
        if (EXTERNAL_MMXEXT(cpu_flags)) {
297
#if ARCH_X86_32
298
            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
299
            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
300
            c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
301
            c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_mmxext;
302
            c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_mmxext;
303
            c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_mmxext;
304
#endif /* ARCH_X86_32 */
305
            c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
306
        }
307
        if (EXTERNAL_SSE2(cpu_flags)) {
308
            c->h264_idct_add     = ff_h264_idct_add_10_sse2;
309
            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
310
 
311
            c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
312
            if (chroma_format_idc == 1)
313
                c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
314
            c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
315
#if HAVE_ALIGNED_STACK
316
            c->h264_idct8_add  = ff_h264_idct8_add_10_sse2;
317
            c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
318
#endif /* HAVE_ALIGNED_STACK */
319
 
320
            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
321
            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
322
            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
323
 
324
            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
325
            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
326
            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
327
 
328
            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_sse2;
329
            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
330
#if HAVE_ALIGNED_STACK
331
            c->h264_v_loop_filter_luma       = ff_deblock_v_luma_10_sse2;
332
            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_10_sse2;
333
            c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
334
            c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
335
#endif /* HAVE_ALIGNED_STACK */
336
        }
337
        if (EXTERNAL_SSE4(cpu_flags)) {
338
            c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
339
            c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
340
            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
341
 
342
            c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
343
            c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
344
            c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
345
        }
346
        if (EXTERNAL_AVX(cpu_flags)) {
347
            c->h264_idct_dc_add  =
348
            c->h264_idct_add     = ff_h264_idct_add_10_avx;
349
            c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
350
 
351
            c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
352
            if (chroma_format_idc == 1)
353
                c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
354
            c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
355
#if HAVE_ALIGNED_STACK
356
            c->h264_idct8_add  = ff_h264_idct8_add_10_avx;
357
            c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
358
#endif /* HAVE_ALIGNED_STACK */
359
 
360
            c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_avx;
361
            c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
362
#if HAVE_ALIGNED_STACK
363
            c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_avx;
364
            c->h264_h_loop_filter_luma         = ff_deblock_h_luma_10_avx;
365
            c->h264_v_loop_filter_luma_intra   = ff_deblock_v_luma_intra_10_avx;
366
            c->h264_h_loop_filter_luma_intra   = ff_deblock_h_luma_intra_10_avx;
367
#endif /* HAVE_ALIGNED_STACK */
368
        }
369
    }
370
#endif
371
}