Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Daniel Kang 
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24
 
25
%include "libavutil/x86/x86util.asm"
26
 
27
SECTION_RODATA 32
28
 
29
pw_pixel_max: times 8 dw ((1 << 10)-1)
30
sq_1: dq 1
31
      dq 0
32
 
33
cextern pw_1
34
 
35
SECTION .text
36
 
37
;-----------------------------------------------------------------------------
38
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
39
;                  int weight, int offset);
40
;-----------------------------------------------------------------------------
41
%macro WEIGHT_PROLOGUE 0
42
.prologue:
43
    PROLOGUE 0,6,8
44
    movifnidn  r0, r0mp
45
    movifnidn r1d, r1m
46
    movifnidn r2d, r2m
47
    movifnidn r4d, r4m
48
    movifnidn r5d, r5m
49
%endmacro
50
 
51
%macro WEIGHT_SETUP 0
52
    mova       m0, [pw_1]
53
    movd       m2, r3m
54
    pslld      m0, m2       ; 1<
55
    SPLATW     m0, m0
56
    shl        r5, 19       ; *8, move to upper half of dword
57
    lea        r5, [r5+r4*2+0x10000]
58
    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
59
    pshufd     m3, m3, 0
60
    mova       m4, [pw_pixel_max]
61
    paddw      m2, [sq_1]   ; log2_denom+1
62
%if notcpuflag(sse4)
63
    pxor       m7, m7
64
%endif
65
%endmacro
66
 
67
%macro WEIGHT_OP 1-2
68
%if %0==1
69
    mova        m5, [r0+%1]
70
    punpckhwd   m6, m5, m0
71
    punpcklwd   m5, m0
72
%else
73
    movq        m5, [r0+%1]
74
    movq        m6, [r0+%2]
75
    punpcklwd   m5, m0
76
    punpcklwd   m6, m0
77
%endif
78
    pmaddwd     m5, m3
79
    pmaddwd     m6, m3
80
    psrad       m5, m2
81
    psrad       m6, m2
82
%if cpuflag(sse4)
83
    packusdw    m5, m6
84
    pminsw      m5, m4
85
%else
86
    packssdw    m5, m6
87
    CLIPW       m5, m7, m4
88
%endif
89
%endmacro
90
 
91
%macro WEIGHT_FUNC_DBL 0
92
cglobal h264_weight_16_10
93
    WEIGHT_PROLOGUE
94
    WEIGHT_SETUP
95
.nextrow:
96
    WEIGHT_OP  0
97
    mova [r0   ], m5
98
    WEIGHT_OP 16
99
    mova [r0+16], m5
100
    add       r0, r1
101
    dec       r2d
102
    jnz .nextrow
103
    REP_RET
104
%endmacro
105
 
106
INIT_XMM sse2
107
WEIGHT_FUNC_DBL
108
INIT_XMM sse4
109
WEIGHT_FUNC_DBL
110
 
111
 
112
%macro WEIGHT_FUNC_MM 0
113
cglobal h264_weight_8_10
114
    WEIGHT_PROLOGUE
115
    WEIGHT_SETUP
116
.nextrow:
117
    WEIGHT_OP   0
118
    mova     [r0], m5
119
    add        r0, r1
120
    dec        r2d
121
    jnz .nextrow
122
    REP_RET
123
%endmacro
124
 
125
INIT_XMM sse2
126
WEIGHT_FUNC_MM
127
INIT_XMM sse4
128
WEIGHT_FUNC_MM
129
 
130
 
131
%macro WEIGHT_FUNC_HALF_MM 0
132
cglobal h264_weight_4_10
133
    WEIGHT_PROLOGUE
134
    sar         r2d, 1
135
    WEIGHT_SETUP
136
    lea         r3, [r1*2]
137
.nextrow:
138
    WEIGHT_OP    0, r1
139
    movh      [r0], m5
140
    movhps [r0+r1], m5
141
    add         r0, r3
142
    dec         r2d
143
    jnz .nextrow
144
    REP_RET
145
%endmacro
146
 
147
INIT_XMM sse2
148
WEIGHT_FUNC_HALF_MM
149
INIT_XMM sse4
150
WEIGHT_FUNC_HALF_MM
151
 
152
 
153
;-----------------------------------------------------------------------------
154
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
155
;                    int log2_denom, int weightd, int weights, int offset);
156
;-----------------------------------------------------------------------------
157
%if ARCH_X86_32
158
DECLARE_REG_TMP 3
159
%else
160
DECLARE_REG_TMP 7
161
%endif
162
 
163
%macro BIWEIGHT_PROLOGUE 0
164
.prologue:
165
    PROLOGUE 0,8,8
166
    movifnidn  r0, r0mp
167
    movifnidn  r1, r1mp
168
    movifnidn r2d, r2m
169
    movifnidn r5d, r5m
170
    movifnidn r6d, r6m
171
    movifnidn t0d, r7m
172
%endmacro
173
 
174
%macro BIWEIGHT_SETUP 0
175
    lea        t0, [t0*4+1] ; (offset<<2)+1
176
    or         t0, 1
177
    shl        r6, 16
178
    or         r5, r6
179
    movd       m4, r5d      ; weightd | weights
180
    movd       m5, t0d      ; (offset+1)|1
181
    movd       m6, r4m      ; log2_denom
182
    pslld      m5, m6       ; (((offset<<2)+1)|1)<
183
    paddd      m6, [sq_1]
184
    pshufd     m4, m4, 0
185
    pshufd     m5, m5, 0
186
    mova       m3, [pw_pixel_max]
187
    movifnidn r3d, r3m
188
%if notcpuflag(sse4)
189
    pxor       m7, m7
190
%endif
191
%endmacro
192
 
193
%macro BIWEIGHT 1-2
194
%if %0==1
195
    mova       m0, [r0+%1]
196
    mova       m1, [r1+%1]
197
    punpckhwd  m2, m0, m1
198
    punpcklwd  m0, m1
199
%else
200
    movq       m0, [r0+%1]
201
    movq       m1, [r1+%1]
202
    punpcklwd  m0, m1
203
    movq       m2, [r0+%2]
204
    movq       m1, [r1+%2]
205
    punpcklwd  m2, m1
206
%endif
207
    pmaddwd    m0, m4
208
    pmaddwd    m2, m4
209
    paddd      m0, m5
210
    paddd      m2, m5
211
    psrad      m0, m6
212
    psrad      m2, m6
213
%if cpuflag(sse4)
214
    packusdw   m0, m2
215
    pminsw     m0, m3
216
%else
217
    packssdw   m0, m2
218
    CLIPW      m0, m7, m3
219
%endif
220
%endmacro
221
 
222
%macro BIWEIGHT_FUNC_DBL 0
223
cglobal h264_biweight_16_10
224
    BIWEIGHT_PROLOGUE
225
    BIWEIGHT_SETUP
226
.nextrow:
227
    BIWEIGHT   0
228
    mova [r0   ], m0
229
    BIWEIGHT  16
230
    mova [r0+16], m0
231
    add       r0, r2
232
    add       r1, r2
233
    dec       r3d
234
    jnz .nextrow
235
    REP_RET
236
%endmacro
237
 
238
INIT_XMM sse2
239
BIWEIGHT_FUNC_DBL
240
INIT_XMM sse4
241
BIWEIGHT_FUNC_DBL
242
 
243
%macro BIWEIGHT_FUNC 0
244
cglobal h264_biweight_8_10
245
    BIWEIGHT_PROLOGUE
246
    BIWEIGHT_SETUP
247
.nextrow:
248
    BIWEIGHT  0
249
    mova   [r0], m0
250
    add      r0, r2
251
    add      r1, r2
252
    dec      r3d
253
    jnz .nextrow
254
    REP_RET
255
%endmacro
256
 
257
INIT_XMM sse2
258
BIWEIGHT_FUNC
259
INIT_XMM sse4
260
BIWEIGHT_FUNC
261
 
262
%macro BIWEIGHT_FUNC_HALF 0
263
cglobal h264_biweight_4_10
264
    BIWEIGHT_PROLOGUE
265
    BIWEIGHT_SETUP
266
    sar        r3d, 1
267
    lea        r4, [r2*2]
268
.nextrow:
269
    BIWEIGHT     0, r2
270
    movh   [r0   ], m0
271
    movhps [r0+r2], m0
272
    add         r0, r4
273
    add         r1, r4
274
    dec         r3d
275
    jnz .nextrow
276
    REP_RET
277
%endmacro
278
 
279
INIT_XMM sse2
280
BIWEIGHT_FUNC_HALF
281
INIT_XMM sse4
282
BIWEIGHT_FUNC_HALF