Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
;******************************************************************************
2
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3
;* Copyright (C) 2012 Christophe Gisquet 
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21
 
22
%include "libavutil/x86/x86util.asm"
23
 
24
SECTION_RODATA
25
pw_row_coeffs:  times 4 dw 13
26
                times 4 dw 17
27
                times 4 dw  7
28
pd_512: times 2 dd 0x200
29
pw_col_coeffs:  dw 13,  13,  13, -13
30
                dw 17,   7,   7, -17
31
                dw 13, -13,  13,  13
32
                dw -7,  17, -17,  -7
33
 
34
SECTION .text
35
 
36
%macro IDCT_DC_NOROUND 1
37
    imul   %1, 13*13*3
38
    sar    %1, 11
39
%endmacro
40
 
41
%macro IDCT_DC_ROUND 1
42
    imul   %1, 13*13
43
    add    %1, 0x200
44
    sar    %1, 10
45
%endmacro
46
 
47
%macro rv34_idct 1
48
cglobal rv34_idct_%1, 1, 2, 0
49
    movsx   r1, word [r0]
50
    IDCT_DC r1
51
    movd    m0, r1d
52
    pshufw  m0, m0, 0
53
    movq    [r0+ 0], m0
54
    movq    [r0+ 8], m0
55
    movq    [r0+16], m0
56
    movq    [r0+24], m0
57
    REP_RET
58
%endmacro
59
 
60
INIT_MMX mmxext
61
%define IDCT_DC IDCT_DC_ROUND
62
rv34_idct dc
63
%define IDCT_DC IDCT_DC_NOROUND
64
rv34_idct dc_noround
65
 
66
; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
67
INIT_MMX mmx
68
cglobal rv34_idct_dc_add, 3, 3
69
    ; calculate DC
70
    IDCT_DC_ROUND r2
71
    pxor       m1, m1
72
    movd       m0, r2d
73
    psubw      m1, m0
74
    packuswb   m0, m0
75
    packuswb   m1, m1
76
    punpcklbw  m0, m0
77
    punpcklbw  m1, m1
78
    punpcklwd  m0, m0
79
    punpcklwd  m1, m1
80
 
81
    ; add DC
82
    lea        r2, [r0+r1*2]
83
    movh       m2, [r0]
84
    movh       m3, [r0+r1]
85
    movh       m4, [r2]
86
    movh       m5, [r2+r1]
87
    paddusb    m2, m0
88
    paddusb    m3, m0
89
    paddusb    m4, m0
90
    paddusb    m5, m0
91
    psubusb    m2, m1
92
    psubusb    m3, m1
93
    psubusb    m4, m1
94
    psubusb    m5, m1
95
    movh       [r0], m2
96
    movh       [r0+r1], m3
97
    movh       [r2], m4
98
    movh       [r2+r1], m5
99
    RET
100
 
101
; Load coeffs and perform row transform
102
; Output: coeffs in mm[0467], rounder in mm5
103
%macro ROW_TRANSFORM  1
104
    pxor        mm7, mm7
105
    mova        mm0, [%1+ 0*8]
106
    mova        mm1, [%1+ 1*8]
107
    mova        mm2, [%1+ 2*8]
108
    mova        mm3, [%1+ 3*8]
109
    mova  [%1+ 0*8], mm7
110
    mova  [%1+ 1*8], mm7
111
    mova  [%1+ 2*8], mm7
112
    mova  [%1+ 3*8], mm7
113
    mova        mm4, mm0
114
    mova        mm6, [pw_row_coeffs+ 0]
115
    paddsw      mm0, mm2                ; b0 + b2
116
    psubsw      mm4, mm2                ; b0 - b2
117
    pmullw      mm0, mm6                ; *13 = z0
118
    pmullw      mm4, mm6                ; *13 = z1
119
    mova        mm5, mm1
120
    pmullw      mm1, [pw_row_coeffs+ 8] ; b1*17
121
    pmullw      mm5, [pw_row_coeffs+16] ; b1* 7
122
    mova        mm7, mm3
123
    pmullw      mm3, [pw_row_coeffs+ 8] ; b3*17
124
    pmullw      mm7, [pw_row_coeffs+16] ; b3* 7
125
    paddsw      mm1, mm7                ; z3 = b1*17 + b3* 7
126
    psubsw      mm5, mm3                ; z2 = b1* 7 - b3*17
127
    mova        mm7, mm0
128
    mova        mm6, mm4
129
    paddsw      mm0, mm1                ; z0 + z3
130
    psubsw      mm7, mm1                ; z0 - z3
131
    paddsw      mm4, mm5                ; z1 + z2
132
    psubsw      mm6, mm5                ; z1 - z2
133
    mova        mm5, [pd_512]           ; 0x200
134
%endmacro
135
 
136
; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
137
%macro COL_TRANSFORM  4
138
    pshufw      mm3, %2, 0xDD        ; col. 1,3,1,3
139
    pshufw       %2, %2, 0x88        ; col. 0,2,0,2
140
    pmaddwd      %2, %3              ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
141
    pmaddwd     mm3, %4              ; 17*c1+ 7*c3 |  7*c1-17*c3 = z3 | z2
142
    paddd        %2, mm5
143
    pshufw      mm1,  %2, 01001110b  ;    z1 | z0
144
    pshufw      mm2, mm3, 01001110b  ;    z2 | z3
145
    paddd        %2, mm3             ; z0+z3 | z1+z2
146
    psubd       mm1, mm2             ; z1-z2 | z0-z3
147
    movd        mm3, %1
148
    psrad        %2, 10
149
    pxor        mm2, mm2
150
    psrad       mm1, 10
151
    punpcklbw   mm3, mm2
152
    packssdw     %2, mm1
153
    paddw        %2, mm3
154
    packuswb     %2, %2
155
    movd         %1, %2
156
%endmacro
157
INIT_MMX mmxext
158
cglobal rv34_idct_add, 3,3,0, d, s, b
159
    ROW_TRANSFORM       bq
160
    COL_TRANSFORM     [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
161
    mova               mm0, [pw_col_coeffs+ 0]
162
    COL_TRANSFORM  [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
163
    mova               mm4, [pw_col_coeffs+ 8]
164
    lea                 dq, [dq + 2*sq]
165
    COL_TRANSFORM     [dq], mm6, mm0, mm4
166
    COL_TRANSFORM  [dq+sq], mm7, mm0, mm4
167
    ret
168
 
169
; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
170
INIT_XMM sse4
171
cglobal rv34_idct_dc_add, 3, 3, 6
172
    ; load data
173
    IDCT_DC_ROUND r2
174
    pxor       m1, m1
175
 
176
    ; calculate DC
177
    movd       m0, r2d
178
    lea        r2, [r0+r1*2]
179
    movd       m2, [r0]
180
    movd       m3, [r0+r1]
181
    pshuflw    m0, m0, 0
182
    movd       m4, [r2]
183
    movd       m5, [r2+r1]
184
    punpcklqdq m0, m0
185
    punpckldq  m2, m3
186
    punpckldq  m4, m5
187
    punpcklbw  m2, m1
188
    punpcklbw  m4, m1
189
    paddw      m2, m0
190
    paddw      m4, m0
191
    packuswb   m2, m4
192
    movd      [r0], m2
193
    pextrd [r0+r1], m2, 1
194
    pextrd    [r2], m2, 2
195
    pextrd [r2+r1], m2, 3
196
    RET