Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code |
||
3 | ;***************************************************************************** |
||
4 | ;* Copyright (C) 2005-2011 x264 project |
||
5 | ;* |
||
6 | ;* Authors: Daniel Kang |
||
7 | ;* |
||
8 | ;* This file is part of FFmpeg. |
||
9 | ;* |
||
10 | ;* FFmpeg is free software; you can redistribute it and/or |
||
11 | ;* modify it under the terms of the GNU Lesser General Public |
||
12 | ;* License as published by the Free Software Foundation; either |
||
13 | ;* version 2.1 of the License, or (at your option) any later version. |
||
14 | ;* |
||
15 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
18 | ;* Lesser General Public License for more details. |
||
19 | ;* |
||
20 | ;* You should have received a copy of the GNU Lesser General Public |
||
21 | ;* License along with FFmpeg; if not, write to the Free Software |
||
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
23 | ;****************************************************************************** |
||
24 | |||
25 | %include "libavutil/x86/x86util.asm" |
||
26 | |||
27 | SECTION_RODATA 32 |
||
28 | |||
29 | pw_pixel_max: times 8 dw ((1 << 10)-1) |
||
30 | sq_1: dq 1 |
||
31 | dq 0 |
||
32 | |||
33 | cextern pw_1 |
||
34 | |||
35 | SECTION .text |
||
36 | |||
37 | ;----------------------------------------------------------------------------- |
||
38 | ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, |
||
39 | ; int weight, int offset); |
||
40 | ;----------------------------------------------------------------------------- |
||
41 | %macro WEIGHT_PROLOGUE 0 |
||
42 | .prologue: |
||
43 | PROLOGUE 0,6,8 |
||
44 | movifnidn r0, r0mp |
||
45 | movifnidn r1d, r1m |
||
46 | movifnidn r2d, r2m |
||
47 | movifnidn r4d, r4m |
||
48 | movifnidn r5d, r5m |
||
49 | %endmacro |
||
50 | |||
51 | %macro WEIGHT_SETUP 0 |
||
52 | mova m0, [pw_1] |
||
53 | movd m2, r3m |
||
54 | pslld m0, m2 ; 1< |
||
55 | SPLATW m0, m0 |
||
56 | shl r5, 19 ; *8, move to upper half of dword |
||
57 | lea r5, [r5+r4*2+0x10000] |
||
58 | movd m3, r5d ; weight<<1 | 1+(offset<<(3)) |
||
59 | pshufd m3, m3, 0 |
||
60 | mova m4, [pw_pixel_max] |
||
61 | paddw m2, [sq_1] ; log2_denom+1 |
||
62 | %if notcpuflag(sse4) |
||
63 | pxor m7, m7 |
||
64 | %endif |
||
65 | %endmacro |
||
66 | |||
67 | %macro WEIGHT_OP 1-2 |
||
68 | %if %0==1 |
||
69 | mova m5, [r0+%1] |
||
70 | punpckhwd m6, m5, m0 |
||
71 | punpcklwd m5, m0 |
||
72 | %else |
||
73 | movq m5, [r0+%1] |
||
74 | movq m6, [r0+%2] |
||
75 | punpcklwd m5, m0 |
||
76 | punpcklwd m6, m0 |
||
77 | %endif |
||
78 | pmaddwd m5, m3 |
||
79 | pmaddwd m6, m3 |
||
80 | psrad m5, m2 |
||
81 | psrad m6, m2 |
||
82 | %if cpuflag(sse4) |
||
83 | packusdw m5, m6 |
||
84 | pminsw m5, m4 |
||
85 | %else |
||
86 | packssdw m5, m6 |
||
87 | CLIPW m5, m7, m4 |
||
88 | %endif |
||
89 | %endmacro |
||
90 | |||
91 | %macro WEIGHT_FUNC_DBL 0 |
||
92 | cglobal h264_weight_16_10 |
||
93 | WEIGHT_PROLOGUE |
||
94 | WEIGHT_SETUP |
||
95 | .nextrow: |
||
96 | WEIGHT_OP 0 |
||
97 | mova [r0 ], m5 |
||
98 | WEIGHT_OP 16 |
||
99 | mova [r0+16], m5 |
||
100 | add r0, r1 |
||
101 | dec r2d |
||
102 | jnz .nextrow |
||
103 | REP_RET |
||
104 | %endmacro |
||
105 | |||
106 | INIT_XMM sse2 |
||
107 | WEIGHT_FUNC_DBL |
||
108 | INIT_XMM sse4 |
||
109 | WEIGHT_FUNC_DBL |
||
110 | |||
111 | |||
112 | %macro WEIGHT_FUNC_MM 0 |
||
113 | cglobal h264_weight_8_10 |
||
114 | WEIGHT_PROLOGUE |
||
115 | WEIGHT_SETUP |
||
116 | .nextrow: |
||
117 | WEIGHT_OP 0 |
||
118 | mova [r0], m5 |
||
119 | add r0, r1 |
||
120 | dec r2d |
||
121 | jnz .nextrow |
||
122 | REP_RET |
||
123 | %endmacro |
||
124 | |||
125 | INIT_XMM sse2 |
||
126 | WEIGHT_FUNC_MM |
||
127 | INIT_XMM sse4 |
||
128 | WEIGHT_FUNC_MM |
||
129 | |||
130 | |||
131 | %macro WEIGHT_FUNC_HALF_MM 0 |
||
132 | cglobal h264_weight_4_10 |
||
133 | WEIGHT_PROLOGUE |
||
134 | sar r2d, 1 |
||
135 | WEIGHT_SETUP |
||
136 | lea r3, [r1*2] |
||
137 | .nextrow: |
||
138 | WEIGHT_OP 0, r1 |
||
139 | movh [r0], m5 |
||
140 | movhps [r0+r1], m5 |
||
141 | add r0, r3 |
||
142 | dec r2d |
||
143 | jnz .nextrow |
||
144 | REP_RET |
||
145 | %endmacro |
||
146 | |||
147 | INIT_XMM sse2 |
||
148 | WEIGHT_FUNC_HALF_MM |
||
149 | INIT_XMM sse4 |
||
150 | WEIGHT_FUNC_HALF_MM |
||
151 | |||
152 | |||
153 | ;----------------------------------------------------------------------------- |
||
154 | ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, |
||
155 | ; int log2_denom, int weightd, int weights, int offset); |
||
156 | ;----------------------------------------------------------------------------- |
||
157 | %if ARCH_X86_32 |
||
158 | DECLARE_REG_TMP 3 |
||
159 | %else |
||
160 | DECLARE_REG_TMP 7 |
||
161 | %endif |
||
162 | |||
163 | %macro BIWEIGHT_PROLOGUE 0 |
||
164 | .prologue: |
||
165 | PROLOGUE 0,8,8 |
||
166 | movifnidn r0, r0mp |
||
167 | movifnidn r1, r1mp |
||
168 | movifnidn r2d, r2m |
||
169 | movifnidn r5d, r5m |
||
170 | movifnidn r6d, r6m |
||
171 | movifnidn t0d, r7m |
||
172 | %endmacro |
||
173 | |||
174 | %macro BIWEIGHT_SETUP 0 |
||
175 | lea t0, [t0*4+1] ; (offset<<2)+1 |
||
176 | or t0, 1 |
||
177 | shl r6, 16 |
||
178 | or r5, r6 |
||
179 | movd m4, r5d ; weightd | weights |
||
180 | movd m5, t0d ; (offset+1)|1 |
||
181 | movd m6, r4m ; log2_denom |
||
182 | pslld m5, m6 ; (((offset<<2)+1)|1)< |
||
183 | paddd m6, [sq_1] |
||
184 | pshufd m4, m4, 0 |
||
185 | pshufd m5, m5, 0 |
||
186 | mova m3, [pw_pixel_max] |
||
187 | movifnidn r3d, r3m |
||
188 | %if notcpuflag(sse4) |
||
189 | pxor m7, m7 |
||
190 | %endif |
||
191 | %endmacro |
||
192 | |||
193 | %macro BIWEIGHT 1-2 |
||
194 | %if %0==1 |
||
195 | mova m0, [r0+%1] |
||
196 | mova m1, [r1+%1] |
||
197 | punpckhwd m2, m0, m1 |
||
198 | punpcklwd m0, m1 |
||
199 | %else |
||
200 | movq m0, [r0+%1] |
||
201 | movq m1, [r1+%1] |
||
202 | punpcklwd m0, m1 |
||
203 | movq m2, [r0+%2] |
||
204 | movq m1, [r1+%2] |
||
205 | punpcklwd m2, m1 |
||
206 | %endif |
||
207 | pmaddwd m0, m4 |
||
208 | pmaddwd m2, m4 |
||
209 | paddd m0, m5 |
||
210 | paddd m2, m5 |
||
211 | psrad m0, m6 |
||
212 | psrad m2, m6 |
||
213 | %if cpuflag(sse4) |
||
214 | packusdw m0, m2 |
||
215 | pminsw m0, m3 |
||
216 | %else |
||
217 | packssdw m0, m2 |
||
218 | CLIPW m0, m7, m3 |
||
219 | %endif |
||
220 | %endmacro |
||
221 | |||
222 | %macro BIWEIGHT_FUNC_DBL 0 |
||
223 | cglobal h264_biweight_16_10 |
||
224 | BIWEIGHT_PROLOGUE |
||
225 | BIWEIGHT_SETUP |
||
226 | .nextrow: |
||
227 | BIWEIGHT 0 |
||
228 | mova [r0 ], m0 |
||
229 | BIWEIGHT 16 |
||
230 | mova [r0+16], m0 |
||
231 | add r0, r2 |
||
232 | add r1, r2 |
||
233 | dec r3d |
||
234 | jnz .nextrow |
||
235 | REP_RET |
||
236 | %endmacro |
||
237 | |||
238 | INIT_XMM sse2 |
||
239 | BIWEIGHT_FUNC_DBL |
||
240 | INIT_XMM sse4 |
||
241 | BIWEIGHT_FUNC_DBL |
||
242 | |||
243 | %macro BIWEIGHT_FUNC 0 |
||
244 | cglobal h264_biweight_8_10 |
||
245 | BIWEIGHT_PROLOGUE |
||
246 | BIWEIGHT_SETUP |
||
247 | .nextrow: |
||
248 | BIWEIGHT 0 |
||
249 | mova [r0], m0 |
||
250 | add r0, r2 |
||
251 | add r1, r2 |
||
252 | dec r3d |
||
253 | jnz .nextrow |
||
254 | REP_RET |
||
255 | %endmacro |
||
256 | |||
257 | INIT_XMM sse2 |
||
258 | BIWEIGHT_FUNC |
||
259 | INIT_XMM sse4 |
||
260 | BIWEIGHT_FUNC |
||
261 | |||
262 | %macro BIWEIGHT_FUNC_HALF 0 |
||
263 | cglobal h264_biweight_4_10 |
||
264 | BIWEIGHT_PROLOGUE |
||
265 | BIWEIGHT_SETUP |
||
266 | sar r3d, 1 |
||
267 | lea r4, [r2*2] |
||
268 | .nextrow: |
||
269 | BIWEIGHT 0, r2 |
||
270 | movh [r0 ], m0 |
||
271 | movhps [r0+r2], m0 |
||
272 | add r0, r4 |
||
273 | add r1, r4 |
||
274 | dec r3d |
||
275 | jnz .nextrow |
||
276 | REP_RET |
||
277 | %endmacro |
||
278 | |||
279 | INIT_XMM sse2 |
||
280 | BIWEIGHT_FUNC_HALF |
||
281 | INIT_XMM sse4 |
||
282 | BIWEIGHT_FUNC_HALF |