Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | ;****************************************************************************** |
2 | ;* x86-optimized vertical line scaling functions |
||
3 | ;* Copyright (c) 2011 Ronald S. Bultje |
||
4 | ;* Kieran Kunhya |
||
5 | ;* |
||
6 | ;* This file is part of FFmpeg. |
||
7 | ;* |
||
8 | ;* FFmpeg is free software; you can redistribute it and/or |
||
9 | ;* modify it under the terms of the GNU Lesser General Public |
||
10 | ;* License as published by the Free Software Foundation; either |
||
11 | ;* version 2.1 of the License, or (at your option) any later version. |
||
12 | ;* |
||
13 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
16 | ;* Lesser General Public License for more details. |
||
17 | ;* |
||
18 | ;* You should have received a copy of the GNU Lesser General Public |
||
19 | ;* License along with FFmpeg; if not, write to the Free Software |
||
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
21 | ;****************************************************************************** |
||
22 | |||
23 | %include "libavutil/x86/x86util.asm" |
||
24 | |||
25 | SECTION_RODATA |
||
26 | |||
27 | minshort: times 8 dw 0x8000 |
||
28 | yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 |
||
29 | yuv2yuvX_10_start: times 4 dd 0x10000 |
||
30 | yuv2yuvX_9_start: times 4 dd 0x20000 |
||
31 | yuv2yuvX_10_upper: times 8 dw 0x3ff |
||
32 | yuv2yuvX_9_upper: times 8 dw 0x1ff |
||
33 | pd_4: times 4 dd 4 |
||
34 | pd_4min0x40000:times 4 dd 4 - (0x40000) |
||
35 | pw_16: times 8 dw 16 |
||
36 | pw_32: times 8 dw 32 |
||
37 | pw_512: times 8 dw 512 |
||
38 | pw_1024: times 8 dw 1024 |
||
39 | |||
40 | SECTION .text |
||
41 | |||
42 | ;----------------------------------------------------------------------------- |
||
43 | ; vertical line scaling |
||
44 | ; |
||
45 | ; void yuv2plane1_ |
||
46 | ; const uint8_t *dither, int offset) |
||
47 | ; and |
||
48 | ; void yuv2planeX_ |
||
49 | ; const int16_t **src, uint8_t *dst, int dstW, |
||
50 | ; const uint8_t *dither, int offset) |
||
51 | ; |
||
52 | ; Scale one or $filterSize lines of source data to generate one line of output |
||
53 | ; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in |
||
54 | ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple |
||
55 | ; of 2. $offset is either 0 or 3. $dither holds 8 values. |
||
56 | ;----------------------------------------------------------------------------- |
||
57 | |||
58 | %macro yuv2planeX_fn 3 |
||
59 | |||
60 | %if ARCH_X86_32 |
||
61 | %define cntr_reg fltsizeq |
||
62 | %define movsx mov |
||
63 | %else |
||
64 | %define cntr_reg r7 |
||
65 | %define movsx movsxd |
||
66 | %endif |
||
67 | |||
68 | cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset |
||
69 | %if %1 == 8 || %1 == 9 || %1 == 10 |
||
70 | pxor m6, m6 |
||
71 | %endif ; %1 == 8/9/10 |
||
72 | |||
73 | %if %1 == 8 |
||
74 | %if ARCH_X86_32 |
||
75 | %assign pad 0x2c - (stack_offset & 15) |
||
76 | SUB rsp, pad |
||
77 | %define m_dith m7 |
||
78 | %else ; x86-64 |
||
79 | %define m_dith m9 |
||
80 | %endif ; x86-32 |
||
81 | |||
82 | ; create registers holding dither |
||
83 | movq m_dith, [ditherq] ; dither |
||
84 | test offsetd, offsetd |
||
85 | jz .no_rot |
||
86 | %if mmsize == 16 |
||
87 | punpcklqdq m_dith, m_dith |
||
88 | %endif ; mmsize == 16 |
||
89 | PALIGNR m_dith, m_dith, 3, m0 |
||
90 | .no_rot: |
||
91 | %if mmsize == 16 |
||
92 | punpcklbw m_dith, m6 |
||
93 | %if ARCH_X86_64 |
||
94 | punpcklwd m8, m_dith, m6 |
||
95 | pslld m8, 12 |
||
96 | %else ; x86-32 |
||
97 | punpcklwd m5, m_dith, m6 |
||
98 | pslld m5, 12 |
||
99 | %endif ; x86-32/64 |
||
100 | punpckhwd m_dith, m6 |
||
101 | pslld m_dith, 12 |
||
102 | %if ARCH_X86_32 |
||
103 | mova [rsp+ 0], m5 |
||
104 | mova [rsp+16], m_dith |
||
105 | %endif |
||
106 | %else ; mmsize == 8 |
||
107 | punpcklbw m5, m_dith, m6 |
||
108 | punpckhbw m_dith, m6 |
||
109 | punpcklwd m4, m5, m6 |
||
110 | punpckhwd m5, m6 |
||
111 | punpcklwd m3, m_dith, m6 |
||
112 | punpckhwd m_dith, m6 |
||
113 | pslld m4, 12 |
||
114 | pslld m5, 12 |
||
115 | pslld m3, 12 |
||
116 | pslld m_dith, 12 |
||
117 | mova [rsp+ 0], m4 |
||
118 | mova [rsp+ 8], m5 |
||
119 | mova [rsp+16], m3 |
||
120 | mova [rsp+24], m_dith |
||
121 | %endif ; mmsize == 8/16 |
||
122 | %endif ; %1 == 8 |
||
123 | |||
124 | xor r5, r5 |
||
125 | |||
126 | .pixelloop: |
||
127 | %assign %%i 0 |
||
128 | ; the rep here is for the 8bit output mmx case, where dither covers |
||
129 | ; 8 pixels but we can only handle 2 pixels per register, and thus 4 |
||
130 | ; pixels per iteration. In order to not have to keep track of where |
||
131 | ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2. |
||
132 | %if %1 == 8 |
||
133 | %assign %%repcnt 16/mmsize |
||
134 | %else |
||
135 | %assign %%repcnt 1 |
||
136 | %endif |
||
137 | |||
138 | %rep %%repcnt |
||
139 | |||
140 | %if %1 == 8 |
||
141 | %if ARCH_X86_32 |
||
142 | mova m2, [rsp+mmsize*(0+%%i)] |
||
143 | mova m1, [rsp+mmsize*(1+%%i)] |
||
144 | %else ; x86-64 |
||
145 | mova m2, m8 |
||
146 | mova m1, m_dith |
||
147 | %endif ; x86-32/64 |
||
148 | %else ; %1 == 9/10/16 |
||
149 | mova m1, [yuv2yuvX_%1_start] |
||
150 | mova m2, m1 |
||
151 | %endif ; %1 == 8/9/10/16 |
||
152 | movsx cntr_reg, fltsizem |
||
153 | .filterloop_ %+ %%i: |
||
154 | ; input pixels |
||
155 | mov r6, [srcq+gprsize*cntr_reg-2*gprsize] |
||
156 | %if %1 == 16 |
||
157 | mova m3, [r6+r5*4] |
||
158 | mova m5, [r6+r5*4+mmsize] |
||
159 | %else ; %1 == 8/9/10 |
||
160 | mova m3, [r6+r5*2] |
||
161 | %endif ; %1 == 8/9/10/16 |
||
162 | mov r6, [srcq+gprsize*cntr_reg-gprsize] |
||
163 | %if %1 == 16 |
||
164 | mova m4, [r6+r5*4] |
||
165 | mova m6, [r6+r5*4+mmsize] |
||
166 | %else ; %1 == 8/9/10 |
||
167 | mova m4, [r6+r5*2] |
||
168 | %endif ; %1 == 8/9/10/16 |
||
169 | |||
170 | ; coefficients |
||
171 | movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1] |
||
172 | %if %1 == 16 |
||
173 | pshuflw m7, m0, 0 ; coeff[0] |
||
174 | pshuflw m0, m0, 0x55 ; coeff[1] |
||
175 | pmovsxwd m7, m7 ; word -> dword |
||
176 | pmovsxwd m0, m0 ; word -> dword |
||
177 | |||
178 | pmulld m3, m7 |
||
179 | pmulld m5, m7 |
||
180 | pmulld m4, m0 |
||
181 | pmulld m6, m0 |
||
182 | |||
183 | paddd m2, m3 |
||
184 | paddd m1, m5 |
||
185 | paddd m2, m4 |
||
186 | paddd m1, m6 |
||
187 | %else ; %1 == 10/9/8 |
||
188 | punpcklwd m5, m3, m4 |
||
189 | punpckhwd m3, m4 |
||
190 | SPLATD m0 |
||
191 | |||
192 | pmaddwd m5, m0 |
||
193 | pmaddwd m3, m0 |
||
194 | |||
195 | paddd m2, m5 |
||
196 | paddd m1, m3 |
||
197 | %endif ; %1 == 8/9/10/16 |
||
198 | |||
199 | sub cntr_reg, 2 |
||
200 | jg .filterloop_ %+ %%i |
||
201 | |||
202 | %if %1 == 16 |
||
203 | psrad m2, 31 - %1 |
||
204 | psrad m1, 31 - %1 |
||
205 | %else ; %1 == 10/9/8 |
||
206 | psrad m2, 27 - %1 |
||
207 | psrad m1, 27 - %1 |
||
208 | %endif ; %1 == 8/9/10/16 |
||
209 | |||
210 | %if %1 == 8 |
||
211 | packssdw m2, m1 |
||
212 | packuswb m2, m2 |
||
213 | movh [dstq+r5*1], m2 |
||
214 | %else ; %1 == 9/10/16 |
||
215 | %if %1 == 16 |
||
216 | packssdw m2, m1 |
||
217 | paddw m2, [minshort] |
||
218 | %else ; %1 == 9/10 |
||
219 | %if cpuflag(sse4) |
||
220 | packusdw m2, m1 |
||
221 | %else ; mmxext/sse2 |
||
222 | packssdw m2, m1 |
||
223 | pmaxsw m2, m6 |
||
224 | %endif ; mmxext/sse2/sse4/avx |
||
225 | pminsw m2, [yuv2yuvX_%1_upper] |
||
226 | %endif ; %1 == 9/10/16 |
||
227 | mova [dstq+r5*2], m2 |
||
228 | %endif ; %1 == 8/9/10/16 |
||
229 | |||
230 | add r5, mmsize/2 |
||
231 | sub wd, mmsize/2 |
||
232 | |||
233 | %assign %%i %%i+2 |
||
234 | %endrep |
||
235 | jg .pixelloop |
||
236 | |||
237 | %if %1 == 8 |
||
238 | %if ARCH_X86_32 |
||
239 | ADD rsp, pad |
||
240 | RET |
||
241 | %else ; x86-64 |
||
242 | REP_RET |
||
243 | %endif ; x86-32/64 |
||
244 | %else ; %1 == 9/10/16 |
||
245 | REP_RET |
||
246 | %endif ; %1 == 8/9/10/16 |
||
247 | %endmacro |
||
248 | |||
249 | %if ARCH_X86_32 |
||
250 | INIT_MMX mmxext |
||
251 | yuv2planeX_fn 8, 0, 7 |
||
252 | yuv2planeX_fn 9, 0, 5 |
||
253 | yuv2planeX_fn 10, 0, 5 |
||
254 | %endif |
||
255 | |||
256 | INIT_XMM sse2 |
||
257 | yuv2planeX_fn 8, 10, 7 |
||
258 | yuv2planeX_fn 9, 7, 5 |
||
259 | yuv2planeX_fn 10, 7, 5 |
||
260 | |||
261 | INIT_XMM sse4 |
||
262 | yuv2planeX_fn 8, 10, 7 |
||
263 | yuv2planeX_fn 9, 7, 5 |
||
264 | yuv2planeX_fn 10, 7, 5 |
||
265 | yuv2planeX_fn 16, 8, 5 |
||
266 | |||
267 | %if HAVE_AVX_EXTERNAL |
||
268 | INIT_XMM avx |
||
269 | yuv2planeX_fn 8, 10, 7 |
||
270 | yuv2planeX_fn 9, 7, 5 |
||
271 | yuv2planeX_fn 10, 7, 5 |
||
272 | %endif |
||
273 | |||
274 | ; %1=outout-bpc, %2=alignment (u/a) |
||
275 | %macro yuv2plane1_mainloop 2 |
||
276 | .loop_%2: |
||
277 | %if %1 == 8 |
||
278 | paddsw m0, m2, [srcq+wq*2+mmsize*0] |
||
279 | paddsw m1, m3, [srcq+wq*2+mmsize*1] |
||
280 | psraw m0, 7 |
||
281 | psraw m1, 7 |
||
282 | packuswb m0, m1 |
||
283 | mov%2 [dstq+wq], m0 |
||
284 | %elif %1 == 16 |
||
285 | paddd m0, m4, [srcq+wq*4+mmsize*0] |
||
286 | paddd m1, m4, [srcq+wq*4+mmsize*1] |
||
287 | paddd m2, m4, [srcq+wq*4+mmsize*2] |
||
288 | paddd m3, m4, [srcq+wq*4+mmsize*3] |
||
289 | psrad m0, 3 |
||
290 | psrad m1, 3 |
||
291 | psrad m2, 3 |
||
292 | psrad m3, 3 |
||
293 | %if cpuflag(sse4) ; avx/sse4 |
||
294 | packusdw m0, m1 |
||
295 | packusdw m2, m3 |
||
296 | %else ; mmx/sse2 |
||
297 | packssdw m0, m1 |
||
298 | packssdw m2, m3 |
||
299 | paddw m0, m5 |
||
300 | paddw m2, m5 |
||
301 | %endif ; mmx/sse2/sse4/avx |
||
302 | mov%2 [dstq+wq*2+mmsize*0], m0 |
||
303 | mov%2 [dstq+wq*2+mmsize*1], m2 |
||
304 | %else ; %1 == 9/10 |
||
305 | paddsw m0, m2, [srcq+wq*2+mmsize*0] |
||
306 | paddsw m1, m2, [srcq+wq*2+mmsize*1] |
||
307 | psraw m0, 15 - %1 |
||
308 | psraw m1, 15 - %1 |
||
309 | pmaxsw m0, m4 |
||
310 | pmaxsw m1, m4 |
||
311 | pminsw m0, m3 |
||
312 | pminsw m1, m3 |
||
313 | mov%2 [dstq+wq*2+mmsize*0], m0 |
||
314 | mov%2 [dstq+wq*2+mmsize*1], m1 |
||
315 | %endif |
||
316 | add wq, mmsize |
||
317 | jl .loop_%2 |
||
318 | %endmacro |
||
319 | |||
320 | %macro yuv2plane1_fn 3 |
||
321 | cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset |
||
322 | movsxdifnidn wq, wd |
||
323 | add wq, mmsize - 1 |
||
324 | and wq, ~(mmsize - 1) |
||
325 | %if %1 == 8 |
||
326 | add dstq, wq |
||
327 | %else ; %1 != 8 |
||
328 | lea dstq, [dstq+wq*2] |
||
329 | %endif ; %1 == 8 |
||
330 | %if %1 == 16 |
||
331 | lea srcq, [srcq+wq*4] |
||
332 | %else ; %1 != 16 |
||
333 | lea srcq, [srcq+wq*2] |
||
334 | %endif ; %1 == 16 |
||
335 | neg wq |
||
336 | |||
337 | %if %1 == 8 |
||
338 | pxor m4, m4 ; zero |
||
339 | |||
340 | ; create registers holding dither |
||
341 | movq m3, [ditherq] ; dither |
||
342 | test offsetd, offsetd |
||
343 | jz .no_rot |
||
344 | %if mmsize == 16 |
||
345 | punpcklqdq m3, m3 |
||
346 | %endif ; mmsize == 16 |
||
347 | PALIGNR m3, m3, 3, m2 |
||
348 | .no_rot: |
||
349 | %if mmsize == 8 |
||
350 | mova m2, m3 |
||
351 | punpckhbw m3, m4 ; byte->word |
||
352 | punpcklbw m2, m4 ; byte->word |
||
353 | %else |
||
354 | punpcklbw m3, m4 |
||
355 | mova m2, m3 |
||
356 | %endif |
||
357 | %elif %1 == 9 |
||
358 | pxor m4, m4 |
||
359 | mova m3, [pw_512] |
||
360 | mova m2, [pw_32] |
||
361 | %elif %1 == 10 |
||
362 | pxor m4, m4 |
||
363 | mova m3, [pw_1024] |
||
364 | mova m2, [pw_16] |
||
365 | %else ; %1 == 16 |
||
366 | %if cpuflag(sse4) ; sse4/avx |
||
367 | mova m4, [pd_4] |
||
368 | %else ; mmx/sse2 |
||
369 | mova m4, [pd_4min0x40000] |
||
370 | mova m5, [minshort] |
||
371 | %endif ; mmx/sse2/sse4/avx |
||
372 | %endif ; %1 == .. |
||
373 | |||
374 | ; actual pixel scaling |
||
375 | %if mmsize == 8 |
||
376 | yuv2plane1_mainloop %1, a |
||
377 | %else ; mmsize == 16 |
||
378 | test dstq, 15 |
||
379 | jnz .unaligned |
||
380 | yuv2plane1_mainloop %1, a |
||
381 | REP_RET |
||
382 | .unaligned: |
||
383 | yuv2plane1_mainloop %1, u |
||
384 | %endif ; mmsize == 8/16 |
||
385 | REP_RET |
||
386 | %endmacro |
||
387 | |||
388 | %if ARCH_X86_32 |
||
389 | INIT_MMX mmx |
||
390 | yuv2plane1_fn 8, 0, 5 |
||
391 | yuv2plane1_fn 16, 0, 3 |
||
392 | |||
393 | INIT_MMX mmxext |
||
394 | yuv2plane1_fn 9, 0, 3 |
||
395 | yuv2plane1_fn 10, 0, 3 |
||
396 | %endif |
||
397 | |||
398 | INIT_XMM sse2 |
||
399 | yuv2plane1_fn 8, 5, 5 |
||
400 | yuv2plane1_fn 9, 5, 3 |
||
401 | yuv2plane1_fn 10, 5, 3 |
||
402 | yuv2plane1_fn 16, 6, 3 |
||
403 | |||
404 | INIT_XMM sse4 |
||
405 | yuv2plane1_fn 16, 5, 3 |
||
406 | |||
407 | %if HAVE_AVX_EXTERNAL |
||
408 | INIT_XMM avx |
||
409 | yuv2plane1_fn 8, 5, 5 |
||
410 | yuv2plane1_fn 9, 5, 3 |
||
411 | yuv2plane1_fn 10, 5, 3 |
||
412 | yuv2plane1_fn 16, 5, 3 |
||
413 | %endif |