Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | ;****************************************************************************** |
2 | ;* Core video DSP functions |
||
3 | ;* Copyright (c) 2012 Ronald S. Bultje |
||
4 | ;* |
||
5 | ;* This file is part of FFmpeg. |
||
6 | ;* |
||
7 | ;* FFmpeg is free software; you can redistribute it and/or |
||
8 | ;* modify it under the terms of the GNU Lesser General Public |
||
9 | ;* License as published by the Free Software Foundation; either |
||
10 | ;* version 2.1 of the License, or (at your option) any later version. |
||
11 | ;* |
||
12 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | ;* Lesser General Public License for more details. |
||
16 | ;* |
||
17 | ;* You should have received a copy of the GNU Lesser General Public |
||
18 | ;* License along with FFmpeg; if not, write to the Free Software |
||
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | ;****************************************************************************** |
||
21 | |||
22 | %include "libavutil/x86/x86util.asm" |
||
23 | |||
24 | SECTION .text |
||
25 | |||
26 | ; slow vertical extension loop function. Works with variable-width, and |
||
27 | ; does per-line reading/writing of source data |
||
28 | |||
29 | %macro V_COPY_ROW 2 ; type (top/body/bottom), h |
||
30 | .%1_y_loop: ; do { |
||
31 | mov wq, r7mp ; initialize w (r7mp = wmp) |
||
32 | .%1_x_loop: ; do { |
||
33 | movu m0, [srcq+wq] ; m0 = read($mmsize) |
||
34 | movu [dstq+wq], m0 ; write(m0, $mmsize) |
||
35 | add wq, mmsize ; w -= $mmsize |
||
36 | cmp wq, -mmsize ; } while (w > $mmsize); |
||
37 | jl .%1_x_loop |
||
38 | movu m0, [srcq-mmsize] ; m0 = read($mmsize) |
||
39 | movu [dstq-mmsize], m0 ; write(m0, $mmsize) |
||
40 | %ifidn %1, body ; if ($type == body) { |
||
41 | add srcq, src_strideq ; src += src_stride |
||
42 | %endif ; } |
||
43 | add dstq, dst_strideq ; dst += dst_stride |
||
44 | dec %2 ; } while (--$h); |
||
45 | jnz .%1_y_loop |
||
46 | %endmacro |
||
47 | |||
48 | %macro vvar_fn 0 |
||
49 | ; .----. <- zero |
||
50 | ; | | <- top is copied from first line in body of source |
||
51 | ; |----| <- start_y |
||
52 | ; | | <- body is copied verbatim (line-by-line) from source |
||
53 | ; |----| <- end_y |
||
54 | ; | | <- bottom is copied from last line in body of source |
||
55 | ; '----' <- bh |
||
56 | %if ARCH_X86_64 |
||
57 | cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ |
||
58 | start_y, end_y, bh, w |
||
59 | %else ; x86-32 |
||
60 | cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w |
||
61 | %define src_strideq r3mp |
||
62 | %define dst_strideq r1mp |
||
63 | mov srcq, r2mp |
||
64 | mov start_yq, r4mp |
||
65 | mov end_yq, r5mp |
||
66 | mov bhq, r6mp |
||
67 | %endif |
||
68 | sub bhq, end_yq ; bh -= end_q |
||
69 | sub end_yq, start_yq ; end_q -= start_q |
||
70 | add srcq, r7mp ; (r7mp = wmp) |
||
71 | add dstq, r7mp ; (r7mp = wmp) |
||
72 | neg r7mp ; (r7mp = wmp) |
||
73 | test start_yq, start_yq ; if (start_q) { |
||
74 | jz .body |
||
75 | V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) |
||
76 | .body: ; } |
||
77 | V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) |
||
78 | test bhq, bhq ; if (bh) { |
||
79 | jz .end |
||
80 | sub srcq, src_strideq ; src -= src_stride |
||
81 | V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) |
||
82 | .end: ; } |
||
83 | RET |
||
84 | %endmacro |
||
85 | |||
86 | %if ARCH_X86_32 |
||
87 | INIT_MMX mmx |
||
88 | vvar_fn |
||
89 | %endif |
||
90 | |||
91 | INIT_XMM sse |
||
92 | vvar_fn |
||
93 | |||
94 | %macro hvar_fn 0 |
||
95 | cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w |
||
96 | lea dstq, [dstq+n_wordsq*2] |
||
97 | neg n_wordsq |
||
98 | lea start_xq, [start_xq+n_wordsq*2] |
||
99 | .y_loop: ; do { |
||
100 | ; FIXME also write a ssse3 version using pshufb |
||
101 | movzx wd, byte [dstq+start_xq] ; w = read(1) |
||
102 | imul wd, 0x01010101 ; w *= 0x01010101 |
||
103 | movd m0, wd |
||
104 | mov wq, n_wordsq ; initialize w |
||
105 | %if cpuflag(sse2) |
||
106 | pshufd m0, m0, q0000 ; splat |
||
107 | %else ; mmx |
||
108 | punpckldq m0, m0 ; splat |
||
109 | %endif ; mmx/sse |
||
110 | .x_loop: ; do { |
||
111 | movu [dstq+wq*2], m0 ; write($reg, $mmsize) |
||
112 | add wq, mmsize/2 ; w -= $mmsize/2 |
||
113 | cmp wq, -mmsize/2 ; } while (w > $mmsize/2) |
||
114 | jl .x_loop |
||
115 | movu [dstq-mmsize], m0 ; write($reg, $mmsize) |
||
116 | add dstq, dst_strideq ; dst += dst_stride |
||
117 | dec hq ; } while (h--) |
||
118 | jnz .y_loop |
||
119 | RET |
||
120 | %endmacro |
||
121 | |||
122 | %if ARCH_X86_32 |
||
123 | INIT_MMX mmx |
||
124 | hvar_fn |
||
125 | %endif |
||
126 | |||
127 | INIT_XMM sse2 |
||
128 | hvar_fn |
||
129 | |||
130 | ; macro to read/write a horizontal number of pixels (%2) to/from registers |
||
131 | ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels |
||
132 | ; - if (%2 & 8) fills 8 bytes into xmm$next |
||
133 | ; - if (%2 & 4) fills 4 bytes into xmm$next |
||
134 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
||
135 | ; on mmx, - fills mm0-7 for consecutive sets of 8 pixels |
||
136 | ; - if (%2 & 4) fills 4 bytes into mm$next |
||
137 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
||
138 | ; writing data out is in the same way |
||
139 | %macro READ_NUM_BYTES 2 |
||
140 | %assign %%off 0 ; offset in source buffer |
||
141 | %assign %%mmx_idx 0 ; mmx register index |
||
142 | %assign %%xmm_idx 0 ; xmm register index |
||
143 | |||
144 | %rep %2/mmsize |
||
145 | %if mmsize == 16 |
||
146 | movu xmm %+ %%xmm_idx, [srcq+%%off] |
||
147 | %assign %%xmm_idx %%xmm_idx+1 |
||
148 | %else ; mmx |
||
149 | movu mm %+ %%mmx_idx, [srcq+%%off] |
||
150 | %assign %%mmx_idx %%mmx_idx+1 |
||
151 | %endif |
||
152 | %assign %%off %%off+mmsize |
||
153 | %endrep ; %2/mmsize |
||
154 | |||
155 | %if mmsize == 16 |
||
156 | %if (%2-%%off) >= 8 |
||
157 | %if %2 > 16 && (%2-%%off) > 8 |
||
158 | movu xmm %+ %%xmm_idx, [srcq+%2-16] |
||
159 | %assign %%xmm_idx %%xmm_idx+1 |
||
160 | %assign %%off %2 |
||
161 | %else |
||
162 | movq mm %+ %%mmx_idx, [srcq+%%off] |
||
163 | %assign %%mmx_idx %%mmx_idx+1 |
||
164 | %assign %%off %%off+8 |
||
165 | %endif |
||
166 | %endif ; (%2-%%off) >= 8 |
||
167 | %endif |
||
168 | |||
169 | %if (%2-%%off) >= 4 |
||
170 | %if %2 > 8 && (%2-%%off) > 4 |
||
171 | movq mm %+ %%mmx_idx, [srcq+%2-8] |
||
172 | %assign %%off %2 |
||
173 | %else |
||
174 | movd mm %+ %%mmx_idx, [srcq+%%off] |
||
175 | %assign %%off %%off+4 |
||
176 | %endif |
||
177 | %assign %%mmx_idx %%mmx_idx+1 |
||
178 | %endif ; (%2-%%off) >= 4 |
||
179 | |||
180 | %if (%2-%%off) >= 1 |
||
181 | %if %2 >= 4 |
||
182 | movd mm %+ %%mmx_idx, [srcq+%2-4] |
||
183 | %elif (%2-%%off) == 1 |
||
184 | mov valb, [srcq+%2-1] |
||
185 | %elif (%2-%%off) == 2 |
||
186 | mov valw, [srcq+%2-2] |
||
187 | %elifidn %1, body |
||
188 | mov vald, [srcq+%2-3] |
||
189 | %else |
||
190 | movd mm %+ %%mmx_idx, [srcq+%2-3] |
||
191 | %endif |
||
192 | %endif ; (%2-%%off) >= 1 |
||
193 | %endmacro ; READ_NUM_BYTES |
||
194 | |||
195 | %macro WRITE_NUM_BYTES 2 |
||
196 | %assign %%off 0 ; offset in destination buffer |
||
197 | %assign %%mmx_idx 0 ; mmx register index |
||
198 | %assign %%xmm_idx 0 ; xmm register index |
||
199 | |||
200 | %rep %2/mmsize |
||
201 | %if mmsize == 16 |
||
202 | movu [dstq+%%off], xmm %+ %%xmm_idx |
||
203 | %assign %%xmm_idx %%xmm_idx+1 |
||
204 | %else ; mmx |
||
205 | movu [dstq+%%off], mm %+ %%mmx_idx |
||
206 | %assign %%mmx_idx %%mmx_idx+1 |
||
207 | %endif |
||
208 | %assign %%off %%off+mmsize |
||
209 | %endrep ; %2/mmsize |
||
210 | |||
211 | %if mmsize == 16 |
||
212 | %if (%2-%%off) >= 8 |
||
213 | %if %2 > 16 && (%2-%%off) > 8 |
||
214 | movu [dstq+%2-16], xmm %+ %%xmm_idx |
||
215 | %assign %%xmm_idx %%xmm_idx+1 |
||
216 | %assign %%off %2 |
||
217 | %else |
||
218 | movq [dstq+%%off], mm %+ %%mmx_idx |
||
219 | %assign %%mmx_idx %%mmx_idx+1 |
||
220 | %assign %%off %%off+8 |
||
221 | %endif |
||
222 | %endif ; (%2-%%off) >= 8 |
||
223 | %endif |
||
224 | |||
225 | %if (%2-%%off) >= 4 |
||
226 | %if %2 > 8 && (%2-%%off) > 4 |
||
227 | movq [dstq+%2-8], mm %+ %%mmx_idx |
||
228 | %assign %%off %2 |
||
229 | %else |
||
230 | movd [dstq+%%off], mm %+ %%mmx_idx |
||
231 | %assign %%off %%off+4 |
||
232 | %endif |
||
233 | %assign %%mmx_idx %%mmx_idx+1 |
||
234 | %endif ; (%2-%%off) >= 4 |
||
235 | |||
236 | %if (%2-%%off) >= 1 |
||
237 | %if %2 >= 4 |
||
238 | movd [dstq+%2-4], mm %+ %%mmx_idx |
||
239 | %elif (%2-%%off) == 1 |
||
240 | mov [dstq+%2-1], valb |
||
241 | %elif (%2-%%off) == 2 |
||
242 | mov [dstq+%2-2], valw |
||
243 | %elifidn %1, body |
||
244 | mov [dstq+%2-3], valw |
||
245 | shr vald, 16 |
||
246 | mov [dstq+%2-1], valb |
||
247 | %else |
||
248 | movd vald, mm %+ %%mmx_idx |
||
249 | mov [dstq+%2-3], valw |
||
250 | shr vald, 16 |
||
251 | mov [dstq+%2-1], valb |
||
252 | %endif |
||
253 | %endif ; (%2-%%off) >= 1 |
||
254 | %endmacro ; WRITE_NUM_BYTES |
||
255 | |||
256 | ; vertical top/bottom extend and body copy fast loops |
||
257 | ; these are function pointers to set-width line copy functions, i.e. |
||
258 | ; they read a fixed number of pixels into set registers, and write |
||
259 | ; those out into the destination buffer |
||
260 | %macro VERTICAL_EXTEND 2 |
||
261 | %assign %%n %1 |
||
262 | %rep 1+%2-%1 |
||
263 | %if %%n <= 3 |
||
264 | %if ARCH_X86_64 |
||
265 | cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ |
||
266 | start_y, end_y, val, bh |
||
267 | mov bhq, r6mp ; r6mp = bhmp |
||
268 | %else ; x86-32 |
||
269 | cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh |
||
270 | mov dstq, r0mp |
||
271 | mov srcq, r2mp |
||
272 | mov start_yq, r4mp |
||
273 | mov end_yq, r5mp |
||
274 | mov bhq, r6mp |
||
275 | %define dst_strideq r1mp |
||
276 | %define src_strideq r3mp |
||
277 | %endif ; x86-64/32 |
||
278 | %else |
||
279 | %if ARCH_X86_64 |
||
280 | cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ |
||
281 | start_y, end_y, bh |
||
282 | %else ; x86-32 |
||
283 | cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh |
||
284 | mov srcq, r2mp |
||
285 | mov start_yq, r4mp |
||
286 | mov end_yq, r5mp |
||
287 | mov bhq, r6mp |
||
288 | %define dst_strideq r1mp |
||
289 | %define src_strideq r3mp |
||
290 | %endif ; x86-64/32 |
||
291 | %endif |
||
292 | ; FIXME move this to c wrapper? |
||
293 | sub bhq, end_yq ; bh -= end_y |
||
294 | sub end_yq, start_yq ; end_y -= start_y |
||
295 | |||
296 | ; extend pixels above body |
||
297 | test start_yq, start_yq ; if (start_y) { |
||
298 | jz .body_loop |
||
299 | READ_NUM_BYTES top, %%n ; $variable_regs = read($n) |
||
300 | .top_loop: ; do { |
||
301 | WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) |
||
302 | add dstq, dst_strideq ; dst += linesize |
||
303 | dec start_yq ; } while (--start_y) |
||
304 | jnz .top_loop ; } |
||
305 | |||
306 | ; copy body pixels |
||
307 | .body_loop: ; do { |
||
308 | READ_NUM_BYTES body, %%n ; $variable_regs = read($n) |
||
309 | WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) |
||
310 | add dstq, dst_strideq ; dst += dst_stride |
||
311 | add srcq, src_strideq ; src += src_stride |
||
312 | dec end_yq ; } while (--end_y) |
||
313 | jnz .body_loop |
||
314 | |||
315 | ; copy bottom pixels |
||
316 | test bhq, bhq ; if (block_h) { |
||
317 | jz .end |
||
318 | sub srcq, src_strideq ; src -= linesize |
||
319 | READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) |
||
320 | .bottom_loop: ; do { |
||
321 | WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) |
||
322 | add dstq, dst_strideq ; dst += linesize |
||
323 | dec bhq ; } while (--bh) |
||
324 | jnz .bottom_loop ; } |
||
325 | |||
326 | .end: |
||
327 | RET |
||
328 | %assign %%n %%n+1 |
||
329 | %endrep ; 1+%2-%1 |
||
330 | %endmacro ; VERTICAL_EXTEND |
||
331 | |||
332 | INIT_MMX mmx |
||
333 | VERTICAL_EXTEND 1, 15 |
||
334 | %if ARCH_X86_32 |
||
335 | VERTICAL_EXTEND 16, 22 |
||
336 | %endif |
||
337 | |||
338 | INIT_XMM sse |
||
339 | VERTICAL_EXTEND 16, 22 |
||
340 | |||
341 | ; left/right (horizontal) fast extend functions |
||
342 | ; these are essentially identical to the vertical extend ones above, |
||
343 | ; just left/right separated because number of pixels to extend is |
||
344 | ; obviously not the same on both sides. |
||
345 | |||
346 | %macro READ_V_PIXEL 2 |
||
347 | movzx vald, byte %2 |
||
348 | imul vald, 0x01010101 |
||
349 | %if %1 >= 8 |
||
350 | movd m0, vald |
||
351 | %if mmsize == 16 |
||
352 | pshufd m0, m0, q0000 |
||
353 | %else |
||
354 | punpckldq m0, m0 |
||
355 | %endif ; mmsize == 16 |
||
356 | %endif ; %1 > 16 |
||
357 | %endmacro ; READ_V_PIXEL |
||
358 | |||
359 | %macro WRITE_V_PIXEL 2 |
||
360 | %assign %%off 0 |
||
361 | |||
362 | %if %1 >= 8 |
||
363 | |||
364 | %rep %1/mmsize |
||
365 | movu [%2+%%off], m0 |
||
366 | %assign %%off %%off+mmsize |
||
367 | %endrep ; %1/mmsize |
||
368 | |||
369 | %if mmsize == 16 |
||
370 | %if %1-%%off >= 8 |
||
371 | %if %1 > 16 && %1-%%off > 8 |
||
372 | movu [%2+%1-16], m0 |
||
373 | %assign %%off %1 |
||
374 | %else |
||
375 | movq [%2+%%off], m0 |
||
376 | %assign %%off %%off+8 |
||
377 | %endif |
||
378 | %endif ; %1-%%off >= 8 |
||
379 | %endif ; mmsize == 16 |
||
380 | |||
381 | %if %1-%%off >= 4 |
||
382 | %if %1 > 8 && %1-%%off > 4 |
||
383 | movq [%2+%1-8], m0 |
||
384 | %assign %%off %1 |
||
385 | %else |
||
386 | movd [%2+%%off], m0 |
||
387 | %assign %%off %%off+4 |
||
388 | %endif |
||
389 | %endif ; %1-%%off >= 4 |
||
390 | |||
391 | %else ; %1 < 8 |
||
392 | |||
393 | %rep %1/4 |
||
394 | mov [%2+%%off], vald |
||
395 | %assign %%off %%off+4 |
||
396 | %endrep ; %1/4 |
||
397 | |||
398 | %endif ; %1 >=/< 8 |
||
399 | |||
400 | %if %1-%%off == 2 |
||
401 | mov [%2+%%off], valw |
||
402 | %endif ; (%1-%%off)/2 |
||
403 | %endmacro ; WRITE_V_PIXEL |
||
404 | |||
405 | %macro H_EXTEND 2 |
||
406 | %assign %%n %1 |
||
407 | %rep 1+(%2-%1)/2 |
||
408 | cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val |
||
409 | .loop_y: ; do { |
||
410 | READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) |
||
411 | WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) |
||
412 | add dstq, dst_strideq ; dst += dst_stride |
||
413 | dec bhq ; } while (--bh) |
||
414 | jnz .loop_y |
||
415 | RET |
||
416 | %assign %%n %%n+2 |
||
417 | %endrep ; 1+(%2-%1)/2 |
||
418 | %endmacro ; H_EXTEND |
||
419 | |||
420 | INIT_MMX mmx |
||
421 | H_EXTEND 2, 14 |
||
422 | %if ARCH_X86_32 |
||
423 | H_EXTEND 16, 22 |
||
424 | %endif |
||
425 | |||
426 | INIT_XMM sse2 |
||
427 | H_EXTEND 16, 22 |
||
428 | |||
429 | %macro PREFETCH_FN 1 |
||
430 | cglobal prefetch, 3, 3, 0, buf, stride, h |
||
431 | .loop: |
||
432 | %1 [bufq] |
||
433 | add bufq, strideq |
||
434 | dec hd |
||
435 | jg .loop |
||
436 | REP_RET |
||
437 | %endmacro |
||
438 | |||
439 | INIT_MMX mmxext |
||
440 | PREFETCH_FN prefetcht0 |
||
441 | %if ARCH_X86_32 |
||
442 | INIT_MMX 3dnow |
||
443 | PREFETCH_FN prefetch |
||
444 | %endif>>=>->->->->->->-> |