Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | ;****************************************************************************** |
2 | ;* Core video DSP functions |
||
3 | ;* Copyright (c) 2012 Ronald S. Bultje |
||
4 | ;* |
||
5 | ;* This file is part of FFmpeg. |
||
6 | ;* |
||
7 | ;* FFmpeg is free software; you can redistribute it and/or |
||
8 | ;* modify it under the terms of the GNU Lesser General Public |
||
9 | ;* License as published by the Free Software Foundation; either |
||
10 | ;* version 2.1 of the License, or (at your option) any later version. |
||
11 | ;* |
||
12 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | ;* Lesser General Public License for more details. |
||
16 | ;* |
||
17 | ;* You should have received a copy of the GNU Lesser General Public |
||
18 | ;* License along with FFmpeg; if not, write to the Free Software |
||
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | ;****************************************************************************** |
||
21 | |||
22 | %include "libavutil/x86/x86util.asm" |
||
23 | |||
24 | SECTION .text |
||
25 | |||
26 | ; slow vertical extension loop function. Works with variable-width, and |
||
27 | ; does per-line reading/writing of source data |
||
28 | |||
29 | %macro V_COPY_ROW 2 ; type (top/body/bottom), h |
||
30 | .%1_y_loop: ; do { |
||
31 | mov wq, r7mp ; initialize w (r7mp = wmp) |
||
32 | .%1_x_loop: ; do { |
||
33 | movu m0, [srcq+wq] ; m0 = read($mmsize) |
||
34 | movu [dstq+wq], m0 ; write(m0, $mmsize) |
||
35 | add wq, mmsize ; w -= $mmsize |
||
36 | cmp wq, -mmsize ; } while (w > $mmsize); |
||
37 | jl .%1_x_loop |
||
38 | movu m0, [srcq-mmsize] ; m0 = read($mmsize) |
||
39 | movu [dstq-mmsize], m0 ; write(m0, $mmsize) |
||
40 | %ifidn %1, body ; if ($type == body) { |
||
41 | add srcq, src_strideq ; src += src_stride |
||
42 | %endif ; } |
||
43 | add dstq, dst_strideq ; dst += dst_stride |
||
44 | dec %2 ; } while (--$h); |
||
45 | jnz .%1_y_loop |
||
46 | %endmacro |
||
47 | |||
48 | %macro vvar_fn 0 |
||
49 | ; .----. <- zero |
||
50 | ; | | <- top is copied from first line in body of source |
||
51 | ; |----| <- start_y |
||
52 | ; | | <- body is copied verbatim (line-by-line) from source |
||
53 | ; |----| <- end_y |
||
54 | ; | | <- bottom is copied from last line in body of source |
||
55 | ; '----' <- bh |
||
56 | %if ARCH_X86_64 |
||
57 | cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ |
||
58 | start_y, end_y, bh, w |
||
59 | %else ; x86-32 |
||
60 | cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w |
||
61 | %define src_strideq r3mp |
||
62 | %define dst_strideq r1mp |
||
63 | mov srcq, r2mp |
||
64 | mov start_yq, r4mp |
||
65 | mov end_yq, r5mp |
||
66 | mov bhq, r6mp |
||
67 | %endif |
||
68 | sub bhq, end_yq ; bh -= end_q |
||
69 | sub end_yq, start_yq ; end_q -= start_q |
||
70 | add srcq, r7mp ; (r7mp = wmp) |
||
71 | add dstq, r7mp ; (r7mp = wmp) |
||
72 | neg r7mp ; (r7mp = wmp) |
||
73 | test start_yq, start_yq ; if (start_q) { |
||
74 | jz .body |
||
75 | V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) |
||
76 | .body: ; } |
||
77 | V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) |
||
78 | test bhq, bhq ; if (bh) { |
||
79 | jz .end |
||
80 | sub srcq, src_strideq ; src -= src_stride |
||
81 | V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) |
||
82 | .end: ; } |
||
83 | RET |
||
84 | %endmacro |
||
85 | |||
86 | %if ARCH_X86_32 |
||
87 | INIT_MMX mmx |
||
88 | vvar_fn |
||
89 | %endif |
||
90 | |||
91 | INIT_XMM sse |
||
92 | vvar_fn |
||
93 | |||
94 | %macro hvar_fn 0 |
||
95 | cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w |
||
96 | lea dstq, [dstq+n_wordsq*2] |
||
97 | neg n_wordsq |
||
98 | lea start_xq, [start_xq+n_wordsq*2] |
||
99 | .y_loop: ; do { |
||
100 | %if cpuflag(avx2) |
||
101 | vpbroadcastb m0, [dstq+start_xq] |
||
102 | mov wq, n_wordsq ; initialize w |
||
103 | %else |
||
104 | movzx wd, byte [dstq+start_xq] ; w = read(1) |
||
105 | imul wd, 0x01010101 ; w *= 0x01010101 |
||
106 | movd m0, wd |
||
107 | mov wq, n_wordsq ; initialize w |
||
108 | %if cpuflag(sse2) |
||
109 | pshufd m0, m0, q0000 ; splat |
||
110 | %else ; mmx |
||
111 | punpckldq m0, m0 ; splat |
||
112 | %endif ; mmx/sse |
||
113 | %endif ; avx2 |
||
114 | .x_loop: ; do { |
||
115 | movu [dstq+wq*2], m0 ; write($reg, $mmsize) |
||
116 | add wq, mmsize/2 ; w -= $mmsize/2 |
||
117 | cmp wq, -mmsize/2 ; } while (w > $mmsize/2) |
||
118 | jl .x_loop |
||
119 | movu [dstq-mmsize], m0 ; write($reg, $mmsize) |
||
120 | add dstq, dst_strideq ; dst += dst_stride |
||
121 | dec hq ; } while (h--) |
||
122 | jnz .y_loop |
||
123 | RET |
||
124 | %endmacro |
||
125 | |||
126 | %if ARCH_X86_32 |
||
127 | INIT_MMX mmx |
||
128 | hvar_fn |
||
129 | %endif |
||
130 | |||
131 | INIT_XMM sse2 |
||
132 | hvar_fn |
||
133 | |||
134 | %if HAVE_AVX2_EXTERNAL |
||
135 | INIT_XMM avx2 |
||
136 | hvar_fn |
||
137 | %endif |
||
138 | |||
139 | ; macro to read/write a horizontal number of pixels (%2) to/from registers |
||
140 | ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels |
||
141 | ; - if (%2 & 8) fills 8 bytes into xmm$next |
||
142 | ; - if (%2 & 4) fills 4 bytes into xmm$next |
||
143 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
||
144 | ; on mmx, - fills mm0-7 for consecutive sets of 8 pixels |
||
145 | ; - if (%2 & 4) fills 4 bytes into mm$next |
||
146 | ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
||
147 | ; writing data out is in the same way |
||
148 | %macro READ_NUM_BYTES 2 |
||
149 | %assign %%off 0 ; offset in source buffer |
||
150 | %assign %%mmx_idx 0 ; mmx register index |
||
151 | %assign %%xmm_idx 0 ; xmm register index |
||
152 | |||
153 | %rep %2/mmsize |
||
154 | %if mmsize == 16 |
||
155 | movu xmm %+ %%xmm_idx, [srcq+%%off] |
||
156 | %assign %%xmm_idx %%xmm_idx+1 |
||
157 | %else ; mmx |
||
158 | movu mm %+ %%mmx_idx, [srcq+%%off] |
||
159 | %assign %%mmx_idx %%mmx_idx+1 |
||
160 | %endif |
||
161 | %assign %%off %%off+mmsize |
||
162 | %endrep ; %2/mmsize |
||
163 | |||
164 | %if mmsize == 16 |
||
165 | %if (%2-%%off) >= 8 |
||
166 | %if %2 > 16 && (%2-%%off) > 8 |
||
167 | movu xmm %+ %%xmm_idx, [srcq+%2-16] |
||
168 | %assign %%xmm_idx %%xmm_idx+1 |
||
169 | %assign %%off %2 |
||
170 | %else |
||
171 | movq mm %+ %%mmx_idx, [srcq+%%off] |
||
172 | %assign %%mmx_idx %%mmx_idx+1 |
||
173 | %assign %%off %%off+8 |
||
174 | %endif |
||
175 | %endif ; (%2-%%off) >= 8 |
||
176 | %endif |
||
177 | |||
178 | %if (%2-%%off) >= 4 |
||
179 | %if %2 > 8 && (%2-%%off) > 4 |
||
180 | movq mm %+ %%mmx_idx, [srcq+%2-8] |
||
181 | %assign %%off %2 |
||
182 | %else |
||
183 | movd mm %+ %%mmx_idx, [srcq+%%off] |
||
184 | %assign %%off %%off+4 |
||
185 | %endif |
||
186 | %assign %%mmx_idx %%mmx_idx+1 |
||
187 | %endif ; (%2-%%off) >= 4 |
||
188 | |||
189 | %if (%2-%%off) >= 1 |
||
190 | %if %2 >= 4 |
||
191 | movd mm %+ %%mmx_idx, [srcq+%2-4] |
||
192 | %elif (%2-%%off) == 1 |
||
193 | mov valb, [srcq+%2-1] |
||
194 | %elif (%2-%%off) == 2 |
||
195 | mov valw, [srcq+%2-2] |
||
196 | %elifidn %1, body |
||
197 | mov valb, [srcq+%2-1] |
||
198 | sal vald, 16 |
||
199 | mov valw, [srcq+%2-3] |
||
200 | %elifidn %1, bottom |
||
201 | movd mm %+ %%mmx_idx, [srcq+%2-4] |
||
202 | %else ; top |
||
203 | movd mm %+ %%mmx_idx, [srcq+%2-3] |
||
204 | %endif |
||
205 | %endif ; (%2-%%off) >= 1 |
||
206 | %endmacro ; READ_NUM_BYTES |
||
207 | |||
208 | %macro WRITE_NUM_BYTES 2 |
||
209 | %assign %%off 0 ; offset in destination buffer |
||
210 | %assign %%mmx_idx 0 ; mmx register index |
||
211 | %assign %%xmm_idx 0 ; xmm register index |
||
212 | |||
213 | %rep %2/mmsize |
||
214 | %if mmsize == 16 |
||
215 | movu [dstq+%%off], xmm %+ %%xmm_idx |
||
216 | %assign %%xmm_idx %%xmm_idx+1 |
||
217 | %else ; mmx |
||
218 | movu [dstq+%%off], mm %+ %%mmx_idx |
||
219 | %assign %%mmx_idx %%mmx_idx+1 |
||
220 | %endif |
||
221 | %assign %%off %%off+mmsize |
||
222 | %endrep ; %2/mmsize |
||
223 | |||
224 | %if mmsize == 16 |
||
225 | %if (%2-%%off) >= 8 |
||
226 | %if %2 > 16 && (%2-%%off) > 8 |
||
227 | movu [dstq+%2-16], xmm %+ %%xmm_idx |
||
228 | %assign %%xmm_idx %%xmm_idx+1 |
||
229 | %assign %%off %2 |
||
230 | %else |
||
231 | movq [dstq+%%off], mm %+ %%mmx_idx |
||
232 | %assign %%mmx_idx %%mmx_idx+1 |
||
233 | %assign %%off %%off+8 |
||
234 | %endif |
||
235 | %endif ; (%2-%%off) >= 8 |
||
236 | %endif |
||
237 | |||
238 | %if (%2-%%off) >= 4 |
||
239 | %if %2 > 8 && (%2-%%off) > 4 |
||
240 | movq [dstq+%2-8], mm %+ %%mmx_idx |
||
241 | %assign %%off %2 |
||
242 | %else |
||
243 | movd [dstq+%%off], mm %+ %%mmx_idx |
||
244 | %assign %%off %%off+4 |
||
245 | %endif |
||
246 | %assign %%mmx_idx %%mmx_idx+1 |
||
247 | %endif ; (%2-%%off) >= 4 |
||
248 | |||
249 | %if (%2-%%off) >= 1 |
||
250 | %if %2 >= 4 |
||
251 | movd [dstq+%2-4], mm %+ %%mmx_idx |
||
252 | %elif (%2-%%off) == 1 |
||
253 | mov [dstq+%2-1], valb |
||
254 | %elif (%2-%%off) == 2 |
||
255 | mov [dstq+%2-2], valw |
||
256 | %elifidn %1, body |
||
257 | mov [dstq+%2-3], valw |
||
258 | sar vald, 16 |
||
259 | mov [dstq+%2-1], valb |
||
260 | %else |
||
261 | movd vald, mm %+ %%mmx_idx |
||
262 | %ifidn %1, bottom |
||
263 | sar vald, 8 |
||
264 | %endif |
||
265 | mov [dstq+%2-3], valw |
||
266 | sar vald, 16 |
||
267 | mov [dstq+%2-1], valb |
||
268 | %endif |
||
269 | %endif ; (%2-%%off) >= 1 |
||
270 | %endmacro ; WRITE_NUM_BYTES |
||
271 | |||
272 | ; vertical top/bottom extend and body copy fast loops |
||
273 | ; these are function pointers to set-width line copy functions, i.e. |
||
274 | ; they read a fixed number of pixels into set registers, and write |
||
275 | ; those out into the destination buffer |
||
276 | %macro VERTICAL_EXTEND 2 |
||
277 | %assign %%n %1 |
||
278 | %rep 1+%2-%1 |
||
279 | %if %%n <= 3 |
||
280 | %if ARCH_X86_64 |
||
281 | cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ |
||
282 | start_y, end_y, val, bh |
||
283 | mov bhq, r6mp ; r6mp = bhmp |
||
284 | %else ; x86-32 |
||
285 | cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh |
||
286 | mov dstq, r0mp |
||
287 | mov srcq, r2mp |
||
288 | mov start_yq, r4mp |
||
289 | mov end_yq, r5mp |
||
290 | mov bhq, r6mp |
||
291 | %define dst_strideq r1mp |
||
292 | %define src_strideq r3mp |
||
293 | %endif ; x86-64/32 |
||
294 | %else |
||
295 | %if ARCH_X86_64 |
||
296 | cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ |
||
297 | start_y, end_y, bh |
||
298 | %else ; x86-32 |
||
299 | cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh |
||
300 | mov srcq, r2mp |
||
301 | mov start_yq, r4mp |
||
302 | mov end_yq, r5mp |
||
303 | mov bhq, r6mp |
||
304 | %define dst_strideq r1mp |
||
305 | %define src_strideq r3mp |
||
306 | %endif ; x86-64/32 |
||
307 | %endif |
||
308 | ; FIXME move this to c wrapper? |
||
309 | sub bhq, end_yq ; bh -= end_y |
||
310 | sub end_yq, start_yq ; end_y -= start_y |
||
311 | |||
312 | ; extend pixels above body |
||
313 | test start_yq, start_yq ; if (start_y) { |
||
314 | jz .body_loop |
||
315 | READ_NUM_BYTES top, %%n ; $variable_regs = read($n) |
||
316 | .top_loop: ; do { |
||
317 | WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) |
||
318 | add dstq, dst_strideq ; dst += linesize |
||
319 | dec start_yq ; } while (--start_y) |
||
320 | jnz .top_loop ; } |
||
321 | |||
322 | ; copy body pixels |
||
323 | .body_loop: ; do { |
||
324 | READ_NUM_BYTES body, %%n ; $variable_regs = read($n) |
||
325 | WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) |
||
326 | add dstq, dst_strideq ; dst += dst_stride |
||
327 | add srcq, src_strideq ; src += src_stride |
||
328 | dec end_yq ; } while (--end_y) |
||
329 | jnz .body_loop |
||
330 | |||
331 | ; copy bottom pixels |
||
332 | test bhq, bhq ; if (block_h) { |
||
333 | jz .end |
||
334 | sub srcq, src_strideq ; src -= linesize |
||
335 | READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) |
||
336 | .bottom_loop: ; do { |
||
337 | WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) |
||
338 | add dstq, dst_strideq ; dst += linesize |
||
339 | dec bhq ; } while (--bh) |
||
340 | jnz .bottom_loop ; } |
||
341 | |||
342 | .end: |
||
343 | RET |
||
344 | %assign %%n %%n+1 |
||
345 | %endrep ; 1+%2-%1 |
||
346 | %endmacro ; VERTICAL_EXTEND |
||
347 | |||
348 | INIT_MMX mmx |
||
349 | VERTICAL_EXTEND 1, 15 |
||
350 | %if ARCH_X86_32 |
||
351 | VERTICAL_EXTEND 16, 22 |
||
352 | %endif |
||
353 | |||
354 | INIT_XMM sse |
||
355 | VERTICAL_EXTEND 16, 22 |
||
356 | |||
357 | ; left/right (horizontal) fast extend functions |
||
358 | ; these are essentially identical to the vertical extend ones above, |
||
359 | ; just left/right separated because number of pixels to extend is |
||
360 | ; obviously not the same on both sides. |
||
361 | |||
362 | %macro READ_V_PIXEL 2 |
||
363 | %if cpuflag(avx2) |
||
364 | vpbroadcastb m0, %2 |
||
365 | %else |
||
366 | movzx vald, byte %2 |
||
367 | imul vald, 0x01010101 |
||
368 | %if %1 >= 8 |
||
369 | movd m0, vald |
||
370 | %if mmsize == 16 |
||
371 | pshufd m0, m0, q0000 |
||
372 | %else |
||
373 | punpckldq m0, m0 |
||
374 | %endif ; mmsize == 16 |
||
375 | %endif ; %1 > 16 |
||
376 | %endif ; avx2 |
||
377 | %endmacro ; READ_V_PIXEL |
||
378 | |||
379 | %macro WRITE_V_PIXEL 2 |
||
380 | %assign %%off 0 |
||
381 | |||
382 | %if %1 >= 8 |
||
383 | |||
384 | %rep %1/mmsize |
||
385 | movu [%2+%%off], m0 |
||
386 | %assign %%off %%off+mmsize |
||
387 | %endrep ; %1/mmsize |
||
388 | |||
389 | %if mmsize == 16 |
||
390 | %if %1-%%off >= 8 |
||
391 | %if %1 > 16 && %1-%%off > 8 |
||
392 | movu [%2+%1-16], m0 |
||
393 | %assign %%off %1 |
||
394 | %else |
||
395 | movq [%2+%%off], m0 |
||
396 | %assign %%off %%off+8 |
||
397 | %endif |
||
398 | %endif ; %1-%%off >= 8 |
||
399 | %endif ; mmsize == 16 |
||
400 | |||
401 | %if %1-%%off >= 4 |
||
402 | %if %1 > 8 && %1-%%off > 4 |
||
403 | movq [%2+%1-8], m0 |
||
404 | %assign %%off %1 |
||
405 | %else |
||
406 | movd [%2+%%off], m0 |
||
407 | %assign %%off %%off+4 |
||
408 | %endif |
||
409 | %endif ; %1-%%off >= 4 |
||
410 | |||
411 | %else ; %1 < 8 |
||
412 | |||
413 | %rep %1/4 |
||
414 | mov [%2+%%off], vald |
||
415 | %assign %%off %%off+4 |
||
416 | %endrep ; %1/4 |
||
417 | |||
418 | %endif ; %1 >=/< 8 |
||
419 | |||
420 | %if %1-%%off == 2 |
||
421 | %if cpuflag(avx2) |
||
422 | movd [%2+%%off-2], m0 |
||
423 | %else |
||
424 | mov [%2+%%off], valw |
||
425 | %endif ; avx2 |
||
426 | %endif ; (%1-%%off)/2 |
||
427 | %endmacro ; WRITE_V_PIXEL |
||
428 | |||
429 | %macro H_EXTEND 2 |
||
430 | %assign %%n %1 |
||
431 | %rep 1+(%2-%1)/2 |
||
432 | %if cpuflag(avx2) |
||
433 | cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh |
||
434 | %else |
||
435 | cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val |
||
436 | %endif |
||
437 | .loop_y: ; do { |
||
438 | READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) |
||
439 | WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) |
||
440 | add dstq, dst_strideq ; dst += dst_stride |
||
441 | dec bhq ; } while (--bh) |
||
442 | jnz .loop_y |
||
443 | RET |
||
444 | %assign %%n %%n+2 |
||
445 | %endrep ; 1+(%2-%1)/2 |
||
446 | %endmacro ; H_EXTEND |
||
447 | |||
448 | INIT_MMX mmx |
||
449 | H_EXTEND 2, 14 |
||
450 | %if ARCH_X86_32 |
||
451 | H_EXTEND 16, 22 |
||
452 | %endif |
||
453 | |||
454 | INIT_XMM sse2 |
||
455 | H_EXTEND 16, 22 |
||
456 | |||
457 | %if HAVE_AVX2_EXTERNAL |
||
458 | INIT_XMM avx2 |
||
459 | H_EXTEND 8, 22 |
||
460 | %endif |
||
461 | |||
462 | %macro PREFETCH_FN 1 |
||
463 | cglobal prefetch, 3, 3, 0, buf, stride, h |
||
464 | .loop: |
||
465 | %1 [bufq] |
||
466 | add bufq, strideq |
||
467 | dec hd |
||
468 | jg .loop |
||
469 | REP_RET |
||
470 | %endmacro |
||
471 | |||
472 | INIT_MMX mmxext |
||
473 | PREFETCH_FN prefetcht0 |
||
474 | %if ARCH_X86_32 |
||
475 | INIT_MMX 3dnow |
||
476 | PREFETCH_FN prefetch |
||
477 | %endif>>=>->->->->->->-> |