Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | ;****************************************************************************** |
2 | ;* MMX/SSE2-optimized functions for the VP6 decoder |
||
3 | ;* Copyright (C) 2009 Sebastien Lucas |
||
4 | ;* Copyright (C) 2009 Zuxy Meng |
||
5 | ;* |
||
6 | ;* This file is part of FFmpeg. |
||
7 | ;* |
||
8 | ;* FFmpeg is free software; you can redistribute it and/or |
||
9 | ;* modify it under the terms of the GNU Lesser General Public |
||
10 | ;* License as published by the Free Software Foundation; either |
||
11 | ;* version 2.1 of the License, or (at your option) any later version. |
||
12 | ;* |
||
13 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
16 | ;* Lesser General Public License for more details. |
||
17 | ;* |
||
18 | ;* You should have received a copy of the GNU Lesser General Public |
||
19 | ;* License along with FFmpeg; if not, write to the Free Software |
||
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
21 | ;****************************************************************************** |
||
22 | |||
23 | %include "libavutil/x86/x86util.asm" |
||
24 | |||
25 | cextern pw_64 |
||
26 | |||
27 | SECTION .text |
||
28 | |||
29 | %macro DIAG4 6 |
||
30 | %if mmsize == 8 |
||
31 | movq m0, [%1+%2] |
||
32 | movq m1, [%1+%3] |
||
33 | movq m3, m0 |
||
34 | movq m4, m1 |
||
35 | punpcklbw m0, m7 |
||
36 | punpcklbw m1, m7 |
||
37 | punpckhbw m3, m7 |
||
38 | punpckhbw m4, m7 |
||
39 | pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] |
||
40 | pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] |
||
41 | pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] |
||
42 | pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] |
||
43 | paddw m0, m1 |
||
44 | paddw m3, m4 |
||
45 | movq m1, [%1+%4] |
||
46 | movq m2, [%1+%5] |
||
47 | movq m4, m1 |
||
48 | movq m5, m2 |
||
49 | punpcklbw m1, m7 |
||
50 | punpcklbw m2, m7 |
||
51 | punpckhbw m4, m7 |
||
52 | punpckhbw m5, m7 |
||
53 | pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] |
||
54 | pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] |
||
55 | pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] |
||
56 | pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] |
||
57 | paddw m1, m2 |
||
58 | paddw m4, m5 |
||
59 | paddsw m0, m1 |
||
60 | paddsw m3, m4 |
||
61 | paddsw m0, m6 ; Add 64 |
||
62 | paddsw m3, m6 ; Add 64 |
||
63 | psraw m0, 7 |
||
64 | psraw m3, 7 |
||
65 | packuswb m0, m3 |
||
66 | movq [%6], m0 |
||
67 | %else ; mmsize == 16 |
||
68 | movq m0, [%1+%2] |
||
69 | movq m1, [%1+%3] |
||
70 | punpcklbw m0, m7 |
||
71 | punpcklbw m1, m7 |
||
72 | pmullw m0, m4 ; src[x-8 ] * biweight [0] |
||
73 | pmullw m1, m5 ; src[x ] * biweight [1] |
||
74 | paddw m0, m1 |
||
75 | movq m1, [%1+%4] |
||
76 | movq m2, [%1+%5] |
||
77 | punpcklbw m1, m7 |
||
78 | punpcklbw m2, m7 |
||
79 | pmullw m1, m6 ; src[x+8 ] * biweight [2] |
||
80 | pmullw m2, m3 ; src[x+16] * biweight [3] |
||
81 | paddw m1, m2 |
||
82 | paddsw m0, m1 |
||
83 | paddsw m0, [pw_64] ; Add 64 |
||
84 | psraw m0, 7 |
||
85 | packuswb m0, m0 |
||
86 | movq [%6], m0 |
||
87 | %endif ; mmsize == 8/16 |
||
88 | %endmacro |
||
89 | |||
90 | %macro SPLAT4REGS 0 |
||
91 | %if mmsize == 8 |
||
92 | movq m5, m3 |
||
93 | punpcklwd m3, m3 |
||
94 | movq m4, m3 |
||
95 | punpckldq m3, m3 |
||
96 | punpckhdq m4, m4 |
||
97 | punpckhwd m5, m5 |
||
98 | movq m2, m5 |
||
99 | punpckhdq m2, m2 |
||
100 | punpckldq m5, m5 |
||
101 | movq [rsp+8*11], m3 |
||
102 | movq [rsp+8*12], m4 |
||
103 | movq [rsp+8*13], m5 |
||
104 | movq [rsp+8*14], m2 |
||
105 | %else ; mmsize == 16 |
||
106 | pshuflw m4, m3, 0x0 |
||
107 | pshuflw m5, m3, 0x55 |
||
108 | pshuflw m6, m3, 0xAA |
||
109 | pshuflw m3, m3, 0xFF |
||
110 | punpcklqdq m4, m4 |
||
111 | punpcklqdq m5, m5 |
||
112 | punpcklqdq m6, m6 |
||
113 | punpcklqdq m3, m3 |
||
114 | %endif ; mmsize == 8/16 |
||
115 | %endmacro |
||
116 | |||
117 | %macro vp6_filter_diag4 0 |
||
118 | ; void ff_vp6_filter_diag4_ |
||
119 | ; const int16_t h_weight[4], const int16_t v_weights[4]) |
||
120 | cglobal vp6_filter_diag4, 5, 7, 8 |
||
121 | mov r5, rsp ; backup stack pointer |
||
122 | and rsp, ~(mmsize-1) ; align stack |
||
123 | %if mmsize == 16 |
||
124 | sub rsp, 8*11 |
||
125 | %else |
||
126 | sub rsp, 8*15 |
||
127 | movq m6, [pw_64] |
||
128 | %endif |
||
129 | %if ARCH_X86_64 |
||
130 | movsxd r2, r2d |
||
131 | %endif |
||
132 | |||
133 | sub r1, r2 |
||
134 | |||
135 | pxor m7, m7 |
||
136 | movq m3, [r3] |
||
137 | SPLAT4REGS |
||
138 | |||
139 | mov r3, rsp |
||
140 | mov r6, 11 |
||
141 | .nextrow: |
||
142 | DIAG4 r1, -1, 0, 1, 2, r3 |
||
143 | add r3, 8 |
||
144 | add r1, r2 |
||
145 | dec r6 |
||
146 | jnz .nextrow |
||
147 | |||
148 | movq m3, [r4] |
||
149 | SPLAT4REGS |
||
150 | |||
151 | lea r3, [rsp+8] |
||
152 | mov r6, 8 |
||
153 | .nextcol: |
||
154 | DIAG4 r3, -8, 0, 8, 16, r0 |
||
155 | add r3, 8 |
||
156 | add r0, r2 |
||
157 | dec r6 |
||
158 | jnz .nextcol |
||
159 | |||
160 | mov rsp, r5 ; restore stack pointer |
||
161 | RET |
||
162 | %endmacro |
||
163 | |||
164 | %if ARCH_X86_32 |
||
165 | INIT_MMX mmx |
||
166 | vp6_filter_diag4 |
||
167 | %endif |
||
168 | |||
169 | INIT_XMM sse2 |
||
170 | vp6_filter_diag4 |