Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | ;****************************************************************************** |
2 | ;* MMX optimized discrete wavelet trasnform |
||
3 | ;* Copyright (c) 2010 David Conrad |
||
4 | ;* |
||
5 | ;* This file is part of FFmpeg. |
||
6 | ;* |
||
7 | ;* FFmpeg is free software; you can redistribute it and/or |
||
8 | ;* modify it under the terms of the GNU Lesser General Public |
||
9 | ;* License as published by the Free Software Foundation; either |
||
10 | ;* version 2.1 of the License, or (at your option) any later version. |
||
11 | ;* |
||
12 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | ;* Lesser General Public License for more details. |
||
16 | ;* |
||
17 | ;* You should have received a copy of the GNU Lesser General Public |
||
18 | ;* License along with FFmpeg; if not, write to the Free Software |
||
19 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | ;****************************************************************************** |
||
21 | |||
22 | %include "libavutil/x86/x86util.asm" |
||
23 | |||
24 | SECTION_RODATA |
||
25 | pw_1: times 8 dw 1 |
||
26 | pw_2: times 8 dw 2 |
||
27 | pw_8: times 8 dw 8 |
||
28 | pw_16: times 8 dw 16 |
||
29 | pw_1991: times 4 dw 9,-1 |
||
30 | |||
31 | section .text |
||
32 | |||
33 | ; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 |
||
34 | %macro COMPOSE_53iL0 4 |
||
35 | paddw %2, %3 |
||
36 | paddw %2, %4 |
||
37 | psraw %2, 2 |
||
38 | psubw %1, %2 |
||
39 | %endm |
||
40 | |||
41 | ; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 |
||
42 | ; if %4 is supplied, %1 is loaded unaligned from there |
||
43 | ; m2: clobbered m3: pw_8 m4: pw_1991 |
||
44 | %macro COMPOSE_DD97iH0 3-4 |
||
45 | paddw m0, %3 |
||
46 | paddw m1, %2 |
||
47 | psubw m0, m3 |
||
48 | mova m2, m1 |
||
49 | punpcklwd m1, m0 |
||
50 | punpckhwd m2, m0 |
||
51 | pmaddwd m1, m4 |
||
52 | pmaddwd m2, m4 |
||
53 | %if %0 > 3 |
||
54 | movu %1, %4 |
||
55 | %endif |
||
56 | psrad m1, 4 |
||
57 | psrad m2, 4 |
||
58 | packssdw m1, m2 |
||
59 | paddw m1, %1 |
||
60 | %endm |
||
61 | |||
62 | %macro COMPOSE_VERTICAL 1 |
||
63 | ; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||
64 | ; int width) |
||
65 | cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width |
||
66 | mova m2, [pw_2] |
||
67 | %if ARCH_X86_64 |
||
68 | mov widthd, widthd |
||
69 | %endif |
||
70 | .loop: |
||
71 | sub widthq, mmsize/2 |
||
72 | mova m1, [b0q+2*widthq] |
||
73 | mova m0, [b1q+2*widthq] |
||
74 | COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 |
||
75 | mova [b1q+2*widthq], m0 |
||
76 | jg .loop |
||
77 | REP_RET |
||
78 | |||
79 | ; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||
80 | ; int width) |
||
81 | cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width |
||
82 | mova m1, [pw_1] |
||
83 | %if ARCH_X86_64 |
||
84 | mov widthd, widthd |
||
85 | %endif |
||
86 | .loop: |
||
87 | sub widthq, mmsize/2 |
||
88 | mova m0, [b0q+2*widthq] |
||
89 | paddw m0, [b2q+2*widthq] |
||
90 | paddw m0, m1 |
||
91 | psraw m0, 1 |
||
92 | paddw m0, [b1q+2*widthq] |
||
93 | mova [b1q+2*widthq], m0 |
||
94 | jg .loop |
||
95 | REP_RET |
||
96 | |||
97 | ; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||
98 | ; IDWTELEM *b3, IDWTELEM *b4, int width) |
||
99 | cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width |
||
100 | mova m3, [pw_8] |
||
101 | mova m4, [pw_1991] |
||
102 | %if ARCH_X86_64 |
||
103 | mov widthd, widthd |
||
104 | %endif |
||
105 | .loop: |
||
106 | sub widthq, mmsize/2 |
||
107 | mova m0, [b0q+2*widthq] |
||
108 | mova m1, [b1q+2*widthq] |
||
109 | COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] |
||
110 | mova [b2q+2*widthq], m1 |
||
111 | jg .loop |
||
112 | REP_RET |
||
113 | |||
114 | ; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
||
115 | ; IDWTELEM *b3, IDWTELEM *b4, int width) |
||
116 | cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width |
||
117 | mova m3, [pw_16] |
||
118 | mova m4, [pw_1991] |
||
119 | %if ARCH_X86_64 |
||
120 | mov widthd, widthd |
||
121 | %endif |
||
122 | .loop: |
||
123 | sub widthq, mmsize/2 |
||
124 | mova m0, [b0q+2*widthq] |
||
125 | mova m1, [b1q+2*widthq] |
||
126 | mova m5, [b2q+2*widthq] |
||
127 | paddw m0, [b4q+2*widthq] |
||
128 | paddw m1, [b3q+2*widthq] |
||
129 | psubw m0, m3 |
||
130 | mova m2, m1 |
||
131 | punpcklwd m1, m0 |
||
132 | punpckhwd m2, m0 |
||
133 | pmaddwd m1, m4 |
||
134 | pmaddwd m2, m4 |
||
135 | psrad m1, 5 |
||
136 | psrad m2, 5 |
||
137 | packssdw m1, m2 |
||
138 | psubw m5, m1 |
||
139 | mova [b2q+2*widthq], m5 |
||
140 | jg .loop |
||
141 | REP_RET |
||
142 | |||
143 | ; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) |
||
144 | cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width |
||
145 | mova m3, [pw_1] |
||
146 | %if ARCH_X86_64 |
||
147 | mov widthd, widthd |
||
148 | %endif |
||
149 | .loop: |
||
150 | sub widthq, mmsize/2 |
||
151 | mova m1, [b1q+2*widthq] |
||
152 | mova m0, [b0q+2*widthq] |
||
153 | mova m2, m1 |
||
154 | paddw m1, m3 |
||
155 | psraw m1, 1 |
||
156 | psubw m0, m1 |
||
157 | mova [b0q+2*widthq], m0 |
||
158 | paddw m2, m0 |
||
159 | mova [b1q+2*widthq], m2 |
||
160 | jg .loop |
||
161 | REP_RET |
||
162 | %endmacro |
||
163 | |||
164 | ; extend the left and right edges of the tmp array by %1 and %2 respectively |
||
165 | %macro EDGE_EXTENSION 3 |
||
166 | mov %3, [tmpq] |
||
167 | %assign %%i 1 |
||
168 | %rep %1 |
||
169 | mov [tmpq-2*%%i], %3 |
||
170 | %assign %%i %%i+1 |
||
171 | %endrep |
||
172 | mov %3, [tmpq+2*w2q-2] |
||
173 | %assign %%i 0 |
||
174 | %rep %2 |
||
175 | mov [tmpq+2*w2q+2*%%i], %3 |
||
176 | %assign %%i %%i+1 |
||
177 | %endrep |
||
178 | %endmacro |
||
179 | |||
180 | |||
181 | %macro HAAR_HORIZONTAL 2 |
||
182 | ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) |
||
183 | cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 |
||
184 | mov w2d, wd |
||
185 | xor xq, xq |
||
186 | shr w2d, 1 |
||
187 | lea b_w2q, [bq+wq] |
||
188 | mova m3, [pw_1] |
||
189 | .lowpass_loop: |
||
190 | movu m1, [b_w2q + 2*xq] |
||
191 | mova m0, [bq + 2*xq] |
||
192 | paddw m1, m3 |
||
193 | psraw m1, 1 |
||
194 | psubw m0, m1 |
||
195 | mova [tmpq + 2*xq], m0 |
||
196 | add xq, mmsize/2 |
||
197 | cmp xq, w2q |
||
198 | jl .lowpass_loop |
||
199 | |||
200 | xor xq, xq |
||
201 | and w2q, ~(mmsize/2 - 1) |
||
202 | cmp w2q, mmsize/2 |
||
203 | jl .end |
||
204 | |||
205 | .highpass_loop: |
||
206 | movu m1, [b_w2q + 2*xq] |
||
207 | mova m0, [tmpq + 2*xq] |
||
208 | paddw m1, m0 |
||
209 | |||
210 | ; shift and interleave |
||
211 | %if %2 == 1 |
||
212 | paddw m0, m3 |
||
213 | paddw m1, m3 |
||
214 | psraw m0, 1 |
||
215 | psraw m1, 1 |
||
216 | %endif |
||
217 | mova m2, m0 |
||
218 | punpcklwd m0, m1 |
||
219 | punpckhwd m2, m1 |
||
220 | mova [bq+4*xq], m0 |
||
221 | mova [bq+4*xq+mmsize], m2 |
||
222 | |||
223 | add xq, mmsize/2 |
||
224 | cmp xq, w2q |
||
225 | jl .highpass_loop |
||
226 | .end: |
||
227 | REP_RET |
||
228 | %endmacro |
||
229 | |||
230 | |||
231 | INIT_XMM |
||
232 | ; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) |
||
233 | cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 |
||
234 | mov w2d, wd |
||
235 | xor xd, xd |
||
236 | shr w2d, 1 |
||
237 | lea b_w2q, [bq+wq] |
||
238 | movu m4, [bq+wq] |
||
239 | mova m7, [pw_2] |
||
240 | pslldq m4, 14 |
||
241 | .lowpass_loop: |
||
242 | movu m1, [b_w2q + 2*xq] |
||
243 | mova m0, [bq + 2*xq] |
||
244 | mova m2, m1 |
||
245 | palignr m1, m4, 14 |
||
246 | mova m4, m2 |
||
247 | COMPOSE_53iL0 m0, m1, m2, m7 |
||
248 | mova [tmpq + 2*xq], m0 |
||
249 | add xd, mmsize/2 |
||
250 | cmp xd, w2d |
||
251 | jl .lowpass_loop |
||
252 | |||
253 | EDGE_EXTENSION 1, 2, xw |
||
254 | ; leave the last up to 7 (sse) or 3 (mmx) values for C |
||
255 | xor xd, xd |
||
256 | and w2d, ~(mmsize/2 - 1) |
||
257 | cmp w2d, mmsize/2 |
||
258 | jl .end |
||
259 | |||
260 | mova m7, [tmpq-mmsize] |
||
261 | mova m0, [tmpq] |
||
262 | mova m5, [pw_1] |
||
263 | mova m3, [pw_8] |
||
264 | mova m4, [pw_1991] |
||
265 | .highpass_loop: |
||
266 | mova m6, m0 |
||
267 | palignr m0, m7, 14 |
||
268 | mova m7, [tmpq + 2*xq + 16] |
||
269 | mova m1, m7 |
||
270 | mova m2, m7 |
||
271 | palignr m1, m6, 2 |
||
272 | palignr m2, m6, 4 |
||
273 | COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] |
||
274 | mova m0, m7 |
||
275 | mova m7, m6 |
||
276 | |||
277 | ; shift and interleave |
||
278 | paddw m6, m5 |
||
279 | paddw m1, m5 |
||
280 | psraw m6, 1 |
||
281 | psraw m1, 1 |
||
282 | mova m2, m6 |
||
283 | punpcklwd m6, m1 |
||
284 | punpckhwd m2, m1 |
||
285 | mova [bq+4*xq], m6 |
||
286 | mova [bq+4*xq+mmsize], m2 |
||
287 | |||
288 | add xd, mmsize/2 |
||
289 | cmp xd, w2d |
||
290 | jl .highpass_loop |
||
291 | .end: |
||
292 | REP_RET |
||
293 | |||
294 | |||
295 | %if ARCH_X86_64 == 0 |
||
296 | INIT_MMX |
||
297 | COMPOSE_VERTICAL mmx |
||
298 | HAAR_HORIZONTAL mmx, 0 |
||
299 | HAAR_HORIZONTAL mmx, 1 |
||
300 | %endif |
||
301 | |||
302 | ;;INIT_XMM |
||
303 | INIT_XMM |
||
304 | COMPOSE_VERTICAL sse2 |
||
305 | HAAR_HORIZONTAL sse2, 0 |
||
306 | HAAR_HORIZONTAL sse2, 1 |