Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | ;****************************************************************************** |
2 | ;* Copyright (c) 2010 David Conrad |
||
3 | ;* |
||
4 | ;* This file is part of FFmpeg. |
||
5 | ;* |
||
6 | ;* FFmpeg is free software; you can redistribute it and/or |
||
7 | ;* modify it under the terms of the GNU Lesser General Public |
||
8 | ;* License as published by the Free Software Foundation; either |
||
9 | ;* version 2.1 of the License, or (at your option) any later version. |
||
10 | ;* |
||
11 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | ;* Lesser General Public License for more details. |
||
15 | ;* |
||
16 | ;* You should have received a copy of the GNU Lesser General Public |
||
17 | ;* License along with FFmpeg; if not, write to the Free Software |
||
18 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | ;****************************************************************************** |
||
20 | |||
21 | %include "libavutil/x86/x86util.asm" |
||
22 | |||
23 | SECTION_RODATA |
||
24 | pw_3: times 8 dw 3 |
||
25 | pw_7: times 8 dw 7 |
||
26 | pw_16: times 8 dw 16 |
||
27 | pw_32: times 8 dw 32 |
||
28 | pb_128: times 16 db 128 |
||
29 | |||
30 | section .text |
||
31 | |||
32 | %macro UNPACK_ADD 6 |
||
33 | mov%5 %1, %3 |
||
34 | mov%6 m5, %4 |
||
35 | mova m4, %1 |
||
36 | mova %2, m5 |
||
37 | punpcklbw %1, m7 |
||
38 | punpcklbw m5, m7 |
||
39 | punpckhbw m4, m7 |
||
40 | punpckhbw %2, m7 |
||
41 | paddw %1, m5 |
||
42 | paddw %2, m4 |
||
43 | %endmacro |
||
44 | |||
45 | %macro HPEL_FILTER 1 |
||
46 | ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); |
||
47 | cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 |
||
48 | mov src0q, srcq |
||
49 | lea stridex3q, [3*strideq] |
||
50 | sub src0q, stridex3q |
||
51 | pxor m7, m7 |
||
52 | .loop: |
||
53 | ; 7*(src[0] + src[1]) |
||
54 | UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a |
||
55 | pmullw m0, [pw_7] |
||
56 | pmullw m1, [pw_7] |
||
57 | |||
58 | ; 3*( ... + src[-2] + src[3]) |
||
59 | UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a |
||
60 | paddw m0, m2 |
||
61 | paddw m1, m3 |
||
62 | pmullw m0, [pw_3] |
||
63 | pmullw m1, [pw_3] |
||
64 | |||
65 | ; ... - 7*(src[-1] + src[2]) |
||
66 | UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a |
||
67 | pmullw m2, [pw_7] |
||
68 | pmullw m3, [pw_7] |
||
69 | psubw m0, m2 |
||
70 | psubw m1, m3 |
||
71 | |||
72 | ; ... - (src[-3] + src[4]) |
||
73 | UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a |
||
74 | psubw m0, m2 |
||
75 | psubw m1, m3 |
||
76 | |||
77 | paddw m0, [pw_16] |
||
78 | paddw m1, [pw_16] |
||
79 | psraw m0, 5 |
||
80 | psraw m1, 5 |
||
81 | packuswb m0, m1 |
||
82 | mova [dstq], m0 |
||
83 | add dstq, mmsize |
||
84 | add srcq, mmsize |
||
85 | add src0q, mmsize |
||
86 | sub widthd, mmsize |
||
87 | jg .loop |
||
88 | RET |
||
89 | |||
90 | ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); |
||
91 | cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width |
||
92 | dec widthd |
||
93 | pxor m7, m7 |
||
94 | and widthd, ~(mmsize-1) |
||
95 | .loop: |
||
96 | ; 7*(src[0] + src[1]) |
||
97 | UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u |
||
98 | pmullw m0, [pw_7] |
||
99 | pmullw m1, [pw_7] |
||
100 | |||
101 | ; 3*( ... + src[-2] + src[3]) |
||
102 | UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u |
||
103 | paddw m0, m2 |
||
104 | paddw m1, m3 |
||
105 | pmullw m0, [pw_3] |
||
106 | pmullw m1, [pw_3] |
||
107 | |||
108 | ; ... - 7*(src[-1] + src[2]) |
||
109 | UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u |
||
110 | pmullw m2, [pw_7] |
||
111 | pmullw m3, [pw_7] |
||
112 | psubw m0, m2 |
||
113 | psubw m1, m3 |
||
114 | |||
115 | ; ... - (src[-3] + src[4]) |
||
116 | UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u |
||
117 | psubw m0, m2 |
||
118 | psubw m1, m3 |
||
119 | |||
120 | paddw m0, [pw_16] |
||
121 | paddw m1, [pw_16] |
||
122 | psraw m0, 5 |
||
123 | psraw m1, 5 |
||
124 | packuswb m0, m1 |
||
125 | mova [dstq + widthq], m0 |
||
126 | sub widthd, mmsize |
||
127 | jge .loop |
||
128 | RET |
||
129 | %endmacro |
||
130 | |||
131 | %macro PUT_RECT 1 |
||
132 | ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) |
||
133 | cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 |
||
134 | mova m0, [pb_128] |
||
135 | add wd, (mmsize-1) |
||
136 | and wd, ~(mmsize-1) |
||
137 | |||
138 | %if ARCH_X86_64 |
||
139 | movsxd dst_strideq, dst_strided |
||
140 | movsxd src_strideq, src_strided |
||
141 | mov r7d, r5m |
||
142 | mov r8d, wd |
||
143 | %define wspill r8d |
||
144 | %define hd r7d |
||
145 | %else |
||
146 | mov r4m, wd |
||
147 | %define wspill r4m |
||
148 | %define hd r5mp |
||
149 | %endif |
||
150 | |||
151 | .loopy |
||
152 | lea src2q, [srcq+src_strideq*2] |
||
153 | lea dst2q, [dstq+dst_strideq] |
||
154 | .loopx: |
||
155 | sub wd, mmsize |
||
156 | mova m1, [srcq +2*wq] |
||
157 | mova m2, [src2q+2*wq] |
||
158 | packsswb m1, [srcq +2*wq+mmsize] |
||
159 | packsswb m2, [src2q+2*wq+mmsize] |
||
160 | paddb m1, m0 |
||
161 | paddb m2, m0 |
||
162 | mova [dstq +wq], m1 |
||
163 | mova [dst2q+wq], m2 |
||
164 | jg .loopx |
||
165 | |||
166 | lea srcq, [srcq+src_strideq*4] |
||
167 | lea dstq, [dstq+dst_strideq*2] |
||
168 | sub hd, 2 |
||
169 | mov wd, wspill |
||
170 | jg .loopy |
||
171 | RET |
||
172 | %endm |
||
173 | |||
174 | %macro ADD_RECT 1 |
||
175 | ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) |
||
176 | cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h |
||
177 | mova m0, [pw_32] |
||
178 | add wd, (mmsize-1) |
||
179 | and wd, ~(mmsize-1) |
||
180 | |||
181 | %if ARCH_X86_64 |
||
182 | movsxd strideq, strided |
||
183 | movsxd idwt_strideq, idwt_strided |
||
184 | mov r8d, wd |
||
185 | %define wspill r8d |
||
186 | %else |
||
187 | mov r5m, wd |
||
188 | %define wspill r5m |
||
189 | %endif |
||
190 | |||
191 | .loop: |
||
192 | sub wd, mmsize |
||
193 | movu m1, [srcq +2*wq] ; FIXME: ensure alignment |
||
194 | paddw m1, m0 |
||
195 | psraw m1, 6 |
||
196 | movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment |
||
197 | paddw m2, m0 |
||
198 | psraw m2, 6 |
||
199 | paddw m1, [idwtq+2*wq] |
||
200 | paddw m2, [idwtq+2*wq+mmsize] |
||
201 | packuswb m1, m2 |
||
202 | mova [dstq +wq], m1 |
||
203 | jg .loop |
||
204 | |||
205 | lea srcq, [srcq + 2*strideq] |
||
206 | add dstq, strideq |
||
207 | lea idwtq, [idwtq+ 2*idwt_strideq] |
||
208 | sub hd, 1 |
||
209 | mov wd, wspill |
||
210 | jg .loop |
||
211 | RET |
||
212 | %endm |
||
213 | |||
214 | %macro ADD_OBMC 2 |
||
215 | ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) |
||
216 | cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen |
||
217 | pxor m4, m4 |
||
218 | .loop: |
||
219 | %assign i 0 |
||
220 | %rep %1 / mmsize |
||
221 | mova m0, [srcq+i] |
||
222 | mova m1, m0 |
||
223 | punpcklbw m0, m4 |
||
224 | punpckhbw m1, m4 |
||
225 | mova m2, [obmcq+i] |
||
226 | mova m3, m2 |
||
227 | punpcklbw m2, m4 |
||
228 | punpckhbw m3, m4 |
||
229 | pmullw m0, m2 |
||
230 | pmullw m1, m3 |
||
231 | movu m2, [dstq+2*i] |
||
232 | movu m3, [dstq+2*i+mmsize] |
||
233 | paddw m0, m2 |
||
234 | paddw m1, m3 |
||
235 | movu [dstq+2*i], m0 |
||
236 | movu [dstq+2*i+mmsize], m1 |
||
237 | %assign i i+mmsize |
||
238 | %endrep |
||
239 | lea srcq, [srcq+strideq] |
||
240 | lea dstq, [dstq+2*strideq] |
||
241 | add obmcq, 32 |
||
242 | sub yblend, 1 |
||
243 | jg .loop |
||
244 | RET |
||
245 | %endm |
||
246 | |||
247 | INIT_MMX |
||
248 | %if ARCH_X86_64 == 0 |
||
249 | PUT_RECT mmx |
||
250 | ADD_RECT mmx |
||
251 | |||
252 | HPEL_FILTER mmx |
||
253 | ADD_OBMC 32, mmx |
||
254 | ADD_OBMC 16, mmx |
||
255 | %endif |
||
256 | ADD_OBMC 8, mmx |
||
257 | |||
258 | INIT_XMM |
||
259 | PUT_RECT sse2 |
||
260 | ADD_RECT sse2 |
||
261 | |||
262 | HPEL_FILTER sse2 |
||
263 | ADD_OBMC 32, sse2 |
||
264 | ADD_OBMC 16, sse2 |