Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code |
||
3 | ;***************************************************************************** |
||
4 | ;* Copyright (C) 2005-2011 x264 project |
||
5 | ;* |
||
6 | ;* Authors: Daniel Kang |
||
7 | ;* |
||
8 | ;* This file is part of FFmpeg. |
||
9 | ;* |
||
10 | ;* FFmpeg is free software; you can redistribute it and/or |
||
11 | ;* modify it under the terms of the GNU Lesser General Public |
||
12 | ;* License as published by the Free Software Foundation; either |
||
13 | ;* version 2.1 of the License, or (at your option) any later version. |
||
14 | ;* |
||
15 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
18 | ;* Lesser General Public License for more details. |
||
19 | ;* |
||
20 | ;* You should have received a copy of the GNU Lesser General Public |
||
21 | ;* License along with FFmpeg; if not, write to the Free Software |
||
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
23 | ;****************************************************************************** |
||
24 | |||
25 | %include "libavutil/x86/x86util.asm" |
||
26 | |||
27 | SECTION_RODATA |
||
28 | |||
29 | cextern pw_4 |
||
30 | cextern pw_8 |
||
31 | cextern pw_32 |
||
32 | cextern pw_64 |
||
33 | |||
34 | SECTION .text |
||
35 | |||
36 | |||
37 | %macro MV0_PIXELS_MC8 0 |
||
38 | lea r4, [r2*3 ] |
||
39 | lea r5, [r2*4 ] |
||
40 | .next4rows: |
||
41 | movu m0, [r1 ] |
||
42 | movu m1, [r1+r2 ] |
||
43 | CHROMAMC_AVG m0, [r0 ] |
||
44 | CHROMAMC_AVG m1, [r0+r2 ] |
||
45 | mova [r0 ], m0 |
||
46 | mova [r0+r2 ], m1 |
||
47 | movu m0, [r1+r2*2] |
||
48 | movu m1, [r1+r4 ] |
||
49 | CHROMAMC_AVG m0, [r0+r2*2] |
||
50 | CHROMAMC_AVG m1, [r0+r4 ] |
||
51 | mova [r0+r2*2], m0 |
||
52 | mova [r0+r4 ], m1 |
||
53 | add r1, r5 |
||
54 | add r0, r5 |
||
55 | sub r3d, 4 |
||
56 | jne .next4rows |
||
57 | %endmacro |
||
58 | |||
59 | ;----------------------------------------------------------------------------- |
||
60 | ; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) |
||
61 | ;----------------------------------------------------------------------------- |
||
62 | %macro CHROMA_MC8 1 |
||
63 | ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, |
||
64 | ; int stride, int h, int mx, int my) |
||
65 | cglobal %1_h264_chroma_mc8_10, 6,7,8 |
||
66 | movsxdifnidn r2, r2d |
||
67 | mov r6d, r5d |
||
68 | or r6d, r4d |
||
69 | jne .at_least_one_non_zero |
||
70 | ; mx == 0 AND my == 0 - no filter needed |
||
71 | MV0_PIXELS_MC8 |
||
72 | REP_RET |
||
73 | |||
74 | .at_least_one_non_zero: |
||
75 | mov r6d, 2 |
||
76 | test r5d, r5d |
||
77 | je .x_interpolation |
||
78 | mov r6, r2 ; dxy = x ? 1 : stride |
||
79 | test r4d, r4d |
||
80 | jne .xy_interpolation |
||
81 | .x_interpolation: |
||
82 | ; mx == 0 XOR my == 0 - 1 dimensional filter only |
||
83 | or r4d, r5d ; x + y |
||
84 | movd m5, r4d |
||
85 | mova m4, [pw_8] |
||
86 | mova m6, [pw_4] ; mm6 = rnd >> 3 |
||
87 | SPLATW m5, m5 ; mm5 = B = x |
||
88 | psubw m4, m5 ; mm4 = A = 8-x |
||
89 | |||
90 | .next1drow: |
||
91 | movu m0, [r1 ] ; mm0 = src[0..7] |
||
92 | movu m2, [r1+r6] ; mm2 = src[1..8] |
||
93 | |||
94 | pmullw m0, m4 ; mm0 = A * src[0..7] |
||
95 | pmullw m2, m5 ; mm2 = B * src[1..8] |
||
96 | |||
97 | paddw m0, m6 |
||
98 | paddw m0, m2 |
||
99 | psrlw m0, 3 |
||
100 | CHROMAMC_AVG m0, [r0] |
||
101 | mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 |
||
102 | |||
103 | add r0, r2 |
||
104 | add r1, r2 |
||
105 | dec r3d |
||
106 | jne .next1drow |
||
107 | REP_RET |
||
108 | |||
109 | .xy_interpolation: ; general case, bilinear |
||
110 | movd m4, r4m ; x |
||
111 | movd m6, r5m ; y |
||
112 | |||
113 | SPLATW m4, m4 ; mm4 = x words |
||
114 | SPLATW m6, m6 ; mm6 = y words |
||
115 | psllw m5, m4, 3 ; mm5 = 8x |
||
116 | pmullw m4, m6 ; mm4 = x * y |
||
117 | psllw m6, 3 ; mm6 = 8y |
||
118 | paddw m1, m5, m6 ; mm7 = 8x+8y |
||
119 | mova m7, m4 ; DD = x * y |
||
120 | psubw m5, m4 ; mm5 = B = 8x - xy |
||
121 | psubw m6, m4 ; mm6 = C = 8y - xy |
||
122 | paddw m4, [pw_64] |
||
123 | psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64 |
||
124 | |||
125 | movu m0, [r1 ] ; mm0 = src[0..7] |
||
126 | movu m1, [r1+2] ; mm1 = src[1..8] |
||
127 | .next2drow: |
||
128 | add r1, r2 |
||
129 | |||
130 | pmullw m2, m0, m4 |
||
131 | pmullw m1, m5 |
||
132 | paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8] |
||
133 | |||
134 | movu m0, [r1] |
||
135 | movu m1, [r1+2] |
||
136 | pmullw m3, m0, m6 |
||
137 | paddw m2, m3 ; mm2 += C * src[0..7+strde] |
||
138 | pmullw m3, m1, m7 |
||
139 | paddw m2, m3 ; mm2 += D * src[1..8+strde] |
||
140 | |||
141 | paddw m2, [pw_32] |
||
142 | psrlw m2, 6 |
||
143 | CHROMAMC_AVG m2, [r0] |
||
144 | mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6 |
||
145 | |||
146 | add r0, r2 |
||
147 | dec r3d |
||
148 | jne .next2drow |
||
149 | REP_RET |
||
150 | %endmacro |
||
151 | |||
152 | ;----------------------------------------------------------------------------- |
||
153 | ; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my) |
||
154 | ;----------------------------------------------------------------------------- |
||
155 | ;TODO: xmm mc4 |
||
156 | %macro MC4_OP 2 |
||
157 | movq %1, [r1 ] |
||
158 | movq m1, [r1+2] |
||
159 | add r1, r2 |
||
160 | pmullw %1, m4 |
||
161 | pmullw m1, m2 |
||
162 | paddw m1, %1 |
||
163 | mova %1, m1 |
||
164 | |||
165 | pmullw %2, m5 |
||
166 | pmullw m1, m3 |
||
167 | paddw %2, [pw_32] |
||
168 | paddw m1, %2 |
||
169 | psrlw m1, 6 |
||
170 | CHROMAMC_AVG m1, %2, [r0] |
||
171 | movq [r0], m1 |
||
172 | add r0, r2 |
||
173 | %endmacro |
||
174 | |||
175 | %macro CHROMA_MC4 1 |
||
176 | cglobal %1_h264_chroma_mc4_10, 6,6,7 |
||
177 | movsxdifnidn r2, r2d |
||
178 | movd m2, r4m ; x |
||
179 | movd m3, r5m ; y |
||
180 | mova m4, [pw_8] |
||
181 | mova m5, m4 |
||
182 | SPLATW m2, m2 |
||
183 | SPLATW m3, m3 |
||
184 | psubw m4, m2 |
||
185 | psubw m5, m3 |
||
186 | |||
187 | movq m0, [r1 ] |
||
188 | movq m6, [r1+2] |
||
189 | add r1, r2 |
||
190 | pmullw m0, m4 |
||
191 | pmullw m6, m2 |
||
192 | paddw m6, m0 |
||
193 | |||
194 | .next2rows: |
||
195 | MC4_OP m0, m6 |
||
196 | MC4_OP m6, m0 |
||
197 | sub r3d, 2 |
||
198 | jnz .next2rows |
||
199 | REP_RET |
||
200 | %endmacro |
||
201 | |||
202 | ;----------------------------------------------------------------------------- |
||
203 | ; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) |
||
204 | ;----------------------------------------------------------------------------- |
||
205 | %macro CHROMA_MC2 1 |
||
206 | cglobal %1_h264_chroma_mc2_10, 6,7 |
||
207 | movsxdifnidn r2, r2d |
||
208 | mov r6d, r4d |
||
209 | shl r4d, 16 |
||
210 | sub r4d, r6d |
||
211 | add r4d, 8 |
||
212 | imul r5d, r4d ; x*y<<16 | y*(8-x) |
||
213 | shl r4d, 3 |
||
214 | sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) |
||
215 | |||
216 | movd m5, r4d |
||
217 | movd m6, r5d |
||
218 | punpckldq m5, m5 ; mm5 = {A,B,A,B} |
||
219 | punpckldq m6, m6 ; mm6 = {C,D,C,D} |
||
220 | pxor m7, m7 |
||
221 | pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] |
||
222 | |||
223 | .nextrow: |
||
224 | add r1, r2 |
||
225 | movq m1, m2 |
||
226 | pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] |
||
227 | pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2] |
||
228 | movq m2, m0 |
||
229 | pmaddwd m0, m6 |
||
230 | paddw m1, [pw_32] |
||
231 | paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] |
||
232 | psrlw m1, 6 |
||
233 | packssdw m1, m7 |
||
234 | CHROMAMC_AVG m1, m3, [r0] |
||
235 | movd [r0], m1 |
||
236 | add r0, r2 |
||
237 | dec r3d |
||
238 | jnz .nextrow |
||
239 | REP_RET |
||
240 | %endmacro |
||
241 | |||
242 | %macro NOTHING 2-3 |
||
243 | %endmacro |
||
244 | %macro AVG 2-3 |
||
245 | %if %0==3 |
||
246 | movq %2, %3 |
||
247 | %endif |
||
248 | pavgw %1, %2 |
||
249 | %endmacro |
||
250 | |||
251 | %define CHROMAMC_AVG NOTHING |
||
252 | INIT_XMM sse2 |
||
253 | CHROMA_MC8 put |
||
254 | %if HAVE_AVX_EXTERNAL |
||
255 | INIT_XMM avx |
||
256 | CHROMA_MC8 put |
||
257 | %endif |
||
258 | INIT_MMX mmxext |
||
259 | CHROMA_MC4 put |
||
260 | CHROMA_MC2 put |
||
261 | |||
262 | %define CHROMAMC_AVG AVG |
||
263 | INIT_XMM sse2 |
||
264 | CHROMA_MC8 avg |
||
265 | %if HAVE_AVX_EXTERNAL |
||
266 | INIT_XMM avx |
||
267 | CHROMA_MC8 avg |
||
268 | %endif |
||
269 | INIT_MMX mmxext |
||
270 | CHROMA_MC4 avg |
||
271 | CHROMA_MC2 avg16><16>16><16> |