Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | ;****************************************************************************** |
2 | ;* SIMD optimized SAO functions for HEVC decoding |
||
3 | ;* |
||
4 | ;* Copyright (c) 2013 Pierre-Edouard LEPERE |
||
5 | ;* Copyright (c) 2014 James Almer |
||
6 | ;* |
||
7 | ;* This file is part of FFmpeg. |
||
8 | ;* |
||
9 | ;* FFmpeg is free software; you can redistribute it and/or |
||
10 | ;* modify it under the terms of the GNU Lesser General Public |
||
11 | ;* License as published by the Free Software Foundation; either |
||
12 | ;* version 2.1 of the License, or (at your option) any later version. |
||
13 | ;* |
||
14 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
17 | ;* Lesser General Public License for more details. |
||
18 | ;* |
||
19 | ;* You should have received a copy of the GNU Lesser General Public |
||
20 | ;* License along with FFmpeg; if not, write to the Free Software |
||
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
22 | ;****************************************************************************** |
||
23 | |||
24 | %include "libavutil/x86/x86util.asm" |
||
25 | |||
26 | SECTION_RODATA 32 |
||
27 | |||
28 | pw_mask10: times 16 dw 0x03FF |
||
29 | pw_mask12: times 16 dw 0x0FFF |
||
30 | pw_m2: times 16 dw -2 |
||
31 | pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
||
32 | pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1 |
||
33 | cextern pw_m1 |
||
34 | cextern pw_1 |
||
35 | cextern pw_2 |
||
36 | cextern pb_1 |
||
37 | cextern pb_2 |
||
38 | |||
39 | SECTION .text |
||
40 | |||
41 | %define MAX_PB_SIZE 64 |
||
42 | %define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE |
||
43 | |||
44 | ;****************************************************************************** |
||
45 | ;SAO Band Filter |
||
46 | ;****************************************************************************** |
||
47 | |||
48 | %macro HEVC_SAO_BAND_FILTER_INIT 1 |
||
49 | and leftq, 31 |
||
50 | movd xm0, leftd |
||
51 | add leftq, 1 |
||
52 | and leftq, 31 |
||
53 | movd xm1, leftd |
||
54 | add leftq, 1 |
||
55 | and leftq, 31 |
||
56 | movd xm2, leftd |
||
57 | add leftq, 1 |
||
58 | and leftq, 31 |
||
59 | movd xm3, leftd |
||
60 | |||
61 | SPLATW m0, xm0 |
||
62 | SPLATW m1, xm1 |
||
63 | SPLATW m2, xm2 |
||
64 | SPLATW m3, xm3 |
||
65 | %if mmsize > 16 |
||
66 | SPLATW m4, [offsetq + 2] |
||
67 | SPLATW m5, [offsetq + 4] |
||
68 | SPLATW m6, [offsetq + 6] |
||
69 | SPLATW m7, [offsetq + 8] |
||
70 | %else |
||
71 | movq m7, [offsetq + 2] |
||
72 | SPLATW m4, m7, 0 |
||
73 | SPLATW m5, m7, 1 |
||
74 | SPLATW m6, m7, 2 |
||
75 | SPLATW m7, m7, 3 |
||
76 | %endif |
||
77 | |||
78 | %if ARCH_X86_64 |
||
79 | %if %1 > 8 |
||
80 | mova m13, [pw_mask %+ %1] |
||
81 | %endif |
||
82 | pxor m14, m14 |
||
83 | |||
84 | %else ; ARCH_X86_32 |
||
85 | mova [rsp+mmsize*0], m0 |
||
86 | mova [rsp+mmsize*1], m1 |
||
87 | mova [rsp+mmsize*2], m2 |
||
88 | mova [rsp+mmsize*3], m3 |
||
89 | mova [rsp+mmsize*4], m4 |
||
90 | mova [rsp+mmsize*5], m5 |
||
91 | mova [rsp+mmsize*6], m6 |
||
92 | pxor m0, m0 |
||
93 | %if %1 > 8 |
||
94 | mova m1, [pw_mask %+ %1] |
||
95 | %endif |
||
96 | %assign MMSIZE mmsize |
||
97 | %define m14 m0 |
||
98 | %define m13 m1 |
||
99 | %define m9 m2 |
||
100 | %define m8 m3 |
||
101 | %endif ; ARCH |
||
102 | DEFINE_ARGS dst, src, dststride, srcstride, offset, height |
||
103 | mov heightd, r7m |
||
104 | %endmacro |
||
105 | |||
106 | %macro HEVC_SAO_BAND_FILTER_COMPUTE 3 |
||
107 | psraw %2, %3, %1-5 |
||
108 | %if ARCH_X86_64 |
||
109 | pcmpeqw m10, %2, m0 |
||
110 | pcmpeqw m11, %2, m1 |
||
111 | pcmpeqw m12, %2, m2 |
||
112 | pcmpeqw %2, m3 |
||
113 | pand m10, m4 |
||
114 | pand m11, m5 |
||
115 | pand m12, m6 |
||
116 | pand %2, m7 |
||
117 | por m10, m11 |
||
118 | por m12, %2 |
||
119 | por m10, m12 |
||
120 | paddw %3, m10 |
||
121 | %else ; ARCH_X86_32 |
||
122 | pcmpeqw m4, %2, [rsp+MMSIZE*0] |
||
123 | pcmpeqw m5, %2, [rsp+MMSIZE*1] |
||
124 | pcmpeqw m6, %2, [rsp+MMSIZE*2] |
||
125 | pcmpeqw %2, [rsp+MMSIZE*3] |
||
126 | pand m4, [rsp+MMSIZE*4] |
||
127 | pand m5, [rsp+MMSIZE*5] |
||
128 | pand m6, [rsp+MMSIZE*6] |
||
129 | pand %2, m7 |
||
130 | por m4, m5 |
||
131 | por m6, %2 |
||
132 | por m4, m6 |
||
133 | paddw %3, m4 |
||
134 | %endif ; ARCH |
||
135 | %endmacro |
||
136 | |||
137 | ;void ff_hevc_sao_band_filter_ |
||
138 | ; int16_t *sao_offset_val, int sao_left_class, int width, int height); |
||
139 | %macro HEVC_SAO_BAND_FILTER_8 2 |
||
140 | cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left |
||
141 | HEVC_SAO_BAND_FILTER_INIT 8 |
||
142 | |||
143 | align 16 |
||
144 | .loop: |
||
145 | %if %1 == 8 |
||
146 | movq m8, [srcq] |
||
147 | punpcklbw m8, m14 |
||
148 | HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8 |
||
149 | packuswb m8, m14 |
||
150 | movq [dstq], m8 |
||
151 | %endif ; %1 == 8 |
||
152 | |||
153 | %assign i 0 |
||
154 | %rep %2 |
||
155 | mova m13, [srcq + i] |
||
156 | punpcklbw m8, m13, m14 |
||
157 | HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8 |
||
158 | punpckhbw m13, m14 |
||
159 | HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m13 |
||
160 | packuswb m8, m13 |
||
161 | mova [dstq + i], m8 |
||
162 | %assign i i+mmsize |
||
163 | %endrep |
||
164 | |||
165 | %if %1 == 48 |
||
166 | INIT_XMM cpuname |
||
167 | |||
168 | mova m13, [srcq + i] |
||
169 | punpcklbw m8, m13, m14 |
||
170 | HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8 |
||
171 | punpckhbw m13, m14 |
||
172 | HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m13 |
||
173 | packuswb m8, m13 |
||
174 | mova [dstq + i], m8 |
||
175 | %if cpuflag(avx2) |
||
176 | INIT_YMM cpuname |
||
177 | %endif |
||
178 | %endif ; %1 == 48 |
||
179 | |||
180 | add dstq, dststrideq ; dst += dststride |
||
181 | add srcq, srcstrideq ; src += srcstride |
||
182 | dec heightd ; cmp height |
||
183 | jnz .loop ; height loop |
||
184 | REP_RET |
||
185 | %endmacro |
||
186 | |||
187 | ;void ff_hevc_sao_band_filter_ |
||
188 | ; int16_t *sao_offset_val, int sao_left_class, int width, int height); |
||
189 | %macro HEVC_SAO_BAND_FILTER_16 3 |
||
190 | cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left |
||
191 | HEVC_SAO_BAND_FILTER_INIT %1 |
||
192 | |||
193 | align 16 |
||
194 | .loop: |
||
195 | %if %2 == 8 |
||
196 | movu m8, [srcq] |
||
197 | HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8 |
||
198 | CLIPW m8, m14, m13 |
||
199 | movu [dstq], m8 |
||
200 | %endif |
||
201 | |||
202 | %assign i 0 |
||
203 | %rep %3 |
||
204 | mova m8, [srcq + i] |
||
205 | HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8 |
||
206 | CLIPW m8, m14, m13 |
||
207 | mova [dstq + i], m8 |
||
208 | |||
209 | mova m9, [srcq + i + mmsize] |
||
210 | HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9 |
||
211 | CLIPW m9, m14, m13 |
||
212 | mova [dstq + i + mmsize], m9 |
||
213 | %assign i i+mmsize*2 |
||
214 | %endrep |
||
215 | |||
216 | %if %2 == 48 |
||
217 | INIT_XMM cpuname |
||
218 | mova m8, [srcq + i] |
||
219 | HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8 |
||
220 | CLIPW m8, m14, m13 |
||
221 | mova [dstq + i], m8 |
||
222 | |||
223 | mova m9, [srcq + i + mmsize] |
||
224 | HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9 |
||
225 | CLIPW m9, m14, m13 |
||
226 | mova [dstq + i + mmsize], m9 |
||
227 | %if cpuflag(avx2) |
||
228 | INIT_YMM cpuname |
||
229 | %endif |
||
230 | %endif ; %1 == 48 |
||
231 | |||
232 | add dstq, dststrideq |
||
233 | add srcq, srcstrideq |
||
234 | dec heightd |
||
235 | jg .loop |
||
236 | REP_RET |
||
237 | %endmacro |
||
238 | |||
239 | %macro HEVC_SAO_BAND_FILTER_FUNCS 0 |
||
240 | HEVC_SAO_BAND_FILTER_8 8, 0 |
||
241 | HEVC_SAO_BAND_FILTER_8 16, 1 |
||
242 | HEVC_SAO_BAND_FILTER_8 32, 2 |
||
243 | HEVC_SAO_BAND_FILTER_8 48, 2 |
||
244 | HEVC_SAO_BAND_FILTER_8 64, 4 |
||
245 | |||
246 | HEVC_SAO_BAND_FILTER_16 10, 8, 0 |
||
247 | HEVC_SAO_BAND_FILTER_16 10, 16, 1 |
||
248 | HEVC_SAO_BAND_FILTER_16 10, 32, 2 |
||
249 | HEVC_SAO_BAND_FILTER_16 10, 48, 2 |
||
250 | HEVC_SAO_BAND_FILTER_16 10, 64, 4 |
||
251 | |||
252 | HEVC_SAO_BAND_FILTER_16 12, 8, 0 |
||
253 | HEVC_SAO_BAND_FILTER_16 12, 16, 1 |
||
254 | HEVC_SAO_BAND_FILTER_16 12, 32, 2 |
||
255 | HEVC_SAO_BAND_FILTER_16 12, 48, 2 |
||
256 | HEVC_SAO_BAND_FILTER_16 12, 64, 4 |
||
257 | %endmacro |
||
258 | |||
259 | INIT_XMM sse2 |
||
260 | HEVC_SAO_BAND_FILTER_FUNCS |
||
261 | INIT_XMM avx |
||
262 | HEVC_SAO_BAND_FILTER_FUNCS |
||
263 | |||
264 | %if HAVE_AVX2_EXTERNAL |
||
265 | INIT_XMM avx2 |
||
266 | HEVC_SAO_BAND_FILTER_8 8, 0 |
||
267 | HEVC_SAO_BAND_FILTER_8 16, 1 |
||
268 | INIT_YMM avx2 |
||
269 | HEVC_SAO_BAND_FILTER_8 32, 1 |
||
270 | HEVC_SAO_BAND_FILTER_8 48, 1 |
||
271 | HEVC_SAO_BAND_FILTER_8 64, 2 |
||
272 | |||
273 | INIT_XMM avx2 |
||
274 | HEVC_SAO_BAND_FILTER_16 10, 8, 0 |
||
275 | HEVC_SAO_BAND_FILTER_16 10, 16, 1 |
||
276 | INIT_YMM avx2 |
||
277 | HEVC_SAO_BAND_FILTER_16 10, 32, 1 |
||
278 | HEVC_SAO_BAND_FILTER_16 10, 48, 1 |
||
279 | HEVC_SAO_BAND_FILTER_16 10, 64, 2 |
||
280 | |||
281 | INIT_XMM avx2 |
||
282 | HEVC_SAO_BAND_FILTER_16 12, 8, 0 |
||
283 | HEVC_SAO_BAND_FILTER_16 12, 16, 1 |
||
284 | INIT_YMM avx2 |
||
285 | HEVC_SAO_BAND_FILTER_16 12, 32, 1 |
||
286 | HEVC_SAO_BAND_FILTER_16 12, 48, 1 |
||
287 | HEVC_SAO_BAND_FILTER_16 12, 64, 2 |
||
288 | %endif |
||
289 | |||
290 | ;****************************************************************************** |
||
291 | ;SAO Edge Filter |
||
292 | ;****************************************************************************** |
||
293 | |||
294 | %define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE |
||
295 | |||
296 | %macro HEVC_SAO_EDGE_FILTER_INIT 1 |
||
297 | %if WIN64 |
||
298 | movsxd eoq, dword eom |
||
299 | %elif ARCH_X86_64 |
||
300 | movsxd eoq, eod |
||
301 | %else |
||
302 | mov eoq, r4m |
||
303 | %endif |
||
304 | lea tmp2q, [pb_eo] |
||
305 | movsx a_strideq, byte [tmp2q+eoq*4+1] |
||
306 | movsx b_strideq, byte [tmp2q+eoq*4+3] |
||
307 | imul a_strideq, EDGE_SRCSTRIDE>>%1 |
||
308 | imul b_strideq, EDGE_SRCSTRIDE>>%1 |
||
309 | movsx tmpq, byte [tmp2q+eoq*4] |
||
310 | add a_strideq, tmpq |
||
311 | movsx tmpq, byte [tmp2q+eoq*4+2] |
||
312 | add b_strideq, tmpq |
||
313 | %endmacro |
||
314 | |||
315 | %macro HEVC_SAO_EDGE_FILTER_COMPUTE_8 1 |
||
316 | pminub m4, m1, m2 |
||
317 | pminub m5, m1, m3 |
||
318 | pcmpeqb m2, m4 |
||
319 | pcmpeqb m3, m5 |
||
320 | pcmpeqb m4, m1 |
||
321 | pcmpeqb m5, m1 |
||
322 | psubb m4, m2 |
||
323 | psubb m5, m3 |
||
324 | paddb m4, m6 |
||
325 | paddb m4, m5 |
||
326 | |||
327 | pshufb m2, m0, m4 |
||
328 | %if %1 > 8 |
||
329 | punpckhbw m5, m7, m1 |
||
330 | punpckhbw m4, m2, m7 |
||
331 | punpcklbw m3, m7, m1 |
||
332 | punpcklbw m2, m7 |
||
333 | pmaddubsw m5, m4 |
||
334 | pmaddubsw m3, m2 |
||
335 | packuswb m3, m5 |
||
336 | %else |
||
337 | punpcklbw m3, m7, m1 |
||
338 | punpcklbw m2, m7 |
||
339 | pmaddubsw m3, m2 |
||
340 | packuswb m3, m3 |
||
341 | %endif |
||
342 | %endmacro |
||
343 | |||
344 | ;void ff_hevc_sao_edge_filter_ |
||
345 | ; int eo, int width, int height); |
||
346 | %macro HEVC_SAO_EDGE_FILTER_8 2-3 |
||
347 | %if ARCH_X86_64 |
||
348 | cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp |
||
349 | %define tmp2q heightq |
||
350 | HEVC_SAO_EDGE_FILTER_INIT 0 |
||
351 | mov heightd, r6m |
||
352 | |||
353 | %else ; ARCH_X86_32 |
||
354 | cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height |
||
355 | %define eoq srcq |
||
356 | %define tmpq heightq |
||
357 | %define tmp2q dststrideq |
||
358 | %define offsetq heightq |
||
359 | HEVC_SAO_EDGE_FILTER_INIT 0 |
||
360 | mov srcq, srcm |
||
361 | mov offsetq, r3m |
||
362 | mov dststrideq, dststridem |
||
363 | %endif ; ARCH |
||
364 | |||
365 | %if mmsize > 16 |
||
366 | vbroadcasti128 m0, [offsetq] |
||
367 | %else |
||
368 | movu m0, [offsetq] |
||
369 | %endif |
||
370 | mova m1, [pb_edge_shuffle] |
||
371 | packsswb m0, m0 |
||
372 | mova m7, [pb_1] |
||
373 | pshufb m0, m1 |
||
374 | mova m6, [pb_2] |
||
375 | %if ARCH_X86_32 |
||
376 | mov heightd, r6m |
||
377 | %endif |
||
378 | |||
379 | align 16 |
||
380 | .loop: |
||
381 | |||
382 | %if %1 == 8 |
||
383 | movq m1, [srcq] |
||
384 | movq m2, [srcq + a_strideq] |
||
385 | movq m3, [srcq + b_strideq] |
||
386 | HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 |
||
387 | movq [dstq], m3 |
||
388 | %endif |
||
389 | |||
390 | %assign i 0 |
||
391 | %rep %2 |
||
392 | mova m1, [srcq + i] |
||
393 | movu m2, [srcq + a_strideq + i] |
||
394 | movu m3, [srcq + b_strideq + i] |
||
395 | HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 |
||
396 | mov%3 [dstq + i], m3 |
||
397 | %assign i i+mmsize |
||
398 | %endrep |
||
399 | |||
400 | %if %1 == 48 |
||
401 | INIT_XMM cpuname |
||
402 | |||
403 | mova m1, [srcq + i] |
||
404 | movu m2, [srcq + a_strideq + i] |
||
405 | movu m3, [srcq + b_strideq + i] |
||
406 | HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1 |
||
407 | mova [dstq + i], m3 |
||
408 | %if cpuflag(avx2) |
||
409 | INIT_YMM cpuname |
||
410 | %endif |
||
411 | %endif |
||
412 | |||
413 | add dstq, dststrideq |
||
414 | add srcq, EDGE_SRCSTRIDE |
||
415 | dec heightd |
||
416 | jg .loop |
||
417 | RET |
||
418 | %endmacro |
||
419 | |||
420 | %macro PMINUW 4 |
||
421 | %if cpuflag(sse4) |
||
422 | pminuw %1, %2, %3 |
||
423 | %else |
||
424 | psubusw %4, %2, %3 |
||
425 | psubw %1, %2, %4 |
||
426 | %endif |
||
427 | %endmacro |
||
428 | |||
429 | %macro HEVC_SAO_EDGE_FILTER_COMPUTE_10 0 |
||
430 | PMINUW m4, m1, m2, m6 |
||
431 | PMINUW m5, m1, m3, m7 |
||
432 | pcmpeqw m2, m4 |
||
433 | pcmpeqw m3, m5 |
||
434 | pcmpeqw m4, m1 |
||
435 | pcmpeqw m5, m1 |
||
436 | psubw m4, m2 |
||
437 | psubw m5, m3 |
||
438 | |||
439 | paddw m4, m5 |
||
440 | pcmpeqw m2, m4, [pw_m2] |
||
441 | %if ARCH_X86_64 |
||
442 | pcmpeqw m3, m4, m13 |
||
443 | pcmpeqw m5, m4, m0 |
||
444 | pcmpeqw m6, m4, m14 |
||
445 | pcmpeqw m7, m4, m15 |
||
446 | pand m2, m8 |
||
447 | pand m3, m9 |
||
448 | pand m5, m10 |
||
449 | pand m6, m11 |
||
450 | pand m7, m12 |
||
451 | %else |
||
452 | pcmpeqw m3, m4, [pw_m1] |
||
453 | pcmpeqw m5, m4, m0 |
||
454 | pcmpeqw m6, m4, [pw_1] |
||
455 | pcmpeqw m7, m4, [pw_2] |
||
456 | pand m2, [rsp+MMSIZE*0] |
||
457 | pand m3, [rsp+MMSIZE*1] |
||
458 | pand m5, [rsp+MMSIZE*2] |
||
459 | pand m6, [rsp+MMSIZE*3] |
||
460 | pand m7, [rsp+MMSIZE*4] |
||
461 | %endif |
||
462 | paddw m2, m3 |
||
463 | paddw m5, m6 |
||
464 | paddw m2, m7 |
||
465 | paddw m2, m1 |
||
466 | paddw m2, m5 |
||
467 | %endmacro |
||
468 | |||
469 | ;void ff_hevc_sao_edge_filter_ |
||
470 | ; int eo, int width, int height); |
||
471 | %macro HEVC_SAO_EDGE_FILTER_16 3 |
||
472 | %if ARCH_X86_64 |
||
473 | cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp |
||
474 | %define tmp2q heightq |
||
475 | HEVC_SAO_EDGE_FILTER_INIT 1 |
||
476 | mov heightd, r6m |
||
477 | add a_strideq, a_strideq |
||
478 | add b_strideq, b_strideq |
||
479 | |||
480 | %else ; ARCH_X86_32 |
||
481 | cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height |
||
482 | %assign MMSIZE mmsize |
||
483 | %define eoq srcq |
||
484 | %define tmpq heightq |
||
485 | %define tmp2q dststrideq |
||
486 | %define offsetq heightq |
||
487 | %define m8 m1 |
||
488 | %define m9 m2 |
||
489 | %define m10 m3 |
||
490 | %define m11 m4 |
||
491 | %define m12 m5 |
||
492 | HEVC_SAO_EDGE_FILTER_INIT 1 |
||
493 | mov srcq, srcm |
||
494 | mov offsetq, r3m |
||
495 | mov dststrideq, dststridem |
||
496 | add a_strideq, a_strideq |
||
497 | add b_strideq, b_strideq |
||
498 | |||
499 | %endif ; ARCH |
||
500 | |||
501 | %if cpuflag(avx2) |
||
502 | SPLATW m8, [offsetq+2] |
||
503 | SPLATW m9, [offsetq+4] |
||
504 | SPLATW m10, [offsetq+0] |
||
505 | SPLATW m11, [offsetq+6] |
||
506 | SPLATW m12, [offsetq+8] |
||
507 | %else |
||
508 | movq m10, [offsetq+0] |
||
509 | movd m12, [offsetq+6] |
||
510 | SPLATW m8, xm10, 1 |
||
511 | SPLATW m9, xm10, 2 |
||
512 | SPLATW m10, xm10, 0 |
||
513 | SPLATW m11, xm12, 0 |
||
514 | SPLATW m12, xm12, 1 |
||
515 | %endif |
||
516 | pxor m0, m0 |
||
517 | %if ARCH_X86_64 |
||
518 | mova m13, [pw_m1] |
||
519 | mova m14, [pw_1] |
||
520 | mova m15, [pw_2] |
||
521 | %else |
||
522 | mov heightd, r6m |
||
523 | mova [rsp+mmsize*0], m8 |
||
524 | mova [rsp+mmsize*1], m9 |
||
525 | mova [rsp+mmsize*2], m10 |
||
526 | mova [rsp+mmsize*3], m11 |
||
527 | mova [rsp+mmsize*4], m12 |
||
528 | %endif |
||
529 | |||
530 | align 16 |
||
531 | .loop: |
||
532 | |||
533 | %if %2 == 8 |
||
534 | mova m1, [srcq] |
||
535 | movu m2, [srcq+a_strideq] |
||
536 | movu m3, [srcq+b_strideq] |
||
537 | |||
538 | HEVC_SAO_EDGE_FILTER_COMPUTE_10 |
||
539 | CLIPW m2, m0, [pw_mask %+ %1] |
||
540 | movu [dstq], m2 |
||
541 | %endif |
||
542 | |||
543 | %assign i 0 |
||
544 | %rep %3 |
||
545 | mova m1, [srcq + i] |
||
546 | movu m2, [srcq+a_strideq + i] |
||
547 | movu m3, [srcq+b_strideq + i] |
||
548 | HEVC_SAO_EDGE_FILTER_COMPUTE_10 |
||
549 | CLIPW m2, m0, [pw_mask %+ %1] |
||
550 | mova [dstq + i], m2 |
||
551 | |||
552 | mova m1, [srcq + i + mmsize] |
||
553 | movu m2, [srcq+a_strideq + i + mmsize] |
||
554 | movu m3, [srcq+b_strideq + i + mmsize] |
||
555 | HEVC_SAO_EDGE_FILTER_COMPUTE_10 |
||
556 | CLIPW m2, m0, [pw_mask %+ %1] |
||
557 | mova [dstq + i + mmsize], m2 |
||
558 | %assign i i+mmsize*2 |
||
559 | %endrep |
||
560 | |||
561 | %if %2 == 48 |
||
562 | INIT_XMM cpuname |
||
563 | mova m1, [srcq + i] |
||
564 | movu m2, [srcq+a_strideq + i] |
||
565 | movu m3, [srcq+b_strideq + i] |
||
566 | HEVC_SAO_EDGE_FILTER_COMPUTE_10 |
||
567 | CLIPW m2, m0, [pw_mask %+ %1] |
||
568 | mova [dstq + i], m2 |
||
569 | |||
570 | mova m1, [srcq + i + mmsize] |
||
571 | movu m2, [srcq+a_strideq + i + mmsize] |
||
572 | movu m3, [srcq+b_strideq + i + mmsize] |
||
573 | HEVC_SAO_EDGE_FILTER_COMPUTE_10 |
||
574 | CLIPW m2, m0, [pw_mask %+ %1] |
||
575 | mova [dstq + i + mmsize], m2 |
||
576 | %if cpuflag(avx2) |
||
577 | INIT_YMM cpuname |
||
578 | %endif |
||
579 | %endif |
||
580 | |||
581 | add dstq, dststrideq |
||
582 | add srcq, EDGE_SRCSTRIDE |
||
583 | dec heightd |
||
584 | jg .loop |
||
585 | RET |
||
586 | %endmacro |
||
587 | |||
588 | INIT_XMM ssse3 |
||
589 | HEVC_SAO_EDGE_FILTER_8 8, 0 |
||
590 | HEVC_SAO_EDGE_FILTER_8 16, 1, a |
||
591 | HEVC_SAO_EDGE_FILTER_8 32, 2, a |
||
592 | HEVC_SAO_EDGE_FILTER_8 48, 2, a |
||
593 | HEVC_SAO_EDGE_FILTER_8 64, 4, a |
||
594 | |||
595 | %if HAVE_AVX2_EXTERNAL |
||
596 | INIT_YMM avx2 |
||
597 | HEVC_SAO_EDGE_FILTER_8 32, 1, a |
||
598 | HEVC_SAO_EDGE_FILTER_8 48, 1, u |
||
599 | HEVC_SAO_EDGE_FILTER_8 64, 2, a |
||
600 | %endif |
||
601 | |||
602 | INIT_XMM sse2 |
||
603 | HEVC_SAO_EDGE_FILTER_16 10, 8, 0 |
||
604 | HEVC_SAO_EDGE_FILTER_16 10, 16, 1 |
||
605 | HEVC_SAO_EDGE_FILTER_16 10, 32, 2 |
||
606 | HEVC_SAO_EDGE_FILTER_16 10, 48, 2 |
||
607 | HEVC_SAO_EDGE_FILTER_16 10, 64, 4 |
||
608 | |||
609 | HEVC_SAO_EDGE_FILTER_16 12, 8, 0 |
||
610 | HEVC_SAO_EDGE_FILTER_16 12, 16, 1 |
||
611 | HEVC_SAO_EDGE_FILTER_16 12, 32, 2 |
||
612 | HEVC_SAO_EDGE_FILTER_16 12, 48, 2 |
||
613 | HEVC_SAO_EDGE_FILTER_16 12, 64, 4 |
||
614 | |||
615 | %if HAVE_AVX2_EXTERNAL |
||
616 | INIT_YMM avx2 |
||
617 | HEVC_SAO_EDGE_FILTER_16 10, 32, 1 |
||
618 | HEVC_SAO_EDGE_FILTER_16 10, 48, 1 |
||
619 | HEVC_SAO_EDGE_FILTER_16 10, 64, 2 |
||
620 | |||
621 | HEVC_SAO_EDGE_FILTER_16 12, 32, 1 |
||
622 | HEVC_SAO_EDGE_FILTER_16 12, 48, 1 |
||
623 | HEVC_SAO_EDGE_FILTER_16 12, 64, 2 |
||
624 | %endif |