Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
;*****************************************************************************
2
;* MMX optimized DSP utils
3
;*****************************************************************************
4
;* Copyright (c) 2000, 2001 Fabrice Bellard
5
;* Copyright (c) 2002-2004 Michael Niedermayer 
6
;*
7
;* This file is part of FFmpeg.
8
;*
9
;* FFmpeg is free software; you can redistribute it and/or
10
;* modify it under the terms of the GNU Lesser General Public
11
;* License as published by the Free Software Foundation; either
12
;* version 2.1 of the License, or (at your option) any later version.
13
;*
14
;* FFmpeg is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
;* Lesser General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU Lesser General Public
20
;* License along with FFmpeg; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
;*****************************************************************************
23
 
24
%include "libavutil/x86/x86util.asm"
25
 
26
SECTION .text
27
 
28
%macro DIFF_PIXELS_1 4
29
    movh            %1, %3
30
    movh            %2, %4
31
    punpcklbw       %2, %1
32
    punpcklbw       %1, %1
33
    psubw           %1, %2
34
%endmacro
35
 
36
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
37
; %6=temporary storage location
38
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
39
%macro DIFF_PIXELS_8 6
40
    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
41
    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
42
    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
43
    add             %1, %5
44
    add             %2, %5
45
    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
46
    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
47
    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48
    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
49
%ifdef m8
50
    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
51
%else
52
    mova          [%6], m0
53
    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
54
    mova            m0, [%6]
55
%endif
56
    sub             %1, %5
57
    sub             %2, %5
58
%endmacro
59
 
60
%macro HADAMARD8 0
61
    SUMSUB_BADC       w, 0, 1, 2, 3
62
    SUMSUB_BADC       w, 4, 5, 6, 7
63
    SUMSUB_BADC       w, 0, 2, 1, 3
64
    SUMSUB_BADC       w, 4, 6, 5, 7
65
    SUMSUB_BADC       w, 0, 4, 1, 5
66
    SUMSUB_BADC       w, 2, 6, 3, 7
67
%endmacro
68
 
69
%macro ABS1_SUM 3
70
    ABS1            %1, %2
71
    paddusw         %3, %1
72
%endmacro
73
 
74
%macro ABS2_SUM 6
75
    ABS2            %1, %2, %3, %4
76
    paddusw         %5, %1
77
    paddusw         %6, %2
78
%endmacro
79
 
80
%macro ABS_SUM_8x8_64 1
81
    ABS2            m0, m1, m8, m9
82
    ABS2_SUM        m2, m3, m8, m9, m0, m1
83
    ABS2_SUM        m4, m5, m8, m9, m0, m1
84
    ABS2_SUM        m6, m7, m8, m9, m0, m1
85
    paddusw         m0, m1
86
%endmacro
87
 
88
%macro ABS_SUM_8x8_32 1
89
    mova          [%1], m7
90
    ABS1            m0, m7
91
    ABS1            m1, m7
92
    ABS1_SUM        m2, m7, m0
93
    ABS1_SUM        m3, m7, m1
94
    ABS1_SUM        m4, m7, m0
95
    ABS1_SUM        m5, m7, m1
96
    ABS1_SUM        m6, m7, m0
97
    mova            m2, [%1]
98
    ABS1_SUM        m2, m7, m1
99
    paddusw         m0, m1
100
%endmacro
101
 
102
; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
103
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
104
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
105
%macro HSUM 3
106
%if cpuflag(sse2)
107
    movhlps         %2, %1
108
    paddusw         %1, %2
109
    pshuflw         %2, %1, 0xE
110
    paddusw         %1, %2
111
    pshuflw         %2, %1, 0x1
112
    paddusw         %1, %2
113
    movd            %3, %1
114
%elif cpuflag(mmxext)
115
    pshufw          %2, %1, 0xE
116
    paddusw         %1, %2
117
    pshufw          %2, %1, 0x1
118
    paddusw         %1, %2
119
    movd            %3, %1
120
%elif cpuflag(mmx)
121
    mova            %2, %1
122
    psrlq           %1, 32
123
    paddusw         %1, %2
124
    mova            %2, %1
125
    psrlq           %1, 16
126
    paddusw         %1, %2
127
    movd            %3, %1
128
%endif
129
%endmacro
130
 
131
%macro STORE4 5
132
    mova [%1+mmsize*0], %2
133
    mova [%1+mmsize*1], %3
134
    mova [%1+mmsize*2], %4
135
    mova [%1+mmsize*3], %5
136
%endmacro
137
 
138
%macro LOAD4 5
139
    mova            %2, [%1+mmsize*0]
140
    mova            %3, [%1+mmsize*1]
141
    mova            %4, [%1+mmsize*2]
142
    mova            %5, [%1+mmsize*3]
143
%endmacro
144
 
145
%macro hadamard8_16_wrapper 2
146
cglobal hadamard8_diff, 4, 4, %1
147
%ifndef m8
148
    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
149
    SUB            rsp, pad
150
%endif
151
    call hadamard8x8_diff %+ SUFFIX
152
%ifndef m8
153
    ADD            rsp, pad
154
%endif
155
    RET
156
 
157
cglobal hadamard8_diff16, 5, 6, %1
158
%ifndef m8
159
    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
160
    SUB            rsp, pad
161
%endif
162
 
163
    call hadamard8x8_diff %+ SUFFIX
164
    mov            r5d, eax
165
 
166
    add             r1, 8
167
    add             r2, 8
168
    call hadamard8x8_diff %+ SUFFIX
169
    add            r5d, eax
170
 
171
    cmp            r4d, 16
172
    jne .done
173
 
174
    lea             r1, [r1+r3*8-8]
175
    lea             r2, [r2+r3*8-8]
176
    call hadamard8x8_diff %+ SUFFIX
177
    add            r5d, eax
178
 
179
    add             r1, 8
180
    add             r2, 8
181
    call hadamard8x8_diff %+ SUFFIX
182
    add            r5d, eax
183
 
184
.done:
185
    mov            eax, r5d
186
%ifndef m8
187
    ADD            rsp, pad
188
%endif
189
    RET
190
%endmacro
191
 
192
%macro HADAMARD8_DIFF 0-1
193
%if cpuflag(sse2)
194
hadamard8x8_diff %+ SUFFIX:
195
    lea                          r0, [r3*3]
196
    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
197
    HADAMARD8
198
%if ARCH_X86_64
199
    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
200
%else
201
    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
202
%endif
203
    HADAMARD8
204
    ABS_SUM_8x8         rsp+gprsize
205
    HSUM                        m0, m1, eax
206
    and                         eax, 0xFFFF
207
    ret
208
 
209
hadamard8_16_wrapper %1, 3
210
%elif cpuflag(mmx)
211
ALIGN 16
212
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
213
;                          int stride, int h)
214
; r0 = void *s = unused, int h = unused (always 8)
215
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
216
; can simply call this 2x2x (and that's why we access rsp+gprsize
217
; everywhere, which is rsp of calling func
218
hadamard8x8_diff %+ SUFFIX:
219
    lea                          r0, [r3*3]
220
 
221
    ; first 4x8 pixels
222
    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
223
    HADAMARD8
224
    mova         [rsp+gprsize+0x60], m7
225
    TRANSPOSE4x4W                 0,  1,  2,  3,  7
226
    STORE4              rsp+gprsize, m0, m1, m2, m3
227
    mova                         m7, [rsp+gprsize+0x60]
228
    TRANSPOSE4x4W                 4,  5,  6,  7,  0
229
    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
230
 
231
    ; second 4x8 pixels
232
    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
233
    HADAMARD8
234
    mova         [rsp+gprsize+0x60], m7
235
    TRANSPOSE4x4W                 0,  1,  2,  3,  7
236
    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
237
    mova                         m7, [rsp+gprsize+0x60]
238
    TRANSPOSE4x4W                 4,  5,  6,  7,  0
239
 
240
    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
241
    HADAMARD8
242
    ABS_SUM_8x8_32 rsp+gprsize+0x60
243
    mova         [rsp+gprsize+0x60], m0
244
 
245
    LOAD4          rsp+gprsize     , m0, m1, m2, m3
246
    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
247
    HADAMARD8
248
    ABS_SUM_8x8_32 rsp+gprsize
249
    paddusw                      m0, [rsp+gprsize+0x60]
250
 
251
    HSUM                         m0, m1, eax
252
    and                         rax, 0xFFFF
253
    ret
254
 
255
hadamard8_16_wrapper 0, 14
256
%endif
257
%endmacro
258
 
259
INIT_MMX mmx
260
HADAMARD8_DIFF
261
 
262
INIT_MMX mmxext
263
HADAMARD8_DIFF
264
 
265
INIT_XMM sse2
266
%if ARCH_X86_64
267
%define ABS_SUM_8x8 ABS_SUM_8x8_64
268
%else
269
%define ABS_SUM_8x8 ABS_SUM_8x8_32
270
%endif
271
HADAMARD8_DIFF 10
272
 
273
INIT_XMM ssse3
274
%define ABS_SUM_8x8 ABS_SUM_8x8_64
275
HADAMARD8_DIFF 9
276
 
277
INIT_XMM sse2
278
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
279
cglobal sse16, 5, 5, 8
280
    shr      r4d, 1
281
    pxor      m0, m0         ; mm0 = 0
282
    pxor      m7, m7         ; mm7 holds the sum
283
 
284
.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
285
    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
286
    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
287
    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
288
    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
289
 
290
    ; todo: mm1-mm2, mm3-mm4
291
    ; algo: subtract mm1 from mm2 with saturation and vice versa
292
    ;       OR the result to get the absolute difference
293
    mova      m5, m1
294
    mova      m6, m3
295
    psubusb   m1, m2
296
    psubusb   m3, m4
297
    psubusb   m2, m5
298
    psubusb   m4, m6
299
 
300
    por       m2, m1
301
    por       m4, m3
302
 
303
    ; now convert to 16-bit vectors so we can square them
304
    mova      m1, m2
305
    mova      m3, m4
306
 
307
    punpckhbw m2, m0
308
    punpckhbw m4, m0
309
    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
310
    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
311
 
312
    pmaddwd   m2, m2
313
    pmaddwd   m4, m4
314
    pmaddwd   m1, m1
315
    pmaddwd   m3, m3
316
 
317
    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
318
    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
319
 
320
    paddd     m1, m2
321
    paddd     m3, m4
322
    paddd     m7, m1
323
    paddd     m7, m3
324
 
325
    dec       r4
326
    jnz .next2lines
327
 
328
    mova      m1, m7
329
    psrldq    m7, 8          ; shift hi qword to lo
330
    paddd     m7, m1
331
    mova      m1, m7
332
    psrldq    m7, 4          ; shift hi dword to lo
333
    paddd     m7, m1
334
    movd     eax, m7         ; return value
335
    RET
336
 
337
INIT_MMX mmx
338
; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
339
cglobal get_pixels, 3,4
340
    movsxdifnidn r2, r2d
341
    add          r0, 128
342
    mov          r3, -128
343
    pxor         m7, m7
344
.loop:
345
    mova         m0, [r1]
346
    mova         m2, [r1+r2]
347
    mova         m1, m0
348
    mova         m3, m2
349
    punpcklbw    m0, m7
350
    punpckhbw    m1, m7
351
    punpcklbw    m2, m7
352
    punpckhbw    m3, m7
353
    mova [r0+r3+ 0], m0
354
    mova [r0+r3+ 8], m1
355
    mova [r0+r3+16], m2
356
    mova [r0+r3+24], m3
357
    lea          r1, [r1+r2*2]
358
    add          r3, 32
359
    js .loop
360
    REP_RET
361
 
362
INIT_XMM sse2
363
cglobal get_pixels, 3, 4
364
    movsxdifnidn r2, r2d
365
    lea          r3, [r2*3]
366
    pxor         m4, m4
367
    movh         m0, [r1]
368
    movh         m1, [r1+r2]
369
    movh         m2, [r1+r2*2]
370
    movh         m3, [r1+r3]
371
    lea          r1, [r1+r2*4]
372
    punpcklbw    m0, m4
373
    punpcklbw    m1, m4
374
    punpcklbw    m2, m4
375
    punpcklbw    m3, m4
376
    mova       [r0], m0
377
    mova  [r0+0x10], m1
378
    mova  [r0+0x20], m2
379
    mova  [r0+0x30], m3
380
    movh         m0, [r1]
381
    movh         m1, [r1+r2*1]
382
    movh         m2, [r1+r2*2]
383
    movh         m3, [r1+r3]
384
    punpcklbw    m0, m4
385
    punpcklbw    m1, m4
386
    punpcklbw    m2, m4
387
    punpcklbw    m3, m4
388
    mova  [r0+0x40], m0
389
    mova  [r0+0x50], m1
390
    mova  [r0+0x60], m2
391
    mova  [r0+0x70], m3
392
    RET
393
 
394
INIT_MMX mmx
395
; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride)
396
cglobal diff_pixels, 4,5
397
    movsxdifnidn r3, r3d
398
    pxor         m7, m7
399
    add          r0,  128
400
    mov          r4, -128
401
.loop:
402
    mova         m0, [r1]
403
    mova         m2, [r2]
404
    mova         m1, m0
405
    mova         m3, m2
406
    punpcklbw    m0, m7
407
    punpckhbw    m1, m7
408
    punpcklbw    m2, m7
409
    punpckhbw    m3, m7
410
    psubw        m0, m2
411
    psubw        m1, m3
412
    mova  [r0+r4+0], m0
413
    mova  [r0+r4+8], m1
414
    add          r1, r3
415
    add          r2, r3
416
    add          r4, 16
417
    jne .loop
418
    REP_RET
419
 
420
INIT_MMX mmx
421
; pix_sum16_mmx(uint8_t * pix, int line_size)
422
cglobal pix_sum16, 2, 3
423
    movsxdifnidn r1, r1d
424
    mov          r2, r1
425
    neg          r2
426
    shl          r2, 4
427
    sub          r0, r2
428
    pxor         m7, m7
429
    pxor         m6, m6
430
.loop:
431
    mova         m0, [r0+r2+0]
432
    mova         m1, [r0+r2+0]
433
    mova         m2, [r0+r2+8]
434
    mova         m3, [r0+r2+8]
435
    punpcklbw    m0, m7
436
    punpckhbw    m1, m7
437
    punpcklbw    m2, m7
438
    punpckhbw    m3, m7
439
    paddw        m1, m0
440
    paddw        m3, m2
441
    paddw        m3, m1
442
    paddw        m6, m3
443
    add          r2, r1
444
    js .loop
445
    mova         m5, m6
446
    psrlq        m6, 32
447
    paddw        m6, m5
448
    mova         m5, m6
449
    psrlq        m6, 16
450
    paddw        m6, m5
451
    movd        eax, m6
452
    and         eax, 0xffff
453
    RET
454
 
455
INIT_MMX mmx
456
; pix_norm1_mmx(uint8_t *pix, int line_size)
457
cglobal pix_norm1, 2, 4
458
    movsxdifnidn r1, r1d
459
    mov          r2, 16
460
    pxor         m0, m0
461
    pxor         m7, m7
462
.loop:
463
    mova         m2, [r0+0]
464
    mova         m3, [r0+8]
465
    mova         m1, m2
466
    punpckhbw    m1, m0
467
    punpcklbw    m2, m0
468
    mova         m4, m3
469
    punpckhbw    m3, m0
470
    punpcklbw    m4, m0
471
    pmaddwd      m1, m1
472
    pmaddwd      m2, m2
473
    pmaddwd      m3, m3
474
    pmaddwd      m4, m4
475
    paddd        m2, m1
476
    paddd        m4, m3
477
    paddd        m7, m2
478
    add          r0, r1
479
    paddd        m7, m4
480
    dec r2
481
    jne .loop
482
    mova         m1, m7
483
    psrlq        m7, 32
484
    paddd        m1, m7
485
    movd        eax, m1
486
    RET
487