Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
;******************************************************************************
2
;* x86-SIMD-optimized IDCT for prores
3
;* this is identical to "simple" IDCT written by Michael Niedermayer
4
;* except for the clip range
5
;*
6
;* Copyright (c) 2011 Ronald S. Bultje 
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24
 
25
%include "libavutil/x86/x86util.asm"
26
 
27
%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
28
%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
29
%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
30
%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
31
%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
32
%define W6sh2  8867 ; W6 = 35468 =  8867<<2
33
%define W7sh2  4520 ; W7 = 18081 =  4520<<2 + 1
34
 
35
%if ARCH_X86_64
36
 
37
SECTION_RODATA
38
 
39
w4_plus_w2: times 4 dw W4sh2, +W2sh2
40
w4_min_w2:  times 4 dw W4sh2, -W2sh2
41
w4_plus_w6: times 4 dw W4sh2, +W6sh2
42
w4_min_w6:  times 4 dw W4sh2, -W6sh2
43
w1_plus_w3: times 4 dw W1sh2, +W3sh2
44
w3_min_w1:  times 4 dw W3sh2, -W1sh2
45
w7_plus_w3: times 4 dw W7sh2, +W3sh2
46
w3_min_w7:  times 4 dw W3sh2, -W7sh2
47
w1_plus_w5: times 4 dw W1sh2, +W5sh2
48
w5_min_w1:  times 4 dw W5sh2, -W1sh2
49
w5_plus_w7: times 4 dw W5sh2, +W7sh2
50
w7_min_w5:  times 4 dw W7sh2, -W5sh2
51
pw_88:      times 8 dw 0x2008
52
 
53
cextern pw_1
54
cextern pw_4
55
cextern pw_512
56
cextern pw_1019
57
 
58
section .text align=16
59
 
60
; interleave data while maintaining source
61
; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
62
%macro SBUTTERFLY3 5
63
    punpckl%1   m%2, m%4, m%5
64
    punpckh%1   m%3, m%4, m%5
65
%endmacro
66
 
67
; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
68
; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
69
;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
70
%macro SUMSUB_SHPK 7
71
    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
72
    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
73
    paddd       %1,  %5            ; { a0 + b0 }[0-3]
74
    paddd       %2,  %6            ; { a0 + b0 }[4-7]
75
    psrad       %1,  %7
76
    psrad       %2,  %7
77
    psrad       %3,  %7
78
    psrad       %4,  %7
79
    packssdw    %1,  %2            ; row[0]
80
    packssdw    %3,  %4            ; row[7]
81
%endmacro
82
 
83
; %1 = row or col (for rounding variable)
84
; %2 = number of bits to shift at the end
85
%macro IDCT_1D 2
86
    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
87
    ; a1 = a0;
88
    ; a2 = a0;
89
    ; a3 = a0;
90
    ; a0 += W2 * row[2];
91
    ; a1 += W6 * row[2];
92
    ; a2 -= W6 * row[2];
93
    ; a3 -= W2 * row[2];
94
%ifidn %1, col
95
    paddw       m10,[pw_88]
96
%endif
97
%ifidn %1, row
98
    paddw       m10,[pw_1]
99
%endif
100
    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
101
    pmaddwd     m2,  m0, [w4_plus_w6]
102
    pmaddwd     m3,  m1, [w4_plus_w6]
103
    pmaddwd     m4,  m0, [w4_min_w6]
104
    pmaddwd     m5,  m1, [w4_min_w6]
105
    pmaddwd     m6,  m0, [w4_min_w2]
106
    pmaddwd     m7,  m1, [w4_min_w2]
107
    pmaddwd     m0, [w4_plus_w2]
108
    pmaddwd     m1, [w4_plus_w2]
109
 
110
    ; a0: -1*row[0]-1*row[2]
111
    ; a1: -1*row[0]
112
    ; a2: -1*row[0]
113
    ; a3: -1*row[0]+1*row[2]
114
 
115
    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
116
    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
117
    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
118
    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
119
    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
120
    pmaddwd     m10, m8, [w4_plus_w6]
121
    pmaddwd     m11, m9, [w4_plus_w6]
122
    paddd       m0,  m10            ; a0[0-3]
123
    paddd       m1,  m11            ; a0[4-7]
124
    pmaddwd     m10, m8, [w4_min_w6]
125
    pmaddwd     m11, m9, [w4_min_w6]
126
    paddd       m6,  m10           ; a3[0-3]
127
    paddd       m7,  m11           ; a3[4-7]
128
    pmaddwd     m10, m8, [w4_min_w2]
129
    pmaddwd     m11, m9, [w4_min_w2]
130
    pmaddwd     m8, [w4_plus_w2]
131
    pmaddwd     m9, [w4_plus_w2]
132
    psubd       m4,  m10           ; a2[0-3] intermediate
133
    psubd       m5,  m11           ; a2[4-7] intermediate
134
    psubd       m2,  m8            ; a1[0-3] intermediate
135
    psubd       m3,  m9            ; a1[4-7] intermediate
136
 
137
    ; load/store
138
    mova   [r2+  0], m0
139
    mova   [r2+ 32], m2
140
    mova   [r2+ 64], m4
141
    mova   [r2+ 96], m6
142
    mova        m10,[r2+ 16]       ; { row[1] }[0-7]
143
    mova        m8, [r2+ 48]       ; { row[3] }[0-7]
144
    mova        m13,[r2+ 80]       ; { row[5] }[0-7]
145
    mova        m14,[r2+112]       ; { row[7] }[0-7]
146
    mova   [r2+ 16], m1
147
    mova   [r2+ 48], m3
148
    mova   [r2+ 80], m5
149
    mova   [r2+112], m7
150
%ifidn %1, row
151
    pmullw      m10,[r3+ 16]
152
    pmullw      m8, [r3+ 48]
153
    pmullw      m13,[r3+ 80]
154
    pmullw      m14,[r3+112]
155
%endif
156
 
157
    ; b0 = MUL(W1, row[1]);
158
    ; MAC(b0, W3, row[3]);
159
    ; b1 = MUL(W3, row[1]);
160
    ; MAC(b1, -W7, row[3]);
161
    ; b2 = MUL(W5, row[1]);
162
    ; MAC(b2, -W1, row[3]);
163
    ; b3 = MUL(W7, row[1]);
164
    ; MAC(b3, -W5, row[3]);
165
    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
166
    pmaddwd     m2,  m0, [w3_min_w7]
167
    pmaddwd     m3,  m1, [w3_min_w7]
168
    pmaddwd     m4,  m0, [w5_min_w1]
169
    pmaddwd     m5,  m1, [w5_min_w1]
170
    pmaddwd     m6,  m0, [w7_min_w5]
171
    pmaddwd     m7,  m1, [w7_min_w5]
172
    pmaddwd     m0, [w1_plus_w3]
173
    pmaddwd     m1, [w1_plus_w3]
174
 
175
    ; b0: +1*row[1]+2*row[3]
176
    ; b1: +2*row[1]-1*row[3]
177
    ; b2: -1*row[1]-1*row[3]
178
    ; b3: +1*row[1]+1*row[3]
179
 
180
    ; MAC(b0,  W5, row[5]);
181
    ; MAC(b0,  W7, row[7]);
182
    ; MAC(b1, -W1, row[5]);
183
    ; MAC(b1, -W5, row[7]);
184
    ; MAC(b2,  W7, row[5]);
185
    ; MAC(b2,  W3, row[7]);
186
    ; MAC(b3,  W3, row[5]);
187
    ; MAC(b3, -W1, row[7]);
188
    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
189
 
190
    ; b0: -1*row[5]+1*row[7]
191
    ; b1: -1*row[5]+1*row[7]
192
    ; b2: +1*row[5]+2*row[7]
193
    ; b3: +2*row[5]-1*row[7]
194
 
195
    pmaddwd     m10, m8, [w1_plus_w5]
196
    pmaddwd     m11, m9, [w1_plus_w5]
197
    pmaddwd     m12, m8, [w5_plus_w7]
198
    pmaddwd     m13, m9, [w5_plus_w7]
199
    psubd       m2,  m10           ; b1[0-3]
200
    psubd       m3,  m11           ; b1[4-7]
201
    paddd       m0,  m12            ; b0[0-3]
202
    paddd       m1,  m13            ; b0[4-7]
203
    pmaddwd     m12, m8, [w7_plus_w3]
204
    pmaddwd     m13, m9, [w7_plus_w3]
205
    pmaddwd     m8, [w3_min_w1]
206
    pmaddwd     m9, [w3_min_w1]
207
    paddd       m4,  m12           ; b2[0-3]
208
    paddd       m5,  m13           ; b2[4-7]
209
    paddd       m6,  m8            ; b3[0-3]
210
    paddd       m7,  m9            ; b3[4-7]
211
 
212
    ; row[0] = (a0 + b0) >> 15;
213
    ; row[7] = (a0 - b0) >> 15;
214
    ; row[1] = (a1 + b1) >> 15;
215
    ; row[6] = (a1 - b1) >> 15;
216
    ; row[2] = (a2 + b2) >> 15;
217
    ; row[5] = (a2 - b2) >> 15;
218
    ; row[3] = (a3 + b3) >> 15;
219
    ; row[4] = (a3 - b3) >> 15;
220
    mova        m8, [r2+ 0]        ; a0[0-3]
221
    mova        m9, [r2+16]        ; a0[4-7]
222
    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
223
    mova        m0, [r2+32]        ; a1[0-3]
224
    mova        m1, [r2+48]        ; a1[4-7]
225
    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
226
    mova        m1, [r2+64]        ; a2[0-3]
227
    mova        m2, [r2+80]        ; a2[4-7]
228
    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
229
    mova        m2, [r2+96]        ; a3[0-3]
230
    mova        m3, [r2+112]       ; a3[4-7]
231
    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
232
%endmacro
233
 
234
; void prores_idct_put_10_(uint8_t *pixels, int stride,
235
;                               int16_t *block, const int16_t *qmat);
236
%macro idct_put_fn 1
237
cglobal prores_idct_put_10, 4, 4, %1
238
    movsxd      r1,  r1d
239
    pxor        m15, m15           ; zero
240
 
241
    ; for (i = 0; i < 8; i++)
242
    ;     idctRowCondDC(block + i*8);
243
    mova        m10,[r2+ 0]        ; { row[0] }[0-7]
244
    mova        m8, [r2+32]        ; { row[2] }[0-7]
245
    mova        m13,[r2+64]        ; { row[4] }[0-7]
246
    mova        m12,[r2+96]        ; { row[6] }[0-7]
247
 
248
    pmullw      m10,[r3+ 0]
249
    pmullw      m8, [r3+32]
250
    pmullw      m13,[r3+64]
251
    pmullw      m12,[r3+96]
252
 
253
    IDCT_1D     row, 15
254
 
255
    ; transpose for second part of IDCT
256
    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
257
    mova   [r2+ 16], m0
258
    mova   [r2+ 48], m2
259
    mova   [r2+ 80], m11
260
    mova   [r2+112], m10
261
    SWAP         8,  10
262
    SWAP         1,   8
263
    SWAP         4,  13
264
    SWAP         9,  12
265
 
266
    ; for (i = 0; i < 8; i++)
267
    ;     idctSparseColAdd(dest + i, line_size, block + i);
268
    IDCT_1D     col, 18
269
 
270
    ; clip/store
271
    mova        m3, [pw_4]
272
    mova        m5, [pw_1019]
273
    pmaxsw      m8,  m3
274
    pmaxsw      m0,  m3
275
    pmaxsw      m1,  m3
276
    pmaxsw      m2,  m3
277
    pmaxsw      m4,  m3
278
    pmaxsw      m11, m3
279
    pmaxsw      m9,  m3
280
    pmaxsw      m10, m3
281
    pminsw      m8,  m5
282
    pminsw      m0,  m5
283
    pminsw      m1,  m5
284
    pminsw      m2,  m5
285
    pminsw      m4,  m5
286
    pminsw      m11, m5
287
    pminsw      m9,  m5
288
    pminsw      m10, m5
289
 
290
    lea         r2, [r1*3]
291
    mova  [r0     ], m8
292
    mova  [r0+r1  ], m0
293
    mova  [r0+r1*2], m1
294
    mova  [r0+r2  ], m2
295
    lea         r0, [r0+r1*4]
296
    mova  [r0     ], m4
297
    mova  [r0+r1  ], m11
298
    mova  [r0+r1*2], m9
299
    mova  [r0+r2  ], m10
300
    RET
301
%endmacro
302
 
303
%macro SIGNEXTEND 2-3
304
%if cpuflag(sse4) ; dstlow, dsthigh
305
    movhlps     %2,  %1
306
    pmovsxwd    %1,  %1
307
    pmovsxwd    %2,  %2
308
%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
309
    pxor        %3,  %3
310
    pcmpgtw     %3,  %1
311
    mova        %2,  %1
312
    punpcklwd   %1,  %3
313
    punpckhwd   %2,  %3
314
%endif
315
%endmacro
316
 
317
INIT_XMM sse2
318
idct_put_fn 16
319
INIT_XMM sse4
320
idct_put_fn 16
321
%if HAVE_AVX_EXTERNAL
322
INIT_XMM avx
323
idct_put_fn 16
324
%endif
325
 
326
%endif