Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/*
2
 * This file is part of the Independent JPEG Group's software.
3
 *
4
 * The authors make NO WARRANTY or representation, either express or implied,
5
 * with respect to this software, its quality, accuracy, merchantability, or
6
 * fitness for a particular purpose.  This software is provided "AS IS", and
7
 * you, its user, assume the entire risk as to its quality and accuracy.
8
 *
9
 * This software is copyright (C) 1991, 1992, Thomas G. Lane.
10
 * All Rights Reserved except as specified below.
11
 *
12
 * Permission is hereby granted to use, copy, modify, and distribute this
13
 * software (or portions thereof) for any purpose, without fee, subject to
14
 * these conditions:
15
 * (1) If any part of the source code for this software is distributed, then
16
 * this README file must be included, with this copyright and no-warranty
17
 * notice unaltered; and any additions, deletions, or changes to the original
18
 * files must be clearly indicated in accompanying documentation.
19
 * (2) If only executable code is distributed, then the accompanying
20
 * documentation must state that "this software is based in part on the work
21
 * of the Independent JPEG Group".
22
 * (3) Permission for use of this software is granted only if the user accepts
23
 * full responsibility for any undesirable consequences; the authors accept
24
 * NO LIABILITY for damages of any kind.
25
 *
26
 * These conditions apply to any software derived from or based on the IJG
27
 * code, not just to the unmodified library.  If you use our work, you ought
28
 * to acknowledge us.
29
 *
30
 * Permission is NOT granted for the use of any IJG author's name or company
31
 * name in advertising or publicity relating to this software or products
32
 * derived from it.  This software may be referred to only as "the Independent
33
 * JPEG Group's software".
34
 *
35
 * We specifically permit and encourage the use of this software as the basis
36
 * of commercial products, provided that all warranty or liability claims are
37
 * assumed by the product vendor.
38
 *
39
 * This file contains the basic inverse-DCT transformation subroutine.
40
 *
41
 * This implementation is based on an algorithm described in
42
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
43
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
44
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
45
 * The primary algorithm described there uses 11 multiplies and 29 adds.
46
 * We use their alternate method with 12 multiplies and 32 adds.
47
 * The advantage of this method is that no data path contains more than one
48
 * multiplication; this allows a very simple and accurate implementation in
49
 * scaled fixed-point arithmetic, with a minimal number of shifts.
50
 *
51
 * I've made lots of modifications to attempt to take advantage of the
52
 * sparse nature of the DCT matrices we're getting.  Although the logic
53
 * is cumbersome, it's straightforward and the resulting code is much
54
 * faster.
55
 *
56
 * A better way to do this would be to pass in the DCT block as a sparse
57
 * matrix, perhaps with the difference cases encoded.
58
 */
59
 
60
/**
61
 * @file
62
 * Independent JPEG Group's LLM idct.
63
 */
64
 
65
#include "libavutil/common.h"
66
#include "dct.h"
67
 
68
#define EIGHT_BIT_SAMPLES
69
 
70
#define DCTSIZE 8
71
#define DCTSIZE2 64
72
 
73
#define GLOBAL
74
 
75
#define RIGHT_SHIFT(x, n) ((x) >> (n))
76
 
77
typedef int16_t DCTBLOCK[DCTSIZE2];
78
 
79
#define CONST_BITS 13
80
 
81
/*
82
 * This routine is specialized to the case DCTSIZE = 8.
83
 */
84
 
85
#if DCTSIZE != 8
86
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
87
#endif
88
 
89
 
90
/*
91
 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
92
 * on each column.  Direct algorithms are also available, but they are
93
 * much more complex and seem not to be any faster when reduced to code.
94
 *
95
 * The poop on this scaling stuff is as follows:
96
 *
97
 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
98
 * larger than the true IDCT outputs.  The final outputs are therefore
99
 * a factor of N larger than desired; since N=8 this can be cured by
100
 * a simple right shift at the end of the algorithm.  The advantage of
101
 * this arrangement is that we save two multiplications per 1-D IDCT,
102
 * because the y0 and y4 inputs need not be divided by sqrt(N).
103
 *
104
 * We have to do addition and subtraction of the integer inputs, which
105
 * is no problem, and multiplication by fractional constants, which is
106
 * a problem to do in integer arithmetic.  We multiply all the constants
107
 * by CONST_SCALE and convert them to integer constants (thus retaining
108
 * CONST_BITS bits of precision in the constants).  After doing a
109
 * multiplication we have to divide the product by CONST_SCALE, with proper
110
 * rounding, to produce the correct output.  This division can be done
111
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
112
 * as long as possible so that partial sums can be added together with
113
 * full fractional precision.
114
 *
115
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
116
 * they are represented to better-than-integral precision.  These outputs
117
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
118
 * with the recommended scaling.  (To scale up 12-bit sample data further, an
119
 * intermediate int32 array would be needed.)
120
 *
121
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
122
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
123
 * shows that the values given below are the most effective.
124
 */
125
 
126
#ifdef EIGHT_BIT_SAMPLES
127
#define PASS1_BITS  2
128
#else
129
#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
130
#endif
131
 
132
#define ONE         ((int32_t) 1)
133
 
134
#define CONST_SCALE (ONE << CONST_BITS)
135
 
136
/* Convert a positive real constant to an integer scaled by CONST_SCALE.
137
 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
138
 * you will pay a significant penalty in run time.  In that case, figure
139
 * the correct integer constant values and insert them by hand.
140
 */
141
 
142
/* Actually FIX is no longer used, we precomputed them all */
143
#define FIX(x)  ((int32_t) ((x) * CONST_SCALE + 0.5))
144
 
145
/* Descale and correctly round an int32_t value that's scaled by N bits.
146
 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
147
 * the fudge factor is correct for either sign of X.
148
 */
149
 
150
#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
151
 
152
/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
153
 * For 8-bit samples with the recommended scaling, all the variable
154
 * and constant values involved are no more than 16 bits wide, so a
155
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
156
 * this provides a useful speedup on many machines.
157
 * There is no way to specify a 16x16->32 multiply in portable C, but
158
 * some C compilers will do the right thing if you provide the correct
159
 * combination of casts.
160
 * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
161
 */
162
 
163
#ifdef EIGHT_BIT_SAMPLES
164
#ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
165
#define MULTIPLY(var,const)  (((int16_t) (var)) * ((int16_t) (const)))
166
#endif
167
#ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
168
#define MULTIPLY(var,const)  (((int16_t) (var)) * ((int32_t) (const)))
169
#endif
170
#endif
171
 
172
#ifndef MULTIPLY                /* default definition */
173
#define MULTIPLY(var,const)  ((var) * (const))
174
#endif
175
 
176
 
177
/*
178
  Unlike our decoder where we approximate the FIXes, we need to use exact
179
ones here or successive P-frames will drift too much with Reference frame coding
180
*/
181
#define FIX_0_211164243 1730
182
#define FIX_0_275899380 2260
183
#define FIX_0_298631336 2446
184
#define FIX_0_390180644 3196
185
#define FIX_0_509795579 4176
186
#define FIX_0_541196100 4433
187
#define FIX_0_601344887 4926
188
#define FIX_0_765366865 6270
189
#define FIX_0_785694958 6436
190
#define FIX_0_899976223 7373
191
#define FIX_1_061594337 8697
192
#define FIX_1_111140466 9102
193
#define FIX_1_175875602 9633
194
#define FIX_1_306562965 10703
195
#define FIX_1_387039845 11363
196
#define FIX_1_451774981 11893
197
#define FIX_1_501321110 12299
198
#define FIX_1_662939225 13623
199
#define FIX_1_847759065 15137
200
#define FIX_1_961570560 16069
201
#define FIX_2_053119869 16819
202
#define FIX_2_172734803 17799
203
#define FIX_2_562915447 20995
204
#define FIX_3_072711026 25172
205
 
206
/*
207
 * Perform the inverse DCT on one block of coefficients.
208
 */
209
 
210
void ff_j_rev_dct(DCTBLOCK data)
211
{
212
  int32_t tmp0, tmp1, tmp2, tmp3;
213
  int32_t tmp10, tmp11, tmp12, tmp13;
214
  int32_t z1, z2, z3, z4, z5;
215
  int32_t d0, d1, d2, d3, d4, d5, d6, d7;
216
  register int16_t *dataptr;
217
  int rowctr;
218
 
219
  /* Pass 1: process rows. */
220
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
221
  /* furthermore, we scale the results by 2**PASS1_BITS. */
222
 
223
  dataptr = data;
224
 
225
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
226
    /* Due to quantization, we will usually find that many of the input
227
     * coefficients are zero, especially the AC terms.  We can exploit this
228
     * by short-circuiting the IDCT calculation for any row in which all
229
     * the AC terms are zero.  In that case each output is equal to the
230
     * DC coefficient (with scale factor as needed).
231
     * With typical images and quantization tables, half or more of the
232
     * row DCT calculations can be simplified this way.
233
     */
234
 
235
    register int *idataptr = (int*)dataptr;
236
 
237
    /* WARNING: we do the same permutation as MMX idct to simplify the
238
       video core */
239
    d0 = dataptr[0];
240
    d2 = dataptr[1];
241
    d4 = dataptr[2];
242
    d6 = dataptr[3];
243
    d1 = dataptr[4];
244
    d3 = dataptr[5];
245
    d5 = dataptr[6];
246
    d7 = dataptr[7];
247
 
248
    if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
249
      /* AC terms all zero */
250
      if (d0) {
251
          /* Compute a 32 bit value to assign. */
252
          int16_t dcval = (int16_t) (d0 << PASS1_BITS);
253
          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
254
 
255
          idataptr[0] = v;
256
          idataptr[1] = v;
257
          idataptr[2] = v;
258
          idataptr[3] = v;
259
      }
260
 
261
      dataptr += DCTSIZE;       /* advance pointer to next row */
262
      continue;
263
    }
264
 
265
    /* Even part: reverse the even part of the forward DCT. */
266
    /* The rotator is sqrt(2)*c(-6). */
267
{
268
    if (d6) {
269
            if (d2) {
270
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
271
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
272
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
273
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
274
 
275
                    tmp0 = (d0 + d4) << CONST_BITS;
276
                    tmp1 = (d0 - d4) << CONST_BITS;
277
 
278
                    tmp10 = tmp0 + tmp3;
279
                    tmp13 = tmp0 - tmp3;
280
                    tmp11 = tmp1 + tmp2;
281
                    tmp12 = tmp1 - tmp2;
282
            } else {
283
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
284
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
285
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
286
 
287
                    tmp0 = (d0 + d4) << CONST_BITS;
288
                    tmp1 = (d0 - d4) << CONST_BITS;
289
 
290
                    tmp10 = tmp0 + tmp3;
291
                    tmp13 = tmp0 - tmp3;
292
                    tmp11 = tmp1 + tmp2;
293
                    tmp12 = tmp1 - tmp2;
294
            }
295
    } else {
296
            if (d2) {
297
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
298
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
299
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
300
 
301
                    tmp0 = (d0 + d4) << CONST_BITS;
302
                    tmp1 = (d0 - d4) << CONST_BITS;
303
 
304
                    tmp10 = tmp0 + tmp3;
305
                    tmp13 = tmp0 - tmp3;
306
                    tmp11 = tmp1 + tmp2;
307
                    tmp12 = tmp1 - tmp2;
308
            } else {
309
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
310
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
311
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
312
            }
313
      }
314
 
315
    /* Odd part per figure 8; the matrix is unitary and hence its
316
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
317
     */
318
 
319
    if (d7) {
320
        if (d5) {
321
            if (d3) {
322
                if (d1) {
323
                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
324
                    z1 = d7 + d1;
325
                    z2 = d5 + d3;
326
                    z3 = d7 + d3;
327
                    z4 = d5 + d1;
328
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
329
 
330
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
331
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
332
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
333
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
334
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
335
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
336
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
337
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
338
 
339
                    z3 += z5;
340
                    z4 += z5;
341
 
342
                    tmp0 += z1 + z3;
343
                    tmp1 += z2 + z4;
344
                    tmp2 += z2 + z3;
345
                    tmp3 += z1 + z4;
346
                } else {
347
                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
348
                    z2 = d5 + d3;
349
                    z3 = d7 + d3;
350
                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
351
 
352
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
353
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
354
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
355
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
356
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
357
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
358
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
359
 
360
                    z3 += z5;
361
                    z4 += z5;
362
 
363
                    tmp0 += z1 + z3;
364
                    tmp1 += z2 + z4;
365
                    tmp2 += z2 + z3;
366
                    tmp3 = z1 + z4;
367
                }
368
            } else {
369
                if (d1) {
370
                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
371
                    z1 = d7 + d1;
372
                    z4 = d5 + d1;
373
                    z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
374
 
375
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
376
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
377
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
378
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
379
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
380
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
381
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
382
 
383
                    z3 += z5;
384
                    z4 += z5;
385
 
386
                    tmp0 += z1 + z3;
387
                    tmp1 += z2 + z4;
388
                    tmp2 = z2 + z3;
389
                    tmp3 += z1 + z4;
390
                } else {
391
                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
392
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
393
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
394
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
395
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
396
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
397
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
398
                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
399
 
400
                    z3 += z5;
401
                    z4 += z5;
402
 
403
                    tmp0 += z3;
404
                    tmp1 += z4;
405
                    tmp2 = z2 + z3;
406
                    tmp3 = z1 + z4;
407
                }
408
            }
409
        } else {
410
            if (d3) {
411
                if (d1) {
412
                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
413
                    z1 = d7 + d1;
414
                    z3 = d7 + d3;
415
                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
416
 
417
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
418
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
419
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
420
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
421
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
422
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
423
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
424
 
425
                    z3 += z5;
426
                    z4 += z5;
427
 
428
                    tmp0 += z1 + z3;
429
                    tmp1 = z2 + z4;
430
                    tmp2 += z2 + z3;
431
                    tmp3 += z1 + z4;
432
                } else {
433
                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
434
                    z3 = d7 + d3;
435
 
436
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
437
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
438
                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
439
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
440
                    z5 = MULTIPLY(z3, FIX_1_175875602);
441
                    z3 = MULTIPLY(-z3, FIX_0_785694958);
442
 
443
                    tmp0 += z3;
444
                    tmp1 = z2 + z5;
445
                    tmp2 += z3;
446
                    tmp3 = z1 + z5;
447
                }
448
            } else {
449
                if (d1) {
450
                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
451
                    z1 = d7 + d1;
452
                    z5 = MULTIPLY(z1, FIX_1_175875602);
453
 
454
                    z1 = MULTIPLY(z1, FIX_0_275899380);
455
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
456
                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
457
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
458
                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
459
 
460
                    tmp0 += z1;
461
                    tmp1 = z4 + z5;
462
                    tmp2 = z3 + z5;
463
                    tmp3 += z1;
464
                } else {
465
                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
466
                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
467
                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
468
                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
469
                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
470
                }
471
            }
472
        }
473
    } else {
474
        if (d5) {
475
            if (d3) {
476
                if (d1) {
477
                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
478
                    z2 = d5 + d3;
479
                    z4 = d5 + d1;
480
                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
481
 
482
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
483
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
484
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
485
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
486
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
487
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
488
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
489
 
490
                    z3 += z5;
491
                    z4 += z5;
492
 
493
                    tmp0 = z1 + z3;
494
                    tmp1 += z2 + z4;
495
                    tmp2 += z2 + z3;
496
                    tmp3 += z1 + z4;
497
                } else {
498
                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
499
                    z2 = d5 + d3;
500
 
501
                    z5 = MULTIPLY(z2, FIX_1_175875602);
502
                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
503
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
504
                    z2 = MULTIPLY(-z2, FIX_1_387039845);
505
                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
506
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
507
 
508
                    tmp0 = z3 + z5;
509
                    tmp1 += z2;
510
                    tmp2 += z2;
511
                    tmp3 = z4 + z5;
512
                }
513
            } else {
514
                if (d1) {
515
                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
516
                    z4 = d5 + d1;
517
 
518
                    z5 = MULTIPLY(z4, FIX_1_175875602);
519
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
520
                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
521
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
522
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
523
                    z4 = MULTIPLY(z4, FIX_0_785694958);
524
 
525
                    tmp0 = z1 + z5;
526
                    tmp1 += z4;
527
                    tmp2 = z2 + z5;
528
                    tmp3 += z4;
529
                } else {
530
                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
531
                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
532
                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
533
                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
534
                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
535
                }
536
            }
537
        } else {
538
            if (d3) {
539
                if (d1) {
540
                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
541
                    z5 = d1 + d3;
542
                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
543
                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
544
                    z1 = MULTIPLY(d1, FIX_1_061594337);
545
                    z2 = MULTIPLY(-d3, FIX_2_172734803);
546
                    z4 = MULTIPLY(z5, FIX_0_785694958);
547
                    z5 = MULTIPLY(z5, FIX_1_175875602);
548
 
549
                    tmp0 = z1 - z4;
550
                    tmp1 = z2 + z4;
551
                    tmp2 += z5;
552
                    tmp3 += z5;
553
                } else {
554
                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
555
                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
556
                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
557
                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
558
                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
559
                }
560
            } else {
561
                if (d1) {
562
                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
563
                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
564
                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
565
                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
566
                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
567
                } else {
568
                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
569
                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
570
                }
571
            }
572
        }
573
    }
574
}
575
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
576
 
577
    dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
578
    dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
579
    dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
580
    dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
581
    dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
582
    dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
583
    dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
584
    dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
585
 
586
    dataptr += DCTSIZE;         /* advance pointer to next row */
587
  }
588
 
589
  /* Pass 2: process columns. */
590
  /* Note that we must descale the results by a factor of 8 == 2**3, */
591
  /* and also undo the PASS1_BITS scaling. */
592
 
593
  dataptr = data;
594
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
595
    /* Columns of zeroes can be exploited in the same way as we did with rows.
596
     * However, the row calculation has created many nonzero AC terms, so the
597
     * simplification applies less often (typically 5% to 10% of the time).
598
     * On machines with very fast multiplication, it's possible that the
599
     * test takes more time than it's worth.  In that case this section
600
     * may be commented out.
601
     */
602
 
603
    d0 = dataptr[DCTSIZE*0];
604
    d1 = dataptr[DCTSIZE*1];
605
    d2 = dataptr[DCTSIZE*2];
606
    d3 = dataptr[DCTSIZE*3];
607
    d4 = dataptr[DCTSIZE*4];
608
    d5 = dataptr[DCTSIZE*5];
609
    d6 = dataptr[DCTSIZE*6];
610
    d7 = dataptr[DCTSIZE*7];
611
 
612
    /* Even part: reverse the even part of the forward DCT. */
613
    /* The rotator is sqrt(2)*c(-6). */
614
    if (d6) {
615
            if (d2) {
616
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
617
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
618
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
619
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
620
 
621
                    tmp0 = (d0 + d4) << CONST_BITS;
622
                    tmp1 = (d0 - d4) << CONST_BITS;
623
 
624
                    tmp10 = tmp0 + tmp3;
625
                    tmp13 = tmp0 - tmp3;
626
                    tmp11 = tmp1 + tmp2;
627
                    tmp12 = tmp1 - tmp2;
628
            } else {
629
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
630
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
631
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
632
 
633
                    tmp0 = (d0 + d4) << CONST_BITS;
634
                    tmp1 = (d0 - d4) << CONST_BITS;
635
 
636
                    tmp10 = tmp0 + tmp3;
637
                    tmp13 = tmp0 - tmp3;
638
                    tmp11 = tmp1 + tmp2;
639
                    tmp12 = tmp1 - tmp2;
640
            }
641
    } else {
642
            if (d2) {
643
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
644
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
645
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
646
 
647
                    tmp0 = (d0 + d4) << CONST_BITS;
648
                    tmp1 = (d0 - d4) << CONST_BITS;
649
 
650
                    tmp10 = tmp0 + tmp3;
651
                    tmp13 = tmp0 - tmp3;
652
                    tmp11 = tmp1 + tmp2;
653
                    tmp12 = tmp1 - tmp2;
654
            } else {
655
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
656
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
657
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
658
            }
659
    }
660
 
661
    /* Odd part per figure 8; the matrix is unitary and hence its
662
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
663
     */
664
    if (d7) {
665
        if (d5) {
666
            if (d3) {
667
                if (d1) {
668
                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
669
                    z1 = d7 + d1;
670
                    z2 = d5 + d3;
671
                    z3 = d7 + d3;
672
                    z4 = d5 + d1;
673
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
674
 
675
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
676
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
677
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
678
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
679
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
680
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
681
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
682
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
683
 
684
                    z3 += z5;
685
                    z4 += z5;
686
 
687
                    tmp0 += z1 + z3;
688
                    tmp1 += z2 + z4;
689
                    tmp2 += z2 + z3;
690
                    tmp3 += z1 + z4;
691
                } else {
692
                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
693
                    z2 = d5 + d3;
694
                    z3 = d7 + d3;
695
                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
696
 
697
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
698
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
699
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
700
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
701
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
702
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
703
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
704
 
705
                    z3 += z5;
706
                    z4 += z5;
707
 
708
                    tmp0 += z1 + z3;
709
                    tmp1 += z2 + z4;
710
                    tmp2 += z2 + z3;
711
                    tmp3 = z1 + z4;
712
                }
713
            } else {
714
                if (d1) {
715
                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
716
                    z1 = d7 + d1;
717
                    z3 = d7;
718
                    z4 = d5 + d1;
719
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
720
 
721
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
722
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
723
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
724
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
725
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
726
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
727
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
728
 
729
                    z3 += z5;
730
                    z4 += z5;
731
 
732
                    tmp0 += z1 + z3;
733
                    tmp1 += z2 + z4;
734
                    tmp2 = z2 + z3;
735
                    tmp3 += z1 + z4;
736
                } else {
737
                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
738
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
739
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
740
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
741
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
742
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
743
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
744
                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
745
 
746
                    z3 += z5;
747
                    z4 += z5;
748
 
749
                    tmp0 += z3;
750
                    tmp1 += z4;
751
                    tmp2 = z2 + z3;
752
                    tmp3 = z1 + z4;
753
                }
754
            }
755
        } else {
756
            if (d3) {
757
                if (d1) {
758
                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
759
                    z1 = d7 + d1;
760
                    z3 = d7 + d3;
761
                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
762
 
763
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
764
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
765
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
766
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
767
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
768
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
769
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
770
 
771
                    z3 += z5;
772
                    z4 += z5;
773
 
774
                    tmp0 += z1 + z3;
775
                    tmp1 = z2 + z4;
776
                    tmp2 += z2 + z3;
777
                    tmp3 += z1 + z4;
778
                } else {
779
                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
780
                    z3 = d7 + d3;
781
 
782
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
783
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
784
                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
785
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
786
                    z5 = MULTIPLY(z3, FIX_1_175875602);
787
                    z3 = MULTIPLY(-z3, FIX_0_785694958);
788
 
789
                    tmp0 += z3;
790
                    tmp1 = z2 + z5;
791
                    tmp2 += z3;
792
                    tmp3 = z1 + z5;
793
                }
794
            } else {
795
                if (d1) {
796
                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
797
                    z1 = d7 + d1;
798
                    z5 = MULTIPLY(z1, FIX_1_175875602);
799
 
800
                    z1 = MULTIPLY(z1, FIX_0_275899380);
801
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
802
                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
803
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
804
                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
805
 
806
                    tmp0 += z1;
807
                    tmp1 = z4 + z5;
808
                    tmp2 = z3 + z5;
809
                    tmp3 += z1;
810
                } else {
811
                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
812
                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
813
                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
814
                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
815
                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
816
                }
817
            }
818
        }
819
    } else {
820
        if (d5) {
821
            if (d3) {
822
                if (d1) {
823
                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
824
                    z2 = d5 + d3;
825
                    z4 = d5 + d1;
826
                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
827
 
828
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
829
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
830
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
831
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
832
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
833
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
834
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
835
 
836
                    z3 += z5;
837
                    z4 += z5;
838
 
839
                    tmp0 = z1 + z3;
840
                    tmp1 += z2 + z4;
841
                    tmp2 += z2 + z3;
842
                    tmp3 += z1 + z4;
843
                } else {
844
                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
845
                    z2 = d5 + d3;
846
 
847
                    z5 = MULTIPLY(z2, FIX_1_175875602);
848
                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
849
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
850
                    z2 = MULTIPLY(-z2, FIX_1_387039845);
851
                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
852
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
853
 
854
                    tmp0 = z3 + z5;
855
                    tmp1 += z2;
856
                    tmp2 += z2;
857
                    tmp3 = z4 + z5;
858
                }
859
            } else {
860
                if (d1) {
861
                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
862
                    z4 = d5 + d1;
863
 
864
                    z5 = MULTIPLY(z4, FIX_1_175875602);
865
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
866
                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
867
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
868
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
869
                    z4 = MULTIPLY(z4, FIX_0_785694958);
870
 
871
                    tmp0 = z1 + z5;
872
                    tmp1 += z4;
873
                    tmp2 = z2 + z5;
874
                    tmp3 += z4;
875
                } else {
876
                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
877
                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
878
                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
879
                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
880
                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
881
                }
882
            }
883
        } else {
884
            if (d3) {
885
                if (d1) {
886
                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
887
                    z5 = d1 + d3;
888
                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
889
                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
890
                    z1 = MULTIPLY(d1, FIX_1_061594337);
891
                    z2 = MULTIPLY(-d3, FIX_2_172734803);
892
                    z4 = MULTIPLY(z5, FIX_0_785694958);
893
                    z5 = MULTIPLY(z5, FIX_1_175875602);
894
 
895
                    tmp0 = z1 - z4;
896
                    tmp1 = z2 + z4;
897
                    tmp2 += z5;
898
                    tmp3 += z5;
899
                } else {
900
                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
901
                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
902
                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
903
                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
904
                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
905
                }
906
            } else {
907
                if (d1) {
908
                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
909
                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
910
                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
911
                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
912
                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
913
                } else {
914
                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
915
                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
916
                }
917
            }
918
        }
919
    }
920
 
921
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
922
 
923
    dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3,
924
                                           CONST_BITS+PASS1_BITS+3);
925
    dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3,
926
                                           CONST_BITS+PASS1_BITS+3);
927
    dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2,
928
                                           CONST_BITS+PASS1_BITS+3);
929
    dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2,
930
                                           CONST_BITS+PASS1_BITS+3);
931
    dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1,
932
                                           CONST_BITS+PASS1_BITS+3);
933
    dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1,
934
                                           CONST_BITS+PASS1_BITS+3);
935
    dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0,
936
                                           CONST_BITS+PASS1_BITS+3);
937
    dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0,
938
                                           CONST_BITS+PASS1_BITS+3);
939
 
940
    dataptr++;                  /* advance pointer to next column */
941
  }
942
}
943
 
944
#undef DCTSIZE
945
#define DCTSIZE 4
946
#define DCTSTRIDE 8
947
 
948
void ff_j_rev_dct4(DCTBLOCK data)
949
{
950
  int32_t tmp0, tmp1, tmp2, tmp3;
951
  int32_t tmp10, tmp11, tmp12, tmp13;
952
  int32_t z1;
953
  int32_t d0, d2, d4, d6;
954
  register int16_t *dataptr;
955
  int rowctr;
956
 
957
  /* Pass 1: process rows. */
958
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
959
  /* furthermore, we scale the results by 2**PASS1_BITS. */
960
 
961
  data[0] += 4;
962
 
963
  dataptr = data;
964
 
965
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
966
    /* Due to quantization, we will usually find that many of the input
967
     * coefficients are zero, especially the AC terms.  We can exploit this
968
     * by short-circuiting the IDCT calculation for any row in which all
969
     * the AC terms are zero.  In that case each output is equal to the
970
     * DC coefficient (with scale factor as needed).
971
     * With typical images and quantization tables, half or more of the
972
     * row DCT calculations can be simplified this way.
973
     */
974
 
975
    register int *idataptr = (int*)dataptr;
976
 
977
    d0 = dataptr[0];
978
    d2 = dataptr[1];
979
    d4 = dataptr[2];
980
    d6 = dataptr[3];
981
 
982
    if ((d2 | d4 | d6) == 0) {
983
      /* AC terms all zero */
984
      if (d0) {
985
          /* Compute a 32 bit value to assign. */
986
          int16_t dcval = (int16_t) (d0 << PASS1_BITS);
987
          register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
988
 
989
          idataptr[0] = v;
990
          idataptr[1] = v;
991
      }
992
 
993
      dataptr += DCTSTRIDE;     /* advance pointer to next row */
994
      continue;
995
    }
996
 
997
    /* Even part: reverse the even part of the forward DCT. */
998
    /* The rotator is sqrt(2)*c(-6). */
999
    if (d6) {
1000
            if (d2) {
1001
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1002
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1003
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1004
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1005
 
1006
                    tmp0 = (d0 + d4) << CONST_BITS;
1007
                    tmp1 = (d0 - d4) << CONST_BITS;
1008
 
1009
                    tmp10 = tmp0 + tmp3;
1010
                    tmp13 = tmp0 - tmp3;
1011
                    tmp11 = tmp1 + tmp2;
1012
                    tmp12 = tmp1 - tmp2;
1013
            } else {
1014
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1015
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1016
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
1017
 
1018
                    tmp0 = (d0 + d4) << CONST_BITS;
1019
                    tmp1 = (d0 - d4) << CONST_BITS;
1020
 
1021
                    tmp10 = tmp0 + tmp3;
1022
                    tmp13 = tmp0 - tmp3;
1023
                    tmp11 = tmp1 + tmp2;
1024
                    tmp12 = tmp1 - tmp2;
1025
            }
1026
    } else {
1027
            if (d2) {
1028
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1029
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
1030
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
1031
 
1032
                    tmp0 = (d0 + d4) << CONST_BITS;
1033
                    tmp1 = (d0 - d4) << CONST_BITS;
1034
 
1035
                    tmp10 = tmp0 + tmp3;
1036
                    tmp13 = tmp0 - tmp3;
1037
                    tmp11 = tmp1 + tmp2;
1038
                    tmp12 = tmp1 - tmp2;
1039
            } else {
1040
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1041
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1042
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1043
            }
1044
      }
1045
 
1046
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1047
 
1048
    dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
1049
    dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
1050
    dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
1051
    dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
1052
 
1053
    dataptr += DCTSTRIDE;       /* advance pointer to next row */
1054
  }
1055
 
1056
  /* Pass 2: process columns. */
1057
  /* Note that we must descale the results by a factor of 8 == 2**3, */
1058
  /* and also undo the PASS1_BITS scaling. */
1059
 
1060
  dataptr = data;
1061
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1062
    /* Columns of zeroes can be exploited in the same way as we did with rows.
1063
     * However, the row calculation has created many nonzero AC terms, so the
1064
     * simplification applies less often (typically 5% to 10% of the time).
1065
     * On machines with very fast multiplication, it's possible that the
1066
     * test takes more time than it's worth.  In that case this section
1067
     * may be commented out.
1068
     */
1069
 
1070
    d0 = dataptr[DCTSTRIDE*0];
1071
    d2 = dataptr[DCTSTRIDE*1];
1072
    d4 = dataptr[DCTSTRIDE*2];
1073
    d6 = dataptr[DCTSTRIDE*3];
1074
 
1075
    /* Even part: reverse the even part of the forward DCT. */
1076
    /* The rotator is sqrt(2)*c(-6). */
1077
    if (d6) {
1078
            if (d2) {
1079
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1080
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1081
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1082
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1083
 
1084
                    tmp0 = (d0 + d4) << CONST_BITS;
1085
                    tmp1 = (d0 - d4) << CONST_BITS;
1086
 
1087
                    tmp10 = tmp0 + tmp3;
1088
                    tmp13 = tmp0 - tmp3;
1089
                    tmp11 = tmp1 + tmp2;
1090
                    tmp12 = tmp1 - tmp2;
1091
            } else {
1092
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1093
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1094
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
1095
 
1096
                    tmp0 = (d0 + d4) << CONST_BITS;
1097
                    tmp1 = (d0 - d4) << CONST_BITS;
1098
 
1099
                    tmp10 = tmp0 + tmp3;
1100
                    tmp13 = tmp0 - tmp3;
1101
                    tmp11 = tmp1 + tmp2;
1102
                    tmp12 = tmp1 - tmp2;
1103
            }
1104
    } else {
1105
            if (d2) {
1106
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1107
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
1108
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
1109
 
1110
                    tmp0 = (d0 + d4) << CONST_BITS;
1111
                    tmp1 = (d0 - d4) << CONST_BITS;
1112
 
1113
                    tmp10 = tmp0 + tmp3;
1114
                    tmp13 = tmp0 - tmp3;
1115
                    tmp11 = tmp1 + tmp2;
1116
                    tmp12 = tmp1 - tmp2;
1117
            } else {
1118
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1119
                    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1120
                    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1121
            }
1122
    }
1123
 
1124
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1125
 
1126
    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
1127
    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
1128
    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
1129
    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
1130
 
1131
    dataptr++;                  /* advance pointer to next column */
1132
  }
1133
}
1134
 
1135
void ff_j_rev_dct2(DCTBLOCK data){
1136
  int d00, d01, d10, d11;
1137
 
1138
  data[0] += 4;
1139
  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
1140
  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
1141
  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
1142
  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
1143
 
1144
  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
1145
  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
1146
  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
1147
  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
1148
}
1149
 
1150
void ff_j_rev_dct1(DCTBLOCK data){
1151
  data[0] = (data[0] + 4)>>3;
1152
}
1153
 
1154
#undef FIX
1155
#undef CONST_BITS