Subversion Repositories Kolibri OS

Rev

Rev 1102 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
999 diamond 1
;;================================================================================================;;
2
;;//// jpeg.asm //// (c) diamond, 2008-2009 //////////////////////////////////////////////////////;;
3
;;================================================================================================;;
4
;;                                                                                                ;;
5
;; This file is part of Common development libraries (Libs-Dev).                                  ;;
6
;;                                                                                                ;;
7
;; Libs-Dev is free software: you can redistribute it and/or modify it under the terms of the GNU ;;
8
;; Lesser General Public License as published by the Free Software Foundation, either version 2.1 ;;
9
;; of the License, or (at your option) any later version.                                         ;;
10
;;                                                                                                ;;
11
;; Libs-Dev is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without  ;;
12
;; even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  ;;
13
;; Lesser General Public License for more details.                                                ;;
14
;;                                                                                                ;;
15
;; You should have received a copy of the GNU Lesser General Public License along with Libs-Dev.  ;;
16
;; If not, see .                                                    ;;
17
;;                                                                                                ;;
18
;;================================================================================================;;
19
 
20
include 'jpeg.inc'
21
 
22
img.is.jpg:
23
	push	esi ebp
24
	mov	esi, [esp+12]	; esi -> JPEG data
25
	mov	ebp, [esp+16]	; ebp = data size
26
	call	get_marker
27
	jc	.no
28
	cmp	al, 0xD8	; SOI marker?
29
	push	1
30
	pop	eax
31
	jz	.ok
32
.no:
33
	xor	eax, eax
34
.ok:
35
	pop	ebp esi
36
	ret	8
37
 
38
img.decode.jpg:
39
	finit
40
	pushad
41
	mov	esi, [esp+20h+4]	; esi -> JPEG data
42
	mov	ebp, [esp+20h+8]	; ebp = data size
43
@@:
44
; allocate area for JPEG processing
45
	push	sizeof.jpeg.work
46
	call	[mem.alloc]
47
	test	eax, eax
48
	jz	.ret
49
	mov	ebx, eax
50
	xor	ecx, ecx
51
	mov	[ebx + jpeg.work.image], ecx
52
	mov	[ebx + jpeg.work.dct_buffer], ecx
53
	mov	[ebx + jpeg.work._esp], esp
54
; check for SOI [Start-Of-Image] marker
55
	call	get_marker
56
	jc	.end
57
	cmp	al, 0xD8	; SOI?
58
	jz	.soi_ok
59
.end:
60
; general exit from the function
61
; for progressive mode: convert loaded DCT coefficients to image
62
	call	handle_progressive
63
; convert full-color images to RGB
64
	call	convert_to_rgb
65
	push	[ebx + jpeg.work.image]
66
	push	ebx
67
	call	[mem.free]
68
	pop	eax
69
.ret:
70
	mov	[esp+28], eax
71
	popad
1102 diamond 72
	ret	12
999 diamond 73
.soi_ok:
74
	mov	[ebx + jpeg.work.restart_interval], ecx
75
	mov	[ebx + jpeg.work.adobe_ycck], cl
76
; loop until start of frame (real data), parse markers
77
.markers_loop:
78
	call	get_marker
79
	jc	.end
80
; markers RSTn do not have parameters
81
; N.B. They can not exist in this part of JPEG, but let's be liberal :)
82
	cmp	al, 0xD0
83
	jb	@f
84
	cmp	al, 0xD8
85
	jb	.markers_loop
86
@@:
87
	cmp	al, 0xD9	; EOI? [invalid here]
88
	jz	.end
89
; ok, this is marker segment
90
; first word is length of the segment
91
	cmp	ebp, 2
92
	jb	.end
93
	xor	edx, edx
94
	mov	dl, [esi+1]
95
	mov	dh, [esi]	; edx = marker length, al = marker value
96
	sub	ebp, edx
97
	jb	.end
98
	cmp	al, 0xDB	; DQT?
99
	jz	.dqt
100
	cmp	al, 0xC4	; DHT?
101
	jz	.dht
102
	cmp	al, 0xCC	; DAC? [ignored - no arithmetic coding]
103
	jz	.next_marker
104
	cmp	al, 0xDD	; DRI?
105
	jz	.dri
106
	cmp	al, 0xDA	; SOS?
107
	jz	.sos
108
	cmp	al, 0xC0
109
	jb	@f
110
	cmp	al, 0xD0
111
	jb	.sofn
112
@@:
113
	cmp	al, 0xEE	; APP14?
114
	jz	.app14
115
; unrecognized marker; let's skip it and hope for the best
116
.next_marker:
117
	add	esi, edx
118
	jmp	.markers_loop
119
.app14:
120
; check for special Adobe marker
121
	cmp	dx, 14
122
	jb	.next_marker
123
	cmp	byte [esi+2], 'A'
124
	jnz	.next_marker
125
	cmp	dword [esi+3], 'dobe'
126
	jnz	.next_marker
127
	cmp	byte [esi+13], 2
128
	setz	[ebx + jpeg.work.adobe_ycck]
129
	jmp	.next_marker
130
.dqt:
131
; DQT marker found
132
; length: 2 bytes for length field + 65 bytes per table
133
	sub	edx, 2
134
	jc	.end
135
	lodsw
136
.dqt_loop:
137
	test	edx, edx
138
	jz	.markers_loop
139
	sub	edx, 1+64
140
	jc	.end
141
	lodsb
142
; 8-bit DCT-based process shall not use a 16-bit precision quantization table.
143
	test	al, 0xF0
144
	jnz	.end
145
	and	eax, 3
146
	mov	[ebx+jpeg.work.quant_tables_defined+eax], 1
147
	shl	eax, 8
148
	lea	edi, [ebx+eax+jpeg.work.quant_tables]
149
	xor	ecx, ecx
150
@@:
151
	xor	eax, eax
152
	lodsb
153
	push	eax
154
	fild	dword [esp]
155
	pop	eax
156
	movzx	eax, byte [zigzag+ecx]
157
	add	eax, eax
158
	push	eax
159
	and	eax, 7*4
160
	fmul	dword [idct_pre_table+eax]
161
	pop	eax
162
	push	eax
163
	shr	eax, 3
164
	and	eax, 7*4
165
	fmul	dword [idct_pre_table+eax]
166
	pop	eax
167
	fstp	dword [edi+eax]
168
	inc	ecx
169
	cmp	ecx, 64
170
	jb	@b
171
	jmp	.dqt_loop
172
.dri:
173
; DRI marker found
174
	cmp	edx, 4		; length must be 4
175
	jnz	.end2
176
	movzx	eax, word [esi+2]
177
	xchg	al, ah
178
	mov	[ebx+jpeg.work.restart_interval], eax
179
	jmp	.next_marker
180
.dht:
181
; DHT marker found
182
	sub	edx, 2
183
	jc	.end2
184
	lodsw
185
.dht_loop:
186
	test	edx, edx
187
	jz	.markers_loop
188
	sub	edx, 17
189
	jc	.end2
190
; next Huffman table; find place for it
191
	lodsb
192
	mov	edi, eax
193
	and	eax, 0x10
194
	and	edi, 3
195
	shr	eax, 2
196
	or	edi, eax
197
	mov	[ebx+jpeg.work.dc_huffman_defined+edi], 1
198
;	shl	edi, 11
199
	imul	edi, max_hufftable_size
200
	lea	edi, [ebx+edi+jpeg.work.dc_huffman]	; edi -> destination table
201
; get table size
202
	xor	eax, eax
203
	push	16
204
	pop	ecx
205
@@:
206
	add	al, [esi]
207
	adc	ah, 0
208
	inc	esi
209
	loop	@b
210
	cmp	ax, 0x100
211
	ja	.end2
212
	sub	edx, eax
213
	jc	.end2
214
; construct Huffman tree
215
	push	ebx edx
216
	; lea	eax, [edi+256*8]
217
	; push	eax
218
	; push	16
219
	; mov	edx, esi
220
; @@:
221
	; cmp	byte [edx-1], 0
222
	; jnz	@f
223
	; dec	edx
224
	; dec	dword [esp]
225
	; jmp	@b
226
; @@:
227
	; sub	edx, [esp]
228
	; lea	eax, [edi+8]
229
	; push	2
230
	; pop	ecx
231
; .lenloop:
232
	; mov	bl, byte [edx]
233
	; test	bl, bl
234
	; jz	.len1done
235
	; push	eax
236
	; xor	eax, eax
237
; .len1loop:
238
	; dec	ecx
239
	; js	.dhterr
240
	; cmp	edi, [esp+8]
241
	; jae	.dhterr
242
	; lodsb
243
	; stosd
244
	; dec	bl
245
	; jnz	.len1loop
246
	; pop	eax
247
; .len1done:
248
	; jecxz	.len2done
249
	; push	ecx
250
; .len2loop:
251
	; cmp	eax, [esp+8]
252
	; jb	@f
253
	; or	eax, -1
254
; @@:
255
	; cmp	edi, [esp+8]
256
	; jae	.dhterr
257
	; stosd
258
	; add	eax, 8
259
	; jnb	@f
260
	; or	eax, -1
261
; @@:
262
	; loop	.len2loop
263
	; pop	ecx
264
; .len2done:
265
	; add	ecx, ecx
266
	; inc	edx
267
	; dec	dword [esp]
268
	; jnz	.lenloop
269
	; pop	eax
270
	; pop	eax
271
	; sub	eax, edi
272
	; shr	eax, 2
273
	; cmp	eax, ecx
274
	; ja	@f
275
	; mov	ecx, eax
276
; @@:
277
	; or	eax, -1
278
	; rep	stosd
279
	; pop	edx ebx
280
	; jmp	.dht_loop
281
; .dhterr:
282
	; ;pop	eax eax eax edx ebx
283
	; add	esp, 5*4
284
	lea	eax, [edi+256*2]
285
	push	eax
286
	lea	edx, [esi-16]
287
	mov	ah, 1
288
	mov	ecx, 128
289
.dht_l1:
290
	movzx	ebx, byte [edx]
291
	inc	edx
292
	test	ebx, ebx
293
	jz	.dht_l3
294
.dht_l2:
295
	cmp	edi, [esp]
296
	jae	.dhterr1
297
	lodsb
298
	xchg	al, ah
299
	push	ecx
300
	rep	stosw
301
	pop	ecx
302
	xchg	al, ah
303
	dec	ebx
304
	jnz	.dht_l2
305
.dht_l3:
306
	inc	ah
307
	shr	ecx, 1
308
	jnz	.dht_l1
309
	push	edi
310
	mov	edi, [esp+4]
311
	push	edi
312
	mov	eax, 0x00090100
313
	mov	cl, 8
314
.dht_l4:
315
	movzx	ebx, byte [edx]
316
	inc	edx
317
	test	ebx, ebx
318
	jz	.dht_l6
319
.dht_l5:
320
	cmp	edi, [esp]
321
	jb	@f
322
	mov	edi, [esp+4]
323
	rol	eax, 16
324
	cmp	edi, [esp+8]
325
	jae	.dhterr2
326
	stosw
327
	inc	ah
328
	mov	[esp+4], edi
329
	pop	edi
330
	push	edi
331
	rol	eax, 16
332
	add	dword [esp], 16*2
333
@@:
334
	lodsb
335
	xchg	al, ah
336
	push	ecx
337
	rep	stosw
338
	pop	ecx
339
	xchg	al, ah
340
	dec	ebx
341
	jnz	.dht_l5
342
.dht_l6:
343
	inc	ah
344
	shr	ecx, 1
345
	jnz	.dht_l4
346
	push	edi
347
	movzx	ebx, byte [edx]
348
	add	ebx, ebx
349
	add	bl, [edx+1]
350
	adc	bh, 0
351
	add	ebx, ebx
352
	add	bl, [edx+2]
353
	adc	bh, 0
354
	add	ebx, ebx
355
	add	bl, [edx+3]
356
	adc	bh, 0
357
	add	ebx, 15
358
	shr	ebx, 4
359
	mov	cl, 8
360
	lea	ebx, [edi+ebx*2]
361
	sub	ebx, [esp+12]
362
	add	ebx, 31
363
	shr	ebx, 5
364
	mov	edi, ebx
365
	shl	edi, 5
366
	add	edi, [esp+12]
367
	xor	ebx, 9
368
	shl	ebx, 16
369
	xor	eax, ebx
370
	push	edi
371
.dht_l7:
372
	movzx	ebx, byte [edx]
373
	inc	edx
374
	test	ebx, ebx
375
	jz	.dht_l10
376
.dht_l8:
377
	cmp	edi, [esp]
378
	jb	.dht_l9
379
	mov	edi, [esp+4]
380
	cmp	edi, [esp+8]
381
	jb	@f
382
	mov	edi, [esp+12]
383
	cmp	edi, [esp+16]
384
	jae	.dhterr3
385
	mov	al, 9
386
	stosb
387
	rol	eax, 8
388
	stosb
389
	inc	eax
390
	ror	eax, 8
391
	mov	[esp+12], edi
392
	mov	edi, [esp+8]
393
	add	dword [esp+8], 16*2
394
@@:
395
	mov	al, 9
396
	stosb
397
	rol	eax, 16
398
	stosb
399
	inc	eax
400
	ror	eax, 16
401
	mov	[esp+4], edi
402
	pop	edi
403
	push	edi
404
	add	dword [esp], 16*2
405
.dht_l9:
406
	lodsb
407
	xchg	al, ah
408
	push	ecx
409
	rep	stosw
410
	pop	ecx
411
	xchg	al, ah
412
	dec	ebx
413
	jnz	.dht_l8
414
.dht_l10:
415
	inc	ah
416
	shr	ecx, 1
417
	jnz	.dht_l7
418
	push	-1
419
	pop	eax
420
	pop	ecx
421
	sub	ecx, edi
422
	rep	stosb
423
	pop	edi
424
	pop	ecx
425
	sub	ecx, edi
426
	rep	stosb
427
	pop	edi
428
	pop	ecx
429
	sub	ecx, edi
430
	rep	stosb
431
	pop	edx ebx
432
	jmp	.dht_loop
433
.dhterr3:
434
	pop	eax eax
435
.dhterr2:
436
	pop	eax eax
437
.dhterr1:
438
	pop	eax
439
	pop	edx ebx
440
.end2:
441
	jmp	.end
442
.sofn:
443
; SOFn marker found
444
	cmp	[ebx+jpeg.work.image], 0
445
	jnz	.end2	; only one frame is allowed
446
; only SOF0 [baseline sequential], SOF1 [extended sequential], SOF2 [progressive]
447
; nobody supports other compression methods
448
	cmp	al, 0xC2
449
	ja	.end2
450
	setz	[ebx+jpeg.work.progressive]
451
; Length must be at least 8
452
	sub	edx, 8
453
	jb	.end2
454
; Sample precision in JFIF must be 8 bits
455
	cmp	byte [esi+2], 8
456
	jnz	.end2
457
; Color space in JFIF is either YCbCr (color images, 3 components)
458
;                        or Y (grey images, 1 component)
459
	movzx	eax, byte [esi+7]
460
	cmp	al, 1
461
	jz	@f
462
	cmp	al, 3
463
	jz	@f
464
; Adobe products sometimes use YCCK color space with 4 components
465
	cmp	al, 4
466
	jnz	.end2
467
	cmp	[ebx+jpeg.work.adobe_ycck], 0
468
	jz	.end2
469
@@:
470
	mov	edi, eax	; edi = number of components
471
	lea	eax, [eax*3]
472
	sub	edx, eax
473
	jnz	.end2
474
; image type: 8 bpp for grayscale JPEGs, 24 bpp for normal,
475
; 32 bpp for Adobe YCCK
2733 dunkaist 476
	push	Image.bpp8i
477
	pop	eax	; Image.bpp8i = 1
1079 diamond 478
	cmp	edi, eax
999 diamond 479
	jz	@f
480
	inc	eax	; Image.bpp24 = 2
481
	cmp	edi, 3
482
	jz	@f
483
	inc	eax	; Image.bpp32 = 3
484
@@:
485
	push	eax
486
; get width and height
487
; width must be nonzero
488
; height must be nonzero - nobody supports DNL markers
489
	mov	ah, [esi+3]
490
	mov	al, [esi+4]	; eax = height
491
	xor	ecx, ecx
492
	mov	ch, [esi+5]
493
	mov	cl, [esi+6]	; ecx = width
494
; allocate memory for image
495
	stdcall img.create, ecx, eax
496
	test	eax, eax
497
	jz	.end2
498
	mov	[ebx + jpeg.work.image], eax
499
; create grayscale palette if needed
500
	cmp	edi, 1
501
	jnz	.no_create_palette
502
	push	ecx edi
503
	mov	edi, [eax + Image.Palette]
504
	xor	eax, eax
505
	mov	ecx, 256
506
@@:
507
	stosd
508
	add	eax, 0x010101
509
	loop	@b
510
	pop	edi ecx
511
.no_create_palette:
512
; other image characteristics
513
	mov	eax, edi
514
	shl	eax, 3
515
	mov	[ebx + jpeg.work.delta_x], eax
516
	mov	[ebx + jpeg.work.pixel_size], edi
517
	;mov	eax, edi
518
	imul	eax, ecx
519
	mov	[ebx + jpeg.work.delta_y], eax
520
	shr	eax, 3
521
	mov	[ebx + jpeg.work.line_size], eax
522
	add	esi, 8
523
	mov	ecx, edi
524
	lea	edi, [ebx + jpeg.work.components]
525
	xor	eax, eax
526
	xor	edx, edx
527
.sof_parse_comp:
528
	movsb	; db ComponentIdentifier
529
	lodsb
530
	mov	ah, al
531
	and	al, 0xF
532
	jz	.end3
533
	shr	ah, 4
534
	jz	.end3
535
	stosd	; db V, db H, db ?, db ? (will be filled later)
536
	cmp	dl, al
537
	ja	@f
538
	mov	dl, al
539
@@:
540
	cmp	dh, ah
541
	ja	@f
542
	mov	dh, ah
543
@@:
544
	movsb	; db QuantizationTableID
545
	loop	.sof_parse_comp
546
	mov	word [ebx + jpeg.work.max_v], dx
547
	movzx	eax, dh
548
	movzx	edx, dl
549
	push	eax edx
550
	shl	eax, 3
551
	shl	edx, 3
552
	mov	[ebx + jpeg.work.block_width], eax
553
	mov	[ebx + jpeg.work.block_height], edx
554
	pop	edx eax
555
	push	eax edx
556
	imul	eax, [ebx + jpeg.work.delta_x]
557
	mov	[ebx + jpeg.work.block_delta_x], eax
558
	imul	edx, [ebx + jpeg.work.delta_y]
559
	mov	[ebx + jpeg.work.block_delta_y], edx
560
	mov	ecx, [ebx + jpeg.work.image]
561
	mov	eax, [ecx + Image.Width]
562
	add	eax, [ebx + jpeg.work.block_width]
563
	dec	eax
564
	xor	edx, edx
565
	div	[ebx + jpeg.work.block_width]
566
	mov	[ebx + jpeg.work.x_num_blocks], eax
567
	mov	eax, [ecx + Image.Height]
568
	add	eax, [ebx + jpeg.work.block_height]
569
	dec	eax
570
	xor	edx, edx
571
	div	[ebx + jpeg.work.block_height]
572
	mov	[ebx + jpeg.work.y_num_blocks], eax
573
	mov	ecx, [ebx + jpeg.work.pixel_size]
574
	pop	edx
575
	lea	edi, [ebx + jpeg.work.components]
576
@@:
577
	mov	eax, edx
578
	div	byte [edi+1]	; VMax / V_i = VFactor_i
579
	mov	byte [edi+3], al	; db VFactor
580
	pop	eax
581
	push	eax
582
	div	byte [edi+2]	; HMax / H_i = HFactor_i
583
	mov	byte [edi+4], al	; db HFactor
584
	add	edi, 6
585
	loop	@b
586
	pop	eax
587
	cmp	[ebx + jpeg.work.progressive], 0
588
	jz	.sof_noprogressive
589
	mov	eax, [ebx + jpeg.work.x_num_blocks]
590
	mul	[ebx + jpeg.work.block_width]
591
	mul	[ebx + jpeg.work.y_num_blocks]
592
	mul	[ebx + jpeg.work.block_height]
593
	add	eax, eax
594
	mov	[ebx + jpeg.work.dct_buffer_size], eax
595
	mul	[ebx + jpeg.work.pixel_size]
596
	push	eax
597
	call	[mem.alloc]
598
	test	eax, eax
599
	jnz	@f
600
	xchg	eax, [ebx + jpeg.work.image]
601
	push	eax
602
	call	img.destroy
603
	jmp	.end
604
@@:
605
	mov	[ebx + jpeg.work.dct_buffer], eax
606
.sof_noprogressive:
607
	jmp	.markers_loop
608
.end3:
609
	jmp	.end
610
.sos:
611
; SOS marker found
612
; frame must be already opened
613
	cmp	[ebx + jpeg.work.image], 0
614
	jz	.end3
615
	cmp	edx, 6
616
	jb	.end3
617
; parse marker
618
	movzx	eax, byte [esi+2]	; number of components in this scan
619
	test	eax, eax
620
	jz	.end3		; must be nonzero
621
	cmp	al, byte [ebx + jpeg.work.pixel_size]
622
	ja	.end3		; must be <= total number of components
623
;	mov	[ns], eax
624
	cmp	al, 1
625
	setz	[ebx + jpeg.work.not_interleaved]
626
	lea	ecx, [6+eax+eax]
627
	cmp	edx, ecx
628
	jnz	.end3
629
	mov	ecx, eax
630
	lea	edi, [ebx + jpeg.work.cur_components]
631
	add	esi, 3
632
.sos_find_comp:
633
	lodsb	; got ComponentID, look for component info
634
	push	ecx esi
635
	mov	ecx, [ebx + jpeg.work.pixel_size]
636
	lea	esi, [ebx + jpeg.work.components]
637
	and	dword [edi+48], 0
638
	and	dword [edi+52], 0
639
@@:
640
	cmp	[esi], al
641
	jz	@f
642
	inc	dword [edi+52]
643
	add	esi, 6
644
	loop	@b
645
@@:
646
	mov	eax, [esi+1]
647
	mov	dl, [esi+5]
648
	pop	esi ecx
649
	jnz	.end3	; bad ComponentID
650
	cmp	[ebx + jpeg.work.not_interleaved], 0
651
	jz	@f
652
	mov	ax, 0x0101
653
@@:
654
	stosd		; db V, db H, db VFactor, db HFactor
655
	push	ecx
656
	xor	eax, eax
657
	mov	al, byte [edi-1]	; get HFactor
658
	mul	byte [ebx+jpeg.work.pixel_size]	; number of components
659
	stosd			; HIncrement_i = HFactor_i * sizeof(pixel)
660
	mov	al, byte [edi-4-2]	; get VFactor
661
	mul	byte [ebx+jpeg.work.pixel_size]	; number of components
662
	mov	ecx, [ebx+jpeg.work.image]
663
	imul	eax, [ecx+Image.Width]	; image width
664
	stosd			; VIncrement_i = VFactor_i * sizeof(row)
665
	xchg	eax, edx
666
	and	eax, 3
667
	cmp	[ebx+jpeg.work.quant_tables_defined+eax], 0
668
	jz	.end3
669
	shl	eax, 8
670
	lea	eax, [ebx+eax+jpeg.work.quant_tables]
671
	stosd		; dd QuantizationTable
672
	lodsb
673
	movzx	eax, al
674
	mov	edx, eax
675
	shr	eax, 4
676
	and	edx, 3
677
	and	eax, 3
678
	cmp	[ebx+jpeg.work.dc_huffman_defined+eax], 0
679
	jnz	.dc_table_ok
680
	cmp	[ebx+jpeg.work.progressive], 0
681
	jz	.end3
682
	xor	eax, eax
683
	jmp	.dc_table_done
684
.dc_table_ok:
685
;	shl	eax, 11
686
	imul	eax, max_hufftable_size
687
	lea	eax, [ebx+jpeg.work.dc_huffman+eax]
688
.dc_table_done:
689
	cmp	[ebx+jpeg.work.ac_huffman_defined+edx], 0
690
	jnz	.ac_table_ok
691
	cmp	[ebx+jpeg.work.progressive], 0
692
	jz	.end3
693
	xor	edx, edx
694
	jmp	.ac_table_done
695
.ac_table_ok:
696
;	shl	edx, 11
697
	imul	edx, max_hufftable_size
698
	lea	edx, [ebx+jpeg.work.ac_huffman+edx]
699
.ac_table_done:
700
	stosd		; dd DCTable
701
	xchg	eax, edx
702
	stosd		; dd ACTable
703
	mov	eax, [ecx+Image.Width]
704
	movzx	ecx, byte [edi-21]	; get HFactor
705
	cdq	; edx:eax = width (width<0x10000, so as dword it is unsigned)
706
	div	ecx
707
	stosd		; dd width / HFactor_i
708
	stosd
709
	xchg	eax, ecx
710
	inc	eax
711
	sub	eax, edx
712
	stosd		; dd HFactor_i+1 - (width % HFactor_i)
713
	mov	ecx, [ebx+jpeg.work.image]
714
	mov	eax, [ecx+Image.Height]
715
	movzx	ecx, byte [edi-34]	; get VFactor
716
	cdq
717
	div	ecx
718
	stosd		; dd height / VFactor_i
719
	stosd
720
	xchg	eax, ecx
721
	inc	eax
722
	sub	eax, edx
723
	stosd		; dd VFactor_i+1 - (height % VFactor_i)
724
	pop	ecx
725
	scasd		; dd DCPrediction
726
	cmp	dword [edi], 0
727
	setnp	al
728
	ror	al, 1
729
	mov	byte [edi-1], al
730
	scasd		; dd ComponentOffset
731
	dec	ecx
732
	jnz	.sos_find_comp
733
	mov	[ebx+jpeg.work.cur_components_end], edi
734
	lea	edi, [ebx+jpeg.work.ScanStart]
735
	movsb
736
	cmp	byte [esi], 63
737
	ja	.end3
738
	movsb
739
	lodsb
740
	push	eax
741
	and	al, 0xF
742
	stosb
743
	pop	eax
744
	shr	al, 4
745
	stosb
746
; now unpack data
747
	call	init_limits
748
	and	[ebx+jpeg.work.decoded_MCUs], 0
749
	mov	[ebx+jpeg.work.cur_rst_marker], 7
750
	and	[ebx+jpeg.work.huffman_bits], 0
751
	cmp	[ebx+jpeg.work.progressive], 0
752
	jz	.sos_noprogressive
753
; progressive mode - only decode DCT coefficients
754
; initialize pointers to coefficients data
755
; zero number of EOBs for AC coefficients
756
; redefine HIncrement and VIncrement
757
	lea	edi, [ebx+jpeg.work.cur_components]
758
.coeff_init:
759
	mov	eax, [ebx+jpeg.work.dct_buffer_size]
760
	mul	dword [edi+52]
761
	add	eax, [ebx+jpeg.work.dct_buffer]
762
	mov	[edi+12], eax
763
	and	dword [edi+52], 0
764
	cmp	[ebx+jpeg.work.ScanStart], 0
765
	jz	.scan_dc
766
	cmp	dword [edi+20], 0
767
	jz	.end3
768
	jmp	@f
769
.scan_dc:
770
	cmp	dword [edi+16], 0
771
	jz	.end3
772
@@:
773
	movzx	eax, byte [edi+1]
774
	shl	eax, 7
775
	mov	[edi+4], eax
776
	mov	eax, [edi+28]
777
	mov	cl, [edi+3]
778
	cmp	cl, [edi+32]
779
	sbb	eax, -7-1
780
	shr	eax, 3
781
	shl	eax, 7
782
	mov	[edi+8], eax
783
	add	edi, 56
784
	cmp	edi, [ebx+jpeg.work.cur_components_end]
785
	jb	.coeff_init
786
; unpack coefficients
787
; N.B. Speed optimization has sense here.
788
.coeff_decode_loop:
789
	lea	edx, [ebx+jpeg.work.cur_components]
790
.coeff_components_loop:
791
	mov	edi, [edx+12]
792
	movzx	ecx, byte [edx]
793
	push	dword [edx+40]
794
	push	edi
795
.coeff_y_loop:
796
	push	ecx
797
	movzx	eax, byte [edx+1]
798
	push	dword [edx+28]
799
	push	edi
800
.coeff_x_loop:
801
	cmp	dword [edx+40], 0
802
	jl	@f
803
	cmp	dword [edx+28], 0
804
	jge	.realdata
805
@@:
806
	cmp	[ebx+jpeg.work.not_interleaved], 0
807
	jnz	.norealdata
808
	push	eax edi
809
	lea	edi, [ebx+jpeg.work.dct_coeff]
810
	call	decode_progressive_coeff
811
	pop	edi eax
812
	jmp	.norealdata
813
.realdata:
814
	push	eax
815
	call	decode_progressive_coeff
816
	add	edi, 64*2
817
	pop	eax
818
.norealdata:
819
	sub	dword [edx+28], 8
820
	sub	eax, 1
821
	jnz	.coeff_x_loop
822
	pop	edi
823
	pop	dword [edx+28]
824
	add	edi, [edx+8]
825
	pop	ecx
826
	sub	dword [edx+40], 8
827
	sub	ecx, 1
828
	jnz	.coeff_y_loop
829
	movzx	eax, byte [edx+1]
830
	shl	eax, 3
831
	pop	edi
832
	add	edi, [edx+4]
833
	pop	dword [edx+40]
834
	sub	[edx+28], eax
835
	mov	[edx+12], edi
836
	add	edx, 56
837
	cmp	edx, [ebx+jpeg.work.cur_components_end]
838
	jnz	.coeff_components_loop
839
	call	next_MCU
840
	jc	.norst
841
	sub	[ebx+jpeg.work.cur_x], 1
842
	jnz	.coeff_decode_loop
843
	call	next_line
844
	lea	edx, [ebx+jpeg.work.cur_components]
845
@@:
846
	mov	eax, [ebx+jpeg.work.max_x]
847
	imul	eax, [edx+4]
848
	sub	[edx+12], eax
849
	movzx	eax, byte [edx]
850
	imul	eax, [edx+8]
851
	add	[edx+12], eax
852
	add	edx, 56
853
	cmp	edx, [ebx+jpeg.work.cur_components_end]
854
	jnz	@b
855
	sub	[ebx+jpeg.work.cur_y], 1
856
	jnz	.coeff_decode_loop
857
	jmp	.markers_loop
858
.norst:
859
.end4:
860
	jmp	.end3
861
.sos_noprogressive:
862
; normal mode - unpack JPEG image
863
	mov	edi, [ebx+jpeg.work.image]
864
	mov	edi, [edi+Image.Data]
865
	mov	[ebx+jpeg.work.cur_out_ptr], edi
866
; N.B. Speed optimization has sense here.
867
.decode_loop:
868
	call	decode_MCU
869
	call	next_MCU
870
	jc	.end4
871
	sub	[ebx+jpeg.work.cur_x], 1
872
	jnz	.decode_loop
873
	call	next_line
874
	sub	[ebx+jpeg.work.cur_y], 1
875
	jnz	.decode_loop
876
	jmp	.markers_loop
877
 
878
get_marker:
879
; in: esi -> data
880
; out: CF=0, al=marker value - ok
881
;      CF=1 - no marker
882
	sub	ebp, 1
883
	jc	.ret
884
	lodsb
885
if 1
886
	cmp	al, 0xFF
887
	jae	@f
888
; Some stupid men, which do not read specifications and manuals,
889
; sometimes create markers with length field two less than true
890
; value (in JPEG length of marker = length of data INCLUDING
891
; length field itself). To open such files, allow 2 bytes
892
; before next marker.
893
	cmp	ebp, 2
894
	jb	.ret
895
	lodsb
896
	lodsb
897
end if
898
	cmp	al, 0xFF
899
	jb	.ret
900
@@:
901
	sub	ebp, 1
902
	jc	.ret
903
	lodsb
904
	cmp	al, 0xFF
905
	jz	@b
906
	clc
907
.ret:
908
	ret
909
 
910
align 16
911
decode_MCU:
912
	lea	edx, [ebx+jpeg.work.cur_components]
913
.components_loop:
914
; decode each component
915
	push	[ebx+jpeg.work.cur_out_ptr]
916
	movzx	ecx, byte [edx]
917
	push	dword [edx+40]
918
; we have H_i * V_i blocks of packed data, decode them
919
.y_loop_1:
920
	push	[ebx+jpeg.work.cur_out_ptr]
921
	push	ecx
922
	movzx	eax, byte [edx+1]
923
	push	dword [edx+28]
924
.x_loop_1:
925
	push	eax
926
	call	decode_data_unit
927
	cmp	dword [edx+40], 0
928
	jl	.nocopyloop
929
	cmp	dword [edx+28], 0
930
	jl	.nocopyloop
931
; now we have decoded block 8*8 in decoded_data
932
; H_i * V_i packed blocks 8*8 make up one block (8*HMax) * (8*VMax)
933
; so each pixel in packed block corresponds to HFact * VFact pixels
934
	movzx	ecx, byte [edx+2]
935
	push	esi ebp
936
	mov	edi, [ebx+jpeg.work.cur_out_ptr]
937
	add	edi, [edx+52]
938
.y_loop_2:
939
	push	ecx edi
940
	cmp	ecx, [edx+44]
941
	mov	ecx, [edx+40]
942
	sbb	ecx, 8-1
943
	sbb	eax, eax
944
	and	ecx, eax
945
	add	ecx, 8
946
	jz	.skip_x_loop_2
947
	movzx	eax, byte [edx+3]
948
.x_loop_2:
949
	push	eax ecx edi
950
	cmp	eax, [edx+32]
951
	mov	eax, [edx+28]
952
	sbb	eax, 8-1
953
	sbb	ebp, ebp
954
	and	eax, ebp
955
	mov	ebp, .copyiter_all
956
	lea	esi, [ebx+jpeg.work.decoded_data]
957
	sub	ebp, eax
958
	sub	ebp, eax
959
	sub	ebp, eax
960
	mov	eax, [edx+4]
961
	sub	eax, 1
962
.copyloop:
963
	push	esi edi
964
	jmp	ebp
965
.copyiter_all:
966
	movsb
967
repeat 7
968
	add	edi, eax
969
	movsb
970
end repeat
971
	nop
972
	nop
973
	pop	edi esi
974
	add	edi, [edx+8]
975
	add	esi, 8
976
	sub	ecx, 1
977
	jnz	.copyloop
978
	pop	edi ecx eax
979
	add	edi, [ebx+jpeg.work.pixel_size]
980
	sub	eax, 1
981
	jnz	.x_loop_2
982
.skip_x_loop_2:
983
	pop	edi ecx
984
	add	edi, [ebx+jpeg.work.line_size]
985
	sub	ecx, 1
986
	jnz	.y_loop_2
987
	pop	ebp esi
988
.nocopyloop:
989
	mov	eax, [ebx+jpeg.work.delta_x]
990
	add	[ebx+jpeg.work.cur_out_ptr], eax
991
	pop	eax
992
	sub	dword [edx+28], 8
993
	sub	eax, 1
994
	jnz	.x_loop_1
995
	pop	dword [edx+28]
996
	pop	ecx
997
	pop	eax
998
	sub	dword [edx+40], 8
999
	add	eax, [ebx+jpeg.work.delta_y]
1000
	mov	[ebx+jpeg.work.cur_out_ptr], eax
1001
	sub	ecx, 1
1002
	jnz	.y_loop_1
1003
	movzx	eax, byte [edx+1]
1004
	pop	dword [edx+40]
1005
	shl	eax, 3
1006
	pop	[ebx+jpeg.work.cur_out_ptr]
1007
	sub	dword [edx+28], eax
1008
	add	edx, 56
1009
	cmp	edx, [ebx+jpeg.work.cur_components_end]
1010
	jb	.components_loop
1011
	mov	eax, [ebx+jpeg.work.cur_block_dx]
1012
	add	[ebx+jpeg.work.cur_out_ptr], eax
1013
	ret
1014
 
1015
align 16
1016
next_MCU:
1017
	add	[ebx+jpeg.work.decoded_MCUs], 1
1018
	mov	eax, [ebx+jpeg.work.restart_interval]
1019
	test	eax, eax
1020
	jz	.no_restart
1021
	cmp	[ebx+jpeg.work.decoded_MCUs], eax
1022
	jb	.no_restart
1023
	and	[ebx+jpeg.work.decoded_MCUs], 0
1024
	and	[ebx+jpeg.work.huffman_bits], 0
1025
	cmp	[ebx+jpeg.work.cur_x], 1
1026
	jnz	@f
1027
	cmp	[ebx+jpeg.work.cur_y], 1
1028
	jz	.no_restart
1029
@@:
1030
; restart marker must be present
1031
	sub	ebp, 2
1032
	js	.error
1033
	cmp	byte [esi], 0xFF
1034
	jnz	.error
1035
	mov	al, [ebx+jpeg.work.cur_rst_marker]
1036
	inc	eax
1037
	and	al, 7
1038
	mov	[ebx+jpeg.work.cur_rst_marker], al
1039
	add	al, 0xD0
1040
	cmp	[esi+1], al
1041
	jnz	.error
1042
	add	esi, 2
1043
; handle restart marker - zero all DC predictions
1044
	lea	edx, [ebx+jpeg.work.cur_components]
1045
@@:
1046
	and	word [edx+48], 0
1047
	add	edx, 56
1048
	cmp	edx, [ebx+jpeg.work.cur_components_end]
1049
	jb	@b
1050
.no_restart:
1051
	clc
1052
	ret
1053
.error:
1054
	stc
1055
	ret
1056
 
1057
next_line:
1058
	mov	eax, [ebx+jpeg.work.max_x]
1059
	mov	[ebx+jpeg.work.cur_x], eax
1060
	mul	[ebx+jpeg.work.cur_block_dx]
1061
	sub	eax, [ebx+jpeg.work.cur_block_dy]
1062
	sub	[ebx+jpeg.work.cur_out_ptr], eax
1063
	lea	edx, [ebx+jpeg.work.cur_components]
1064
@@:
1065
	mov	eax, [edx+24]
1066
	mov	[edx+28], eax
1067
	movzx	eax, byte [edx]
1068
	shl	eax, 3
1069
	sub	[edx+40], eax
1070
	add	edx, 56
1071
	cmp	edx, [ebx+jpeg.work.cur_components_end]
1072
	jb	@b
1073
	ret
1074
 
1075
init_limits:
1076
	push	[ebx+jpeg.work.x_num_blocks]
1077
	pop	[ebx+jpeg.work.max_x]
1078
	push	[ebx+jpeg.work.y_num_blocks]
1079
	pop	[ebx+jpeg.work.max_y]
1080
	push	[ebx+jpeg.work.block_delta_x]
1081
	pop	[ebx+jpeg.work.cur_block_dx]
1082
	push	[ebx+jpeg.work.block_delta_y]
1083
	pop	[ebx+jpeg.work.cur_block_dy]
1084
	cmp	[ebx+jpeg.work.not_interleaved], 0
1085
	jz	@f
1086
	mov	eax, dword [ebx+jpeg.work.cur_components+28]
1087
	movzx	ecx, byte [ebx+jpeg.work.cur_components+3]
1088
	cmp	cl, [ebx+jpeg.work.cur_components+32]
1089
	sbb	eax, -7-1
1090
	shr	eax, 3
1091
	mov	[ebx+jpeg.work.max_x], eax
1092
	mov	eax, dword [ebx+jpeg.work.cur_components+40]
1093
	movzx	edx, byte [ebx+jpeg.work.cur_components+2]
1094
	cmp	dl, [ebx+jpeg.work.cur_components+44]
1095
	sbb	eax, -7-1
1096
	shr	eax, 3
1097
	mov	[ebx+jpeg.work.max_y], eax
1098
	imul	ecx, [ebx+jpeg.work.delta_x]
1099
	mov	[ebx+jpeg.work.cur_block_dx], ecx
1100
	imul	edx, [ebx+jpeg.work.delta_y]
1101
	mov	[ebx+jpeg.work.cur_block_dy], edx
1102
@@:
1103
	push	[ebx+jpeg.work.max_x]
1104
	pop	[ebx+jpeg.work.cur_x]
1105
	push	[ebx+jpeg.work.max_y]
1106
	pop	[ebx+jpeg.work.cur_y]
1107
	ret
1108
 
1109
;macro get_bit
1110
;{
1111
;local .l1,.l2,.marker
1112
;	add	cl, cl
1113
;	jnz	.l1
1114
;	sub	ebp, 1
1115
;	js	decode_data_unit.eof
1116
;	mov	cl, [esi]
1117
;	cmp	cl, 0xFF
1118
;	jnz	.l2
1119
;.marker:
1120
;	add	esi, 1
1121
;	sub	ebp, 1
1122
;	js	decode_data_unit.eof
1123
;	cmp	byte [esi], 0xFF
1124
;	jz	.marker
1125
;	cmp	byte [esi], 0
1126
;	jnz	decode_data_unit.eof
1127
;.l2:
1128
;	sub	esi, -1
1129
;	adc	cl, cl
1130
;.l1:
1131
;}
1132
macro get_bit stack_depth
1133
{
1134
local .l1,.l2,.marker
1135
	sub	cl, 1
1136
	jns	.l1
1137
	sub	ebp, 1
1138
	js	.eof_pop#stack_depth
1139
	mov	ch, [esi]
1140
	cmp	ch, 0xFF
1141
	jnz	.l2
1142
.marker:
1143
	add	esi, 1
1144
	sub	ebp, 1
1145
	js	.eof_pop#stack_depth
1146
	cmp	byte [esi], 0xFF
1147
	jz	.marker
1148
	cmp	byte [esi], 0
1149
	jnz	.eof_pop#stack_depth
1150
.l2:
1151
	add	esi, 1
1152
	mov	cl, 7
1153
.l1:
1154
	add	ch, ch
1155
}
1156
macro get_bits stack_depth,stack_depth_p1,restore_edx
1157
{
1158
local .l1,.l2,.l3,.marker2
1159
	movzx	eax, ch
1160
	mov	dl, cl
1161
	shl	eax, 24
1162
	neg	cl
1163
	push	ebx
1164
	add	cl, 24
1165
.l1:
1166
	cmp	bl, dl
1167
	jbe	.l2
1168
	sub	bl, dl
1169
	sub	ebp, 1
1170
	js	.eof_pop#stack_depth_p1
1171
	mov	ch, [esi]
1172
	cmp	ch, 0xFF
1173
	jnz	.l3
1174
.marker2:
1175
	add	esi, 1
1176
	sub	ebp, 1
1177
	js	.eof_pop#stack_depth_p1
1178
	cmp	byte [esi], 0xFF
1179
	jz	.marker2
1180
	cmp	byte [esi], 0
1181
	jnz	.eof_pop#stack_depth_p1
1182
.l3:
1183
	movzx	edx, ch
1184
	add	esi, 1
1185
	shl	edx, cl
1186
	sub	cl, 8
1187
	or	eax, edx
1188
	mov	dl, 8
1189
	jmp	.l1
1190
.l2:
1191
	mov	cl, bl
1192
	sub	dl, bl
1193
	shl	ch, cl
1194
	pop	ebx
1195
	cmp	eax, 80000000h
1196
	rcr	eax, 1
1197
	mov	cl, 31
1198
	sub	cl, bl
1199
	sar	eax, cl
1200
	mov	cl, dl
1201
if restore_edx eq true
1202
	pop	edx
1203
end if
1204
	add	eax, 80000000h
1205
	adc	eax, 80000000h
1206
}
1207
; macro get_huffman_code
1208
; {
1209
; local .l1
1210
	; xor	ebx, ebx
1211
; .l1:
1212
	; get_bit
1213
	; adc	ebx, ebx
1214
	; mov	eax, [eax+4*ebx]
1215
	; xor	ebx, ebx
1216
	; cmp	eax, -1
1217
	; jz	.eof_pop
1218
	; cmp	eax, 0x1000
1219
	; jae	.l1
1220
	; mov	ebx, eax
1221
; }
1222
macro get_huffman_code stack_depth,stack_depth_p1
1223
{
1224
local .l1,.l2,.l3,.l4,.l5,.l6,.nomarker1,.marker1,.nomarker2,.marker2,.nomarker3,.marker3,.done
1225
; 1. (First level in Huffman table) Does the current Huffman code fit in 8 bits
1226
; and have we got enough bits?
1227
	movzx	ebx, ch
1228
	cmp	byte [eax+ebx*2], cl
1229
	jbe	.l1
1230
; 2a. No; load next byte
1231
	sub	ebp, 1
1232
	js	.eof_pop#stack_depth
1233
	mov	ch, [esi]
1234
	movzx	edx, ch
1235
	cmp	ch, 0xFF
1236
	jnz	.nomarker1
1237
.marker1:
1238
	add	esi, 1
1239
	sub	ebp, 1
1240
	js	.eof_pop#stack_depth
1241
	cmp	byte [esi], 0xFF
1242
	jz	.marker1
1243
	cmp	byte [esi], 0
1244
	jnz	.eof_pop#stack_depth
1245
.nomarker1:
1246
	shr	edx, cl
1247
	add	esi, 1
1248
	or	ebx, edx
1249
; 3a. (First level in Huffman table, >=8 bits known) Does the current Huffman code fit in 8 bits?
1250
	cmp	byte [eax+ebx*2], 8
1251
	jbe	.l2
1252
	jl	.eof_pop#stack_depth
1253
; 4aa. No; go to next level
1254
	movzx	ebx, byte [eax+ebx*2+1]
1255
	mov	dl, ch
1256
	shl	ebx, 5
1257
	ror	edx, cl
1258
	lea	ebx, [eax+ebx+0x200]
1259
	shr	edx, 24
1260
	push	edx
1261
	shr	edx, 4
1262
; 5aa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits
1263
; and have we got enough bits?
1264
	cmp	byte [ebx+edx*2], cl
1265
	jbe	.l3
1266
; 6aaa. No; have we got 12 bits?
1267
	cmp	cl, 4
1268
	jae	.l4
1269
; 7aaaa. No; load next byte
1270
	pop	edx
1271
	sub	ebp, 1
1272
	js	.eof_pop#stack_depth
1273
	mov	ch, [esi]
1274
	cmp	ch, 0xFF
1275
	jnz	.nomarker2
1276
.marker2:
1277
	add	esi, 1
1278
	sub	ebp, 1
1279
	js	.eof_pop#stack_depth
1280
	cmp	byte [esi], 0xFF
1281
	jz	.marker2
1282
	cmp	byte [esi], 0
1283
	jnz	.eof_pop#stack_depth
1284
.nomarker2:
1285
	push	ecx
1286
	shr	ch, cl
1287
	add	esi, 1
1288
	or	dl, ch
1289
	pop	ecx
1290
	push	edx
1291
	shr	edx, 4
1292
; 8aaaa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits?
1293
	cmp	byte [ebx+edx*2], 4
1294
	jbe	.l5
1295
	jl	.eof_pop#stack_depth_p1
1296
; 9aaaaa. No; go to next level
1297
	movzx	ebx, byte [ebx+edx*2+1]
1298
	pop	edx
1299
	shl	ebx, 5
1300
	and	edx, 0xF
1301
	lea	ebx, [eax+ebx+0x200]
1302
; 10aaaaa. Get current code length and value
1303
	sub	cl, [ebx+edx*2]
1304
	movzx	eax, byte [ebx+edx*2+1]
1305
	neg	cl
1306
	shl	ch, cl
1307
	neg	cl
1308
	add	cl, 8
1309
	jmp	.done
1310
.l5:
1311
; 9aaaab. Yes; get current code length and value
1312
	sub	cl, [ebx+edx*2]
1313
	movzx	eax, byte [ebx+edx*2+1]
1314
	neg	cl
1315
	pop	edx
1316
	shl	ch, cl
1317
	neg	cl
1318
	add	cl, 8
1319
	jmp	.done
1320
.l4:
1321
; 7aaab. Yes; go to next level
1322
	movzx	ebx, byte [ebx+edx*2+1]
1323
	pop	edx
1324
	shl	ebx, 5
1325
	and	edx, 0xF
1326
	lea	ebx, [eax+ebx+0x200]
1327
; 8aaab. (Third level in Huffman table) Have we got enough bits?
1328
	cmp	[ebx+edx*2], cl
1329
	jbe	.l6
1330
; 9aaaba. No; load next byte
1331
	sub	ebp, 1
1332
	js	.eof_pop#stack_depth
1333
	mov	ch, [esi]
1334
	cmp	ch, 0xFF
1335
	jnz	.nomarker3
1336
.marker3:
1337
	add	esi, 1
1338
	sub	ebp, 1
1339
	js	.eof_pop#stack_depth
1340
	cmp	byte [esi], 0xFF
1341
	jz	.marker3
1342
	cmp	byte [esi], 0
1343
	jnz	.eof_pop#stack_depth
1344
.nomarker3:
1345
	push	ecx
1346
	shr	ch, cl
1347
	add	esi, 1
1348
	or	dl, ch
1349
	pop	ecx
1350
; 10aaaba. Get current code length and value
1351
	sub	cl, [ebx+edx*2]
1352
	movzx	eax, byte [ebx+edx*2+1]
1353
	neg	cl
1354
	shl	ch, cl
1355
	neg	cl
1356
	add	cl, 8
1357
	jmp	.done
1358
.l3:
1359
; 6aab. Yes; get current code length and value
1360
	pop	eax
1361
.l6:
1362
; 9aaabb. Yes; get current code length and value
1363
	sub	cl, [ebx+edx*2]
1364
	movzx	eax, byte [ebx+edx*2+1]
1365
	xor	cl, 7
1366
	shl	ch, cl
1367
	xor	cl, 7
1368
	add	ch, ch
1369
	jmp	.done
1370
.l2:
1371
; 3ab. Yes; get current code length and value
1372
	sub	cl, [eax+ebx*2]
1373
	movzx	eax, byte [eax+ebx*2+1]
1374
	neg	cl
1375
	shl	ch, cl
1376
	neg	cl
1377
	add	cl, 8
1378
	jmp	.done
1379
.l1:
1380
; 3b. Yes; get current code length and value
1381
	mov	dl, [eax+ebx*2]
1382
	movzx	eax, byte [eax+ebx*2+1]
1383
	xchg	cl, dl
1384
	sub	dl, cl
1385
	shl	ch, cl
1386
	mov	cl, dl
1387
.done:
1388
	mov	ebx, eax
1389
}
1390
; Decode DCT coefficients for one 8*8 block in progressive mode
1391
; from input stream, given by pointer esi and length ebp
1392
; N.B. Speed optimization has sense here.
1393
align 16
1394
decode_progressive_coeff:
1395
	mov	ecx, [ebx+jpeg.work.huffman_bits]
1396
	cmp	[ebx+jpeg.work.ScanStart], 0
1397
	jnz	.ac
1398
; DC coefficient
1399
	cmp	[ebx+jpeg.work.ApproxPosHigh], 0
1400
	jz	.dc_first
1401
; DC coefficient, subsequent passes
1402
	xor	eax, eax
1403
	get_bit 0
1404
	adc	eax, eax
1405
	mov	[ebx+jpeg.work.huffman_bits], ecx
1406
	mov	cl, [ebx+jpeg.work.ApproxPosLow]
1407
	shl	eax, cl
1408
	or	[edi], ax
1409
	ret
1410
.dc_first:
1411
; DC coefficient, first pass
1412
	mov	eax, [edx+16]
1413
	push	ebx
1414
	push	edx
1415
	get_huffman_code 2,3
1416
	get_bits 2,3,true
1417
	pop	ebx
1418
	add	eax, [edx+48]
1419
	mov	[edx+48], ax
1420
	mov	[ebx+jpeg.work.huffman_bits], ecx
1421
	mov	cl, [ebx+jpeg.work.ApproxPosLow]
1422
	shl	eax, cl
1423
	mov	[edi], ax
1424
	ret
1425
.ac:
1426
; AC coefficients
1427
	movzx	eax, [ebx+jpeg.work.ScanStart]
1428
	cmp	al, [ebx+jpeg.work.ScanEnd]
1429
	ja	.ret
1430
	cmp	dword [edx+52], 0
1431
	jnz	.was_eob
1432
	push	ebx
1433
.acloop:
1434
	push	edx
1435
	push	eax
1436
	mov	eax, [edx+20]
1437
	get_huffman_code 3,4
1438
	pop	eax
1439
	test	ebx, 15
1440
	jz	.band
1441
	push	eax ebx
1442
	and	ebx, 15
1443
	get_bits 4,5,false
1444
	pop	ebx
1445
	xchg	eax, [esp]
1446
	shr	ebx, 4
1447
	mov	edx, [esp+8]
1448
.zeroloop1:
1449
	push	eax ebx
1450
	movzx	eax, byte [zigzag+eax]
1451
	xor	ebx, ebx
1452
	cmp	word [edi+eax], bx
1453
	jz	.zeroloop2
1454
	get_bit 5
1455
	jnc	@f
1456
	push	ecx
1457
	mov	cl, [edx+jpeg.work.ApproxPosLow]
1458
	xor	ebx, ebx
1459
	cmp	byte [edi+eax+1], 80h
1460
	adc	ebx, 0
1461
	add	ebx, ebx
1462
	sub	ebx, 1
1463
	shl	ebx, cl
1464
	pop	ecx
1465
	add	[edi+eax], bx
1466
@@:
1467
	pop	ebx eax
1468
@@:
1469
	add	eax, 1
1470
	cmp	al, [edx+jpeg.work.ScanEnd]
1471
	ja	decode_data_unit.eof_pop3
1472
	jmp	.zeroloop1
1473
.zeroloop2:
1474
	pop	ebx eax
1475
	sub	ebx, 1
1476
	jns	@b
1477
.nozero1:
1478
	pop	ebx
1479
	test	ebx, ebx
1480
	jz	@f
1481
	push	eax
1482
	movzx	eax, byte [zigzag+eax]
1483
	push	ecx
1484
	mov	cl, [edx+jpeg.work.ApproxPosLow]
1485
	shl	ebx, cl
1486
	pop	ecx
1487
	mov	[edi+eax], bx
1488
	pop	eax
1489
@@:
1490
	add	eax, 1
1491
	cmp	al, [edx+jpeg.work.ScanEnd]
1492
	pop	edx
1493
	jbe	.acloop
1494
	pop	ebx
1495
	mov	[ebx+jpeg.work.huffman_bits], ecx
1496
.ret:
1497
	ret
1498
.eof_pop5:
1499
	pop	ebx
1500
.eof_pop4:
1501
	pop	ebx
1502
.eof_pop3:
1503
	pop	ebx
1504
.eof_pop2:
1505
	pop	ebx
1506
.eof_pop1:
1507
	pop	ebx
1508
.eof_pop0:
1509
	jmp	decode_data_unit.eof_pop0
1510
.band:
1511
	shr	ebx, 4
1512
	cmp	ebx, 15
1513
	jnz	.eob
1514
	mov	edx, [esp+4]
1515
	push	0
1516
	jmp	.zeroloop1
1517
.eob:
1518
	pop	edx
1519
	push	eax
1520
	mov	eax, 1
1521
	test	ebx, ebx
1522
	jz	.eob0
1523
@@:
1524
	get_bit 2
1525
	adc	eax, eax
1526
	sub	ebx, 1
1527
	jnz	@b
1528
.eob0:
1529
	mov	[edx+52], eax
1530
	pop	eax
1531
	pop	ebx
1532
.was_eob:
1533
	sub	dword [edx+52], 1
1534
	cmp	al, [ebx+jpeg.work.ScanEnd]
1535
	ja	.ret2
1536
	push	edx
1537
.zeroloop3:
1538
	push	eax
1539
	movzx	eax, byte [zigzag+eax]
1540
	xor	edx, edx
1541
	cmp	word [edi+eax], dx
1542
	jz	@f
1543
	get_bit 2
1544
	jnc	@f
1545
	push	ecx
1546
	mov	cl, [ebx+jpeg.work.ApproxPosLow]
1547
	xor	edx, edx
1548
	cmp	byte [edi+eax+1], 80h
1549
	adc	edx, 0
1550
	add	edx, edx
1551
	sub	edx, 1
1552
	shl	edx, cl
1553
	pop	ecx
1554
	add	[edi+eax], dx
1555
@@:
1556
	pop	eax
1557
	add	eax, 1
1558
	cmp	al, [ebx+jpeg.work.ScanEnd]
1559
	jbe	.zeroloop3
1560
	pop	edx
1561
.ret2:
1562
	mov	[ebx+jpeg.work.huffman_bits], ecx
1563
	ret
1564
 
1565
handle_progressive:
1566
	cmp	[ebx+jpeg.work.dct_buffer], 0
1567
	jnz	@f
1568
	ret
1569
@@:
1570
; information for all components
1571
	lea	esi, [ebx+jpeg.work.components]
1572
	xor	ebp, ebp
1573
	mov	ecx, [ebx+jpeg.work.pixel_size]
1574
.next_component:
1575
	lea	edi, [ebx+jpeg.work.cur_components]
1576
	lodsb	; ComponentID
1577
	lodsd
1578
	mov	ax, 0x0101
1579
	stosd	; db V, db H, db VFactor, db HFactor
1580
	xor	eax, eax
1581
	mov	al, byte [edi-1]	; get HFactor
1582
	mul	byte [ebx+jpeg.work.pixel_size]	; number of components
1583
	stosd			; HIncrement_i = HFactor_i * sizeof(pixel)
1584
	movzx	eax, byte [edi-4-2]	; get VFactor
1585
	mul	[ebx+jpeg.work.line_size]	; number of components * image width
1586
	stosd			; VIncrement_i = VFactor_i * sizeof(row)
1587
	lodsb
1588
	and	eax, 3
1589
	cmp	[ebx+jpeg.work.quant_tables_defined+eax], 0
1590
	jz	.error
1591
	shl	eax, 8
1592
	lea	eax, [ebx+jpeg.work.quant_tables+eax]
1593
	stosd		; dd QuantizationTable
1594
	stosd		; dd DCTable - ignored
1595
	mov	eax, ebp
1596
	mul	[ebx+jpeg.work.dct_buffer_size]
1597
	add	eax, [ebx+jpeg.work.dct_buffer]
1598
	stosd		; instead of dd ACTable - pointer to current DCT coefficients
1599
	push	ecx
1600
	mov	eax, [ebx+jpeg.work.image]
1601
	mov	eax, [eax+Image.Width]
1602
	movzx	ecx, byte [edi-21]	; get HFactor
1603
;	cdq	; edx = 0 as a result of previous mul
1604
	div	ecx
1605
	stosd		; dd width / HFactor_i
1606
	stosd
1607
	xchg	eax, ecx
1608
	inc	eax
1609
	sub	eax, edx
1610
	stosd		; dd HFactor_i+1 - (width % HFactor_i)
1611
	mov	eax, [ebx+jpeg.work.image]
1612
	mov	eax, [eax+Image.Height]
1613
	movzx	ecx, byte [edi-34]	; get VFactor
1614
	cdq
1615
	div	ecx
1616
	stosd		; dd height / VFactor_i
1617
	stosd
1618
	xchg	eax, ecx
1619
	inc	eax
1620
	sub	eax, edx
1621
	stosd		; dd VFactor_i+1 - (height % VFactor_i)
1622
	pop	ecx
1623
	xor	eax, eax
1079 diamond 1624
	test	ebp, ebp
1625
	setnp	al
1626
	ror	eax, 1
999 diamond 1627
	stosd		; dd DCPrediction
1628
	mov	eax, ebp
1629
	stosd		; dd ComponentOffset
1630
	inc	ebp
1631
	push	ecx
1632
	mov	[ebx+jpeg.work.cur_components_end], edi
1633
	lea	edx, [edi-56]
1634
; do IDCT and unpack
1635
	mov	edi, [ebx+jpeg.work.image]
1636
	mov	edi, [edi+Image.Data]
1637
	mov	[ebx+jpeg.work.cur_out_ptr], edi
1638
	mov	[ebx+jpeg.work.not_interleaved], 1
1639
	call	init_limits
1640
.decode_loop:
1641
	call	decode_MCU
1642
	sub	[ebx+jpeg.work.cur_x], 1
1643
	jnz	.decode_loop
1644
	call	next_line
1645
	sub	[ebx+jpeg.work.cur_y], 1
1646
	jnz	.decode_loop
1647
	pop	ecx
1648
	dec	ecx
1649
	jnz	.next_component
1650
; image unpacked, return
1651
.error:
1652
	push	[ebx+jpeg.work.dct_buffer]
1653
	call	[mem.free]
1654
	ret
1655
 
1656
; Support for YCbCr -> RGB conversion
1657
; R = Y                          + 1.402 * (Cr - 128)
1658
; G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
1659
; B = Y +   1.772 * (Cb - 128)
1660
; When converting YCbCr -> RGB, we need to do some multiplications;
1661
; to be faster, we precalculate the table for all 256 possible values
1662
; Also we approximate fractions with N/65536, this gives sufficient precision
1663
img.initialize.jpeg:
1664
;initialize_color_table:
1665
; 1.402 = 1 + 26345/65536, -0.71414 = -46802/65536
1666
; -0.34414 = -22554/65536, 1.772 = 1 + 50594/65536
1667
	pushad
1668
	mov	edi, color_table_1
1669
	mov	ecx, 128
1670
; 1. Cb -> 1.772*Cb
1671
	xor	eax, eax
1672
	mov	dx, 8000h
1673
.l1:
1674
	push	ecx
1675
@@:
1676
	stosd
1677
	add	dx, 50594
1678
	adc	eax, 1
1679
	loop	@b
1680
	neg	dx
1681
	adc	eax, -1
1682
	neg	eax
1683
	pop	ecx
1684
	jnz	.l1
1685
; 2. Cb -> -0.34414*Cb
1686
	mov	ax, dx
1687
.l2:
1688
	push	ecx
1689
@@:
1690
	stosd
1691
	sub	eax, 22554
1692
	loop	@b
1693
	neg	eax
1694
	pop	ecx
1695
	cmp	ax, dx
1696
	jnz	.l2
1697
	xor	eax, eax
1698
; 3. Cr -> -0.71414*Cr
1699
.l3:
1700
	push	ecx
1701
@@:
1702
	stosd
1703
	sub	eax, 46802
1704
	loop	@b
1705
	neg	eax
1706
	pop	ecx
1707
	jnz	.l3
1708
; 4. Cr -> 1.402*Cr
1709
.l4:
1710
	push	ecx
1711
@@:
1712
	stosd
1713
	add	dx, 26345
1714
	adc	eax, 1
1715
	loop	@b
1716
	neg	dx
1717
	adc	eax, -1
1718
	neg	eax
1719
	pop	ecx
1720
	jnz	.l4
1721
	popad
1722
	ret
1723
 
1724
; this function is called in the end of image loading
1725
convert_to_rgb:
1726
; some checks
1727
	mov	eax, [ebx+jpeg.work.image]
1728
	test	eax, eax	; image exists?
1729
	jz	.ret
1730
	cmp	byte [ebx+jpeg.work.pixel_size], 3	; full-color image?
1731
	jz	.ycc2rgb
1732
	cmp	byte [ebx+jpeg.work.pixel_size], 4
1733
	jz	.ycck2rgb
1734
.ret:
1735
	ret
1736
.ycc2rgb:
1737
; conversion is needed
1738
	mov	esi, [eax+Image.Width]
1739
	imul	esi, [eax+Image.Height]
1740
	mov	edi, [eax+Image.Data]
1741
	push	ebx
1742
; N.B. Speed optimization has sense here.
1743
align 16
1744
.loop:
1745
;	mov	ebx, [edi]
1746
;	mov	edx, ebx
1747
;	mov	ecx, ebx
1748
;	movzx	ebx, bl		; ebx = Y
1749
;	shr	edx, 16
1750
;	mov	eax, ebx
1751
;	movzx	edx, dl		; edx = Cr
1752
;	movzx	ecx, ch		; ecx = Cb
1753
	movzx	ebx, byte [edi]
1754
	movzx	ecx, byte [edi+1]
1755
	mov	eax, ebx
1756
	movzx	edx, byte [edi+2]
1757
; B = Y + color_table_1[Cb]
1758
	add	eax, [color_table_1+ecx*4]
1759
	mov	ebp, [color_table_2+ecx*4]
1760
	cmp	eax, 80000000h
1761
	sbb	ecx, ecx
1762
	and	eax, ecx
1763
	add	ebp, [color_table_3+edx*4]
1764
	cmp	eax, 0x100
1765
	sbb	ecx, ecx
1766
	not	ecx
1767
	sar	ebp, 16
1768
	or	eax, ecx
1769
	mov	[edi], al
1770
; G = Y + color_table_2[Cb] + color_table_3[Cr]
1771
	lea	eax, [ebx+ebp]
1772
	cmp	eax, 80000000h
1773
	sbb	ecx, ecx
1774
	and	eax, ecx
1775
	cmp	eax, 0x100
1776
	sbb	ecx, ecx
1777
	not	ecx
1778
	or	eax, ecx
1779
	mov	[edi+1], al
1780
; R = Y + color_table_4[Cr]
1781
	mov	eax, ebx
1782
	add	eax, [color_table_4+edx*4]
1783
	cmp	eax, 80000000h
1784
	sbb	ecx, ecx
1785
	and	eax, ecx
1786
	cmp	eax, 0x100
1787
	sbb	ecx, ecx
1788
	not	ecx
1789
	or	eax, ecx
1790
	mov	[edi+2], al
1791
	add	edi, 3
1792
	sub	esi, 1
1793
	jnz	.loop
1794
	pop	ebx
1795
	ret
1796
.ycck2rgb:
1797
; conversion is needed
1798
	mov	esi, [eax+Image.Width]
1799
	imul	esi, [eax+Image.Height]
1800
	push	ebx
1801
	push	esi
1802
	mov	edi, [eax+Image.Data]
1803
	mov	esi, edi
1804
; N.B. Speed optimization has sense here.
1805
align 16
1806
.kloop:
1807
;	mov	ebx, [esi]
1808
;	mov	edx, ebx
1809
;	mov	ecx, ebx
1810
;	movzx	ebx, bl		; ebx = Y
1811
;	shr	edx, 16
1812
;	mov	eax, ebx
1813
;	movzx	edx, dl		; edx = Cr
1814
;	movzx	ecx, ch		; ecx = Cb
1815
	movzx	ebx, byte [esi]
1816
	movzx	ecx, byte [esi+1]
1817
	mov	eax, ebx
1818
	movzx	edx, byte [esi+2]
1819
; B = Y + color_table_1[Cb]
1820
	add	eax, [color_table_1+ecx*4]
1821
	mov	ebp, [color_table_2+ecx*4]
1822
	cmp	eax, 80000000h
1823
	sbb	ecx, ecx
1824
	and	eax, ecx
1825
	add	ebp, [color_table_3+edx*4]
1826
	cmp	eax, 0x100
1827
	sbb	ecx, ecx
1828
	not	ecx
1829
	sar	ebp, 16
1830
	or	eax, ecx
1831
	xor	al, 0xFF
1832
	mul	byte [esi+3]
1833
	add	al, ah
1834
	adc	ah, 0
1835
	add	al, 80h
1836
	adc	ah, 0
1837
	mov	byte [edi], ah
1838
; G = Y + color_table_2[Cb] + color_table_3[Cr]
1839
	lea	eax, [ebx+ebp]
1840
	cmp	eax, 80000000h
1841
	sbb	ecx, ecx
1842
	and	eax, ecx
1843
	cmp	eax, 0x100
1844
	sbb	ecx, ecx
1845
	not	ecx
1846
	or	eax, ecx
1847
	xor	al, 0xFF
1848
	mul	byte [esi+3]
1849
	add	al, ah
1850
	adc	ah, 0
1851
	add	al, 80h
1852
	adc	ah, 0
1853
	mov	byte [edi+1], ah
1854
; R = Y + color_table_4[Cr]
1855
	mov	eax, ebx
1856
	add	eax, [color_table_4+edx*4]
1857
	cmp	eax, 80000000h
1858
	sbb	ecx, ecx
1859
	and	eax, ecx
1860
	cmp	eax, 0x100
1861
	sbb	ecx, ecx
1862
	not	ecx
1863
	or	eax, ecx
1864
	xor	al, 0xFF
1865
	mul	byte [esi+3]
1866
	add	al, ah
1867
	adc	ah, 0
1868
	add	al, 80h
1869
	adc	ah, 0
1870
	mov	byte [edi+2], ah
1871
	add	esi, 4
1872
	add	edi, 4 ;3
1873
	sub	dword [esp], 1
1874
	jnz	.kloop
1875
	pop	eax
1876
	pop	ebx
1877
; release some memory - must succeed because we decrease size
1878
;	add	ecx, 44+1
1879
;	mov	edx, ebx
1880
;	push	68
1881
;	pop	eax
1882
;	push	20
1883
;	pop	ebx
1884
;	int	0x40
1885
;	mov	ebx, eax
1886
	ret
1887
 
1888
; Decodes one data unit, that is, 8*8 block,
1889
; from input stream, given by pointer esi and length ebp
1890
; N.B. Speed optimization has sense here.
1891
align 16
1892
decode_data_unit:
1893
; edx -> component data
1894
	cmp	[ebx+jpeg.work.progressive], 0
1895
	jz	@f
1896
	mov	edi, [edx+20]
1897
	add	dword [edx+20], 64*2
1898
	jmp	.coeff_decoded
1899
@@:
1900
	lea	edi, [ebx+jpeg.work.dct_coeff]
1901
	mov	ecx, 64*2/4
1902
	xor	eax, eax
1903
	rep	stosd
1904
	mov	edi, zigzag+1
1905
	mov	ecx, [ebx+jpeg.work.huffman_bits]
1906
; read DC coefficient
1907
	push	ebx
1908
	mov	eax, [edx+16]
1909
	push	edx
1910
	get_huffman_code 2,3
1911
	get_bits 2,3,true
1912
	pop	ebx
1913
	add	eax, [edx+48]
1914
	mov	[ebx+jpeg.work.dct_coeff], ax
1915
	mov	[edx+48], ax
1916
; read AC coefficients
1917
	push	ebx
1918
@@:
1919
	mov	eax, [edx+20]
1920
	push	edx
1921
	get_huffman_code 2,3
1922
	shr	eax, 4
1923
	and	ebx, 15
1924
	jz	.band
1925
	add	edi, eax
1926
	cmp	edi, zigzag+64
1927
	jae	.eof_pop2
1928
	get_bits 2,3,true
1929
	movzx	ebx, byte [edi]
1930
	add	ebx, [esp]
1931
	mov	[jpeg.work.dct_coeff+ebx], ax
1932
	add	edi, 1
1933
	cmp	edi, zigzag+64
1934
	jb	@b
1935
	jmp	.do_idct
1936
.band:
1937
	pop	edx
1938
	cmp	al, 15
1939
	jnz	.do_idct
1940
	add	edi, 16
1941
	cmp	edi, zigzag+64
1942
	jb	@b
1943
;	jmp	.eof_pop1
1944
.do_idct:
1945
	pop	ebx
1946
	lea	edi, [ebx+jpeg.work.dct_coeff]
1947
	mov	[ebx+jpeg.work.huffman_bits], ecx
1948
; coefficients loaded, now IDCT
1949
.coeff_decoded:
1950
	mov	eax, [edx+12]
1951
	add	ebx, jpeg.work.idct_tmp_area
1952
	push	8
1953
.idct_loop1:
1954
	mov	cx, word [edi+1*16]
1955
repeat 6
1956
	or	cx, word [edi+(%+1)*16]
1957
end repeat
1958
	jnz	.real_transform
1959
	fild	word [edi]
1960
	fmul	dword [eax]
1961
	fstp	dword [ebx]
1962
	mov	ecx, [ebx]
1963
repeat 7
1964
	mov	[ebx+%*32], ecx
1965
end repeat
1966
	jmp	.idct_next1
1967
.real_transform:
1968
; S0,...,S7 - transformed values, s0,...,s7 - sought-for values
1969
; S0,...,S7 are dequantized;
1970
; dequantization table elements were multiplied to [idct_pre_table],
1971
; so S0,S1,... later denote S0/2\sqrt{2},S1*\cos{\pi/16}/2,...
1972
; 	sqrt2 = \sqrt{2}, cos = 2\cos{\pi/8},
1973
; 	cos_sum = -2(\cos{\pi/8}+\cos{3\pi/8}), cos_diff = 2(\cos{\pi/8}-\cos{3\pi/8})
1974
; Now formulas:
1975
; s0 = ((S0+S4)+(S2+S6)) + ((S1+S7)+(S3+S5))
1976
; s7 = ((S0+S4)+(S2+S6)) - ((S1+S7)+(S3+S5))
1977
; val0 = ((cos-1)S1-(cos+cos_sum+1)S3+(cos+cos_sum-1)S5-(cos+1)S7)
1978
; s1 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) + val0
1979
; s6 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) - val0
1980
; val1 = (S1+S7-S3-S5)sqrt2 - val0
1981
; s2 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) + val1
1982
; s5 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) - val1
1983
; val2 = (S1-S7)cos_diff - (S1-S3+S5-S7)cos + val1
1984
; s3 = ((S0+S4)-(S2+S6)) - val2
1985
; s4 = ((S0+S4)-(S2+S6)) + val2
1986
	fild	word [edi+3*16]
1987
	fmul	dword [eax+3*32]
1988
	fild	word [edi+5*16]
1989
	fmul	dword [eax+5*32]	; st0=S5,st1=S3
1990
	fadd	st1,st0
1991
	fadd	st0,st0
1992
	fsub	st0,st1		; st0=S5-S3,st1=S5+S3
1993
	fild	word [edi+1*16]
1994
	fmul	dword [eax+1*32]
1995
	fild	word [edi+7*16]
1996
	fmul	dword [eax+7*32]	; st0=S7,st1=S1
1997
	fsub	st1,st0
1998
	fadd	st0,st0
1999
	fadd	st0,st1		; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
2000
	fadd	st3,st0
2001
	fadd	st0,st0
2002
	fsub	st0,st3		; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
2003
	fmul	[idct_sqrt2]
2004
	fld	st2
2005
	fadd	st0,st2
2006
	fmul	[idct_cos]	; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
2007
				; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
2008
	fxch	st2
2009
	fmul	[idct_cos_diff]
2010
	fsub	st0,st2		; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
2011
	fxch	st3
2012
	fmul	[idct_cos_sum]
2013
	fadd	st0,st2		; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
2014
	fsub	st0,st4		; st0=val0
2015
	fsub	st1,st0		; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
2016
				; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
2017
	fxch	st2
2018
	fstp	st0
2019
	fadd	st2,st0		; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
2020
 
2021
	fild	word [edi+0*16]
2022
	fmul	dword [eax+0*32]
2023
	fild	word [edi+4*16]
2024
	fmul	dword [eax+4*32]	; st0=S4,st1=S0
2025
	fsub	st1,st0
2026
	fadd	st0,st0
2027
	fadd	st0,st1		; st0=S0+S4,st1=S0-S4
2028
	fild	word [edi+6*16]
2029
	fmul	dword [eax+6*32]
2030
	fild	word [edi+2*16]
2031
	fmul	dword [eax+2*32]	; st0=S2,st1=S6
2032
	fadd	st1,st0
2033
	fadd	st0,st0
2034
	fsub	st0,st1		; st0=S2-S6,st1=S2+S6
2035
	fmul	[idct_sqrt2]
2036
	fsub	st0,st1
2037
	fsub	st3,st0
2038
	fadd	st0,st0
2039
	fadd	st0,st3		; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
2040
				; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
2041
	fxch	st1
2042
	fsub	st2,st0
2043
	fadd	st0,st0
2044
	fadd	st0,st2		; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
2045
				; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
2046
				; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
2047
	fsubr	st7,st0
2048
	fadd	st0,st0
2049
	fsub	st0,st7
2050
	fstp	dword [ebx+0*32]
2051
	fsubr	st4,st0
2052
	fadd	st0,st0
2053
	fsub	st0,st4
2054
	fstp	dword [ebx+1*32]
2055
	fadd	st4,st0
2056
	fadd	st0,st0
2057
	fsub	st0,st4
2058
	fstp	dword [ebx+3*32]
2059
	fsubr	st1,st0
2060
	fadd	st0,st0
2061
	fsub	st0,st1
2062
	fstp	dword [ebx+2*32]
2063
	fstp	dword [ebx+5*32]
2064
	fstp	dword [ebx+6*32]
2065
	fstp	dword [ebx+4*32]
2066
	fstp	dword [ebx+7*32]
2067
.idct_next1:
2068
	add	ebx, 4
2069
	add	edi, 2
2070
	add	eax, 4
2071
	sub	dword [esp], 1
2072
	jnz	.idct_loop1
2073
	pop	ecx
2074
	sub	ebx, 8*4
2075
	mov	ecx, 8
2076
.idct_loop2:
2077
	fld	dword [ebx+3*4]
2078
	fld	dword [ebx+5*4]
2079
	fadd	st1,st0
2080
	fadd	st0,st0
2081
	fsub	st0,st1		; st0=S5-S3,st1=S5+S3
2082
	fld	dword [ebx+1*4]
2083
	fld	dword [ebx+7*4]
2084
	fsub	st1,st0
2085
	fadd	st0,st0
2086
	fadd	st0,st1		; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
2087
	fadd	st3,st0
2088
	fadd	st0,st0
2089
	fsub	st0,st3		; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
2090
	fmul	[idct_sqrt2]
2091
	fld	st2
2092
	fadd	st0,st2
2093
	fmul	[idct_cos]	; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
2094
				; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
2095
	fxch	st2
2096
	fmul	[idct_cos_diff]
2097
	fsub	st0,st2		; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
2098
	fxch	st3
2099
	fmul	[idct_cos_sum]
2100
	fadd	st0,st2		; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
2101
	fsub	st0,st4		; st0=val0
2102
	fsub	st1,st0		; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
2103
				; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
2104
	fxch	st2
2105
	fstp	st0
2106
	fadd	st2,st0		; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
2107
 
2108
	fld	dword [ebx+0*4]
2109
	fld	dword [ebx+4*4]
2110
	fsub	st1,st0
2111
	fadd	st0,st0
2112
	fadd	st0,st1		; st0=S0+S4,st1=S0-S4
2113
	fld	dword [ebx+6*4]
2114
	fld	dword [ebx+2*4]
2115
	fadd	st1,st0
2116
	fadd	st0,st0
2117
	fsub	st0,st1		; st0=S2-S6,st1=S2+S6
2118
	fmul	[idct_sqrt2]
2119
	fsub	st0,st1
2120
	fsub	st3,st0
2121
	fadd	st0,st0
2122
	fadd	st0,st3		; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
2123
				; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
2124
	fxch	st1
2125
	fsub	st2,st0
2126
	fadd	st0,st0
2127
	fadd	st0,st2		; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
2128
				; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
2129
				; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
2130
	fsubr	st7,st0
2131
	fadd	st0,st0
2132
	fsub	st0,st7
2133
	fistp	dword [ebx+0*4]
2134
	fsubr	st4,st0
2135
	fadd	st0,st0
2136
	fsub	st0,st4
2137
	fistp	dword [ebx+1*4]
2138
	fadd	st4,st0
2139
	fadd	st0,st0
2140
	fsub	st0,st4
2141
	fistp	dword [ebx+3*4]
2142
	fsubr	st1,st0
2143
	fadd	st0,st0
2144
	fsub	st0,st1
2145
	fistp	dword [ebx+2*4]
2146
	fistp	dword [ebx+5*4]
2147
	fistp	dword [ebx+6*4]
2148
	fistp	dword [ebx+4*4]
2149
	fistp	dword [ebx+7*4]
2150
 
2151
	add	ebx, 32
2152
	sub	ecx, 1
2153
	jnz	.idct_loop2
2154
 
2155
	sub	ebx, 32*8
2156
	mov	ecx, 64
2157
	lea	edi, [ebx - jpeg.work.idct_tmp_area + jpeg.work.decoded_data - 1]
2158
	push	esi
2159
.idct_loop3:
2160
	mov	eax, [ebx]
2161
	add	ebx, 4
2162
	add	eax, 80h
2163
	cmp	eax, 80000000h
2164
	sbb	esi, esi
2165
	add	edi, 1
2166
	and	eax, esi
2167
	cmp	eax, 100h
2168
	sbb	esi, esi
2169
	not	esi
2170
	or	eax, esi
2171
	sub	al, [edx+51]
2172
	sub	ecx, 1
2173
	mov	[edi], al
2174
	jnz	.idct_loop3
2175
	pop	esi
2176
	sub	ebx, 64*4 + jpeg.work.idct_tmp_area
2177
; done
2178
	ret
2179
 
2180
.eof_pop3:
2181
	pop	ebx
2182
.eof_pop2:
2183
	pop	ebx
2184
.eof_pop1:
2185
	pop	ebx
2186
.eof_pop0:
2187
; EOF or incorrect data during scanning
2188
	mov	esp, [ebx + jpeg.work._esp]
2189
	jmp	img.decode.jpg.end
2190
 
2191
img.encode.jpg:
2192
	xor	eax, eax
2193
	ret	8
2194
 
2195
zigzag:
2196
; (x,y) -> 2*(x+y*8)
2197
repeat 8
2198
	.cur = %
2199
	if .cur and 1
2200
		repeat %
2201
			db	2*((%-1) + (.cur-%)*8)
2202
		end repeat
2203
	else
2204
		repeat %
2205
			db	2*((.cur-%) + (%-1)*8)
2206
		end repeat
2207
	end if
2208
end repeat
2209
repeat 7
2210
	.cur = %
2211
	if .cur and 1
2212
		repeat 8-%
2213
			db	2*((%+.cur-1) + (8-%)*8)
2214
		end repeat
2215
	else
2216
		repeat 8-%
2217
			db	2*((8-%) + (%+.cur-1)*8)
2218
		end repeat
2219
	end if
2220
end repeat
2221
 
2222
align 4
2223
idct_pre_table:
2224
; c_0 = 1/(2\sqrt{2}), c_i = cos(i*\pi/16)/2
2225
	dd	0.35355339, 0.49039264, 0.461939766, 0.41573481
2226
	dd	0.35355339, 0.27778512, 0.19134172, 0.09754516
2227
idct_sqrt2	dd	1.41421356	; \sqrt{2}
2228
idct_cos	dd	1.847759065	; 2\cos{\pi/8}
2229
idct_cos_sum	dd	-2.61312593	; -2(\cos{\pi/8} + \cos{3\pi/8})
2230
idct_cos_diff	dd	1.08239220	; 2(\cos{\pi/8} - \cos{3\pi/8})
2231
;---------------------------------------------------------------------