WebSVN – Kolibri OS – Path Comparison – / – /programs/develop/libraries/libs-dev/libimg/jpeg/ Rev 998 and /programs/develop/libraries/libs-dev/libimg/jpeg/ Rev 999

Regard whitespace Rev 998 → Rev 999

 /programs/develop/libraries/libs-dev/libimg/jpeg/jpeg.asm
 ,0 → 1,2231
+;;================================================================================================;;
+;;//// jpeg.asm //// (c) diamond, 2008-2009 //////////////////////////////////////////////////////;;
+;;================================================================================================;;
+;;                                                                                                ;;
+;; This file is part of Common development libraries (Libs-Dev).                                  ;;
+;;                                                                                                ;;
+;; Libs-Dev is free software: you can redistribute it and/or modify it under the terms of the GNU ;;
+;; Lesser General Public License as published by the Free Software Foundation, either version 2.1 ;;
+;; of the License, or (at your option) any later version.                                         ;;
+;;                                                                                                ;;
+;; Libs-Dev is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without  ;;
+;; even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  ;;
+;; Lesser General Public License for more details.                                                ;;
+;;                                                                                                ;;
+;; You should have received a copy of the GNU Lesser General Public License along with Libs-Dev.  ;;
+;; If not, see <http://www.gnu.org/licenses/>.                                                    ;;
+;;                                                                                                ;;
+;;================================================================================================;;
+include 'jpeg.inc'
+img.is.jpg:
+        push    esi ebp
+        mov     esi, [esp+12]   ; esi -> JPEG data
+        mov     ebp, [esp+16]   ; ebp = data size
+        call    get_marker
+        jc      .no
+        cmp     al, 0xD8        ; SOI marker?
+        push    1
+        pop     eax
+        jz      .ok
+.no:
+        xor     eax, eax
+.ok:
+        pop     ebp esi
+        ret     8
+img.decode.jpg:
+        finit
+        pushad
+        mov     esi, [esp+20h+4]        ; esi -> JPEG data
+        mov     ebp, [esp+20h+8]        ; ebp = data size
+@@:
+; allocate area for JPEG processing
+        push    sizeof.jpeg.work
+        call    [mem.alloc]
+        test    eax, eax
+        jz      .ret
+        mov     ebx, eax
+        xor     ecx, ecx
+        mov     [ebx + jpeg.work.image], ecx
+        mov     [ebx + jpeg.work.dct_buffer], ecx
+        mov     [ebx + jpeg.work._esp], esp
+; check for SOI [Start-Of-Image] marker
+        call    get_marker
+        jc      .end
+        cmp     al, 0xD8        ; SOI?
+        jz      .soi_ok
+.end:
+; general exit from the function
+; for progressive mode: convert loaded DCT coefficients to image
+        call    handle_progressive
+; convert full-color images to RGB
+        call    convert_to_rgb
+        push    [ebx + jpeg.work.image]
+        push    ebx
+        call    [mem.free]
+        pop     eax
+.ret:
+        mov     [esp+28], eax
+        popad
+        ret     8
+.soi_ok:
+        mov     [ebx + jpeg.work.restart_interval], ecx
+        mov     [ebx + jpeg.work.adobe_ycck], cl
+; loop until start of frame (real data), parse markers
+.markers_loop:
+        call    get_marker
+        jc      .end
+; markers RSTn do not have parameters
+; N.B. They can not exist in this part of JPEG, but let's be liberal :)
+        cmp     al, 0xD0
+        jb      @f
+        cmp     al, 0xD8
+        jb      .markers_loop
+@@:
+        cmp     al, 0xD9        ; EOI? [invalid here]
+        jz      .end
+; ok, this is marker segment
+; first word is length of the segment
+        cmp     ebp, 2
+        jb      .end
+        xor     edx, edx
+        mov     dl, [esi+1]
+        mov     dh, [esi]       ; edx = marker length, al = marker value
+        sub     ebp, edx
+        jb      .end
+        cmp     al, 0xDB        ; DQT?
+        jz      .dqt
+        cmp     al, 0xC4        ; DHT?
+        jz      .dht
+        cmp     al, 0xCC        ; DAC? [ignored - no arithmetic coding]
+        jz      .next_marker
+        cmp     al, 0xDD        ; DRI?
+        jz      .dri
+        cmp     al, 0xDA        ; SOS?
+        jz      .sos
+        cmp     al, 0xC0
+        jb      @f
+        cmp     al, 0xD0
+        jb      .sofn
+@@:
+        cmp     al, 0xEE        ; APP14?
+        jz      .app14
+; unrecognized marker; let's skip it and hope for the best
+.next_marker:
+        add     esi, edx
+        jmp     .markers_loop
+.app14:
+; check for special Adobe marker
+        cmp     dx, 14
+        jb      .next_marker
+        cmp     byte [esi+2], 'A'
+        jnz     .next_marker
+        cmp     dword [esi+3], 'dobe'
+        jnz     .next_marker
+        cmp     byte [esi+13], 2
+        setz    [ebx + jpeg.work.adobe_ycck]
+        jmp     .next_marker
+.dqt:
+; DQT marker found
+; length: 2 bytes for length field + 65 bytes per table
+        sub     edx, 2
+        jc      .end
+        lodsw
+.dqt_loop:
+        test    edx, edx
+        jz      .markers_loop
+        sub     edx, 1+64
+        jc      .end
+        lodsb
+; 8-bit DCT-based process shall not use a 16-bit precision quantization table.
+        test    al, 0xF0
+        jnz     .end
+        and     eax, 3
+        mov     [ebx+jpeg.work.quant_tables_defined+eax], 1
+        shl     eax, 8
+        lea     edi, [ebx+eax+jpeg.work.quant_tables]
+        xor     ecx, ecx
+@@:
+        xor     eax, eax
+        lodsb
+        push    eax
+        fild    dword [esp]
+        pop     eax
+        movzx   eax, byte [zigzag+ecx]
+        add     eax, eax
+        push    eax
+        and     eax, 7*4
+        fmul    dword [idct_pre_table+eax]
+        pop     eax
+        push    eax
+        shr     eax, 3
+        and     eax, 7*4
+        fmul    dword [idct_pre_table+eax]
+        pop     eax
+        fstp    dword [edi+eax]
+        inc     ecx
+        cmp     ecx, 64
+        jb      @b
+        jmp     .dqt_loop
+.dri:
+; DRI marker found
+        cmp     edx, 4          ; length must be 4
+        jnz     .end2
+        movzx   eax, word [esi+2]
+        xchg    al, ah
+        mov     [ebx+jpeg.work.restart_interval], eax
+        jmp     .next_marker
+.dht:
+; DHT marker found
+        sub     edx, 2
+        jc      .end2
+        lodsw
+.dht_loop:
+        test    edx, edx
+        jz      .markers_loop
+        sub     edx, 17
+        jc      .end2
+; next Huffman table; find place for it
+        lodsb
+        mov     edi, eax
+        and     eax, 0x10
+        and     edi, 3
+        shr     eax, 2
+        or      edi, eax
+        mov     [ebx+jpeg.work.dc_huffman_defined+edi], 1
+;       shl     edi, 11
+        imul    edi, max_hufftable_size
+        lea     edi, [ebx+edi+jpeg.work.dc_huffman]     ; edi -> destination table
+; get table size
+        xor     eax, eax
+        push    16
+        pop     ecx
+@@:
+        add     al, [esi]
+        adc     ah, 0
+        inc     esi
+        loop    @b
+        cmp     ax, 0x100
+        ja      .end2
+        sub     edx, eax
+        jc      .end2
+; construct Huffman tree
+        push    ebx edx
+        ; lea   eax, [edi+256*8]
+        ; push  eax
+        ; push  16
+        ; mov   edx, esi
+; @@:
+        ; cmp   byte [edx-1], 0
+        ; jnz   @f
+        ; dec   edx
+        ; dec   dword [esp]
+        ; jmp   @b
+; @@:
+        ; sub   edx, [esp]
+        ; lea   eax, [edi+8]
+        ; push  2
+        ; pop   ecx
+; .lenloop:
+        ; mov   bl, byte [edx]
+        ; test  bl, bl
+        ; jz    .len1done
+        ; push  eax
+        ; xor   eax, eax
+; .len1loop:
+        ; dec   ecx
+        ; js    .dhterr
+        ; cmp   edi, [esp+8]
+        ; jae   .dhterr
+        ; lodsb
+        ; stosd
+        ; dec   bl
+        ; jnz   .len1loop
+        ; pop   eax
+; .len1done:
+        ; jecxz .len2done
+        ; push  ecx
+; .len2loop:
+        ; cmp   eax, [esp+8]
+        ; jb    @f
+        ; or    eax, -1
+; @@:
+        ; cmp   edi, [esp+8]
+        ; jae   .dhterr
+        ; stosd
+        ; add   eax, 8
+        ; jnb   @f
+        ; or    eax, -1
+; @@:
+        ; loop  .len2loop
+        ; pop   ecx
+; .len2done:
+        ; add   ecx, ecx
+        ; inc   edx
+        ; dec   dword [esp]
+        ; jnz   .lenloop
+        ; pop   eax
+        ; pop   eax
+        ; sub   eax, edi
+        ; shr   eax, 2
+        ; cmp   eax, ecx
+        ; ja    @f
+        ; mov   ecx, eax
+; @@:
+        ; or    eax, -1
+        ; rep   stosd
+        ; pop   edx ebx
+        ; jmp   .dht_loop
+; .dhterr:
+        ; ;pop  eax eax eax edx ebx
+        ; add   esp, 5*4
+        lea     eax, [edi+256*2]
+        push    eax
+        lea     edx, [esi-16]
+        mov     ah, 1
+        mov     ecx, 128
+.dht_l1:
+        movzx   ebx, byte [edx]
+        inc     edx
+        test    ebx, ebx
+        jz      .dht_l3
+.dht_l2:
+        cmp     edi, [esp]
+        jae     .dhterr1
+        lodsb
+        xchg    al, ah
+        push    ecx
+        rep     stosw
+        pop     ecx
+        xchg    al, ah
+        dec     ebx
+        jnz     .dht_l2
+.dht_l3:
+        inc     ah
+        shr     ecx, 1
+        jnz     .dht_l1
+        push    edi
+        mov     edi, [esp+4]
+        push    edi
+        mov     eax, 0x00090100
+        mov     cl, 8
+.dht_l4:
+        movzx   ebx, byte [edx]
+        inc     edx
+        test    ebx, ebx
+        jz      .dht_l6
+.dht_l5:
+        cmp     edi, [esp]
+        jb      @f
+        mov     edi, [esp+4]
+        rol     eax, 16
+        cmp     edi, [esp+8]
+        jae     .dhterr2
+        stosw
+        inc     ah
+        mov     [esp+4], edi
+        pop     edi
+        push    edi
+        rol     eax, 16
+        add     dword [esp], 16*2
+@@:
+        lodsb
+        xchg    al, ah
+        push    ecx
+        rep     stosw
+        pop     ecx
+        xchg    al, ah
+        dec     ebx
+        jnz     .dht_l5
+.dht_l6:
+        inc     ah
+        shr     ecx, 1
+        jnz     .dht_l4
+        push    edi
+        movzx   ebx, byte [edx]
+        add     ebx, ebx
+        add     bl, [edx+1]
+        adc     bh, 0
+        add     ebx, ebx
+        add     bl, [edx+2]
+        adc     bh, 0
+        add     ebx, ebx
+        add     bl, [edx+3]
+        adc     bh, 0
+        add     ebx, 15
+        shr     ebx, 4
+        mov     cl, 8
+        lea     ebx, [edi+ebx*2]
+        sub     ebx, [esp+12]
+        add     ebx, 31
+        shr     ebx, 5
+        mov     edi, ebx
+        shl     edi, 5
+        add     edi, [esp+12]
+        xor     ebx, 9
+        shl     ebx, 16
+        xor     eax, ebx
+        push    edi
+.dht_l7:
+        movzx   ebx, byte [edx]
+        inc     edx
+        test    ebx, ebx
+        jz      .dht_l10
+.dht_l8:
+        cmp     edi, [esp]
+        jb      .dht_l9
+        mov     edi, [esp+4]
+        cmp     edi, [esp+8]
+        jb      @f
+        mov     edi, [esp+12]
+        cmp     edi, [esp+16]
+        jae     .dhterr3
+        mov     al, 9
+        stosb
+        rol     eax, 8
+        stosb
+        inc     eax
+        ror     eax, 8
+        mov     [esp+12], edi
+        mov     edi, [esp+8]
+        add     dword [esp+8], 16*2
+@@:
+        mov     al, 9
+        stosb
+        rol     eax, 16
+        stosb
+        inc     eax
+        ror     eax, 16
+        mov     [esp+4], edi
+        pop     edi
+        push    edi
+        add     dword [esp], 16*2
+.dht_l9:
+        lodsb
+        xchg    al, ah
+        push    ecx
+        rep     stosw
+        pop     ecx
+        xchg    al, ah
+        dec     ebx
+        jnz     .dht_l8
+.dht_l10:
+        inc     ah
+        shr     ecx, 1
+        jnz     .dht_l7
+        push    -1
+        pop     eax
+        pop     ecx
+        sub     ecx, edi
+        rep     stosb
+        pop     edi
+        pop     ecx
+        sub     ecx, edi
+        rep     stosb
+        pop     edi
+        pop     ecx
+        sub     ecx, edi
+        rep     stosb
+        pop     edx ebx
+        jmp     .dht_loop
+.dhterr3:
+        pop     eax eax
+.dhterr2:
+        pop     eax eax
+.dhterr1:
+        pop     eax
+        pop     edx ebx
+.end2:
+        jmp     .end
+.sofn:
+; SOFn marker found
+        cmp     [ebx+jpeg.work.image], 0
+        jnz     .end2   ; only one frame is allowed
+; only SOF0 [baseline sequential], SOF1 [extended sequential], SOF2 [progressive]
+; nobody supports other compression methods
+        cmp     al, 0xC2
+        ja      .end2
+        setz    [ebx+jpeg.work.progressive]
+; Length must be at least 8
+        sub     edx, 8
+        jb      .end2
+; Sample precision in JFIF must be 8 bits
+        cmp     byte [esi+2], 8
+        jnz     .end2
+; Color space in JFIF is either YCbCr (color images, 3 components)
+;                        or Y (grey images, 1 component)
+        movzx   eax, byte [esi+7]
+        cmp     al, 1
+        jz      @f
+        cmp     al, 3
+        jz      @f
+; Adobe products sometimes use YCCK color space with 4 components
+        cmp     al, 4
+        jnz     .end2
+        cmp     [ebx+jpeg.work.adobe_ycck], 0
+        jz      .end2
+@@:
+        mov     edi, eax        ; edi = number of components
+        lea     eax, [eax*3]
+        sub     edx, eax
+        jnz     .end2
+; image type: 8 bpp for grayscale JPEGs, 24 bpp for normal,
+; 32 bpp for Adobe YCCK
+        push    Image.bpp8
+        pop     eax
+        cmp     edi, 1
+        jz      @f
+        inc     eax     ; Image.bpp24 = 2
+        cmp     edi, 3
+        jz      @f
+        inc     eax     ; Image.bpp32 = 3
+@@:
+        push    eax
+; get width and height
+; width must be nonzero
+; height must be nonzero - nobody supports DNL markers
+        mov     ah, [esi+3]
+        mov     al, [esi+4]     ; eax = height
+        xor     ecx, ecx
+        mov     ch, [esi+5]
+        mov     cl, [esi+6]     ; ecx = width
+; allocate memory for image
+        stdcall img.create, ecx, eax
+        test    eax, eax
+        jz      .end2
+        mov     [ebx + jpeg.work.image], eax
+; create grayscale palette if needed
+        cmp     edi, 1
+        jnz     .no_create_palette
+        push    ecx edi
+        mov     edi, [eax + Image.Palette]
+        xor     eax, eax
+        mov     ecx, 256
+@@:
+        stosd
+        add     eax, 0x010101
+        loop    @b
+        pop     edi ecx
+.no_create_palette:
+; other image characteristics
+        mov     eax, edi
+        shl     eax, 3
+        mov     [ebx + jpeg.work.delta_x], eax
+        mov     [ebx + jpeg.work.pixel_size], edi
+        ;mov    eax, edi
+        imul    eax, ecx
+        mov     [ebx + jpeg.work.delta_y], eax
+        shr     eax, 3
+        mov     [ebx + jpeg.work.line_size], eax
+        add     esi, 8
+        mov     ecx, edi
+        lea     edi, [ebx + jpeg.work.components]
+        xor     eax, eax
+        xor     edx, edx
+.sof_parse_comp:
+        movsb   ; db ComponentIdentifier
+        lodsb
+        mov     ah, al
+        and     al, 0xF
+        jz      .end3
+        shr     ah, 4
+        jz      .end3
+        stosd   ; db V, db H, db ?, db ? (will be filled later)
+        cmp     dl, al
+        ja      @f
+        mov     dl, al
+@@:
+        cmp     dh, ah
+        ja      @f
+        mov     dh, ah
+@@:
+        movsb   ; db QuantizationTableID
+        loop    .sof_parse_comp
+        mov     word [ebx + jpeg.work.max_v], dx
+        movzx   eax, dh
+        movzx   edx, dl
+        push    eax edx
+        shl     eax, 3
+        shl     edx, 3
+        mov     [ebx + jpeg.work.block_width], eax
+        mov     [ebx + jpeg.work.block_height], edx
+        pop     edx eax
+        push    eax edx
+        imul    eax, [ebx + jpeg.work.delta_x]
+        mov     [ebx + jpeg.work.block_delta_x], eax
+        imul    edx, [ebx + jpeg.work.delta_y]
+        mov     [ebx + jpeg.work.block_delta_y], edx
+        mov     ecx, [ebx + jpeg.work.image]
+        mov     eax, [ecx + Image.Width]
+        add     eax, [ebx + jpeg.work.block_width]
+        dec     eax
+        xor     edx, edx
+        div     [ebx + jpeg.work.block_width]
+        mov     [ebx + jpeg.work.x_num_blocks], eax
+        mov     eax, [ecx + Image.Height]
+        add     eax, [ebx + jpeg.work.block_height]
+        dec     eax
+        xor     edx, edx
+        div     [ebx + jpeg.work.block_height]
+        mov     [ebx + jpeg.work.y_num_blocks], eax
+        mov     ecx, [ebx + jpeg.work.pixel_size]
+        pop     edx
+        lea     edi, [ebx + jpeg.work.components]
+@@:
+        mov     eax, edx
+        div     byte [edi+1]    ; VMax / V_i = VFactor_i
+        mov     byte [edi+3], al        ; db VFactor
+        pop     eax
+        push    eax
+        div     byte [edi+2]    ; HMax / H_i = HFactor_i
+        mov     byte [edi+4], al        ; db HFactor
+        add     edi, 6
+        loop    @b
+        pop     eax
+        cmp     [ebx + jpeg.work.progressive], 0
+        jz      .sof_noprogressive
+        mov     eax, [ebx + jpeg.work.x_num_blocks]
+        mul     [ebx + jpeg.work.block_width]
+        mul     [ebx + jpeg.work.y_num_blocks]
+        mul     [ebx + jpeg.work.block_height]
+        add     eax, eax
+        mov     [ebx + jpeg.work.dct_buffer_size], eax
+        mul     [ebx + jpeg.work.pixel_size]
+        push    eax
+        call    [mem.alloc]
+        test    eax, eax
+        jnz     @f
+        xchg    eax, [ebx + jpeg.work.image]
+        push    eax
+        call    img.destroy
+        jmp     .end
+@@:
+        mov     [ebx + jpeg.work.dct_buffer], eax
+.sof_noprogressive:
+        jmp     .markers_loop
+.end3:
+        jmp     .end
+.sos:
+; SOS marker found
+; frame must be already opened
+        cmp     [ebx + jpeg.work.image], 0
+        jz      .end3
+        cmp     edx, 6
+        jb      .end3
+; parse marker
+        movzx   eax, byte [esi+2]       ; number of components in this scan
+        test    eax, eax
+        jz      .end3           ; must be nonzero
+        cmp     al, byte [ebx + jpeg.work.pixel_size]
+        ja      .end3           ; must be <= total number of components
+;       mov     [ns], eax
+        cmp     al, 1
+        setz    [ebx + jpeg.work.not_interleaved]
+        lea     ecx, [6+eax+eax]
+        cmp     edx, ecx
+        jnz     .end3
+        mov     ecx, eax
+        lea     edi, [ebx + jpeg.work.cur_components]
+        add     esi, 3
+.sos_find_comp:
+        lodsb   ; got ComponentID, look for component info
+        push    ecx esi
+        mov     ecx, [ebx + jpeg.work.pixel_size]
+        lea     esi, [ebx + jpeg.work.components]
+        and     dword [edi+48], 0
+        and     dword [edi+52], 0
+@@:
+        cmp     [esi], al
+        jz      @f
+        inc     dword [edi+52]
+        add     esi, 6
+        loop    @b
+@@:
+        mov     eax, [esi+1]
+        mov     dl, [esi+5]
+        pop     esi ecx
+        jnz     .end3   ; bad ComponentID
+        cmp     [ebx + jpeg.work.not_interleaved], 0
+        jz      @f
+        mov     ax, 0x0101
+@@:
+        stosd           ; db V, db H, db VFactor, db HFactor
+        push    ecx
+        xor     eax, eax
+        mov     al, byte [edi-1]        ; get HFactor
+        mul     byte [ebx+jpeg.work.pixel_size] ; number of components
+        stosd                   ; HIncrement_i = HFactor_i * sizeof(pixel)
+        mov     al, byte [edi-4-2]      ; get VFactor
+        mul     byte [ebx+jpeg.work.pixel_size] ; number of components
+        mov     ecx, [ebx+jpeg.work.image]
+        imul    eax, [ecx+Image.Width]  ; image width
+        stosd                   ; VIncrement_i = VFactor_i * sizeof(row)
+        xchg    eax, edx
+        and     eax, 3
+        cmp     [ebx+jpeg.work.quant_tables_defined+eax], 0
+        jz      .end3
+        shl     eax, 8
+        lea     eax, [ebx+eax+jpeg.work.quant_tables]
+        stosd           ; dd QuantizationTable
+        lodsb
+        movzx   eax, al
+        mov     edx, eax
+        shr     eax, 4
+        and     edx, 3
+        and     eax, 3
+        cmp     [ebx+jpeg.work.dc_huffman_defined+eax], 0
+        jnz     .dc_table_ok
+        cmp     [ebx+jpeg.work.progressive], 0
+        jz      .end3
+        xor     eax, eax
+        jmp     .dc_table_done
+.dc_table_ok:
+;       shl     eax, 11
+        imul    eax, max_hufftable_size
+        lea     eax, [ebx+jpeg.work.dc_huffman+eax]
+.dc_table_done:
+        cmp     [ebx+jpeg.work.ac_huffman_defined+edx], 0
+        jnz     .ac_table_ok
+        cmp     [ebx+jpeg.work.progressive], 0
+        jz      .end3
+        xor     edx, edx
+        jmp     .ac_table_done
+.ac_table_ok:
+;       shl     edx, 11
+        imul    edx, max_hufftable_size
+        lea     edx, [ebx+jpeg.work.ac_huffman+edx]
+.ac_table_done:
+        stosd           ; dd DCTable
+        xchg    eax, edx
+        stosd           ; dd ACTable
+        mov     eax, [ecx+Image.Width]
+        movzx   ecx, byte [edi-21]      ; get HFactor
+        cdq     ; edx:eax = width (width<0x10000, so as dword it is unsigned)
+        div     ecx
+        stosd           ; dd width / HFactor_i
+        stosd
+        xchg    eax, ecx
+        inc     eax
+        sub     eax, edx
+        stosd           ; dd HFactor_i+1 - (width % HFactor_i)
+        mov     ecx, [ebx+jpeg.work.image]
+        mov     eax, [ecx+Image.Height]
+        movzx   ecx, byte [edi-34]      ; get VFactor
+        cdq
+        div     ecx
+        stosd           ; dd height / VFactor_i
+        stosd
+        xchg    eax, ecx
+        inc     eax
+        sub     eax, edx
+        stosd           ; dd VFactor_i+1 - (height % VFactor_i)
+        pop     ecx
+        scasd           ; dd DCPrediction
+        cmp     dword [edi], 0
+        setnp   al
+        ror     al, 1
+        mov     byte [edi-1], al
+        scasd           ; dd ComponentOffset
+        dec     ecx
+        jnz     .sos_find_comp
+        mov     [ebx+jpeg.work.cur_components_end], edi
+        lea     edi, [ebx+jpeg.work.ScanStart]
+        movsb
+        cmp     byte [esi], 63
+        ja      .end3
+        movsb
+        lodsb
+        push    eax
+        and     al, 0xF
+        stosb
+        pop     eax
+        shr     al, 4
+        stosb
+; now unpack data
+        call    init_limits
+        and     [ebx+jpeg.work.decoded_MCUs], 0
+        mov     [ebx+jpeg.work.cur_rst_marker], 7
+        and     [ebx+jpeg.work.huffman_bits], 0
+        cmp     [ebx+jpeg.work.progressive], 0
+        jz      .sos_noprogressive
+; progressive mode - only decode DCT coefficients
+; initialize pointers to coefficients data
+; zero number of EOBs for AC coefficients
+; redefine HIncrement and VIncrement
+        lea     edi, [ebx+jpeg.work.cur_components]
+.coeff_init:
+        mov     eax, [ebx+jpeg.work.dct_buffer_size]
+        mul     dword [edi+52]
+        add     eax, [ebx+jpeg.work.dct_buffer]
+        mov     [edi+12], eax
+        and     dword [edi+52], 0
+        cmp     [ebx+jpeg.work.ScanStart], 0
+        jz      .scan_dc
+        cmp     dword [edi+20], 0
+        jz      .end3
+        jmp     @f
+.scan_dc:
+        cmp     dword [edi+16], 0
+        jz      .end3
+@@:
+        movzx   eax, byte [edi+1]
+        shl     eax, 7
+        mov     [edi+4], eax
+        mov     eax, [edi+28]
+        mov     cl, [edi+3]
+        cmp     cl, [edi+32]
+        sbb     eax, -7-1
+        shr     eax, 3
+        shl     eax, 7
+        mov     [edi+8], eax
+        add     edi, 56
+        cmp     edi, [ebx+jpeg.work.cur_components_end]
+        jb      .coeff_init
+; unpack coefficients
+; N.B. Speed optimization has sense here.
+.coeff_decode_loop:
+        lea     edx, [ebx+jpeg.work.cur_components]
+.coeff_components_loop:
+        mov     edi, [edx+12]
+        movzx   ecx, byte [edx]
+        push    dword [edx+40]
+        push    edi
+.coeff_y_loop:
+        push    ecx
+        movzx   eax, byte [edx+1]
+        push    dword [edx+28]
+        push    edi
+.coeff_x_loop:
+        cmp     dword [edx+40], 0
+        jl      @f
+        cmp     dword [edx+28], 0
+        jge     .realdata
+@@:
+        cmp     [ebx+jpeg.work.not_interleaved], 0
+        jnz     .norealdata
+        push    eax edi
+        lea     edi, [ebx+jpeg.work.dct_coeff]
+        call    decode_progressive_coeff
+        pop     edi eax
+        jmp     .norealdata
+.realdata:
+        push    eax
+        call    decode_progressive_coeff
+        add     edi, 64*2
+        pop     eax
+.norealdata:
+        sub     dword [edx+28], 8
+        sub     eax, 1
+        jnz     .coeff_x_loop
+        pop     edi
+        pop     dword [edx+28]
+        add     edi, [edx+8]
+        pop     ecx
+        sub     dword [edx+40], 8
+        sub     ecx, 1
+        jnz     .coeff_y_loop
+        movzx   eax, byte [edx+1]
+        shl     eax, 3
+        pop     edi
+        add     edi, [edx+4]
+        pop     dword [edx+40]
+        sub     [edx+28], eax
+        mov     [edx+12], edi
+        add     edx, 56
+        cmp     edx, [ebx+jpeg.work.cur_components_end]
+        jnz     .coeff_components_loop
+        call    next_MCU
+        jc      .norst
+        sub     [ebx+jpeg.work.cur_x], 1
+        jnz     .coeff_decode_loop
+        call    next_line
+        lea     edx, [ebx+jpeg.work.cur_components]
+@@:
+        mov     eax, [ebx+jpeg.work.max_x]
+        imul    eax, [edx+4]
+        sub     [edx+12], eax
+        movzx   eax, byte [edx]
+        imul    eax, [edx+8]
+        add     [edx+12], eax
+        add     edx, 56
+        cmp     edx, [ebx+jpeg.work.cur_components_end]
+        jnz     @b
+        sub     [ebx+jpeg.work.cur_y], 1
+        jnz     .coeff_decode_loop
+        jmp     .markers_loop
+.norst:
+.end4:
+        jmp     .end3
+.sos_noprogressive:
+; normal mode - unpack JPEG image
+        mov     edi, [ebx+jpeg.work.image]
+        mov     edi, [edi+Image.Data]
+        mov     [ebx+jpeg.work.cur_out_ptr], edi
+; N.B. Speed optimization has sense here.
+.decode_loop:
+        call    decode_MCU
+        call    next_MCU
+        jc      .end4
+        sub     [ebx+jpeg.work.cur_x], 1
+        jnz     .decode_loop
+        call    next_line
+        sub     [ebx+jpeg.work.cur_y], 1
+        jnz     .decode_loop
+        jmp     .markers_loop
+get_marker:
+; in: esi -> data
+; out: CF=0, al=marker value - ok
+;      CF=1 - no marker
+        sub     ebp, 1
+        jc      .ret
+        lodsb
+if 1
+        cmp     al, 0xFF
+        jae     @f
+; Some stupid men, which do not read specifications and manuals,
+; sometimes create markers with length field two less than true
+; value (in JPEG length of marker = length of data INCLUDING
+; length field itself). To open such files, allow 2 bytes
+; before next marker.
+        cmp     ebp, 2
+        jb      .ret
+        lodsb
+        lodsb
+end if
+        cmp     al, 0xFF
+        jb      .ret
+@@:
+        sub     ebp, 1
+        jc      .ret
+        lodsb
+        cmp     al, 0xFF
+        jz      @b
+        clc
+.ret:
+        ret
+align 16
+decode_MCU:
+        lea     edx, [ebx+jpeg.work.cur_components]
+.components_loop:
+; decode each component
+        push    [ebx+jpeg.work.cur_out_ptr]
+        movzx   ecx, byte [edx]
+        push    dword [edx+40]
+; we have H_i * V_i blocks of packed data, decode them
+.y_loop_1:
+        push    [ebx+jpeg.work.cur_out_ptr]
+        push    ecx
+        movzx   eax, byte [edx+1]
+        push    dword [edx+28]
+.x_loop_1:
+        push    eax
+        call    decode_data_unit
+        cmp     dword [edx+40], 0
+        jl      .nocopyloop
+        cmp     dword [edx+28], 0
+        jl      .nocopyloop
+; now we have decoded block 8*8 in decoded_data
+; H_i * V_i packed blocks 8*8 make up one block (8*HMax) * (8*VMax)
+; so each pixel in packed block corresponds to HFact * VFact pixels
+        movzx   ecx, byte [edx+2]
+        push    esi ebp
+        mov     edi, [ebx+jpeg.work.cur_out_ptr]
+        add     edi, [edx+52]
+.y_loop_2:
+        push    ecx edi
+        cmp     ecx, [edx+44]
+        mov     ecx, [edx+40]
+        sbb     ecx, 8-1
+        sbb     eax, eax
+        and     ecx, eax
+        add     ecx, 8
+        jz      .skip_x_loop_2
+        movzx   eax, byte [edx+3]
+.x_loop_2:
+        push    eax ecx edi
+        cmp     eax, [edx+32]
+        mov     eax, [edx+28]
+        sbb     eax, 8-1
+        sbb     ebp, ebp
+        and     eax, ebp
+        mov     ebp, .copyiter_all
+        lea     esi, [ebx+jpeg.work.decoded_data]
+        sub     ebp, eax
+        sub     ebp, eax
+        sub     ebp, eax
+        mov     eax, [edx+4]
+        sub     eax, 1
+.copyloop:
+        push    esi edi
+        jmp     ebp
+.copyiter_all:
+        movsb
+repeat 7
+        add     edi, eax
+        movsb
+end repeat
+        nop
+        nop
+        pop     edi esi
+        add     edi, [edx+8]
+        add     esi, 8
+        sub     ecx, 1
+        jnz     .copyloop
+        pop     edi ecx eax
+        add     edi, [ebx+jpeg.work.pixel_size]
+        sub     eax, 1
+        jnz     .x_loop_2
+.skip_x_loop_2:
+        pop     edi ecx
+        add     edi, [ebx+jpeg.work.line_size]
+        sub     ecx, 1
+        jnz     .y_loop_2
+        pop     ebp esi
+.nocopyloop:
+        mov     eax, [ebx+jpeg.work.delta_x]
+        add     [ebx+jpeg.work.cur_out_ptr], eax
+        pop     eax
+        sub     dword [edx+28], 8
+        sub     eax, 1
+        jnz     .x_loop_1
+        pop     dword [edx+28]
+        pop     ecx
+        pop     eax
+        sub     dword [edx+40], 8
+        add     eax, [ebx+jpeg.work.delta_y]
+        mov     [ebx+jpeg.work.cur_out_ptr], eax
+        sub     ecx, 1
+        jnz     .y_loop_1
+        movzx   eax, byte [edx+1]
+        pop     dword [edx+40]
+        shl     eax, 3
+        pop     [ebx+jpeg.work.cur_out_ptr]
+        sub     dword [edx+28], eax
+        add     edx, 56
+        cmp     edx, [ebx+jpeg.work.cur_components_end]
+        jb      .components_loop
+        mov     eax, [ebx+jpeg.work.cur_block_dx]
+        add     [ebx+jpeg.work.cur_out_ptr], eax
+        ret
+align 16
+next_MCU:
+        add     [ebx+jpeg.work.decoded_MCUs], 1
+        mov     eax, [ebx+jpeg.work.restart_interval]
+        test    eax, eax
+        jz      .no_restart
+        cmp     [ebx+jpeg.work.decoded_MCUs], eax
+        jb      .no_restart
+        and     [ebx+jpeg.work.decoded_MCUs], 0
+        and     [ebx+jpeg.work.huffman_bits], 0
+        cmp     [ebx+jpeg.work.cur_x], 1
+        jnz     @f
+        cmp     [ebx+jpeg.work.cur_y], 1
+        jz      .no_restart
+@@:
+; restart marker must be present
+        sub     ebp, 2
+        js      .error
+        cmp     byte [esi], 0xFF
+        jnz     .error
+        mov     al, [ebx+jpeg.work.cur_rst_marker]
+        inc     eax
+        and     al, 7
+        mov     [ebx+jpeg.work.cur_rst_marker], al
+        add     al, 0xD0
+        cmp     [esi+1], al
+        jnz     .error
+        add     esi, 2
+; handle restart marker - zero all DC predictions
+        lea     edx, [ebx+jpeg.work.cur_components]
+@@:
+        and     word [edx+48], 0
+        add     edx, 56
+        cmp     edx, [ebx+jpeg.work.cur_components_end]
+        jb      @b
+.no_restart:
+        clc
+        ret
+.error:
+        stc
+        ret
+next_line:
+        mov     eax, [ebx+jpeg.work.max_x]
+        mov     [ebx+jpeg.work.cur_x], eax
+        mul     [ebx+jpeg.work.cur_block_dx]
+        sub     eax, [ebx+jpeg.work.cur_block_dy]
+        sub     [ebx+jpeg.work.cur_out_ptr], eax
+        lea     edx, [ebx+jpeg.work.cur_components]
+@@:
+        mov     eax, [edx+24]
+        mov     [edx+28], eax
+        movzx   eax, byte [edx]
+        shl     eax, 3
+        sub     [edx+40], eax
+        add     edx, 56
+        cmp     edx, [ebx+jpeg.work.cur_components_end]
+        jb      @b
+        ret
+init_limits:
+        push    [ebx+jpeg.work.x_num_blocks]
+        pop     [ebx+jpeg.work.max_x]
+        push    [ebx+jpeg.work.y_num_blocks]
+        pop     [ebx+jpeg.work.max_y]
+        push    [ebx+jpeg.work.block_delta_x]
+        pop     [ebx+jpeg.work.cur_block_dx]
+        push    [ebx+jpeg.work.block_delta_y]
+        pop     [ebx+jpeg.work.cur_block_dy]
+        cmp     [ebx+jpeg.work.not_interleaved], 0
+        jz      @f
+        mov     eax, dword [ebx+jpeg.work.cur_components+28]
+        movzx   ecx, byte [ebx+jpeg.work.cur_components+3]
+        cmp     cl, [ebx+jpeg.work.cur_components+32]
+        sbb     eax, -7-1
+        shr     eax, 3
+        mov     [ebx+jpeg.work.max_x], eax
+        mov     eax, dword [ebx+jpeg.work.cur_components+40]
+        movzx   edx, byte [ebx+jpeg.work.cur_components+2]
+        cmp     dl, [ebx+jpeg.work.cur_components+44]
+        sbb     eax, -7-1
+        shr     eax, 3
+        mov     [ebx+jpeg.work.max_y], eax
+        imul    ecx, [ebx+jpeg.work.delta_x]
+        mov     [ebx+jpeg.work.cur_block_dx], ecx
+        imul    edx, [ebx+jpeg.work.delta_y]
+        mov     [ebx+jpeg.work.cur_block_dy], edx
+@@:
+        push    [ebx+jpeg.work.max_x]
+        pop     [ebx+jpeg.work.cur_x]
+        push    [ebx+jpeg.work.max_y]
+        pop     [ebx+jpeg.work.cur_y]
+        ret
+;macro get_bit
+;{
+;local .l1,.l2,.marker
+;       add     cl, cl
+;       jnz     .l1
+;       sub     ebp, 1
+;       js      decode_data_unit.eof
+;       mov     cl, [esi]
+;       cmp     cl, 0xFF
+;       jnz     .l2
+;.marker:
+;       add     esi, 1
+;       sub     ebp, 1
+;       js      decode_data_unit.eof
+;       cmp     byte [esi], 0xFF
+;       jz      .marker
+;       cmp     byte [esi], 0
+;       jnz     decode_data_unit.eof
+;.l2:
+;       sub     esi, -1
+;       adc     cl, cl
+;.l1:
+;}
+macro get_bit stack_depth
+{
+local .l1,.l2,.marker
+        sub     cl, 1
+        jns     .l1
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        mov     ch, [esi]
+        cmp     ch, 0xFF
+        jnz     .l2
+.marker:
+        add     esi, 1
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        cmp     byte [esi], 0xFF
+        jz      .marker
+        cmp     byte [esi], 0
+        jnz     .eof_pop#stack_depth
+.l2:
+        add     esi, 1
+        mov     cl, 7
+.l1:
+        add     ch, ch
+}
+macro get_bits stack_depth,stack_depth_p1,restore_edx
+{
+local .l1,.l2,.l3,.marker2
+        movzx   eax, ch
+        mov     dl, cl
+        shl     eax, 24
+        neg     cl
+        push    ebx
+        add     cl, 24
+.l1:
+        cmp     bl, dl
+        jbe     .l2
+        sub     bl, dl
+        sub     ebp, 1
+        js      .eof_pop#stack_depth_p1
+        mov     ch, [esi]
+        cmp     ch, 0xFF
+        jnz     .l3
+.marker2:
+        add     esi, 1
+        sub     ebp, 1
+        js      .eof_pop#stack_depth_p1
+        cmp     byte [esi], 0xFF
+        jz      .marker2
+        cmp     byte [esi], 0
+        jnz     .eof_pop#stack_depth_p1
+.l3:
+        movzx   edx, ch
+        add     esi, 1
+        shl     edx, cl
+        sub     cl, 8
+        or      eax, edx
+        mov     dl, 8
+        jmp     .l1
+.l2:
+        mov     cl, bl
+        sub     dl, bl
+        shl     ch, cl
+        pop     ebx
+        cmp     eax, 80000000h
+        rcr     eax, 1
+        mov     cl, 31
+        sub     cl, bl
+        sar     eax, cl
+        mov     cl, dl
+if restore_edx eq true
+        pop     edx
+end if
+        add     eax, 80000000h
+        adc     eax, 80000000h
+}
+; macro get_huffman_code
+; {
+; local .l1
+        ; xor   ebx, ebx
+; .l1:
+        ; get_bit
+        ; adc   ebx, ebx
+        ; mov   eax, [eax+4*ebx]
+        ; xor   ebx, ebx
+        ; cmp   eax, -1
+        ; jz    .eof_pop
+        ; cmp   eax, 0x1000
+        ; jae   .l1
+        ; mov   ebx, eax
+; }
+macro get_huffman_code stack_depth,stack_depth_p1
+{
+local .l1,.l2,.l3,.l4,.l5,.l6,.nomarker1,.marker1,.nomarker2,.marker2,.nomarker3,.marker3,.done
+; 1. (First level in Huffman table) Does the current Huffman code fit in 8 bits
+; and have we got enough bits?
+        movzx   ebx, ch
+        cmp     byte [eax+ebx*2], cl
+        jbe     .l1
+; 2a. No; load next byte
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        mov     ch, [esi]
+        movzx   edx, ch
+        cmp     ch, 0xFF
+        jnz     .nomarker1
+.marker1:
+        add     esi, 1
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        cmp     byte [esi], 0xFF
+        jz      .marker1
+        cmp     byte [esi], 0
+        jnz     .eof_pop#stack_depth
+.nomarker1:
+        shr     edx, cl
+        add     esi, 1
+        or      ebx, edx
+; 3a. (First level in Huffman table, >=8 bits known) Does the current Huffman code fit in 8 bits?
+        cmp     byte [eax+ebx*2], 8
+        jbe     .l2
+        jl      .eof_pop#stack_depth
+; 4aa. No; go to next level
+        movzx   ebx, byte [eax+ebx*2+1]
+        mov     dl, ch
+        shl     ebx, 5
+        ror     edx, cl
+        lea     ebx, [eax+ebx+0x200]
+        shr     edx, 24
+        push    edx
+        shr     edx, 4
+; 5aa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits
+; and have we got enough bits?
+        cmp     byte [ebx+edx*2], cl
+        jbe     .l3
+; 6aaa. No; have we got 12 bits?
+        cmp     cl, 4
+        jae     .l4
+; 7aaaa. No; load next byte
+        pop     edx
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        mov     ch, [esi]
+        cmp     ch, 0xFF
+        jnz     .nomarker2
+.marker2:
+        add     esi, 1
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        cmp     byte [esi], 0xFF
+        jz      .marker2
+        cmp     byte [esi], 0
+        jnz     .eof_pop#stack_depth
+.nomarker2:
+        push    ecx
+        shr     ch, cl
+        add     esi, 1
+        or      dl, ch
+        pop     ecx
+        push    edx
+        shr     edx, 4
+; 8aaaa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits?
+        cmp     byte [ebx+edx*2], 4
+        jbe     .l5
+        jl      .eof_pop#stack_depth_p1
+; 9aaaaa. No; go to next level
+        movzx   ebx, byte [ebx+edx*2+1]
+        pop     edx
+        shl     ebx, 5
+        and     edx, 0xF
+        lea     ebx, [eax+ebx+0x200]
+; 10aaaaa. Get current code length and value
+        sub     cl, [ebx+edx*2]
+        movzx   eax, byte [ebx+edx*2+1]
+        neg     cl
+        shl     ch, cl
+        neg     cl
+        add     cl, 8
+        jmp     .done
+.l5:
+; 9aaaab. Yes; get current code length and value
+        sub     cl, [ebx+edx*2]
+        movzx   eax, byte [ebx+edx*2+1]
+        neg     cl
+        pop     edx
+        shl     ch, cl
+        neg     cl
+        add     cl, 8
+        jmp     .done
+.l4:
+; 7aaab. Yes; go to next level
+        movzx   ebx, byte [ebx+edx*2+1]
+        pop     edx
+        shl     ebx, 5
+        and     edx, 0xF
+        lea     ebx, [eax+ebx+0x200]
+; 8aaab. (Third level in Huffman table) Have we got enough bits?
+        cmp     [ebx+edx*2], cl
+        jbe     .l6
+; 9aaaba. No; load next byte
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        mov     ch, [esi]
+        cmp     ch, 0xFF
+        jnz     .nomarker3
+.marker3:
+        add     esi, 1
+        sub     ebp, 1
+        js      .eof_pop#stack_depth
+        cmp     byte [esi], 0xFF
+        jz      .marker3
+        cmp     byte [esi], 0
+        jnz     .eof_pop#stack_depth
+.nomarker3:
+        push    ecx
+        shr     ch, cl
+        add     esi, 1
+        or      dl, ch
+        pop     ecx
+; 10aaaba. Get current code length and value
+        sub     cl, [ebx+edx*2]
+        movzx   eax, byte [ebx+edx*2+1]
+        neg     cl
+        shl     ch, cl
+        neg     cl
+        add     cl, 8
+        jmp     .done
+.l3:
+; 6aab. Yes; get current code length and value
+        pop     eax
+.l6:
+; 9aaabb. Yes; get current code length and value
+        sub     cl, [ebx+edx*2]
+        movzx   eax, byte [ebx+edx*2+1]
+        xor     cl, 7
+        shl     ch, cl
+        xor     cl, 7
+        add     ch, ch
+        jmp     .done
+.l2:
+; 3ab. Yes; get current code length and value
+        sub     cl, [eax+ebx*2]
+        movzx   eax, byte [eax+ebx*2+1]
+        neg     cl
+        shl     ch, cl
+        neg     cl
+        add     cl, 8
+        jmp     .done
+.l1:
+; 3b. Yes; get current code length and value
+        mov     dl, [eax+ebx*2]
+        movzx   eax, byte [eax+ebx*2+1]
+        xchg    cl, dl
+        sub     dl, cl
+        shl     ch, cl
+        mov     cl, dl
+.done:
+        mov     ebx, eax
+}
+; Decode DCT coefficients for one 8*8 block in progressive mode
+; from input stream, given by pointer esi and length ebp
+; N.B. Speed optimization has sense here.
+align 16
+decode_progressive_coeff:
+        mov     ecx, [ebx+jpeg.work.huffman_bits]
+        cmp     [ebx+jpeg.work.ScanStart], 0
+        jnz     .ac
+; DC coefficient
+        cmp     [ebx+jpeg.work.ApproxPosHigh], 0
+        jz      .dc_first
+; DC coefficient, subsequent passes
+        xor     eax, eax
+        get_bit 0
+        adc     eax, eax
+        mov     [ebx+jpeg.work.huffman_bits], ecx
+        mov     cl, [ebx+jpeg.work.ApproxPosLow]
+        shl     eax, cl
+        or      [edi], ax
+        ret
+.dc_first:
+; DC coefficient, first pass
+        mov     eax, [edx+16]
+        push    ebx
+        push    edx
+        get_huffman_code 2,3
+        get_bits 2,3,true
+        pop     ebx
+        add     eax, [edx+48]
+        mov     [edx+48], ax
+        mov     [ebx+jpeg.work.huffman_bits], ecx
+        mov     cl, [ebx+jpeg.work.ApproxPosLow]
+        shl     eax, cl
+        mov     [edi], ax
+        ret
+.ac:
+; AC coefficients
+        movzx   eax, [ebx+jpeg.work.ScanStart]
+        cmp     al, [ebx+jpeg.work.ScanEnd]
+        ja      .ret
+        cmp     dword [edx+52], 0
+        jnz     .was_eob
+        push    ebx
+.acloop:
+        push    edx
+        push    eax
+        mov     eax, [edx+20]
+        get_huffman_code 3,4
+        pop     eax
+        test    ebx, 15
+        jz      .band
+        push    eax ebx
+        and     ebx, 15
+        get_bits 4,5,false
+        pop     ebx
+        xchg    eax, [esp]
+        shr     ebx, 4
+        mov     edx, [esp+8]
+.zeroloop1:
+        push    eax ebx
+        movzx   eax, byte [zigzag+eax]
+        xor     ebx, ebx
+        cmp     word [edi+eax], bx
+        jz      .zeroloop2
+        get_bit 5
+        jnc     @f
+        push    ecx
+        mov     cl, [edx+jpeg.work.ApproxPosLow]
+        xor     ebx, ebx
+        cmp     byte [edi+eax+1], 80h
+        adc     ebx, 0
+        add     ebx, ebx
+        sub     ebx, 1
+        shl     ebx, cl
+        pop     ecx
+        add     [edi+eax], bx
+@@:
+        pop     ebx eax
+@@:
+        add     eax, 1
+        cmp     al, [edx+jpeg.work.ScanEnd]
+        ja      decode_data_unit.eof_pop3
+        jmp     .zeroloop1
+.zeroloop2:
+        pop     ebx eax
+        sub     ebx, 1
+        jns     @b
+.nozero1:
+        pop     ebx
+        test    ebx, ebx
+        jz      @f
+        push    eax
+        movzx   eax, byte [zigzag+eax]
+        push    ecx
+        mov     cl, [edx+jpeg.work.ApproxPosLow]
+        shl     ebx, cl
+        pop     ecx
+        mov     [edi+eax], bx
+        pop     eax
+@@:
+        add     eax, 1
+        cmp     al, [edx+jpeg.work.ScanEnd]
+        pop     edx
+        jbe     .acloop
+        pop     ebx
+        mov     [ebx+jpeg.work.huffman_bits], ecx
+.ret:
+        ret
+.eof_pop5:
+        pop     ebx
+.eof_pop4:
+        pop     ebx
+.eof_pop3:
+        pop     ebx
+.eof_pop2:
+        pop     ebx
+.eof_pop1:
+        pop     ebx
+.eof_pop0:
+        jmp     decode_data_unit.eof_pop0
+.band:
+        shr     ebx, 4
+        cmp     ebx, 15
+        jnz     .eob
+        mov     edx, [esp+4]
+        push    0
+        jmp     .zeroloop1
+.eob:
+        pop     edx
+        push    eax
+        mov     eax, 1
+        test    ebx, ebx
+        jz      .eob0
+@@:
+        get_bit 2
+        adc     eax, eax
+        sub     ebx, 1
+        jnz     @b
+.eob0:
+        mov     [edx+52], eax
+        pop     eax
+        pop     ebx
+.was_eob:
+        sub     dword [edx+52], 1
+        cmp     al, [ebx+jpeg.work.ScanEnd]
+        ja      .ret2
+        push    edx
+.zeroloop3:
+        push    eax
+        movzx   eax, byte [zigzag+eax]
+        xor     edx, edx
+        cmp     word [edi+eax], dx
+        jz      @f
+        get_bit 2
+        jnc     @f
+        push    ecx
+        mov     cl, [ebx+jpeg.work.ApproxPosLow]
+        xor     edx, edx
+        cmp     byte [edi+eax+1], 80h
+        adc     edx, 0
+        add     edx, edx
+        sub     edx, 1
+        shl     edx, cl
+        pop     ecx
+        add     [edi+eax], dx
+@@:
+        pop     eax
+        add     eax, 1
+        cmp     al, [ebx+jpeg.work.ScanEnd]
+        jbe     .zeroloop3
+        pop     edx
+.ret2:
+        mov     [ebx+jpeg.work.huffman_bits], ecx
+        ret
+handle_progressive:
+        cmp     [ebx+jpeg.work.dct_buffer], 0
+        jnz     @f
+        ret
+@@:
+; information for all components
+        lea     esi, [ebx+jpeg.work.components]
+        xor     ebp, ebp
+        mov     ecx, [ebx+jpeg.work.pixel_size]
+.next_component:
+        lea     edi, [ebx+jpeg.work.cur_components]
+        lodsb   ; ComponentID
+        lodsd
+        mov     ax, 0x0101
+        stosd   ; db V, db H, db VFactor, db HFactor
+        xor     eax, eax
+        mov     al, byte [edi-1]        ; get HFactor
+        mul     byte [ebx+jpeg.work.pixel_size] ; number of components
+        stosd                   ; HIncrement_i = HFactor_i * sizeof(pixel)
+        movzx   eax, byte [edi-4-2]     ; get VFactor
+        mul     [ebx+jpeg.work.line_size]       ; number of components * image width
+        stosd                   ; VIncrement_i = VFactor_i * sizeof(row)
+        lodsb
+        and     eax, 3
+        cmp     [ebx+jpeg.work.quant_tables_defined+eax], 0
+        jz      .error
+        shl     eax, 8
+        lea     eax, [ebx+jpeg.work.quant_tables+eax]
+        stosd           ; dd QuantizationTable
+        stosd           ; dd DCTable - ignored
+        mov     eax, ebp
+        mul     [ebx+jpeg.work.dct_buffer_size]
+        add     eax, [ebx+jpeg.work.dct_buffer]
+        stosd           ; instead of dd ACTable - pointer to current DCT coefficients
+        push    ecx
+        mov     eax, [ebx+jpeg.work.image]
+        mov     eax, [eax+Image.Width]
+        movzx   ecx, byte [edi-21]      ; get HFactor
+;       cdq     ; edx = 0 as a result of previous mul
+        div     ecx
+        stosd           ; dd width / HFactor_i
+        stosd
+        xchg    eax, ecx
+        inc     eax
+        sub     eax, edx
+        stosd           ; dd HFactor_i+1 - (width % HFactor_i)
+        mov     eax, [ebx+jpeg.work.image]
+        mov     eax, [eax+Image.Height]
+        movzx   ecx, byte [edi-34]      ; get VFactor
+        cdq
+        div     ecx
+        stosd           ; dd height / VFactor_i
+        stosd
+        xchg    eax, ecx
+        inc     eax
+        sub     eax, edx
+        stosd           ; dd VFactor_i+1 - (height % VFactor_i)
+        pop     ecx
+        xor     eax, eax
+        cmp     ebp, 1
+        cmc
+        rcr     eax, 1
+        stosd           ; dd DCPrediction
+        mov     eax, ebp
+        stosd           ; dd ComponentOffset
+        inc     ebp
+        push    ecx
+        mov     [ebx+jpeg.work.cur_components_end], edi
+        lea     edx, [edi-56]
+; do IDCT and unpack
+        mov     edi, [ebx+jpeg.work.image]
+        mov     edi, [edi+Image.Data]
+        mov     [ebx+jpeg.work.cur_out_ptr], edi
+        mov     [ebx+jpeg.work.not_interleaved], 1
+        call    init_limits
+.decode_loop:
+        call    decode_MCU
+        sub     [ebx+jpeg.work.cur_x], 1
+        jnz     .decode_loop
+        call    next_line
+        sub     [ebx+jpeg.work.cur_y], 1
+        jnz     .decode_loop
+        pop     ecx
+        dec     ecx
+        jnz     .next_component
+; image unpacked, return
+.error:
+        push    [ebx+jpeg.work.dct_buffer]
+        call    [mem.free]
+        ret
+; Support for YCbCr -> RGB conversion
+; R = Y                          + 1.402 * (Cr - 128)
+; G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+; B = Y +   1.772 * (Cb - 128)
+; When converting YCbCr -> RGB, we need to do some multiplications;
+; to be faster, we precalculate the table for all 256 possible values
+; Also we approximate fractions with N/65536, this gives sufficient precision
+img.initialize.jpeg:
+;initialize_color_table:
+; 1.402 = 1 + 26345/65536, -0.71414 = -46802/65536
+; -0.34414 = -22554/65536, 1.772 = 1 + 50594/65536
+        pushad
+        mov     edi, color_table_1
+        mov     ecx, 128
+; 1. Cb -> 1.772*Cb
+        xor     eax, eax
+        mov     dx, 8000h
+.l1:
+        push    ecx
+@@:
+        stosd
+        add     dx, 50594
+        adc     eax, 1
+        loop    @b
+        neg     dx
+        adc     eax, -1
+        neg     eax
+        pop     ecx
+        jnz     .l1
+; 2. Cb -> -0.34414*Cb
+        mov     ax, dx
+.l2:
+        push    ecx
+@@:
+        stosd
+        sub     eax, 22554
+        loop    @b
+        neg     eax
+        pop     ecx
+        cmp     ax, dx
+        jnz     .l2
+        xor     eax, eax
+; 3. Cr -> -0.71414*Cr
+.l3:
+        push    ecx
+@@:
+        stosd
+        sub     eax, 46802
+        loop    @b
+        neg     eax
+        pop     ecx
+        jnz     .l3
+; 4. Cr -> 1.402*Cr
+.l4:
+        push    ecx
+@@:
+        stosd
+        add     dx, 26345
+        adc     eax, 1
+        loop    @b
+        neg     dx
+        adc     eax, -1
+        neg     eax
+        pop     ecx
+        jnz     .l4
+        popad
+        ret
+; this function is called in the end of image loading
+convert_to_rgb:
+; some checks
+        mov     eax, [ebx+jpeg.work.image]
+        test    eax, eax        ; image exists?
+        jz      .ret
+        cmp     byte [ebx+jpeg.work.pixel_size], 3      ; full-color image?
+        jz      .ycc2rgb
+        cmp     byte [ebx+jpeg.work.pixel_size], 4
+        jz      .ycck2rgb
+.ret:
+        ret
+.ycc2rgb:
+; conversion is needed
+        mov     esi, [eax+Image.Width]
+        imul    esi, [eax+Image.Height]
+        mov     edi, [eax+Image.Data]
+        push    ebx
+; N.B. Speed optimization has sense here.
+align 16
+.loop:
+;       mov     ebx, [edi]
+;       mov     edx, ebx
+;       mov     ecx, ebx
+;       movzx   ebx, bl         ; ebx = Y
+;       shr     edx, 16
+;       mov     eax, ebx
+;       movzx   edx, dl         ; edx = Cr
+;       movzx   ecx, ch         ; ecx = Cb
+        movzx   ebx, byte [edi]
+        movzx   ecx, byte [edi+1]
+        mov     eax, ebx
+        movzx   edx, byte [edi+2]
+; B = Y + color_table_1[Cb]
+        add     eax, [color_table_1+ecx*4]
+        mov     ebp, [color_table_2+ecx*4]
+        cmp     eax, 80000000h
+        sbb     ecx, ecx
+        and     eax, ecx
+        add     ebp, [color_table_3+edx*4]
+        cmp     eax, 0x100
+        sbb     ecx, ecx
+        not     ecx
+        sar     ebp, 16
+        or      eax, ecx
+        mov     [edi], al
+; G = Y + color_table_2[Cb] + color_table_3[Cr]
+        lea     eax, [ebx+ebp]
+        cmp     eax, 80000000h
+        sbb     ecx, ecx
+        and     eax, ecx
+        cmp     eax, 0x100
+        sbb     ecx, ecx
+        not     ecx
+        or      eax, ecx
+        mov     [edi+1], al
+; R = Y + color_table_4[Cr]
+        mov     eax, ebx
+        add     eax, [color_table_4+edx*4]
+        cmp     eax, 80000000h
+        sbb     ecx, ecx
+        and     eax, ecx
+        cmp     eax, 0x100
+        sbb     ecx, ecx
+        not     ecx
+        or      eax, ecx
+        mov     [edi+2], al
+        add     edi, 3
+        sub     esi, 1
+        jnz     .loop
+        pop     ebx
+        ret
+.ycck2rgb:
+; conversion is needed
+        mov     esi, [eax+Image.Width]
+        imul    esi, [eax+Image.Height]
+        push    ebx
+        push    esi
+        mov     edi, [eax+Image.Data]
+        mov     esi, edi
+; N.B. Speed optimization has sense here.
+align 16
+.kloop:
+;       mov     ebx, [esi]
+;       mov     edx, ebx
+;       mov     ecx, ebx
+;       movzx   ebx, bl         ; ebx = Y
+;       shr     edx, 16
+;       mov     eax, ebx
+;       movzx   edx, dl         ; edx = Cr
+;       movzx   ecx, ch         ; ecx = Cb
+        movzx   ebx, byte [esi]
+        movzx   ecx, byte [esi+1]
+        mov     eax, ebx
+        movzx   edx, byte [esi+2]
+; B = Y + color_table_1[Cb]
+        add     eax, [color_table_1+ecx*4]
+        mov     ebp, [color_table_2+ecx*4]
+        cmp     eax, 80000000h
+        sbb     ecx, ecx
+        and     eax, ecx
+        add     ebp, [color_table_3+edx*4]
+        cmp     eax, 0x100
+        sbb     ecx, ecx
+        not     ecx
+        sar     ebp, 16
+        or      eax, ecx
+        xor     al, 0xFF
+        mul     byte [esi+3]
+        add     al, ah
+        adc     ah, 0
+        add     al, 80h
+        adc     ah, 0
+        mov     byte [edi], ah
+; G = Y + color_table_2[Cb] + color_table_3[Cr]
+        lea     eax, [ebx+ebp]
+        cmp     eax, 80000000h
+        sbb     ecx, ecx
+        and     eax, ecx
+        cmp     eax, 0x100
+        sbb     ecx, ecx
+        not     ecx
+        or      eax, ecx
+        xor     al, 0xFF
+        mul     byte [esi+3]
+        add     al, ah
+        adc     ah, 0
+        add     al, 80h
+        adc     ah, 0
+        mov     byte [edi+1], ah
+; R = Y + color_table_4[Cr]
+        mov     eax, ebx
+        add     eax, [color_table_4+edx*4]
+        cmp     eax, 80000000h
+        sbb     ecx, ecx
+        and     eax, ecx
+        cmp     eax, 0x100
+        sbb     ecx, ecx
+        not     ecx
+        or      eax, ecx
+        xor     al, 0xFF
+        mul     byte [esi+3]
+        add     al, ah
+        adc     ah, 0
+        add     al, 80h
+        adc     ah, 0
+        mov     byte [edi+2], ah
+        add     esi, 4
+        add     edi, 4 ;3
+        sub     dword [esp], 1
+        jnz     .kloop
+        pop     eax
+        pop     ebx
+; release some memory - must succeed because we decrease size
+;       add     ecx, 44+1
+;       mov     edx, ebx
+;       push    68
+;       pop     eax
+;       push    20
+;       pop     ebx
+;       int     0x40
+;       mov     ebx, eax
+        ret
+; Decodes one data unit, that is, 8*8 block,
+; from input stream, given by pointer esi and length ebp
+; N.B. Speed optimization has sense here.
+align 16
+decode_data_unit:
+; edx -> component data
+        cmp     [ebx+jpeg.work.progressive], 0
+        jz      @f
+        mov     edi, [edx+20]
+        add     dword [edx+20], 64*2
+        jmp     .coeff_decoded
+@@:
+        lea     edi, [ebx+jpeg.work.dct_coeff]
+        mov     ecx, 64*2/4
+        xor     eax, eax
+        rep     stosd
+        mov     edi, zigzag+1
+        mov     ecx, [ebx+jpeg.work.huffman_bits]
+; read DC coefficient
+        push    ebx
+        mov     eax, [edx+16]
+        push    edx
+        get_huffman_code 2,3
+        get_bits 2,3,true
+        pop     ebx
+        add     eax, [edx+48]
+        mov     [ebx+jpeg.work.dct_coeff], ax
+        mov     [edx+48], ax
+; read AC coefficients
+        push    ebx
+@@:
+        mov     eax, [edx+20]
+        push    edx
+        get_huffman_code 2,3
+        shr     eax, 4
+        and     ebx, 15
+        jz      .band
+        add     edi, eax
+        cmp     edi, zigzag+64
+        jae     .eof_pop2
+        get_bits 2,3,true
+        movzx   ebx, byte [edi]
+        add     ebx, [esp]
+        mov     [jpeg.work.dct_coeff+ebx], ax
+        add     edi, 1
+        cmp     edi, zigzag+64
+        jb      @b
+        jmp     .do_idct
+.band:
+        pop     edx
+        cmp     al, 15
+        jnz     .do_idct
+        add     edi, 16
+        cmp     edi, zigzag+64
+        jb      @b
+;       jmp     .eof_pop1
+.do_idct:
+        pop     ebx
+        lea     edi, [ebx+jpeg.work.dct_coeff]
+        mov     [ebx+jpeg.work.huffman_bits], ecx
+; coefficients loaded, now IDCT
+.coeff_decoded:
+        mov     eax, [edx+12]
+        add     ebx, jpeg.work.idct_tmp_area
+        push    8
+.idct_loop1:
+        mov     cx, word [edi+1*16]
+repeat 6
+        or      cx, word [edi+(%+1)*16]
+end repeat
+        jnz     .real_transform
+        fild    word [edi]
+        fmul    dword [eax]
+        fstp    dword [ebx]
+        mov     ecx, [ebx]
+repeat 7
+        mov     [ebx+%*32], ecx
+end repeat
+        jmp     .idct_next1
+.real_transform:
+; S0,...,S7 - transformed values, s0,...,s7 - sought-for values
+; S0,...,S7 are dequantized;
+; dequantization table elements were multiplied to [idct_pre_table],
+; so S0,S1,... later denote S0/2\sqrt{2},S1*\cos{\pi/16}/2,...
+;       sqrt2 = \sqrt{2}, cos = 2\cos{\pi/8},
+;       cos_sum = -2(\cos{\pi/8}+\cos{3\pi/8}), cos_diff = 2(\cos{\pi/8}-\cos{3\pi/8})
+; Now formulas:
+; s0 = ((S0+S4)+(S2+S6)) + ((S1+S7)+(S3+S5))
+; s7 = ((S0+S4)+(S2+S6)) - ((S1+S7)+(S3+S5))
+; val0 = ((cos-1)S1-(cos+cos_sum+1)S3+(cos+cos_sum-1)S5-(cos+1)S7)
+; s1 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) + val0
+; s6 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) - val0
+; val1 = (S1+S7-S3-S5)sqrt2 - val0
+; s2 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) + val1
+; s5 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) - val1
+; val2 = (S1-S7)cos_diff - (S1-S3+S5-S7)cos + val1
+; s3 = ((S0+S4)-(S2+S6)) - val2
+; s4 = ((S0+S4)-(S2+S6)) + val2
+        fild    word [edi+3*16]
+        fmul    dword [eax+3*32]
+        fild    word [edi+5*16]
+        fmul    dword [eax+5*32]        ; st0=S5,st1=S3
+        fadd    st1,st0
+        fadd    st0,st0
+        fsub    st0,st1         ; st0=S5-S3,st1=S5+S3
+        fild    word [edi+1*16]
+        fmul    dword [eax+1*32]
+        fild    word [edi+7*16]
+        fmul    dword [eax+7*32]        ; st0=S7,st1=S1
+        fsub    st1,st0
+        fadd    st0,st0
+        fadd    st0,st1         ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
+        fadd    st3,st0
+        fadd    st0,st0
+        fsub    st0,st3         ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
+        fmul    [idct_sqrt2]
+        fld     st2
+        fadd    st0,st2
+        fmul    [idct_cos]      ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
+                                ; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
+        fxch    st2
+        fmul    [idct_cos_diff]
+        fsub    st0,st2         ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
+        fxch    st3
+        fmul    [idct_cos_sum]
+        fadd    st0,st2         ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
+        fsub    st0,st4         ; st0=val0
+        fsub    st1,st0         ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
+                                ; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
+        fxch    st2
+        fstp    st0
+        fadd    st2,st0         ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
+        fild    word [edi+0*16]
+        fmul    dword [eax+0*32]
+        fild    word [edi+4*16]
+        fmul    dword [eax+4*32]        ; st0=S4,st1=S0
+        fsub    st1,st0
+        fadd    st0,st0
+        fadd    st0,st1         ; st0=S0+S4,st1=S0-S4
+        fild    word [edi+6*16]
+        fmul    dword [eax+6*32]
+        fild    word [edi+2*16]
+        fmul    dword [eax+2*32]        ; st0=S2,st1=S6
+        fadd    st1,st0
+        fadd    st0,st0
+        fsub    st0,st1         ; st0=S2-S6,st1=S2+S6
+        fmul    [idct_sqrt2]
+        fsub    st0,st1
+        fsub    st3,st0
+        fadd    st0,st0
+        fadd    st0,st3         ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
+                                ; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
+        fxch    st1
+        fsub    st2,st0
+        fadd    st0,st0
+        fadd    st0,st2         ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
+                                ; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
+                                ; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
+        fsubr   st7,st0
+        fadd    st0,st0
+        fsub    st0,st7
+        fstp    dword [ebx+0*32]
+        fsubr   st4,st0
+        fadd    st0,st0
+        fsub    st0,st4
+        fstp    dword [ebx+1*32]
+        fadd    st4,st0
+        fadd    st0,st0
+        fsub    st0,st4
+        fstp    dword [ebx+3*32]
+        fsubr   st1,st0
+        fadd    st0,st0
+        fsub    st0,st1
+        fstp    dword [ebx+2*32]
+        fstp    dword [ebx+5*32]
+        fstp    dword [ebx+6*32]
+        fstp    dword [ebx+4*32]
+        fstp    dword [ebx+7*32]
+.idct_next1:
+        add     ebx, 4
+        add     edi, 2
+        add     eax, 4
+        sub     dword [esp], 1
+        jnz     .idct_loop1
+        pop     ecx
+        sub     ebx, 8*4
+        mov     ecx, 8
+.idct_loop2:
+        fld     dword [ebx+3*4]
+        fld     dword [ebx+5*4]
+        fadd    st1,st0
+        fadd    st0,st0
+        fsub    st0,st1         ; st0=S5-S3,st1=S5+S3
+        fld     dword [ebx+1*4]
+        fld     dword [ebx+7*4]
+        fsub    st1,st0
+        fadd    st0,st0
+        fadd    st0,st1         ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
+        fadd    st3,st0
+        fadd    st0,st0
+        fsub    st0,st3         ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
+        fmul    [idct_sqrt2]
+        fld     st2
+        fadd    st0,st2
+        fmul    [idct_cos]      ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
+                                ; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
+        fxch    st2
+        fmul    [idct_cos_diff]
+        fsub    st0,st2         ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
+        fxch    st3
+        fmul    [idct_cos_sum]
+        fadd    st0,st2         ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
+        fsub    st0,st4         ; st0=val0
+        fsub    st1,st0         ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
+                                ; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
+        fxch    st2
+        fstp    st0
+        fadd    st2,st0         ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
+        fld     dword [ebx+0*4]
+        fld     dword [ebx+4*4]
+        fsub    st1,st0
+        fadd    st0,st0
+        fadd    st0,st1         ; st0=S0+S4,st1=S0-S4
+        fld     dword [ebx+6*4]
+        fld     dword [ebx+2*4]
+        fadd    st1,st0
+        fadd    st0,st0
+        fsub    st0,st1         ; st0=S2-S6,st1=S2+S6
+        fmul    [idct_sqrt2]
+        fsub    st0,st1
+        fsub    st3,st0
+        fadd    st0,st0
+        fadd    st0,st3         ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
+                                ; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
+        fxch    st1
+        fsub    st2,st0
+        fadd    st0,st0
+        fadd    st0,st2         ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
+                                ; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
+                                ; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
+        fsubr   st7,st0
+        fadd    st0,st0
+        fsub    st0,st7
+        fistp   dword [ebx+0*4]
+        fsubr   st4,st0
+        fadd    st0,st0
+        fsub    st0,st4
+        fistp   dword [ebx+1*4]
+        fadd    st4,st0
+        fadd    st0,st0
+        fsub    st0,st4
+        fistp   dword [ebx+3*4]
+        fsubr   st1,st0
+        fadd    st0,st0
+        fsub    st0,st1
+        fistp   dword [ebx+2*4]
+        fistp   dword [ebx+5*4]
+        fistp   dword [ebx+6*4]
+        fistp   dword [ebx+4*4]
+        fistp   dword [ebx+7*4]
+        add     ebx, 32
+        sub     ecx, 1
+        jnz     .idct_loop2
+        sub     ebx, 32*8
+        mov     ecx, 64
+        lea     edi, [ebx - jpeg.work.idct_tmp_area + jpeg.work.decoded_data - 1]
+        push    esi
+.idct_loop3:
+        mov     eax, [ebx]
+        add     ebx, 4
+        add     eax, 80h
+        cmp     eax, 80000000h
+        sbb     esi, esi
+        add     edi, 1
+        and     eax, esi
+        cmp     eax, 100h
+        sbb     esi, esi
+        not     esi
+        or      eax, esi
+        sub     al, [edx+51]
+        sub     ecx, 1
+        mov     [edi], al
+        jnz     .idct_loop3
+        pop     esi
+        sub     ebx, 64*4 + jpeg.work.idct_tmp_area
+; done
+        ret
+.eof_pop3:
+        pop     ebx
+.eof_pop2:
+        pop     ebx
+.eof_pop1:
+        pop     ebx
+.eof_pop0:
+; EOF or incorrect data during scanning
+        mov     esp, [ebx + jpeg.work._esp]
+        jmp     img.decode.jpg.end
+img.encode.jpg:
+        xor     eax, eax
+        ret     8
+zigzag:
+; (x,y) -> 2*(x+y*8)
+repeat 8
+        .cur = %
+        if .cur and 1
+                repeat %
+                        db      2*((%-1) + (.cur-%)*8)
+                end repeat
+        else
+                repeat %
+                        db      2*((.cur-%) + (%-1)*8)
+                end repeat
+        end if
+end repeat
+repeat 7
+        .cur = %
+        if .cur and 1
+                repeat 8-%
+                        db      2*((%+.cur-1) + (8-%)*8)
+                end repeat
+        else
+                repeat 8-%
+                        db      2*((8-%) + (%+.cur-1)*8)
+                end repeat
+        end if
+end repeat
+align 4
+idct_pre_table:
+; c_0 = 1/(2\sqrt{2}), c_i = cos(i*\pi/16)/2
+        dd      0.35355339, 0.49039264, 0.461939766, 0.41573481
+        dd      0.35355339, 0.27778512, 0.19134172, 0.09754516
+idct_sqrt2      dd      1.41421356      ; \sqrt{2}
+idct_cos        dd      1.847759065     ; 2\cos{\pi/8}
+idct_cos_sum    dd      -2.61312593     ; -2(\cos{\pi/8} + \cos{3\pi/8})
+idct_cos_diff   dd      1.08239220      ; 2(\cos{\pi/8} - \cos{3\pi/8})
+;---------------------------------------------------------------------

 /programs/develop/libraries/libs-dev/libimg/jpeg/jpeg.inc
 ,0 → 1,96
+;;================================================================================================;;
+;;//// jpeg.inc //// (c) diamond, 2008-2009 //////////////////////////////////////////////////////;;
+;;================================================================================================;;
+;;                                                                                                ;;
+;; This file is part of Common development libraries (Libs-Dev).                                  ;;
+;;                                                                                                ;;
+;; Libs-Dev is free software: you can redistribute it and/or modify it under the terms of the GNU ;;
+;; Lesser General Public License as published by the Free Software Foundation, either version 2.1 ;;
+;; of the License, or (at your option) any later version.                                         ;;
+;;                                                                                                ;;
+;; Libs-Dev is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without  ;;
+;; even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  ;;
+;; Lesser General Public License for more details.                                                ;;
+;;                                                                                                ;;
+;; You should have received a copy of the GNU Lesser General Public License along with Libs-Dev.  ;;
+;; If not, see <http://www.gnu.org/licenses/>.                                                    ;;
+;;                                                                                                ;;
+;;================================================================================================;;
+struct jpeg.work        ; working area for JPEG handling
+image                   dd      ?
+; progressive JPEG?
+progressive             db      ?
+; one component in the scan?
+not_interleaved         db      ?
+; Adobe YCCK file?
+adobe_ycck              db      ?
+                        rb      1
+; parameters for progressive scan
+ScanStart               db      ?
+ScanEnd                 db      ?
+ApproxPosLow            db      ?
+ApproxPosHigh           db      ?
+; restart interval
+restart_interval        dd      ?
+decoded_MCUs            dd      ?
+_esp                    dd      ?
+; components information, up to 4 components
+; db ComponentIdentifier, db V, db H, db VFactor, db HFactor, db QuantizationTable
+components              rb      4*6
+max_v                   db      ?
+max_h                   db      ?
+cur_rst_marker          db      ?
+                        db      ?
+huffman_bits            dd      ?
+block_width     dd      ?
+block_height    dd      ?
+block_delta_x   dd      ?
+block_delta_y   dd      ?
+cur_block_dx    dd      ?
+cur_block_dy    dd      ?
+x_num_blocks    dd      ?
+y_num_blocks    dd      ?
+delta_x         dd      ?
+delta_y         dd      ?
+pixel_size      dd      ?
+line_size       dd      ?
+cur_x           dd      ?
+cur_y           dd      ?
+max_x           dd      ?
+max_y           dd      ?
+cur_out_ptr     dd      ?
+dct_buffer      dd      ?
+dct_buffer_size dd      ?
+;ns                     dd      ?
+; +0: db V, db H, db VFactor, db HFactor, dd HIncrement, dd VIncrement,
+; +12: dd QuantizationTable, dd DCTable, dd ACTable,
+; +24: dd width/HFactor, dd width/HFactor-8k, dd HFactor+1-(width%HFactor),
+; +36: dd height/VFactor, dd height/VFactor-8m, dd VFactor+1-(height%VFactor),
+; +48: dw DCPrediction, db ?, db (0 for Y, 80h for Cb,Cr), dd ComponentOffset
+cur_components          rb      4*56
+cur_components_end      dd      ?
+; Fourier coefficients
+dct_coeff               rw      64
+; Temporary space for IDCT
+idct_tmp_area           rd      64
+; decoded block 8*8
+decoded_data            rb      8*8
+; up to 4 quantization tables
+quant_tables            rd      4*64
+quant_tables_defined    rb      4
+; Huffman tables
+dc_huffman_defined      rb      4
+ac_huffman_defined      rb      4
+; up to 4 DC Huffman tables
+;dc_huffman             rd      4*256*2
+; up to 4 AC Huffman tables
+;ac_huffman             rd      4*256*2
+max_hufftable_size = (256 + (9+128)*16)*2
+dc_huffman              rb      4*max_hufftable_size
+ac_huffman              rb      4*max_hufftable_size
+ends

Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 998 → Rev 999