Subversion Repositories Kolibri OS

Rev

Rev 4060 | Blame | Compare with Previous | Last modification | View Log | Download | RSS feed

  1. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2. ;;                                                                 ;;
  3. ;; Copyright (C) KolibriOS team 2004-2013. All rights reserved.    ;;
  4. ;; Distributed under terms of the GNU General Public License       ;;
  5. ;;                                                                 ;;
  6. ;;   Written by CleverMouse                                        ;;
  7. ;;                                                                 ;;
  8. ;;         GNU GENERAL PUBLIC LICENSE                              ;;
  9. ;;          Version 2, June 1991                                   ;;
  10. ;;                                                                 ;;
  11. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  12.  
  13.  
  14. get_next_byte:
  15. ; Load next byte from the packet, translating to cp866 if necessary
  16. ; At input esi = pointer to data, edx = limit of data
  17. ; Output is either (translated) byte in al with CF set or CF cleared.
  18.         mov     eax, [encoding]
  19.         jmp     [get_byte_table+eax*4]
  20.  
  21. get_byte_cp866:
  22.         cmp     esi, edx
  23.         jae     .nothing
  24.         lodsb
  25. .nothing:
  26.         ret
  27.  
  28. get_byte_cp1251:
  29.         cmp     esi, edx
  30.         jae     .nothing
  31.         lodsb
  32.         cmp     al, 0x80
  33.         jb      @f
  34.         and     eax, 0x7F
  35.         mov     al, [cp1251_table+eax]
  36. @@:
  37.         stc
  38. .nothing:
  39.         ret
  40.  
  41. get_byte_utf8:
  42. ; UTF8 decoding is slightly complicated.
  43. ; One character can occupy one or more bytes.
  44. ; The boundary in packets theoretically can be anywhere in data,
  45. ; so this procedure keeps internal state between calls and handles
  46. ; one byte at a time, looping until character is read or packet is over.
  47. ; Globally, there are two distinct tasks: decode byte sequence to unicode char
  48. ; and convert this unicode char to our base encoding (that is cp866).
  49. ; 1. Check that there are data.
  50.         cmp     esi, edx
  51.         jae     .nothing
  52. ; 2. Load byte.
  53.         lodsb
  54.         movzx   ecx, al
  55. ; 3. Bytes in an UTF8 sequence can be of any of three types.
  56. ; If most significant bit is cleared, sequence is one byte and usual ASCII char.
  57. ; First byte of a sequence must be 11xxxxxx, other bytes are 10yyyyyy.
  58.         and     al, 0xC0
  59.         jns     .single_byte
  60.         jp      .first_byte
  61. ; 4. This byte is not first in UTF8 sequence.
  62. ; 4a. Check that the sequence was started. If no, it is invalid byte
  63. ; and we simply ignore it.
  64.         cmp     [utf8_bytes_rest], 0
  65.         jz      get_byte_utf8
  66. ; 4b. Otherwise, it is really next byte and it gives some more bits of char.
  67.         mov     eax, [utf8_char]
  68.         shl     eax, 6
  69.         lea     eax, [eax+ecx-0x80]
  70. ; 4c. Decrement number of bytes rest in the sequence.
  71. ; If it goes to zero, character is read, so return it.
  72.         dec     [utf8_bytes_rest]
  73.         jz      .got_char
  74.         mov     [utf8_char], eax
  75.         jmp     get_byte_utf8
  76. ; 5. If the byte is first in UTF8 sequence, calculate the number of leading 1s
  77. ; - it equals total number of bytes in the sequence; some other bits rest for
  78. ; leading bits in the character.
  79. .first_byte:
  80.         mov     eax, -1
  81. @@:
  82.         inc     eax
  83.         add     cl, cl
  84.         js      @b
  85.         mov     [utf8_bytes_rest], eax
  86.         xchg    eax, ecx
  87.         inc     ecx
  88.         shr     al, cl
  89.         mov     [utf8_char], eax
  90.         jmp     get_byte_utf8
  91. ; 6. If the byte is ASCII char, it is the character.
  92. .single_byte:
  93.         xchg    eax, ecx
  94. .got_char:
  95. ; We got the character, now abandon a possible sequence in progress.
  96.         and     [utf8_bytes_rest], 0
  97. ; Now second task. The unicode character is in eax, and now we shall convert it
  98. ; to cp866.
  99.         cmp     eax, 0x80
  100.         jb      .done
  101. ; 0x410-0x43F -> 0x80-0xAF, 0x440-0x44F -> 0xE0-0xEF, 0x401 -> 0xF0, 0x451 -> 0xF1
  102.         cmp     eax, 0x401
  103.         jz      .YO
  104.         cmp     eax, 0x451
  105.         jz      .yo
  106.         cmp     eax, 0x410
  107.         jb      .unrecognized
  108.         cmp     eax, 0x440
  109.         jb      .part1
  110.         cmp     eax, 0x450
  111.         jae     .unrecognized
  112.         sub     al, (0x40-0xE0) and 0xFF
  113.         ret
  114. .part1:
  115.         sub     al, 0x10-0x80
  116. .nothing:
  117. .done:
  118.         ret
  119. .unrecognized:
  120.         mov     al, '?'
  121.         stc
  122.         ret
  123. .YO:
  124.         mov     al, 0xF0
  125.         stc
  126.         ret
  127. .yo:
  128.         mov     al, 0xF1
  129.         stc
  130.         ret
  131.  
  132.  
  133.  
  134. recode_to_cp866:
  135.         rep     movsb
  136.         ret
  137.  
  138. recode_to_cp1251:
  139.         xor     eax, eax
  140.         jecxz   .nothing
  141.   .loop:
  142.         lodsb
  143.         cmp     al,0x80
  144.         jb      @f
  145.         mov     al, [cp866_table-0x80+eax]
  146.     @@: stosb
  147.         loop    .loop
  148.   .nothing:
  149.         ret
  150.  
  151. recode_to_utf8:
  152.         jecxz   .nothing
  153.   .loop:
  154.         lodsb
  155.         cmp     al, 0x80
  156.         jb      .single_byte
  157.         and     eax, 0x7F
  158.         mov     ax, [utf8_table+eax*2]
  159.         stosw
  160.         loop    .loop
  161.         ret
  162.   .single_byte:
  163.         stosb
  164.         loop    .loop
  165.   .nothing:
  166.         ret
  167.  
  168. recode:
  169.         mov     eax, [encoding]
  170.         jmp     [recode_proc+eax*4]
  171.  
  172.  
  173.  
  174. encoding        dd      UTF8
  175. recode_proc     dd      recode_to_cp866, recode_to_cp1251, recode_to_utf8
  176. get_byte_table  dd      get_byte_cp866, get_byte_cp1251, get_byte_utf8
  177.  
  178.  
  179. cp1251_table:
  180.   db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; 8
  181.   db '?','?','?','?','?',$F9,'?','?' , '?','?','?','?','?','?','?','?' ; 9
  182.   db '?',$F6,$F7,'?',$FD,'?','?','?' , $F0,'?',$F2,'?','?','?','?',$F4 ; A
  183.   db $F8,'?','?','?','?','?','?',$FA , $F1,$FC,$F3,'?','?','?','?',$F5 ; B
  184.   db $80,$81,$82,$83,$84,$85,$86,$87 , $88,$89,$8A,$8B,$8C,$8D,$8E,$8F ; C
  185.   db $90,$91,$92,$93,$94,$95,$96,$97 , $98,$99,$9A,$9B,$9C,$9D,$9E,$9F ; D
  186.   db $A0,$A1,$A2,$A3,$A4,$A5,$A6,$A7 , $A8,$A9,$AA,$AB,$AC,$AD,$AE,$AF ; E
  187.   db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; F
  188.  
  189. ;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
  190.  
  191. utf8_table:
  192.         times 80h dw 0x98C3     ; default placeholder
  193.  
  194. ; 0x80-0xAF -> 0x90D0-0xBFD0
  195. repeat 0x30
  196.         store byte 0xD0 at utf8_table+2*(%-1)
  197.         store byte 0x90+%-1 at utf8_table+2*%-1
  198. end repeat
  199.  
  200. ; 0xE0-0xEF -> 0x80D1-0x8FD1
  201. repeat 0x10
  202.         store byte 0xD1 at utf8_table+2*(0xE0-0x80+%-1)
  203.         store byte 0x80+%-1 at utf8_table+2*(0xE0-0x80+%)-1
  204. end repeat
  205.  
  206. ; 0xF0 -> 0x81D0, 0xF1 -> 0x91D1
  207.         store dword 0x91D181D0 at utf8_table+2*(0xF0-0x80)
  208.  
  209. cp866_table:
  210.   db $C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7 , $C8,$C9,$CA,$CB,$CC,$CD,$CE,$CF ; 8
  211.   db $D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7 , $D8,$D9,$DA,$DB,$DC,$DD,$DE,$DF ; 9
  212.   db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; A
  213.   db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; B
  214.   db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; C
  215.   db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; D
  216.   db $F0,$F1,$F2,$F3,$F4,$F5,$F6,$F7 , $F8,$F9,$FA,$FB,$FC,$FD,$FE,$FF ; E
  217.   db $A8,$B8,$AA,$BA,$AF,$BF,$A1,$A2 , $B0,$95,$B7,'?',$B9,$A4,'?','?' ; F
  218.  
  219. ;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
  220.  
  221.