Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2. ;;                                                                 ;;
  3. ;; Copyright (C) KolibriOS team 2004-2013. All rights reserved.    ;;
  4. ;; Distributed under terms of the GNU General Public License       ;;
  5. ;;                                                                 ;;
  6. ;;   Written by CleverMouse                                        ;;
  7. ;;                                                                 ;;
  8. ;;         GNU GENERAL PUBLIC LICENSE                              ;;
  9. ;;          Version 2, June 1991                                   ;;
  10. ;;                                                                 ;;
  11. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  12.  
  13. uglobal
  14.  
  15. utf8_bytes_rest dd ?                    ; bytes rest in current UTF8 sequence
  16. utf8_char       dd ?                    ; first bits of current UTF8 character
  17.  
  18. endg
  19.  
  20.  
  21. ;get_next_byte:
  22. ;; Load next byte from the packet, translating to cp866 if necessary
  23. ;; At input esi = pointer to data, edx = limit of data
  24. ;; Output is either (translated) byte in al with CF set or CF cleared.
  25. ;        mov     eax, [encoding]
  26. ;        jmp     [get_byte_table+eax*4]
  27. ;
  28. ;get_byte_cp866:
  29. ;        cmp     esi, edx
  30. ;        jae     .nothing
  31. ;        lodsb
  32. ;.nothing:
  33. ;        ret
  34. ;
  35. ;get_byte_cp1251:
  36. ;        cmp     esi, edx
  37. ;        jae     .nothing
  38. ;        lodsb
  39. ;        cmp     al, 0x80
  40. ;        jb      @f
  41. ;        and     eax, 0x7F
  42. ;        mov     al, [cp1251_table+eax]
  43. ;@@:
  44. ;        stc
  45. ;.nothing:
  46. ;        ret
  47.  
  48. get_byte_utf8:
  49. ; UTF8 decoding is slightly complicated.
  50. ; One character can occupy one or more bytes.
  51. ; The boundary in packets theoretically can be anywhere in data,
  52. ; so this procedure keeps internal state between calls and handles
  53. ; one byte at a time, looping until character is read or packet is over.
  54. ; Globally, there are two distinct tasks: decode byte sequence to unicode char
  55. ; and convert this unicode char to our base encoding (that is cp866).
  56. ; 1. Check that there are data.
  57.         cmp     esi, edx
  58.         jae     .nothing
  59. ; 2. Load byte.
  60.         lodsb
  61.         movzx   ecx, al
  62. ; 3. Bytes in an UTF8 sequence can be of any of three types.
  63. ; If most significant bit is cleared, sequence is one byte and usual ASCII char.
  64. ; First byte of a sequence must be 11xxxxxx, other bytes are 10yyyyyy.
  65.         and     al, 0xC0
  66.         jns     .single_byte
  67.         jp      .first_byte
  68. ; 4. This byte is not first in UTF8 sequence.
  69. ; 4a. Check that the sequence was started. If no, it is invalid byte
  70. ; and we simply ignore it.
  71.         cmp     [utf8_bytes_rest], 0
  72.         jz      get_byte_utf8
  73. ; 4b. Otherwise, it is really next byte and it gives some more bits of char.
  74.         mov     eax, [utf8_char]
  75.         shl     eax, 6
  76.         lea     eax, [eax+ecx-0x80]
  77. ; 4c. Decrement number of bytes rest in the sequence.
  78. ; If it goes to zero, character is read, so return it.
  79.         dec     [utf8_bytes_rest]
  80.         jz      .got_char
  81.         mov     [utf8_char], eax
  82.         jmp     get_byte_utf8
  83. ; 5. If the byte is first in UTF8 sequence, calculate the number of leading 1s
  84. ; - it equals total number of bytes in the sequence; some other bits rest for
  85. ; leading bits in the character.
  86. .first_byte:
  87.         mov     eax, -1
  88. @@:
  89.         inc     eax
  90.         add     cl, cl
  91.         js      @b
  92.         mov     [utf8_bytes_rest], eax
  93.         xchg    eax, ecx
  94.         inc     ecx
  95.         shr     al, cl
  96.         mov     [utf8_char], eax
  97.         jmp     get_byte_utf8
  98. ; 6. If the byte is ASCII char, it is the character.
  99. .single_byte:
  100.         xchg    eax, ecx
  101. .got_char:
  102. ; We got the character, now abandon a possible sequence in progress.
  103.         and     [utf8_bytes_rest], 0
  104. ; Now second task. The unicode character is in eax, and now we shall convert it
  105. ; to cp866.
  106.         cmp     eax, 0x80
  107.         jb      .done
  108. ; 0x410-0x43F -> 0x80-0xAF, 0x440-0x44F -> 0xE0-0xEF, 0x401 -> 0xF0, 0x451 -> 0xF1
  109.         cmp     eax, 0x401
  110.         jz      .YO
  111.         cmp     eax, 0x451
  112.         jz      .yo
  113.         cmp     eax, 0x410
  114.         jb      .unrecognized
  115.         cmp     eax, 0x440
  116.         jb      .part1
  117.         cmp     eax, 0x450
  118.         jb      .part2
  119.         cmp     eax, 0x25a0
  120.         jae     .unrecognized
  121.         sub     eax, 0x2500
  122.         jb      .unrecognized
  123.         mov     al, [cp866_boxes+eax]
  124.         ret
  125. .part1:
  126.         sub     al, 0x10-0x80
  127. .nothing:
  128. .done:
  129.         ret
  130. .part2:
  131.         sub     al, (0x40-0xE0) and 0xFF
  132.         ret
  133. .unrecognized:
  134.         mov     al, '?'
  135.         stc
  136.         ret
  137. .YO:
  138.         mov     al, 0xF0
  139.         stc
  140.         ret
  141. .yo:
  142.         mov     al, 0xF1
  143.         stc
  144.         ret
  145.  
  146.  
  147.  
  148. ;recode_to_cp866:
  149. ;        rep     movsb
  150. ;        ret
  151. ;
  152. ;recode_to_cp1251:
  153. ;        xor     eax, eax
  154. ;        jecxz   .nothing
  155. ;  .loop:
  156. ;        lodsb
  157. ;        cmp     al,0x80
  158. ;        jb      @f
  159. ;        mov     al, [cp866_table-0x80+eax]
  160. ;    @@: stosb
  161. ;        loop    .loop
  162. ;  .nothing:
  163. ;        ret
  164.  
  165. recode_to_utf8:
  166.         jecxz   .nothing
  167.   .loop:
  168.         lodsb
  169.         cmp     al, 0x80
  170.         jb      .single_byte
  171.         and     eax, 0x7F
  172.         mov     ax, [utf8_table+eax*2]
  173.         stosw
  174.         loop    .loop
  175.         ret
  176.   .single_byte:
  177.         stosb
  178.         loop    .loop
  179.   .nothing:
  180.         ret
  181.  
  182. ;recode:
  183. ;        mov     eax, [encoding]
  184. ;        jmp     [recode_proc+eax*4]
  185.  
  186.  
  187.  
  188. ;encoding        dd      UTF8
  189. ;recode_proc     dd      recode_to_cp866, recode_to_cp1251, recode_to_utf8
  190. ;get_byte_table  dd      get_byte_cp866, get_byte_cp1251, get_byte_utf8
  191.  
  192.  
  193. ;cp1251_table:
  194. ;  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; 8
  195. ;  db '?','?','?','?','?',$F9,'?','?' , '?','?','?','?','?','?','?','?' ; 9
  196. ;  db '?',$F6,$F7,'?',$FD,'?','?','?' , $F0,'?',$F2,'?','?','?','?',$F4 ; A
  197. ;  db $F8,'?','?','?','?','?','?',$FA , $F1,$FC,$F3,'?','?','?','?',$F5 ; B
  198. ;  db $80,$81,$82,$83,$84,$85,$86,$87 , $88,$89,$8A,$8B,$8C,$8D,$8E,$8F ; C
  199. ;  db $90,$91,$92,$93,$94,$95,$96,$97 , $98,$99,$9A,$9B,$9C,$9D,$9E,$9F ; D
  200. ;  db $A0,$A1,$A2,$A3,$A4,$A5,$A6,$A7 , $A8,$A9,$AA,$AB,$AC,$AD,$AE,$AF ; E
  201. ;  db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; F
  202.  
  203. ;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
  204.  
  205. utf8_table:
  206.         times 80h dw 0x98C3     ; default placeholder
  207.  
  208. ; 0x80-0xAF -> 0x90D0-0xBFD0
  209. repeat 0x30
  210.         store byte 0xD0 at utf8_table+2*(%-1)
  211.         store byte 0x90+%-1 at utf8_table+2*%-1
  212. end repeat
  213.  
  214. ; 0xE0-0xEF -> 0x80D1-0x8FD1
  215. repeat 0x10
  216.         store byte 0xD1 at utf8_table+2*(0xE0-0x80+%-1)
  217.         store byte 0x80+%-1 at utf8_table+2*(0xE0-0x80+%)-1
  218. end repeat
  219.  
  220. ; 0xF0 -> 0x81D0, 0xF1 -> 0x91D1
  221.         store dword 0x91D181D0 at utf8_table+2*(0xF0-0x80)
  222.  
  223. ;cp866_table:
  224. ;  db $C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7 , $C8,$C9,$CA,$CB,$CC,$CD,$CE,$CF ; 8
  225. ;  db $D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7 , $D8,$D9,$DA,$DB,$DC,$DD,$DE,$DF ; 9
  226. ;  db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; A
  227. ;  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; B
  228. ;  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; C
  229. ;  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; D
  230. ;  db $F0,$F1,$F2,$F3,$F4,$F5,$F6,$F7 , $F8,$F9,$FA,$FB,$FC,$FD,$FE,$FF ; E
  231. ;  db $A8,$B8,$AA,$BA,$AF,$BF,$A1,$A2 , $B0,$95,$B7,'?',$B9,$A4,'?','?' ; F
  232.  
  233. ;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
  234.  
  235.  
  236. ; Codepoints for 0xB0-0xDF, unicode offset 0x2500
  237. cp866_boxes:
  238.         times 0xA0 db '?'
  239.  
  240.         store byte 0xB0 at cp866_boxes+0x91
  241.         store byte 0xB1 at cp866_boxes+0x92
  242.         store byte 0xB2 at cp866_boxes+0x93
  243.         store byte 0xB3 at cp866_boxes+0x02
  244.         store byte 0xB4 at cp866_boxes+0x24
  245.         store byte 0xB5 at cp866_boxes+0x61
  246.         store byte 0xB6 at cp866_boxes+0x62
  247.         store byte 0xB7 at cp866_boxes+0x56
  248.  
  249.         store byte 0xB8 at cp866_boxes+0x55
  250.         store byte 0xB9 at cp866_boxes+0x63
  251.         store byte 0xBA at cp866_boxes+0x51
  252.         store byte 0xBB at cp866_boxes+0x57
  253.         store byte 0xBC at cp866_boxes+0x5D
  254.         store byte 0xBD at cp866_boxes+0x5C
  255.         store byte 0xBE at cp866_boxes+0x5B
  256.         store byte 0xBF at cp866_boxes+0x10
  257.  
  258.         store byte 0xC0 at cp866_boxes+0x14
  259.         store byte 0xC1 at cp866_boxes+0x34
  260.         store byte 0xC2 at cp866_boxes+0x2C
  261.         store byte 0xC3 at cp866_boxes+0x1C
  262.         store byte 0xC4 at cp866_boxes+0x00
  263.         store byte 0xC5 at cp866_boxes+0x3C
  264.         store byte 0xC6 at cp866_boxes+0x5E
  265.         store byte 0xC7 at cp866_boxes+0x5F
  266.  
  267.         store byte 0xC8 at cp866_boxes+0x5A
  268.         store byte 0xC9 at cp866_boxes+0x54
  269.         store byte 0xCA at cp866_boxes+0x69
  270.         store byte 0xCB at cp866_boxes+0x66
  271.         store byte 0xCC at cp866_boxes+0x60
  272.         store byte 0xCD at cp866_boxes+0x50
  273.         store byte 0xCE at cp866_boxes+0x6C
  274.         store byte 0xCF at cp866_boxes+0x67
  275.  
  276.         store byte 0xD0 at cp866_boxes+0x68
  277.         store byte 0xD1 at cp866_boxes+0x64
  278.         store byte 0xD2 at cp866_boxes+0x65
  279.         store byte 0xD3 at cp866_boxes+0x59
  280.         store byte 0xD4 at cp866_boxes+0x58
  281.         store byte 0xD5 at cp866_boxes+0x52
  282.         store byte 0xD6 at cp866_boxes+0x53
  283.         store byte 0xD7 at cp866_boxes+0x6B
  284.  
  285.         store byte 0xD8 at cp866_boxes+0x6A
  286.         store byte 0xD9 at cp866_boxes+0x18
  287.         store byte 0xDA at cp866_boxes+0x0C
  288.         store byte 0xDB at cp866_boxes+0x88
  289.         store byte 0xDC at cp866_boxes+0x84
  290.         store byte 0xDD at cp866_boxes+0x8C
  291.         store byte 0xDE at cp866_boxes+0x90
  292.         store byte 0xDF at cp866_boxes+0x80
  293.