Rev 3981 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
3545 | hidnplayr | 1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ;; ;; |
||
3 | ;; Copyright (C) KolibriOS team 2004-2013. All rights reserved. ;; |
||
4 | ;; Distributed under terms of the GNU General Public License ;; |
||
5 | ;; ;; |
||
6 | ;; ;; |
||
7 | ;; GNU GENERAL PUBLIC LICENSE ;; |
||
8 | ;; Version 2, June 1991 ;; |
||
9 | ;; ;; |
||
10 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
||
11 | |||
12 | |||
13 | get_next_byte: |
||
14 | ; Load next byte from the packet, translating to cp866 if necessary |
||
15 | ; At input esi = pointer to data, edx = limit of data |
||
16 | ; Output is either (translated) byte in al with CF set or CF cleared. |
||
17 | mov eax, [encoding] |
||
18 | jmp [get_byte_table+eax*4] |
||
19 | |||
20 | get_byte_cp866: |
||
21 | cmp esi, edx |
||
22 | jae .nothing |
||
23 | lodsb |
||
24 | .nothing: |
||
25 | ret |
||
26 | |||
27 | get_byte_cp1251: |
||
28 | cmp esi, edx |
||
29 | jae .nothing |
||
30 | lodsb |
||
31 | cmp al, 0x80 |
||
32 | jb @f |
||
33 | and eax, 0x7F |
||
34 | mov al, [cp1251_table+eax] |
||
35 | @@: |
||
36 | stc |
||
37 | .nothing: |
||
38 | ret |
||
39 | |||
40 | get_byte_utf8: |
||
41 | ; UTF8 decoding is slightly complicated. |
||
42 | ; One character can occupy one or more bytes. |
||
43 | ; The boundary in packets theoretically can be anywhere in data, |
||
44 | ; so this procedure keeps internal state between calls and handles |
||
45 | ; one byte at a time, looping until character is read or packet is over. |
||
46 | ; Globally, there are two distinct tasks: decode byte sequence to unicode char |
||
47 | ; and convert this unicode char to our base encoding (that is cp866). |
||
48 | ; 1. Check that there are data. |
||
49 | cmp esi, edx |
||
50 | jae .nothing |
||
51 | ; 2. Load byte. |
||
52 | lodsb |
||
53 | movzx ecx, al |
||
54 | ; 3. Bytes in an UTF8 sequence can be of any of three types. |
||
55 | ; If most significant bit is cleared, sequence is one byte and usual ASCII char. |
||
56 | ; First byte of a sequence must be 11xxxxxx, other bytes are 10yyyyyy. |
||
57 | and al, 0xC0 |
||
58 | jns .single_byte |
||
59 | jp .first_byte |
||
60 | ; 4. This byte is not first in UTF8 sequence. |
||
61 | ; 4a. Check that the sequence was started. If no, it is invalid byte |
||
62 | ; and we simply ignore it. |
||
63 | cmp [utf8_bytes_rest], 0 |
||
64 | jz get_byte_utf8 |
||
65 | ; 4b. Otherwise, it is really next byte and it gives some more bits of char. |
||
66 | mov eax, [utf8_char] |
||
67 | shl eax, 6 |
||
68 | lea eax, [eax+ecx-0x80] |
||
69 | ; 4c. Decrement number of bytes rest in the sequence. |
||
70 | ; If it goes to zero, character is read, so return it. |
||
71 | dec [utf8_bytes_rest] |
||
72 | jz .got_char |
||
73 | mov [utf8_char], eax |
||
74 | jmp get_byte_utf8 |
||
75 | ; 5. If the byte is first in UTF8 sequence, calculate the number of leading 1s |
||
76 | ; - it equals total number of bytes in the sequence; some other bits rest for |
||
77 | ; leading bits in the character. |
||
78 | .first_byte: |
||
79 | mov eax, -1 |
||
80 | @@: |
||
81 | inc eax |
||
82 | add cl, cl |
||
83 | js @b |
||
84 | mov [utf8_bytes_rest], eax |
||
85 | xchg eax, ecx |
||
86 | inc ecx |
||
87 | shr al, cl |
||
88 | mov [utf8_char], eax |
||
89 | jmp get_byte_utf8 |
||
90 | ; 6. If the byte is ASCII char, it is the character. |
||
91 | .single_byte: |
||
92 | xchg eax, ecx |
||
93 | .got_char: |
||
94 | ; We got the character, now abandon a possible sequence in progress. |
||
95 | and [utf8_bytes_rest], 0 |
||
96 | ; Now second task. The unicode character is in eax, and now we shall convert it |
||
97 | ; to cp866. |
||
98 | cmp eax, 0x80 |
||
99 | jb .done |
||
100 | ; 0x410-0x43F -> 0x80-0xAF, 0x440-0x44F -> 0xE0-0xEF, 0x401 -> 0xF0, 0x451 -> 0xF1 |
||
101 | cmp eax, 0x401 |
||
102 | jz .YO |
||
103 | cmp eax, 0x451 |
||
104 | jz .yo |
||
105 | cmp eax, 0x410 |
||
106 | jb .unrecognized |
||
107 | cmp eax, 0x440 |
||
108 | jb .part1 |
||
109 | cmp eax, 0x450 |
||
110 | jae .unrecognized |
||
111 | sub al, (0x40-0xE0) and 0xFF |
||
112 | ret |
||
113 | .part1: |
||
114 | sub al, 0x10-0x80 |
||
115 | .nothing: |
||
116 | .done: |
||
117 | ret |
||
118 | .unrecognized: |
||
119 | mov al, '?' |
||
120 | stc |
||
121 | ret |
||
122 | .YO: |
||
123 | mov al, 0xF0 |
||
124 | stc |
||
125 | ret |
||
126 | .yo: |
||
127 | mov al, 0xF1 |
||
128 | stc |
||
129 | ret |
||
130 | |||
131 | |||
132 | |||
133 | recode_to_cp866: |
||
134 | rep movsb |
||
135 | ret |
||
136 | |||
137 | recode_to_cp1251: |
||
138 | xor eax, eax |
||
139 | jecxz .nothing |
||
140 | .loop: |
||
141 | lodsb |
||
142 | cmp al,0x80 |
||
143 | jb @f |
||
144 | mov al, [cp866_table-0x80+eax] |
||
145 | @@: stosb |
||
146 | loop .loop |
||
147 | .nothing: |
||
148 | ret |
||
149 | |||
150 | recode_to_utf8: |
||
151 | jecxz .nothing |
||
152 | .loop: |
||
153 | lodsb |
||
154 | cmp al, 0x80 |
||
155 | jb .single_byte |
||
156 | and eax, 0x7F |
||
157 | mov ax, [utf8_table+eax*2] |
||
158 | stosw |
||
159 | loop .loop |
||
160 | ret |
||
161 | .single_byte: |
||
162 | stosb |
||
163 | loop .loop |
||
164 | .nothing: |
||
165 | ret |
||
166 | |||
167 | recode: |
||
168 | mov eax, [encoding] |
||
169 | jmp [recode_proc+eax*4] |
||
170 | |||
171 | |||
172 | |||
173 | encoding dd UTF8 |
||
174 | recode_proc dd recode_to_cp866, recode_to_cp1251, recode_to_utf8 |
||
175 | get_byte_table dd get_byte_cp866, get_byte_cp1251, get_byte_utf8 |
||
176 | |||
177 | |||
178 | cp1251_table: |
||
179 | db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; 8 |
||
180 | db '?','?','?','?','?',$F9,'?','?' , '?','?','?','?','?','?','?','?' ; 9 |
||
181 | db '?',$F6,$F7,'?',$FD,'?','?','?' , $F0,'?',$F2,'?','?','?','?',$F4 ; A |
||
182 | db $F8,'?','?','?','?','?','?',$FA , $F1,$FC,$F3,'?','?','?','?',$F5 ; B |
||
183 | db $80,$81,$82,$83,$84,$85,$86,$87 , $88,$89,$8A,$8B,$8C,$8D,$8E,$8F ; C |
||
184 | db $90,$91,$92,$93,$94,$95,$96,$97 , $98,$99,$9A,$9B,$9C,$9D,$9E,$9F ; D |
||
185 | db $A0,$A1,$A2,$A3,$A4,$A5,$A6,$A7 , $A8,$A9,$AA,$AB,$AC,$AD,$AE,$AF ; E |
||
186 | db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; F |
||
187 | |||
188 | ; 0 1 2 3 4 5 6 7 8 9 A B C D E F |
||
189 | |||
190 | utf8_table: |
||
191 | times 80h dw 0x98C3 ; default placeholder |
||
192 | |||
193 | ; 0x80-0xAF -> 0x90D0-0xBFD0 |
||
194 | repeat 0x30 |
||
195 | store byte 0xD0 at utf8_table+2*(%-1) |
||
196 | store byte 0x90+%-1 at utf8_table+2*%-1 |
||
197 | end repeat |
||
198 | |||
199 | ; 0xE0-0xEF -> 0x80D1-0x8FD1 |
||
200 | repeat 0x10 |
||
201 | store byte 0xD1 at utf8_table+2*(0xE0-0x80+%-1) |
||
202 | store byte 0x80+%-1 at utf8_table+2*(0xE0-0x80+%)-1 |
||
203 | end repeat |
||
204 | |||
205 | ; 0xF0 -> 0x81D0, 0xF1 -> 0x91D1 |
||
206 | store dword 0x91D181D0 at utf8_table+2*(0xF0-0x80) |
||
207 | |||
208 | cp866_table: |
||
209 | db $C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7 , $C8,$C9,$CA,$CB,$CC,$CD,$CE,$CF ; 8 |
||
210 | db $D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7 , $D8,$D9,$DA,$DB,$DC,$DD,$DE,$DF ; 9 |
||
211 | db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; A |
||
212 | db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; B |
||
213 | db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; C |
||
214 | db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; D |
||
215 | db $F0,$F1,$F2,$F3,$F4,$F5,$F6,$F7 , $F8,$F9,$FA,$FB,$FC,$FD,$FE,$FF ; E |
||
216 | db $A8,$B8,$AA,$BA,$AF,$BF,$A1,$A2 , $B0,$95,$B7,'?',$B9,$A4,'?','?' ; F |
||
217 | |||
218 | ; 0 1 2 3 4 5 6 7 8 9 A B C D E F |
||
219 |