Subversion Repositories Kolibri OS

Rev

Rev 4060 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
3545 hidnplayr 1
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2
;;                                                                 ;;
3
;; Copyright (C) KolibriOS team 2004-2013. All rights reserved.    ;;
4
;; Distributed under terms of the GNU General Public License       ;;
5
;;                                                                 ;;
4143 hidnplayr 6
;;   Written by CleverMouse                                        ;;
3545 hidnplayr 7
;;                                                                 ;;
8
;;         GNU GENERAL PUBLIC LICENSE                              ;;
9
;;          Version 2, June 1991                                   ;;
10
;;                                                                 ;;
11
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
12
 
13
 
14
get_next_byte:
15
; Load next byte from the packet, translating to cp866 if necessary
16
; At input esi = pointer to data, edx = limit of data
17
; Output is either (translated) byte in al with CF set or CF cleared.
18
        mov     eax, [encoding]
19
        jmp     [get_byte_table+eax*4]
20
 
21
get_byte_cp866:
22
        cmp     esi, edx
23
        jae     .nothing
24
        lodsb
25
.nothing:
26
        ret
27
 
28
get_byte_cp1251:
29
        cmp     esi, edx
30
        jae     .nothing
31
        lodsb
32
        cmp     al, 0x80
33
        jb      @f
34
        and     eax, 0x7F
35
        mov     al, [cp1251_table+eax]
36
@@:
37
        stc
38
.nothing:
39
        ret
40
 
41
get_byte_utf8:
42
; UTF8 decoding is slightly complicated.
43
; One character can occupy one or more bytes.
44
; The boundary in packets theoretically can be anywhere in data,
45
; so this procedure keeps internal state between calls and handles
46
; one byte at a time, looping until character is read or packet is over.
47
; Globally, there are two distinct tasks: decode byte sequence to unicode char
48
; and convert this unicode char to our base encoding (that is cp866).
49
; 1. Check that there are data.
50
        cmp     esi, edx
51
        jae     .nothing
52
; 2. Load byte.
53
        lodsb
54
        movzx   ecx, al
55
; 3. Bytes in an UTF8 sequence can be of any of three types.
56
; If most significant bit is cleared, sequence is one byte and usual ASCII char.
57
; First byte of a sequence must be 11xxxxxx, other bytes are 10yyyyyy.
58
        and     al, 0xC0
59
        jns     .single_byte
60
        jp      .first_byte
61
; 4. This byte is not first in UTF8 sequence.
62
; 4a. Check that the sequence was started. If no, it is invalid byte
63
; and we simply ignore it.
64
        cmp     [utf8_bytes_rest], 0
65
        jz      get_byte_utf8
66
; 4b. Otherwise, it is really next byte and it gives some more bits of char.
67
        mov     eax, [utf8_char]
68
        shl     eax, 6
69
        lea     eax, [eax+ecx-0x80]
70
; 4c. Decrement number of bytes rest in the sequence.
71
; If it goes to zero, character is read, so return it.
72
        dec     [utf8_bytes_rest]
73
        jz      .got_char
74
        mov     [utf8_char], eax
75
        jmp     get_byte_utf8
76
; 5. If the byte is first in UTF8 sequence, calculate the number of leading 1s
77
; - it equals total number of bytes in the sequence; some other bits rest for
78
; leading bits in the character.
79
.first_byte:
80
        mov     eax, -1
81
@@:
82
        inc     eax
83
        add     cl, cl
84
        js      @b
85
        mov     [utf8_bytes_rest], eax
86
        xchg    eax, ecx
87
        inc     ecx
88
        shr     al, cl
89
        mov     [utf8_char], eax
90
        jmp     get_byte_utf8
91
; 6. If the byte is ASCII char, it is the character.
92
.single_byte:
93
        xchg    eax, ecx
94
.got_char:
95
; We got the character, now abandon a possible sequence in progress.
96
        and     [utf8_bytes_rest], 0
97
; Now second task. The unicode character is in eax, and now we shall convert it
98
; to cp866.
99
        cmp     eax, 0x80
100
        jb      .done
101
; 0x410-0x43F -> 0x80-0xAF, 0x440-0x44F -> 0xE0-0xEF, 0x401 -> 0xF0, 0x451 -> 0xF1
102
        cmp     eax, 0x401
103
        jz      .YO
104
        cmp     eax, 0x451
105
        jz      .yo
106
        cmp     eax, 0x410
107
        jb      .unrecognized
108
        cmp     eax, 0x440
109
        jb      .part1
110
        cmp     eax, 0x450
111
        jae     .unrecognized
112
        sub     al, (0x40-0xE0) and 0xFF
113
        ret
114
.part1:
115
        sub     al, 0x10-0x80
116
.nothing:
117
.done:
118
        ret
119
.unrecognized:
120
        mov     al, '?'
121
        stc
122
        ret
123
.YO:
124
        mov     al, 0xF0
125
        stc
126
        ret
127
.yo:
128
        mov     al, 0xF1
129
        stc
130
        ret
131
 
132
 
133
 
134
recode_to_cp866:
135
        rep     movsb
136
        ret
137
 
138
recode_to_cp1251:
139
        xor     eax, eax
140
        jecxz   .nothing
141
  .loop:
142
        lodsb
143
        cmp     al,0x80
144
        jb      @f
145
        mov     al, [cp866_table-0x80+eax]
146
    @@: stosb
147
        loop    .loop
148
  .nothing:
149
        ret
150
 
151
recode_to_utf8:
152
        jecxz   .nothing
153
  .loop:
154
        lodsb
155
        cmp     al, 0x80
156
        jb      .single_byte
157
        and     eax, 0x7F
158
        mov     ax, [utf8_table+eax*2]
159
        stosw
160
        loop    .loop
161
        ret
162
  .single_byte:
163
        stosb
164
        loop    .loop
165
  .nothing:
166
        ret
167
 
168
recode:
169
        mov     eax, [encoding]
170
        jmp     [recode_proc+eax*4]
171
 
172
 
173
 
174
encoding        dd      UTF8
175
recode_proc     dd      recode_to_cp866, recode_to_cp1251, recode_to_utf8
176
get_byte_table  dd      get_byte_cp866, get_byte_cp1251, get_byte_utf8
177
 
178
 
179
cp1251_table:
180
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; 8
181
  db '?','?','?','?','?',$F9,'?','?' , '?','?','?','?','?','?','?','?' ; 9
182
  db '?',$F6,$F7,'?',$FD,'?','?','?' , $F0,'?',$F2,'?','?','?','?',$F4 ; A
183
  db $F8,'?','?','?','?','?','?',$FA , $F1,$FC,$F3,'?','?','?','?',$F5 ; B
184
  db $80,$81,$82,$83,$84,$85,$86,$87 , $88,$89,$8A,$8B,$8C,$8D,$8E,$8F ; C
185
  db $90,$91,$92,$93,$94,$95,$96,$97 , $98,$99,$9A,$9B,$9C,$9D,$9E,$9F ; D
186
  db $A0,$A1,$A2,$A3,$A4,$A5,$A6,$A7 , $A8,$A9,$AA,$AB,$AC,$AD,$AE,$AF ; E
187
  db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; F
188
 
189
;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
190
 
191
utf8_table:
192
        times 80h dw 0x98C3     ; default placeholder
193
 
194
; 0x80-0xAF -> 0x90D0-0xBFD0
195
repeat 0x30
196
        store byte 0xD0 at utf8_table+2*(%-1)
197
        store byte 0x90+%-1 at utf8_table+2*%-1
198
end repeat
199
 
200
; 0xE0-0xEF -> 0x80D1-0x8FD1
201
repeat 0x10
202
        store byte 0xD1 at utf8_table+2*(0xE0-0x80+%-1)
203
        store byte 0x80+%-1 at utf8_table+2*(0xE0-0x80+%)-1
204
end repeat
205
 
206
; 0xF0 -> 0x81D0, 0xF1 -> 0x91D1
207
        store dword 0x91D181D0 at utf8_table+2*(0xF0-0x80)
208
 
209
cp866_table:
210
  db $C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7 , $C8,$C9,$CA,$CB,$CC,$CD,$CE,$CF ; 8
211
  db $D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7 , $D8,$D9,$DA,$DB,$DC,$DD,$DE,$DF ; 9
212
  db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; A
213
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; B
214
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; C
215
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; D
216
  db $F0,$F1,$F2,$F3,$F4,$F5,$F6,$F7 , $F8,$F9,$FA,$FB,$FC,$FD,$FE,$FF ; E
217
  db $A8,$B8,$AA,$BA,$AF,$BF,$A1,$A2 , $B0,$95,$B7,'?',$B9,$A4,'?','?' ; F
218
 
219
;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
220