Subversion Repositories Kolibri OS

Rev

Rev 3981 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
3545 hidnplayr 1
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2
;;                                                                 ;;
3
;; Copyright (C) KolibriOS team 2004-2013. All rights reserved.    ;;
4
;; Distributed under terms of the GNU General Public License       ;;
5
;;                                                                 ;;
6
;;                                                                 ;;
7
;;         GNU GENERAL PUBLIC LICENSE                              ;;
8
;;          Version 2, June 1991                                   ;;
9
;;                                                                 ;;
10
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
11
 
12
 
13
get_next_byte:
14
; Load next byte from the packet, translating to cp866 if necessary
15
; At input esi = pointer to data, edx = limit of data
16
; Output is either (translated) byte in al with CF set or CF cleared.
17
        mov     eax, [encoding]
18
        jmp     [get_byte_table+eax*4]
19
 
20
get_byte_cp866:
21
        cmp     esi, edx
22
        jae     .nothing
23
        lodsb
24
.nothing:
25
        ret
26
 
27
get_byte_cp1251:
28
        cmp     esi, edx
29
        jae     .nothing
30
        lodsb
31
        cmp     al, 0x80
32
        jb      @f
33
        and     eax, 0x7F
34
        mov     al, [cp1251_table+eax]
35
@@:
36
        stc
37
.nothing:
38
        ret
39
 
40
get_byte_utf8:
41
; UTF8 decoding is slightly complicated.
42
; One character can occupy one or more bytes.
43
; The boundary in packets theoretically can be anywhere in data,
44
; so this procedure keeps internal state between calls and handles
45
; one byte at a time, looping until character is read or packet is over.
46
; Globally, there are two distinct tasks: decode byte sequence to unicode char
47
; and convert this unicode char to our base encoding (that is cp866).
48
; 1. Check that there are data.
49
        cmp     esi, edx
50
        jae     .nothing
51
; 2. Load byte.
52
        lodsb
53
        movzx   ecx, al
54
; 3. Bytes in an UTF8 sequence can be of any of three types.
55
; If most significant bit is cleared, sequence is one byte and usual ASCII char.
56
; First byte of a sequence must be 11xxxxxx, other bytes are 10yyyyyy.
57
        and     al, 0xC0
58
        jns     .single_byte
59
        jp      .first_byte
60
; 4. This byte is not first in UTF8 sequence.
61
; 4a. Check that the sequence was started. If no, it is invalid byte
62
; and we simply ignore it.
63
        cmp     [utf8_bytes_rest], 0
64
        jz      get_byte_utf8
65
; 4b. Otherwise, it is really next byte and it gives some more bits of char.
66
        mov     eax, [utf8_char]
67
        shl     eax, 6
68
        lea     eax, [eax+ecx-0x80]
69
; 4c. Decrement number of bytes rest in the sequence.
70
; If it goes to zero, character is read, so return it.
71
        dec     [utf8_bytes_rest]
72
        jz      .got_char
73
        mov     [utf8_char], eax
74
        jmp     get_byte_utf8
75
; 5. If the byte is first in UTF8 sequence, calculate the number of leading 1s
76
; - it equals total number of bytes in the sequence; some other bits rest for
77
; leading bits in the character.
78
.first_byte:
79
        mov     eax, -1
80
@@:
81
        inc     eax
82
        add     cl, cl
83
        js      @b
84
        mov     [utf8_bytes_rest], eax
85
        xchg    eax, ecx
86
        inc     ecx
87
        shr     al, cl
88
        mov     [utf8_char], eax
89
        jmp     get_byte_utf8
90
; 6. If the byte is ASCII char, it is the character.
91
.single_byte:
92
        xchg    eax, ecx
93
.got_char:
94
; We got the character, now abandon a possible sequence in progress.
95
        and     [utf8_bytes_rest], 0
96
; Now second task. The unicode character is in eax, and now we shall convert it
97
; to cp866.
98
        cmp     eax, 0x80
99
        jb      .done
100
; 0x410-0x43F -> 0x80-0xAF, 0x440-0x44F -> 0xE0-0xEF, 0x401 -> 0xF0, 0x451 -> 0xF1
101
        cmp     eax, 0x401
102
        jz      .YO
103
        cmp     eax, 0x451
104
        jz      .yo
105
        cmp     eax, 0x410
106
        jb      .unrecognized
107
        cmp     eax, 0x440
108
        jb      .part1
109
        cmp     eax, 0x450
110
        jae     .unrecognized
111
        sub     al, (0x40-0xE0) and 0xFF
112
        ret
113
.part1:
114
        sub     al, 0x10-0x80
115
.nothing:
116
.done:
117
        ret
118
.unrecognized:
119
        mov     al, '?'
120
        stc
121
        ret
122
.YO:
123
        mov     al, 0xF0
124
        stc
125
        ret
126
.yo:
127
        mov     al, 0xF1
128
        stc
129
        ret
130
 
131
 
132
 
133
recode_to_cp866:
134
        rep     movsb
135
        ret
136
 
137
recode_to_cp1251:
138
        xor     eax, eax
139
        jecxz   .nothing
140
  .loop:
141
        lodsb
142
        cmp     al,0x80
143
        jb      @f
144
        mov     al, [cp866_table-0x80+eax]
145
    @@: stosb
146
        loop    .loop
147
  .nothing:
148
        ret
149
 
150
recode_to_utf8:
151
        jecxz   .nothing
152
  .loop:
153
        lodsb
154
        cmp     al, 0x80
155
        jb      .single_byte
156
        and     eax, 0x7F
157
        mov     ax, [utf8_table+eax*2]
158
        stosw
159
        loop    .loop
160
        ret
161
  .single_byte:
162
        stosb
163
        loop    .loop
164
  .nothing:
165
        ret
166
 
167
recode:
168
        mov     eax, [encoding]
169
        jmp     [recode_proc+eax*4]
170
 
171
 
172
 
173
encoding        dd      UTF8
174
recode_proc     dd      recode_to_cp866, recode_to_cp1251, recode_to_utf8
175
get_byte_table  dd      get_byte_cp866, get_byte_cp1251, get_byte_utf8
176
 
177
 
178
cp1251_table:
179
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; 8
180
  db '?','?','?','?','?',$F9,'?','?' , '?','?','?','?','?','?','?','?' ; 9
181
  db '?',$F6,$F7,'?',$FD,'?','?','?' , $F0,'?',$F2,'?','?','?','?',$F4 ; A
182
  db $F8,'?','?','?','?','?','?',$FA , $F1,$FC,$F3,'?','?','?','?',$F5 ; B
183
  db $80,$81,$82,$83,$84,$85,$86,$87 , $88,$89,$8A,$8B,$8C,$8D,$8E,$8F ; C
184
  db $90,$91,$92,$93,$94,$95,$96,$97 , $98,$99,$9A,$9B,$9C,$9D,$9E,$9F ; D
185
  db $A0,$A1,$A2,$A3,$A4,$A5,$A6,$A7 , $A8,$A9,$AA,$AB,$AC,$AD,$AE,$AF ; E
186
  db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; F
187
 
188
;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
189
 
190
utf8_table:
191
        times 80h dw 0x98C3     ; default placeholder
192
 
193
; 0x80-0xAF -> 0x90D0-0xBFD0
194
repeat 0x30
195
        store byte 0xD0 at utf8_table+2*(%-1)
196
        store byte 0x90+%-1 at utf8_table+2*%-1
197
end repeat
198
 
199
; 0xE0-0xEF -> 0x80D1-0x8FD1
200
repeat 0x10
201
        store byte 0xD1 at utf8_table+2*(0xE0-0x80+%-1)
202
        store byte 0x80+%-1 at utf8_table+2*(0xE0-0x80+%)-1
203
end repeat
204
 
205
; 0xF0 -> 0x81D0, 0xF1 -> 0x91D1
206
        store dword 0x91D181D0 at utf8_table+2*(0xF0-0x80)
207
 
208
cp866_table:
209
  db $C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7 , $C8,$C9,$CA,$CB,$CC,$CD,$CE,$CF ; 8
210
  db $D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7 , $D8,$D9,$DA,$DB,$DC,$DD,$DE,$DF ; 9
211
  db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; A
212
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; B
213
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; C
214
  db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; D
215
  db $F0,$F1,$F2,$F3,$F4,$F5,$F6,$F7 , $F8,$F9,$FA,$FB,$FC,$FD,$FE,$FF ; E
216
  db $A8,$B8,$AA,$BA,$AF,$BF,$A1,$A2 , $B0,$95,$B7,'?',$B9,$A4,'?','?' ; F
217
 
218
;    0   1   2   3   4   5   6   7     8   9   A   B   C   D   E   F
219