WebSVN – Kolibri OS – Blame – /programs/network/ircc/encodings.inc

Rev	Author	Line No.	Line
3545	hidnplayr	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		2	;; ;;
		3	;; Copyright (C) KolibriOS team 2004-2013. All rights reserved. ;;
		4	;; Distributed under terms of the GNU General Public License ;;
		5	;; ;;
		6	;; ;;
		7	;; GNU GENERAL PUBLIC LICENSE ;;
		8	;; Version 2, June 1991 ;;
		9	;; ;;
		10	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
		11
		12
		13	get_next_byte:
		14	; Load next byte from the packet, translating to cp866 if necessary
		15	; At input esi = pointer to data, edx = limit of data
		16	; Output is either (translated) byte in al with CF set or CF cleared.
		17	mov eax, [encoding]
		18	jmp [get_byte_table+eax*4]
		19
		20	get_byte_cp866:
		21	cmp esi, edx
		22	jae .nothing
		23	lodsb
		24	.nothing:
		25	ret
		26
		27	get_byte_cp1251:
		28	cmp esi, edx
		29	jae .nothing
		30	lodsb
		31	cmp al, 0x80
		32	jb @f
		33	and eax, 0x7F
		34	mov al, [cp1251_table+eax]
		35	@@:
		36	stc
		37	.nothing:
		38	ret
		39
		40	get_byte_utf8:
		41	; UTF8 decoding is slightly complicated.
		42	; One character can occupy one or more bytes.
		43	; The boundary in packets theoretically can be anywhere in data,
		44	; so this procedure keeps internal state between calls and handles
		45	; one byte at a time, looping until character is read or packet is over.
		46	; Globally, there are two distinct tasks: decode byte sequence to unicode char
		47	; and convert this unicode char to our base encoding (that is cp866).
		48	; 1. Check that there are data.
		49	cmp esi, edx
		50	jae .nothing
		51	; 2. Load byte.
		52	lodsb
		53	movzx ecx, al
		54	; 3. Bytes in an UTF8 sequence can be of any of three types.
		55	; If most significant bit is cleared, sequence is one byte and usual ASCII char.
		56	; First byte of a sequence must be 11xxxxxx, other bytes are 10yyyyyy.
		57	and al, 0xC0
		58	jns .single_byte
		59	jp .first_byte
		60	; 4. This byte is not first in UTF8 sequence.
		61	; 4a. Check that the sequence was started. If no, it is invalid byte
		62	; and we simply ignore it.
		63	cmp [utf8_bytes_rest], 0
		64	jz get_byte_utf8
		65	; 4b. Otherwise, it is really next byte and it gives some more bits of char.
		66	mov eax, [utf8_char]
		67	shl eax, 6
		68	lea eax, [eax+ecx-0x80]
		69	; 4c. Decrement number of bytes rest in the sequence.
		70	; If it goes to zero, character is read, so return it.
		71	dec [utf8_bytes_rest]
		72	jz .got_char
		73	mov [utf8_char], eax
		74	jmp get_byte_utf8
		75	; 5. If the byte is first in UTF8 sequence, calculate the number of leading 1s
		76	; - it equals total number of bytes in the sequence; some other bits rest for
		77	; leading bits in the character.
		78	.first_byte:
		79	mov eax, -1
		80	@@:
		81	inc eax
		82	add cl, cl
		83	js @b
		84	mov [utf8_bytes_rest], eax
		85	xchg eax, ecx
		86	inc ecx
		87	shr al, cl
		88	mov [utf8_char], eax
		89	jmp get_byte_utf8
		90	; 6. If the byte is ASCII char, it is the character.
		91	.single_byte:
		92	xchg eax, ecx
		93	.got_char:
		94	; We got the character, now abandon a possible sequence in progress.
		95	and [utf8_bytes_rest], 0
		96	; Now second task. The unicode character is in eax, and now we shall convert it
		97	; to cp866.
		98	cmp eax, 0x80
		99	jb .done
		100	; 0x410-0x43F -> 0x80-0xAF, 0x440-0x44F -> 0xE0-0xEF, 0x401 -> 0xF0, 0x451 -> 0xF1
		101	cmp eax, 0x401
		102	jz .YO
		103	cmp eax, 0x451
		104	jz .yo
		105	cmp eax, 0x410
		106	jb .unrecognized
		107	cmp eax, 0x440
		108	jb .part1
		109	cmp eax, 0x450
		110	jae .unrecognized
		111	sub al, (0x40-0xE0) and 0xFF
		112	ret
		113	.part1:
		114	sub al, 0x10-0x80
		115	.nothing:
		116	.done:
		117	ret
		118	.unrecognized:
		119	mov al, '?'
		120	stc
		121	ret
		122	.YO:
		123	mov al, 0xF0
		124	stc
		125	ret
		126	.yo:
		127	mov al, 0xF1
		128	stc
		129	ret
		130
		131
		132
		133	recode_to_cp866:
		134	rep movsb
		135	ret
		136
		137	recode_to_cp1251:
		138	xor eax, eax
		139	jecxz .nothing
		140	.loop:
		141	lodsb
		142	cmp al,0x80
		143	jb @f
		144	mov al, [cp866_table-0x80+eax]
		145	@@: stosb
		146	loop .loop
		147	.nothing:
		148	ret
		149
		150	recode_to_utf8:
		151	jecxz .nothing
		152	.loop:
		153	lodsb
		154	cmp al, 0x80
		155	jb .single_byte
		156	and eax, 0x7F
		157	mov ax, [utf8_table+eax*2]
		158	stosw
		159	loop .loop
		160	ret
		161	.single_byte:
		162	stosb
		163	loop .loop
		164	.nothing:
		165	ret
		166
		167	recode:
		168	mov eax, [encoding]
		169	jmp [recode_proc+eax*4]
		170
		171
		172
		173	encoding dd UTF8
		174	recode_proc dd recode_to_cp866, recode_to_cp1251, recode_to_utf8
		175	get_byte_table dd get_byte_cp866, get_byte_cp1251, get_byte_utf8
		176
		177
		178	cp1251_table:
		179	db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; 8
		180	db '?','?','?','?','?',$F9,'?','?' , '?','?','?','?','?','?','?','?' ; 9
		181	db '?',$F6,$F7,'?',$FD,'?','?','?' , $F0,'?',$F2,'?','?','?','?',$F4 ; A
		182	db $F8,'?','?','?','?','?','?',$FA , $F1,$FC,$F3,'?','?','?','?',$F5 ; B
		183	db $80,$81,$82,$83,$84,$85,$86,$87 , $88,$89,$8A,$8B,$8C,$8D,$8E,$8F ; C
		184	db $90,$91,$92,$93,$94,$95,$96,$97 , $98,$99,$9A,$9B,$9C,$9D,$9E,$9F ; D
		185	db $A0,$A1,$A2,$A3,$A4,$A5,$A6,$A7 , $A8,$A9,$AA,$AB,$AC,$AD,$AE,$AF ; E
		186	db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; F
		187
		188	; 0 1 2 3 4 5 6 7 8 9 A B C D E F
		189
		190	utf8_table:
		191	times 80h dw 0x98C3 ; default placeholder
		192
		193	; 0x80-0xAF -> 0x90D0-0xBFD0
		194	repeat 0x30
		195	store byte 0xD0 at utf8_table+2*(%-1)
		196	store byte 0x90+%-1 at utf8_table+2*%-1
		197	end repeat
		198
		199	; 0xE0-0xEF -> 0x80D1-0x8FD1
		200	repeat 0x10
		201	store byte 0xD1 at utf8_table+2*(0xE0-0x80+%-1)
		202	store byte 0x80+%-1 at utf8_table+2*(0xE0-0x80+%)-1
		203	end repeat
		204
		205	; 0xF0 -> 0x81D0, 0xF1 -> 0x91D1
		206	store dword 0x91D181D0 at utf8_table+2*(0xF0-0x80)
		207
		208	cp866_table:
		209	db $C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7 , $C8,$C9,$CA,$CB,$CC,$CD,$CE,$CF ; 8
		210	db $D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7 , $D8,$D9,$DA,$DB,$DC,$DD,$DE,$DF ; 9
		211	db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; A
		212	db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; B
		213	db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; C
		214	db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; D
		215	db $F0,$F1,$F2,$F3,$F4,$F5,$F6,$F7 , $F8,$F9,$FA,$FB,$FC,$FD,$FE,$FF ; E
		216	db $A8,$B8,$AA,$BA,$AF,$BF,$A1,$A2 , $B0,$95,$B7,'?',$B9,$A4,'?','?' ; F
		217
		218	; 0 1 2 3 4 5 6 7 8 9 A B C D E F
		219

Subversion Repositories Kolibri OS

(root)/programs/network/ircc/encodings.inc – Rev 4060