WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/cairo/src/cairo-unicode.c

Rev	Author	Line No.	Line
4349	Serge	1	/* -- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -- */
		2	/* cairo - a vector graphics library with display and print output
		3	*
		4	* The code in this file is derived from GLib's gutf8.c and
		5	* ultimately from libunicode. It is relicensed under the
		6	* dual LGPL/MPL with permission of the original authors.
		7	*
		8	* Copyright © 1999 Tom Tromey
		9	* Copyright © 2005 Red Hat, Inc
		10	*
		11	* This library is free software; you can redistribute it and/or
		12	* modify it either under the terms of the GNU Lesser General Public
		13	* License version 2.1 as published by the Free Software Foundation
		14	* (the "LGPL") or, at your option, under the terms of the Mozilla
		15	* Public License Version 1.1 (the "MPL"). If you do not alter this
		16	* notice, a recipient may use your version of this file under either
		17	* the MPL or the LGPL.
		18	*
		19	* You should have received a copy of the LGPL along with this library
		20	* in the file COPYING-LGPL-2.1; if not, write to the Free Software
		21	* Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
		22	* You should have received a copy of the MPL along with this library
		23	* in the file COPYING-MPL-1.1
		24	*
		25	* The contents of this file are subject to the Mozilla Public License
		26	* Version 1.1 (the "License"); you may not use this file except in
		27	* compliance with the License. You may obtain a copy of the License at
		28	* http://www.mozilla.org/MPL/
		29	*
		30	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
		31	* OF ANY KIND, either express or implied. See the LGPL or the MPL for
		32	* the specific language governing rights and limitations.
		33	*
		34	* The Original Code is the cairo graphics library.
		35	*
		36	* The Initial Developer of the Original Code is Tom Tromey.
		37	* and Red Hat, Inc.
		38	*
		39	* Contributor(s):
		40	* Owen Taylor
		41	*/
		42
		43	#include "cairoint.h"
		44	#include "cairo-error-private.h"
		45
		46	#define UTF8_COMPUTE(Char, Mask, Len) \
		47	if (Char < 128) \
		48	{ \
		49	Len = 1; \
		50	Mask = 0x7f; \
		51	} \
		52	else if ((Char & 0xe0) == 0xc0) \
		53	{ \
		54	Len = 2; \
		55	Mask = 0x1f; \
		56	} \
		57	else if ((Char & 0xf0) == 0xe0) \
		58	{ \
		59	Len = 3; \
		60	Mask = 0x0f; \
		61	} \
		62	else if ((Char & 0xf8) == 0xf0) \
		63	{ \
		64	Len = 4; \
		65	Mask = 0x07; \
		66	} \
		67	else if ((Char & 0xfc) == 0xf8) \
		68	{ \
		69	Len = 5; \
		70	Mask = 0x03; \
		71	} \
		72	else if ((Char & 0xfe) == 0xfc) \
		73	{ \
		74	Len = 6; \
		75	Mask = 0x01; \
		76	} \
		77	else \
		78	Len = -1;
		79
		80	#define UTF8_LENGTH(Char) \
		81	((Char) < 0x80 ? 1 : \
		82	((Char) < 0x800 ? 2 : \
		83	((Char) < 0x10000 ? 3 : \
		84	((Char) < 0x200000 ? 4 : \
		85	((Char) < 0x4000000 ? 5 : 6)))))
		86
		87	#define UTF8_GET(Result, Chars, Count, Mask, Len) \
		88	(Result) = (Chars)[0] & (Mask); \
		89	for ((Count) = 1; (Count) < (Len); ++(Count)) \
		90	{ \
		91	if (((Chars)[(Count)] & 0xc0) != 0x80) \
		92	{ \
		93	(Result) = -1; \
		94	break; \
		95	} \
		96	(Result) <<= 6; \
		97	(Result) \|= ((Chars)[(Count)] & 0x3f); \
		98	}
		99
		100	#define UNICODE_VALID(Char) \
		101	((Char) < 0x110000 && \
		102	(((Char) & 0xFFFFF800) != 0xD800) && \
		103	((Char) < 0xFDD0 \|\| (Char) > 0xFDEF) && \
		104	((Char) & 0xFFFE) != 0xFFFE)
		105
		106	static const char utf8_skip_data[256] = {
		107	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
		108	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
		109	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
		110	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
		111	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
		112	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
		113	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
		114	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
		115	};
		116
		117	#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[(unsigned char )(p)])
		118
		119	/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
		120	* If @p does not point to a valid UTF-8 encoded character, results are
		121	* undefined.
		122	**/
		123	static uint32_t
		124	_utf8_get_char (const unsigned char *p)
		125	{
		126	int i, mask = 0, len;
		127	uint32_t result;
		128	unsigned char c = (unsigned char) *p;
		129
		130	UTF8_COMPUTE (c, mask, len);
		131	if (len == -1)
		132	return (uint32_t)-1;
		133	UTF8_GET (result, p, i, mask, len);
		134
		135	return result;
		136	}
		137
		138	/* Like _utf8_get_char, but take a maximum length
		139	* and return (uint32_t)-2 on incomplete trailing character
		140	*/
		141	static uint32_t
		142	_utf8_get_char_extended (const unsigned char *p,
		143	long max_len)
		144	{
		145	int i, len;
		146	uint32_t wc = (unsigned char) *p;
		147
		148	if (wc < 0x80) {
		149	return wc;
		150	} else if (wc < 0xc0) {
		151	return (uint32_t)-1;
		152	} else if (wc < 0xe0) {
		153	len = 2;
		154	wc &= 0x1f;
		155	} else if (wc < 0xf0) {
		156	len = 3;
		157	wc &= 0x0f;
		158	} else if (wc < 0xf8) {
		159	len = 4;
		160	wc &= 0x07;
		161	} else if (wc < 0xfc) {
		162	len = 5;
		163	wc &= 0x03;
		164	} else if (wc < 0xfe) {
		165	len = 6;
		166	wc &= 0x01;
		167	} else {
		168	return (uint32_t)-1;
		169	}
		170
		171	if (max_len >= 0 && len > max_len) {
		172	for (i = 1; i < max_len; i++) {
		173	if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
		174	return (uint32_t)-1;
		175	}
		176	return (uint32_t)-2;
		177	}
		178
		179	for (i = 1; i < len; ++i) {
		180	uint32_t ch = ((unsigned char *)p)[i];
		181
		182	if ((ch & 0xc0) != 0x80) {
		183	if (ch)
		184	return (uint32_t)-1;
		185	else
		186	return (uint32_t)-2;
		187	}
		188
		189	wc <<= 6;
		190	wc \|= (ch & 0x3f);
		191	}
		192
		193	if (UTF8_LENGTH(wc) != len)
		194	return (uint32_t)-1;
		195
		196	return wc;
		197	}
		198
		199	/**
		200	* _cairo_utf8_get_char_validated:
		201	* @p: a UTF-8 string
		202	* @unicode: location to store one Unicode character
		203	*
		204	* Decodes the first character of a valid UTF-8 string, and returns
		205	* the number of bytes consumed.
		206	*
		207	* Note that the string should be valid. Do not use this without
		208	* validating the string first.
		209	*
		210	* Returns: the number of bytes forming the character returned.
		211	**/
		212	int
		213	_cairo_utf8_get_char_validated (const char *p,
		214	uint32_t *unicode)
		215	{
		216	int i, mask = 0, len;
		217	uint32_t result;
		218	unsigned char c = (unsigned char) *p;
		219
		220	UTF8_COMPUTE (c, mask, len);
		221	if (len == -1) {
		222	if (unicode)
		223	*unicode = (uint32_t)-1;
		224	return 1;
		225	}
		226	UTF8_GET (result, p, i, mask, len);
		227
		228	if (unicode)
		229	*unicode = result;
		230	return len;
		231	}
		232
		233	/**
		234	* _cairo_utf8_to_ucs4:
		235	* @str: an UTF-8 string
		236	* @len: length of @str in bytes, or -1 if it is nul-terminated.
		237	* If @len is supplied and the string has an embedded nul
		238	* byte, only the portion before the nul byte is converted.
		239	* @result: location to store a pointer to a newly allocated UTF-32
		240	* string (always native endian), or %NULL. Free with free(). A 0
		241	* word will be written after the last character.
		242	* @items_written: location to store number of 32-bit words
		243	* written. (Not including the trailing 0)
		244	*
		245	* Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
		246	* with 1 32-bit word per character. The string is validated to
		247	* consist entirely of valid Unicode characters.
		248	*
		249	* Return value: %CAIRO_STATUS_SUCCESS if the entire string was
		250	* successfully converted. %CAIRO_STATUS_INVALID_STRING if an
		251	* invalid sequence was found.
		252	**/
		253	cairo_status_t
		254	_cairo_utf8_to_ucs4 (const char *str,
		255	int len,
		256	uint32_t **result,
		257	int *items_written)
		258	{
		259	uint32_t *str32 = NULL;
		260	int n_chars, i;
		261	const unsigned char *in;
		262	const unsigned char * const ustr = (const unsigned char *) str;
		263
		264	in = ustr;
		265	n_chars = 0;
		266	while ((len < 0 \|\| ustr + len - in > 0) && *in)
		267	{
		268	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
		269	if (wc & 0x80000000 \|\| !UNICODE_VALID (wc))
		270	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
		271
		272	n_chars++;
		273	if (n_chars == INT_MAX)
		274	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
		275
		276	in = UTF8_NEXT_CHAR (in);
		277	}
		278
		279	if (result) {
		280	str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
		281	if (!str32)
		282	return _cairo_error (CAIRO_STATUS_NO_MEMORY);
		283
		284	in = ustr;
		285	for (i=0; i < n_chars; i++) {
		286	str32[i] = _utf8_get_char (in);
		287	in = UTF8_NEXT_CHAR (in);
		288	}
		289	str32[i] = 0;
		290
		291	*result = str32;
		292	}
		293
		294	if (items_written)
		295	*items_written = n_chars;
		296
		297	return CAIRO_STATUS_SUCCESS;
		298	}
		299
		300	/**
		301	* _cairo_ucs4_to_utf8:
		302	* @unicode: a UCS-4 character
		303	* @utf8: buffer to write utf8 string into. Must have at least 4 bytes
		304	* space available. Or %NULL.
		305	*
		306	* This space left intentionally blank.
		307	*
		308	* Return value: Number of bytes in the utf8 string or 0 if an invalid
		309	* unicode character
		310	**/
		311	int
		312	_cairo_ucs4_to_utf8 (uint32_t unicode,
		313	char *utf8)
		314	{
		315	int bytes;
		316	char *p;
		317
		318	if (unicode < 0x80) {
		319	if (utf8)
		320	*utf8 = unicode;
		321	return 1;
		322	} else if (unicode < 0x800) {
		323	bytes = 2;
		324	} else if (unicode < 0x10000) {
		325	bytes = 3;
		326	} else if (unicode < 0x200000) {
		327	bytes = 4;
		328	} else {
		329	return 0;
		330	}
		331
		332	if (!utf8)
		333	return bytes;
		334
		335	p = utf8 + bytes;
		336	while (p > utf8) {
		337	*--p = 0x80 \| (unicode & 0x3f);
		338	unicode >>= 6;
		339	}
		340	*p \|= 0xf0 << (4 - bytes);
		341
		342	return bytes;
		343	}
		344
		345	#if CAIRO_HAS_UTF8_TO_UTF16
		346	/**
		347	* _cairo_utf8_to_utf16:
		348	* @str: an UTF-8 string
		349	* @len: length of @str in bytes, or -1 if it is nul-terminated.
		350	* If @len is supplied and the string has an embedded nul
		351	* byte, only the portion before the nul byte is converted.
		352	* @result: location to store a pointer to a newly allocated UTF-16
		353	* string (always native endian). Free with free(). A 0
		354	* word will be written after the last character.
		355	* @items_written: location to store number of 16-bit words
		356	* written. (Not including the trailing 0)
		357	*
		358	* Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
		359	* where characters are represented either as a single 16-bit word, or
		360	* as a pair of 16-bit "surrogates". The string is validated to
		361	* consist entirely of valid Unicode characters.
		362	*
		363	* Return value: %CAIRO_STATUS_SUCCESS if the entire string was
		364	* successfully converted. %CAIRO_STATUS_INVALID_STRING if an
		365	* an invalid sequence was found.
		366	**/
		367	cairo_status_t
		368	_cairo_utf8_to_utf16 (const char *str,
		369	int len,
		370	uint16_t **result,
		371	int *items_written)
		372	{
		373	uint16_t *str16 = NULL;
		374	int n16, i;
		375	const unsigned char *in;
		376	const unsigned char * const ustr = (const unsigned char *) str;
		377
		378	in = ustr;
		379	n16 = 0;
		380	while ((len < 0 \|\| ustr + len - in > 0) && *in) {
		381	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
		382	if (wc & 0x80000000 \|\| !UNICODE_VALID (wc))
		383	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
		384
		385	if (wc < 0x10000)
		386	n16 += 1;
		387	else
		388	n16 += 2;
		389
		390	if (n16 == INT_MAX - 1 \|\| n16 == INT_MAX)
		391	return _cairo_error (CAIRO_STATUS_INVALID_STRING);
		392
		393	in = UTF8_NEXT_CHAR (in);
		394	}
		395
		396	str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
		397	if (!str16)
		398	return _cairo_error (CAIRO_STATUS_NO_MEMORY);
		399
		400	in = ustr;
		401	for (i = 0; i < n16;) {
		402	uint32_t wc = _utf8_get_char (in);
		403
		404	if (wc < 0x10000) {
		405	str16[i++] = wc;
		406	} else {
		407	str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
		408	str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
		409	}
		410
		411	in = UTF8_NEXT_CHAR (in);
		412	}
		413
		414	str16[i] = 0;
		415
		416	*result = str16;
		417	if (items_written)
		418	*items_written = n16;
		419
		420	return CAIRO_STATUS_SUCCESS;
		421	}
		422	#endif

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/cairo/src/cairo-unicode.c – Rev 8031