Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */ |
2 | /* cairo - a vector graphics library with display and print output |
||
3 | * |
||
4 | * The code in this file is derived from GLib's gutf8.c and |
||
5 | * ultimately from libunicode. It is relicensed under the |
||
6 | * dual LGPL/MPL with permission of the original authors. |
||
7 | * |
||
8 | * Copyright © 1999 Tom Tromey |
||
9 | * Copyright © 2005 Red Hat, Inc |
||
10 | * |
||
11 | * This library is free software; you can redistribute it and/or |
||
12 | * modify it either under the terms of the GNU Lesser General Public |
||
13 | * License version 2.1 as published by the Free Software Foundation |
||
14 | * (the "LGPL") or, at your option, under the terms of the Mozilla |
||
15 | * Public License Version 1.1 (the "MPL"). If you do not alter this |
||
16 | * notice, a recipient may use your version of this file under either |
||
17 | * the MPL or the LGPL. |
||
18 | * |
||
19 | * You should have received a copy of the LGPL along with this library |
||
20 | * in the file COPYING-LGPL-2.1; if not, write to the Free Software |
||
21 | * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA |
||
22 | * You should have received a copy of the MPL along with this library |
||
23 | * in the file COPYING-MPL-1.1 |
||
24 | * |
||
25 | * The contents of this file are subject to the Mozilla Public License |
||
26 | * Version 1.1 (the "License"); you may not use this file except in |
||
27 | * compliance with the License. You may obtain a copy of the License at |
||
28 | * http://www.mozilla.org/MPL/ |
||
29 | * |
||
30 | * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY |
||
31 | * OF ANY KIND, either express or implied. See the LGPL or the MPL for |
||
32 | * the specific language governing rights and limitations. |
||
33 | * |
||
34 | * The Original Code is the cairo graphics library. |
||
35 | * |
||
36 | * The Initial Developer of the Original Code is Tom Tromey. |
||
37 | * and Red Hat, Inc. |
||
38 | * |
||
39 | * Contributor(s): |
||
40 | * Owen Taylor |
||
41 | */ |
||
42 | |||
43 | #include "cairoint.h" |
||
44 | #include "cairo-error-private.h" |
||
45 | |||
46 | #define UTF8_COMPUTE(Char, Mask, Len) \ |
||
47 | if (Char < 128) \ |
||
48 | { \ |
||
49 | Len = 1; \ |
||
50 | Mask = 0x7f; \ |
||
51 | } \ |
||
52 | else if ((Char & 0xe0) == 0xc0) \ |
||
53 | { \ |
||
54 | Len = 2; \ |
||
55 | Mask = 0x1f; \ |
||
56 | } \ |
||
57 | else if ((Char & 0xf0) == 0xe0) \ |
||
58 | { \ |
||
59 | Len = 3; \ |
||
60 | Mask = 0x0f; \ |
||
61 | } \ |
||
62 | else if ((Char & 0xf8) == 0xf0) \ |
||
63 | { \ |
||
64 | Len = 4; \ |
||
65 | Mask = 0x07; \ |
||
66 | } \ |
||
67 | else if ((Char & 0xfc) == 0xf8) \ |
||
68 | { \ |
||
69 | Len = 5; \ |
||
70 | Mask = 0x03; \ |
||
71 | } \ |
||
72 | else if ((Char & 0xfe) == 0xfc) \ |
||
73 | { \ |
||
74 | Len = 6; \ |
||
75 | Mask = 0x01; \ |
||
76 | } \ |
||
77 | else \ |
||
78 | Len = -1; |
||
79 | |||
80 | #define UTF8_LENGTH(Char) \ |
||
81 | ((Char) < 0x80 ? 1 : \ |
||
82 | ((Char) < 0x800 ? 2 : \ |
||
83 | ((Char) < 0x10000 ? 3 : \ |
||
84 | ((Char) < 0x200000 ? 4 : \ |
||
85 | ((Char) < 0x4000000 ? 5 : 6))))) |
||
86 | |||
87 | #define UTF8_GET(Result, Chars, Count, Mask, Len) \ |
||
88 | (Result) = (Chars)[0] & (Mask); \ |
||
89 | for ((Count) = 1; (Count) < (Len); ++(Count)) \ |
||
90 | { \ |
||
91 | if (((Chars)[(Count)] & 0xc0) != 0x80) \ |
||
92 | { \ |
||
93 | (Result) = -1; \ |
||
94 | break; \ |
||
95 | } \ |
||
96 | (Result) <<= 6; \ |
||
97 | (Result) |= ((Chars)[(Count)] & 0x3f); \ |
||
98 | } |
||
99 | |||
100 | #define UNICODE_VALID(Char) \ |
||
101 | ((Char) < 0x110000 && \ |
||
102 | (((Char) & 0xFFFFF800) != 0xD800) && \ |
||
103 | ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ |
||
104 | ((Char) & 0xFFFE) != 0xFFFE) |
||
105 | |||
106 | static const char utf8_skip_data[256] = { |
||
107 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
108 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
109 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
110 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
111 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
112 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
||
113 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
||
114 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
||
115 | }; |
||
116 | |||
117 | #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)]) |
||
118 | |||
119 | /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character. |
||
120 | * If @p does not point to a valid UTF-8 encoded character, results are |
||
121 | * undefined. |
||
122 | **/ |
||
123 | static uint32_t |
||
124 | _utf8_get_char (const unsigned char *p) |
||
125 | { |
||
126 | int i, mask = 0, len; |
||
127 | uint32_t result; |
||
128 | unsigned char c = (unsigned char) *p; |
||
129 | |||
130 | UTF8_COMPUTE (c, mask, len); |
||
131 | if (len == -1) |
||
132 | return (uint32_t)-1; |
||
133 | UTF8_GET (result, p, i, mask, len); |
||
134 | |||
135 | return result; |
||
136 | } |
||
137 | |||
138 | /* Like _utf8_get_char, but take a maximum length |
||
139 | * and return (uint32_t)-2 on incomplete trailing character |
||
140 | */ |
||
141 | static uint32_t |
||
142 | _utf8_get_char_extended (const unsigned char *p, |
||
143 | long max_len) |
||
144 | { |
||
145 | int i, len; |
||
146 | uint32_t wc = (unsigned char) *p; |
||
147 | |||
148 | if (wc < 0x80) { |
||
149 | return wc; |
||
150 | } else if (wc < 0xc0) { |
||
151 | return (uint32_t)-1; |
||
152 | } else if (wc < 0xe0) { |
||
153 | len = 2; |
||
154 | wc &= 0x1f; |
||
155 | } else if (wc < 0xf0) { |
||
156 | len = 3; |
||
157 | wc &= 0x0f; |
||
158 | } else if (wc < 0xf8) { |
||
159 | len = 4; |
||
160 | wc &= 0x07; |
||
161 | } else if (wc < 0xfc) { |
||
162 | len = 5; |
||
163 | wc &= 0x03; |
||
164 | } else if (wc < 0xfe) { |
||
165 | len = 6; |
||
166 | wc &= 0x01; |
||
167 | } else { |
||
168 | return (uint32_t)-1; |
||
169 | } |
||
170 | |||
171 | if (max_len >= 0 && len > max_len) { |
||
172 | for (i = 1; i < max_len; i++) { |
||
173 | if ((((unsigned char *)p)[i] & 0xc0) != 0x80) |
||
174 | return (uint32_t)-1; |
||
175 | } |
||
176 | return (uint32_t)-2; |
||
177 | } |
||
178 | |||
179 | for (i = 1; i < len; ++i) { |
||
180 | uint32_t ch = ((unsigned char *)p)[i]; |
||
181 | |||
182 | if ((ch & 0xc0) != 0x80) { |
||
183 | if (ch) |
||
184 | return (uint32_t)-1; |
||
185 | else |
||
186 | return (uint32_t)-2; |
||
187 | } |
||
188 | |||
189 | wc <<= 6; |
||
190 | wc |= (ch & 0x3f); |
||
191 | } |
||
192 | |||
193 | if (UTF8_LENGTH(wc) != len) |
||
194 | return (uint32_t)-1; |
||
195 | |||
196 | return wc; |
||
197 | } |
||
198 | |||
199 | /** |
||
200 | * _cairo_utf8_get_char_validated: |
||
201 | * @p: a UTF-8 string |
||
202 | * @unicode: location to store one Unicode character |
||
203 | * |
||
204 | * Decodes the first character of a valid UTF-8 string, and returns |
||
205 | * the number of bytes consumed. |
||
206 | * |
||
207 | * Note that the string should be valid. Do not use this without |
||
208 | * validating the string first. |
||
209 | * |
||
210 | * Returns: the number of bytes forming the character returned. |
||
211 | **/ |
||
212 | int |
||
213 | _cairo_utf8_get_char_validated (const char *p, |
||
214 | uint32_t *unicode) |
||
215 | { |
||
216 | int i, mask = 0, len; |
||
217 | uint32_t result; |
||
218 | unsigned char c = (unsigned char) *p; |
||
219 | |||
220 | UTF8_COMPUTE (c, mask, len); |
||
221 | if (len == -1) { |
||
222 | if (unicode) |
||
223 | *unicode = (uint32_t)-1; |
||
224 | return 1; |
||
225 | } |
||
226 | UTF8_GET (result, p, i, mask, len); |
||
227 | |||
228 | if (unicode) |
||
229 | *unicode = result; |
||
230 | return len; |
||
231 | } |
||
232 | |||
233 | /** |
||
234 | * _cairo_utf8_to_ucs4: |
||
235 | * @str: an UTF-8 string |
||
236 | * @len: length of @str in bytes, or -1 if it is nul-terminated. |
||
237 | * If @len is supplied and the string has an embedded nul |
||
238 | * byte, only the portion before the nul byte is converted. |
||
239 | * @result: location to store a pointer to a newly allocated UTF-32 |
||
240 | * string (always native endian), or %NULL. Free with free(). A 0 |
||
241 | * word will be written after the last character. |
||
242 | * @items_written: location to store number of 32-bit words |
||
243 | * written. (Not including the trailing 0) |
||
244 | * |
||
245 | * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode |
||
246 | * with 1 32-bit word per character. The string is validated to |
||
247 | * consist entirely of valid Unicode characters. |
||
248 | * |
||
249 | * Return value: %CAIRO_STATUS_SUCCESS if the entire string was |
||
250 | * successfully converted. %CAIRO_STATUS_INVALID_STRING if an |
||
251 | * invalid sequence was found. |
||
252 | **/ |
||
253 | cairo_status_t |
||
254 | _cairo_utf8_to_ucs4 (const char *str, |
||
255 | int len, |
||
256 | uint32_t **result, |
||
257 | int *items_written) |
||
258 | { |
||
259 | uint32_t *str32 = NULL; |
||
260 | int n_chars, i; |
||
261 | const unsigned char *in; |
||
262 | const unsigned char * const ustr = (const unsigned char *) str; |
||
263 | |||
264 | in = ustr; |
||
265 | n_chars = 0; |
||
266 | while ((len < 0 || ustr + len - in > 0) && *in) |
||
267 | { |
||
268 | uint32_t wc = _utf8_get_char_extended (in, ustr + len - in); |
||
269 | if (wc & 0x80000000 || !UNICODE_VALID (wc)) |
||
270 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
||
271 | |||
272 | n_chars++; |
||
273 | if (n_chars == INT_MAX) |
||
274 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
||
275 | |||
276 | in = UTF8_NEXT_CHAR (in); |
||
277 | } |
||
278 | |||
279 | if (result) { |
||
280 | str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t)); |
||
281 | if (!str32) |
||
282 | return _cairo_error (CAIRO_STATUS_NO_MEMORY); |
||
283 | |||
284 | in = ustr; |
||
285 | for (i=0; i < n_chars; i++) { |
||
286 | str32[i] = _utf8_get_char (in); |
||
287 | in = UTF8_NEXT_CHAR (in); |
||
288 | } |
||
289 | str32[i] = 0; |
||
290 | |||
291 | *result = str32; |
||
292 | } |
||
293 | |||
294 | if (items_written) |
||
295 | *items_written = n_chars; |
||
296 | |||
297 | return CAIRO_STATUS_SUCCESS; |
||
298 | } |
||
299 | |||
300 | /** |
||
301 | * _cairo_ucs4_to_utf8: |
||
302 | * @unicode: a UCS-4 character |
||
303 | * @utf8: buffer to write utf8 string into. Must have at least 4 bytes |
||
304 | * space available. Or %NULL. |
||
305 | * |
||
306 | * This space left intentionally blank. |
||
307 | * |
||
308 | * Return value: Number of bytes in the utf8 string or 0 if an invalid |
||
309 | * unicode character |
||
310 | **/ |
||
311 | int |
||
312 | _cairo_ucs4_to_utf8 (uint32_t unicode, |
||
313 | char *utf8) |
||
314 | { |
||
315 | int bytes; |
||
316 | char *p; |
||
317 | |||
318 | if (unicode < 0x80) { |
||
319 | if (utf8) |
||
320 | *utf8 = unicode; |
||
321 | return 1; |
||
322 | } else if (unicode < 0x800) { |
||
323 | bytes = 2; |
||
324 | } else if (unicode < 0x10000) { |
||
325 | bytes = 3; |
||
326 | } else if (unicode < 0x200000) { |
||
327 | bytes = 4; |
||
328 | } else { |
||
329 | return 0; |
||
330 | } |
||
331 | |||
332 | if (!utf8) |
||
333 | return bytes; |
||
334 | |||
335 | p = utf8 + bytes; |
||
336 | while (p > utf8) { |
||
337 | *--p = 0x80 | (unicode & 0x3f); |
||
338 | unicode >>= 6; |
||
339 | } |
||
340 | *p |= 0xf0 << (4 - bytes); |
||
341 | |||
342 | return bytes; |
||
343 | } |
||
344 | |||
345 | #if CAIRO_HAS_UTF8_TO_UTF16 |
||
346 | /** |
||
347 | * _cairo_utf8_to_utf16: |
||
348 | * @str: an UTF-8 string |
||
349 | * @len: length of @str in bytes, or -1 if it is nul-terminated. |
||
350 | * If @len is supplied and the string has an embedded nul |
||
351 | * byte, only the portion before the nul byte is converted. |
||
352 | * @result: location to store a pointer to a newly allocated UTF-16 |
||
353 | * string (always native endian). Free with free(). A 0 |
||
354 | * word will be written after the last character. |
||
355 | * @items_written: location to store number of 16-bit words |
||
356 | * written. (Not including the trailing 0) |
||
357 | * |
||
358 | * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode |
||
359 | * where characters are represented either as a single 16-bit word, or |
||
360 | * as a pair of 16-bit "surrogates". The string is validated to |
||
361 | * consist entirely of valid Unicode characters. |
||
362 | * |
||
363 | * Return value: %CAIRO_STATUS_SUCCESS if the entire string was |
||
364 | * successfully converted. %CAIRO_STATUS_INVALID_STRING if an |
||
365 | * an invalid sequence was found. |
||
366 | **/ |
||
367 | cairo_status_t |
||
368 | _cairo_utf8_to_utf16 (const char *str, |
||
369 | int len, |
||
370 | uint16_t **result, |
||
371 | int *items_written) |
||
372 | { |
||
373 | uint16_t *str16 = NULL; |
||
374 | int n16, i; |
||
375 | const unsigned char *in; |
||
376 | const unsigned char * const ustr = (const unsigned char *) str; |
||
377 | |||
378 | in = ustr; |
||
379 | n16 = 0; |
||
380 | while ((len < 0 || ustr + len - in > 0) && *in) { |
||
381 | uint32_t wc = _utf8_get_char_extended (in, ustr + len - in); |
||
382 | if (wc & 0x80000000 || !UNICODE_VALID (wc)) |
||
383 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
||
384 | |||
385 | if (wc < 0x10000) |
||
386 | n16 += 1; |
||
387 | else |
||
388 | n16 += 2; |
||
389 | |||
390 | if (n16 == INT_MAX - 1 || n16 == INT_MAX) |
||
391 | return _cairo_error (CAIRO_STATUS_INVALID_STRING); |
||
392 | |||
393 | in = UTF8_NEXT_CHAR (in); |
||
394 | } |
||
395 | |||
396 | str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t)); |
||
397 | if (!str16) |
||
398 | return _cairo_error (CAIRO_STATUS_NO_MEMORY); |
||
399 | |||
400 | in = ustr; |
||
401 | for (i = 0; i < n16;) { |
||
402 | uint32_t wc = _utf8_get_char (in); |
||
403 | |||
404 | if (wc < 0x10000) { |
||
405 | str16[i++] = wc; |
||
406 | } else { |
||
407 | str16[i++] = (wc - 0x10000) / 0x400 + 0xd800; |
||
408 | str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00; |
||
409 | } |
||
410 | |||
411 | in = UTF8_NEXT_CHAR (in); |
||
412 | } |
||
413 | |||
414 | str16[i] = 0; |
||
415 | |||
416 | *result = str16; |
||
417 | if (items_written) |
||
418 | *items_written = n16; |
||
419 | |||
420 | return CAIRO_STATUS_SUCCESS; |
||
421 | } |
||
422 | #endif>>>>><>>>>>>>=><=>>>>>>>>>>>>=><=>>>>>>>> |