Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2
/* cairo - a vector graphics library with display and print output
3
 *
4
 * The code in this file is derived from GLib's gutf8.c and
5
 *   ultimately from libunicode. It is relicensed under the
6
 *   dual LGPL/MPL with permission of the original authors.
7
 *
8
 * Copyright © 1999 Tom Tromey
9
 * Copyright © 2005 Red Hat, Inc
10
 *
11
 * This library is free software; you can redistribute it and/or
12
 * modify it either under the terms of the GNU Lesser General Public
13
 * License version 2.1 as published by the Free Software Foundation
14
 * (the "LGPL") or, at your option, under the terms of the Mozilla
15
 * Public License Version 1.1 (the "MPL"). If you do not alter this
16
 * notice, a recipient may use your version of this file under either
17
 * the MPL or the LGPL.
18
 *
19
 * You should have received a copy of the LGPL along with this library
20
 * in the file COPYING-LGPL-2.1; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
22
 * You should have received a copy of the MPL along with this library
23
 * in the file COPYING-MPL-1.1
24
 *
25
 * The contents of this file are subject to the Mozilla Public License
26
 * Version 1.1 (the "License"); you may not use this file except in
27
 * compliance with the License. You may obtain a copy of the License at
28
 * http://www.mozilla.org/MPL/
29
 *
30
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31
 * OF ANY KIND, either express or implied. See the LGPL or the MPL for
32
 * the specific language governing rights and limitations.
33
 *
34
 * The Original Code is the cairo graphics library.
35
 *
36
 * The Initial Developer of the Original Code is Tom Tromey.
37
 *  and Red Hat, Inc.
38
 *
39
 * Contributor(s):
40
 *	Owen Taylor 
41
 */
42
 
43
#include "cairoint.h"
44
#include "cairo-error-private.h"
45
 
46
#define UTF8_COMPUTE(Char, Mask, Len)					      \
47
  if (Char < 128)							      \
48
    {									      \
49
      Len = 1;								      \
50
      Mask = 0x7f;							      \
51
    }									      \
52
  else if ((Char & 0xe0) == 0xc0)					      \
53
    {									      \
54
      Len = 2;								      \
55
      Mask = 0x1f;							      \
56
    }									      \
57
  else if ((Char & 0xf0) == 0xe0)					      \
58
    {									      \
59
      Len = 3;								      \
60
      Mask = 0x0f;							      \
61
    }									      \
62
  else if ((Char & 0xf8) == 0xf0)					      \
63
    {									      \
64
      Len = 4;								      \
65
      Mask = 0x07;							      \
66
    }									      \
67
  else if ((Char & 0xfc) == 0xf8)					      \
68
    {									      \
69
      Len = 5;								      \
70
      Mask = 0x03;							      \
71
    }									      \
72
  else if ((Char & 0xfe) == 0xfc)					      \
73
    {									      \
74
      Len = 6;								      \
75
      Mask = 0x01;							      \
76
    }									      \
77
  else									      \
78
    Len = -1;
79
 
80
#define UTF8_LENGTH(Char)              \
81
  ((Char) < 0x80 ? 1 :                 \
82
   ((Char) < 0x800 ? 2 :               \
83
    ((Char) < 0x10000 ? 3 :            \
84
     ((Char) < 0x200000 ? 4 :          \
85
      ((Char) < 0x4000000 ? 5 : 6)))))
86
 
87
#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
88
  (Result) = (Chars)[0] & (Mask);					      \
89
  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
90
    {									      \
91
      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
92
	{								      \
93
	  (Result) = -1;						      \
94
	  break;							      \
95
	}								      \
96
      (Result) <<= 6;							      \
97
      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
98
    }
99
 
100
#define UNICODE_VALID(Char)                   \
101
    ((Char) < 0x110000 &&                     \
102
     (((Char) & 0xFFFFF800) != 0xD800) &&     \
103
     ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
104
     ((Char) & 0xFFFE) != 0xFFFE)
105
 
106
static const char utf8_skip_data[256] = {
107
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
113
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
114
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
115
};
116
 
117
#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
118
 
119
/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
120
 * If @p does not point to a valid UTF-8 encoded character, results are
121
 * undefined.
122
 **/
123
static uint32_t
124
_utf8_get_char (const unsigned char *p)
125
{
126
    int i, mask = 0, len;
127
    uint32_t result;
128
    unsigned char c = (unsigned char) *p;
129
 
130
    UTF8_COMPUTE (c, mask, len);
131
    if (len == -1)
132
	return (uint32_t)-1;
133
    UTF8_GET (result, p, i, mask, len);
134
 
135
    return result;
136
}
137
 
138
/* Like _utf8_get_char, but take a maximum length
139
 * and return (uint32_t)-2 on incomplete trailing character
140
 */
141
static uint32_t
142
_utf8_get_char_extended (const unsigned char *p,
143
			 long		      max_len)
144
{
145
    int i, len;
146
    uint32_t wc = (unsigned char) *p;
147
 
148
    if (wc < 0x80) {
149
	return wc;
150
    } else if (wc < 0xc0) {
151
	return (uint32_t)-1;
152
    } else if (wc < 0xe0) {
153
	len = 2;
154
	wc &= 0x1f;
155
    } else if (wc < 0xf0) {
156
	len = 3;
157
	wc &= 0x0f;
158
    } else if (wc < 0xf8) {
159
	len = 4;
160
	wc &= 0x07;
161
    } else if (wc < 0xfc) {
162
	len = 5;
163
	wc &= 0x03;
164
    } else if (wc < 0xfe) {
165
	len = 6;
166
	wc &= 0x01;
167
    } else {
168
	return (uint32_t)-1;
169
    }
170
 
171
    if (max_len >= 0 && len > max_len) {
172
	for (i = 1; i < max_len; i++) {
173
	    if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
174
		return (uint32_t)-1;
175
	}
176
	return (uint32_t)-2;
177
    }
178
 
179
    for (i = 1; i < len; ++i) {
180
	uint32_t ch = ((unsigned char *)p)[i];
181
 
182
	if ((ch & 0xc0) != 0x80) {
183
	    if (ch)
184
		return (uint32_t)-1;
185
	    else
186
		return (uint32_t)-2;
187
	}
188
 
189
	wc <<= 6;
190
	wc |= (ch & 0x3f);
191
    }
192
 
193
    if (UTF8_LENGTH(wc) != len)
194
	return (uint32_t)-1;
195
 
196
    return wc;
197
}
198
 
199
/**
200
 * _cairo_utf8_get_char_validated:
201
 * @p: a UTF-8 string
202
 * @unicode: location to store one Unicode character
203
 *
204
 * Decodes the first character of a valid UTF-8 string, and returns
205
 * the number of bytes consumed.
206
 *
207
 * Note that the string should be valid.  Do not use this without
208
 * validating the string first.
209
 *
210
 * Returns: the number of bytes forming the character returned.
211
 **/
212
int
213
_cairo_utf8_get_char_validated (const char *p,
214
				uint32_t   *unicode)
215
{
216
    int i, mask = 0, len;
217
    uint32_t result;
218
    unsigned char c = (unsigned char) *p;
219
 
220
    UTF8_COMPUTE (c, mask, len);
221
    if (len == -1) {
222
	if (unicode)
223
	    *unicode = (uint32_t)-1;
224
	return 1;
225
    }
226
    UTF8_GET (result, p, i, mask, len);
227
 
228
    if (unicode)
229
	*unicode = result;
230
    return len;
231
}
232
 
233
/**
234
 * _cairo_utf8_to_ucs4:
235
 * @str: an UTF-8 string
236
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
237
 *   If @len is supplied and the string has an embedded nul
238
 *   byte, only the portion before the nul byte is converted.
239
 * @result: location to store a pointer to a newly allocated UTF-32
240
 *   string (always native endian), or %NULL. Free with free(). A 0
241
 *   word will be written after the last character.
242
 * @items_written: location to store number of 32-bit words
243
 *   written. (Not including the trailing 0)
244
 *
245
 * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
246
 * with 1 32-bit word per character. The string is validated to
247
 * consist entirely of valid Unicode characters.
248
 *
249
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
250
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
251
 *   invalid sequence was found.
252
 **/
253
cairo_status_t
254
_cairo_utf8_to_ucs4 (const char *str,
255
		     int	 len,
256
		     uint32_t  **result,
257
		     int	*items_written)
258
{
259
    uint32_t *str32 = NULL;
260
    int n_chars, i;
261
    const unsigned char *in;
262
    const unsigned char * const ustr = (const unsigned char *) str;
263
 
264
    in = ustr;
265
    n_chars = 0;
266
    while ((len < 0 || ustr + len - in > 0) && *in)
267
    {
268
	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
269
	if (wc & 0x80000000 || !UNICODE_VALID (wc))
270
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
271
 
272
	n_chars++;
273
	if (n_chars == INT_MAX)
274
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
275
 
276
	in = UTF8_NEXT_CHAR (in);
277
    }
278
 
279
    if (result) {
280
	str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
281
	if (!str32)
282
	    return _cairo_error (CAIRO_STATUS_NO_MEMORY);
283
 
284
	in = ustr;
285
	for (i=0; i < n_chars; i++) {
286
	    str32[i] = _utf8_get_char (in);
287
	    in = UTF8_NEXT_CHAR (in);
288
	}
289
	str32[i] = 0;
290
 
291
	*result = str32;
292
    }
293
 
294
    if (items_written)
295
	*items_written = n_chars;
296
 
297
    return CAIRO_STATUS_SUCCESS;
298
}
299
 
300
/**
301
 * _cairo_ucs4_to_utf8:
302
 * @unicode: a UCS-4 character
303
 * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
304
 * space available. Or %NULL.
305
 *
306
 * This space left intentionally blank.
307
 *
308
 * Return value: Number of bytes in the utf8 string or 0 if an invalid
309
 * unicode character
310
 **/
311
int
312
_cairo_ucs4_to_utf8 (uint32_t  unicode,
313
		     char     *utf8)
314
{
315
    int bytes;
316
    char *p;
317
 
318
    if (unicode < 0x80) {
319
	if (utf8)
320
	    *utf8 = unicode;
321
	return 1;
322
    } else if (unicode < 0x800) {
323
	bytes = 2;
324
    } else if (unicode < 0x10000) {
325
	bytes = 3;
326
    } else if (unicode < 0x200000) {
327
	bytes = 4;
328
    } else {
329
	return 0;
330
    }
331
 
332
    if (!utf8)
333
	return bytes;
334
 
335
    p = utf8 + bytes;
336
    while (p > utf8) {
337
	*--p = 0x80 | (unicode & 0x3f);
338
	unicode >>= 6;
339
    }
340
    *p |= 0xf0 << (4 - bytes);
341
 
342
    return bytes;
343
}
344
 
345
#if CAIRO_HAS_UTF8_TO_UTF16
346
/**
347
 * _cairo_utf8_to_utf16:
348
 * @str: an UTF-8 string
349
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
350
 *   If @len is supplied and the string has an embedded nul
351
 *   byte, only the portion before the nul byte is converted.
352
 * @result: location to store a pointer to a newly allocated UTF-16
353
 *   string (always native endian). Free with free(). A 0
354
 *   word will be written after the last character.
355
 * @items_written: location to store number of 16-bit words
356
 *   written. (Not including the trailing 0)
357
 *
358
 * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
359
 * where characters are represented either as a single 16-bit word, or
360
 * as a pair of 16-bit "surrogates". The string is validated to
361
 * consist entirely of valid Unicode characters.
362
 *
363
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
364
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
365
 *   an invalid sequence was found.
366
 **/
367
cairo_status_t
368
_cairo_utf8_to_utf16 (const char *str,
369
		      int	  len,
370
		      uint16_t **result,
371
		      int	*items_written)
372
{
373
    uint16_t *str16 = NULL;
374
    int n16, i;
375
    const unsigned char *in;
376
    const unsigned char * const ustr = (const unsigned char *) str;
377
 
378
    in = ustr;
379
    n16 = 0;
380
    while ((len < 0 || ustr + len - in > 0) && *in) {
381
	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
382
	if (wc & 0x80000000 || !UNICODE_VALID (wc))
383
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
384
 
385
	if (wc < 0x10000)
386
	    n16 += 1;
387
	else
388
	    n16 += 2;
389
 
390
	if (n16 == INT_MAX - 1 || n16 == INT_MAX)
391
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
392
 
393
	in = UTF8_NEXT_CHAR (in);
394
    }
395
 
396
    str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
397
    if (!str16)
398
	return _cairo_error (CAIRO_STATUS_NO_MEMORY);
399
 
400
    in = ustr;
401
    for (i = 0; i < n16;) {
402
	uint32_t wc = _utf8_get_char (in);
403
 
404
	if (wc < 0x10000) {
405
	    str16[i++] = wc;
406
	} else {
407
	    str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
408
	    str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
409
	}
410
 
411
	in = UTF8_NEXT_CHAR (in);
412
    }
413
 
414
    str16[i] = 0;
415
 
416
    *result = str16;
417
    if (items_written)
418
	*items_written = n16;
419
 
420
    return CAIRO_STATUS_SUCCESS;
421
}
422
#endif