Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
3584 sourcerer 1
/*
2
 * This file is part of LibParserUtils.
3
 * Licensed under the MIT License,
4
 *                http://www.opensource.org/licenses/mit-license.php
5
 * Copyright 2008 John-Mark Bell 
6
 */
7
 
8
#include 
9
#include 
10
#include 
11
 
12
#include 
13
 
14
#include "charset/codecs/codec_impl.h"
15
#include "utils/endian.h"
16
#include "utils/utils.h"
17
 
18
#include "charset/codecs/ext8_tables.h"
19
 
20
static struct {
21
	uint16_t mib;
22
	const char *name;
23
	size_t len;
24
	uint32_t *table;
25
} known_charsets[] = {
26
	{ 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
27
	{ 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
28
	{ 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
29
	{ 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
30
	{ 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
31
	{ 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
32
	{ 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
33
	{ 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
34
	{ 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
35
};
36
 
37
/**
38
 * Windows charset codec
39
 */
40
typedef struct charset_ext8_codec {
41
	parserutils_charset_codec base;	/**< Base class */
42
 
43
	uint32_t *table;		/**< Mapping table for 0x80-0xFF */
44
 
45
#define READ_BUFSIZE (8)
46
	uint32_t read_buf[READ_BUFSIZE];	/**< Buffer for partial
47
						 * output sequences (decode)
48
						 * (host-endian) */
49
	size_t read_len;		/**< Character length of read_buf */
50
 
51
#define WRITE_BUFSIZE (8)
52
	uint32_t write_buf[WRITE_BUFSIZE];	/**< Buffer for partial
53
						 * output sequences (encode)
54
						 * (host-endian) */
55
	size_t write_len;		/**< Character length of write_buf */
56
 
57
} charset_ext8_codec;
58
 
59
static bool charset_ext8_codec_handles_charset(const char *charset);
60
static parserutils_error charset_ext8_codec_create(const char *charset,
61
		parserutils_alloc alloc, void *pw,
62
		parserutils_charset_codec **codec);
63
static parserutils_error charset_ext8_codec_destroy(
64
		parserutils_charset_codec *codec);
65
static parserutils_error charset_ext8_codec_encode(
66
		parserutils_charset_codec *codec,
67
		const uint8_t **source, size_t *sourcelen,
68
		uint8_t **dest, size_t *destlen);
69
static parserutils_error charset_ext8_codec_decode(
70
		parserutils_charset_codec *codec,
71
		const uint8_t **source, size_t *sourcelen,
72
		uint8_t **dest, size_t *destlen);
73
static parserutils_error charset_ext8_codec_reset(
74
		parserutils_charset_codec *codec);
75
static inline parserutils_error charset_ext8_codec_read_char(
76
		charset_ext8_codec *c,
77
		const uint8_t **source, size_t *sourcelen,
78
		uint8_t **dest, size_t *destlen);
79
static inline parserutils_error charset_ext8_codec_output_decoded_char(
80
		charset_ext8_codec *c,
81
		uint32_t ucs4, uint8_t **dest, size_t *destlen);
82
static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
83
		uint32_t ucs4, uint8_t **s, size_t *len);
84
static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
85
		const uint8_t *s, size_t len, uint32_t *ucs4);
86
 
87
/**
88
 * Determine whether this codec handles a specific charset
89
 *
90
 * \param charset  Charset to test
91
 * \return true if handleable, false otherwise
92
 */
93
bool charset_ext8_codec_handles_charset(const char *charset)
94
{
95
	uint32_t i;
96
	uint16_t match = parserutils_charset_mibenum_from_name(charset,
97
			strlen(charset));
98
 
99
	if (known_charsets[0].mib == 0) {
100
		for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
101
			known_charsets[i].mib =
102
				parserutils_charset_mibenum_from_name(
103
						known_charsets[i].name,
104
						known_charsets[i].len);
105
		}
106
	}
107
 
108
	for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
109
		if (known_charsets[i].mib == match)
110
			return true;
111
	}
112
 
113
	return false;
114
}
115
 
116
/**
117
 * Create an extended 8bit codec
118
 *
119
 * \param charset  The charset to read from / write to
120
 * \param alloc    Memory (de)allocation function
121
 * \param pw       Pointer to client-specific private data (may be NULL)
122
 * \param codec    Pointer to location to receive codec
123
 * \return PARSERUTILS_OK on success,
124
 *         PARSERUTILS_BADPARM on bad parameters,
125
 *         PARSERUTILS_NOMEM on memory exhausion
126
 */
127
parserutils_error charset_ext8_codec_create(const char *charset,
128
		parserutils_alloc alloc, void *pw,
129
		parserutils_charset_codec **codec)
130
{
131
	uint32_t i;
132
	charset_ext8_codec *c;
133
	uint16_t match = parserutils_charset_mibenum_from_name(
134
			charset, strlen(charset));
135
	uint32_t *table = NULL;
136
 
137
	for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
138
		if (known_charsets[i].mib == match) {
139
			table = known_charsets[i].table;
140
			break;
141
		}
142
	}
143
 
144
	assert(table != NULL);
145
 
146
	c = alloc(NULL, sizeof(charset_ext8_codec), pw);
147
	if (c == NULL)
148
		return PARSERUTILS_NOMEM;
149
 
150
	c->table = table;
151
 
152
	c->read_buf[0] = 0;
153
	c->read_len = 0;
154
 
155
	c->write_buf[0] = 0;
156
	c->write_len = 0;
157
 
158
	/* Finally, populate vtable */
159
	c->base.handler.destroy = charset_ext8_codec_destroy;
160
	c->base.handler.encode = charset_ext8_codec_encode;
161
	c->base.handler.decode = charset_ext8_codec_decode;
162
	c->base.handler.reset = charset_ext8_codec_reset;
163
 
164
	*codec = (parserutils_charset_codec *) c;
165
 
166
	return PARSERUTILS_OK;
167
}
168
 
169
/**
170
 * Destroy an extended 8bit codec
171
 *
172
 * \param codec  The codec to destroy
173
 * \return PARSERUTILS_OK on success, appropriate error otherwise
174
 */
175
parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec)
176
{
177
	UNUSED(codec);
178
 
179
	return PARSERUTILS_OK;
180
}
181
 
182
/**
183
 * Encode a chunk of UCS-4 (big endian) data into extended 8bit
184
 *
185
 * \param codec      The codec to use
186
 * \param source     Pointer to pointer to source data
187
 * \param sourcelen  Pointer to length (in bytes) of source data
188
 * \param dest       Pointer to pointer to output buffer
189
 * \param destlen    Pointer to length (in bytes) of output buffer
190
 * \return PARSERUTILS_OK          on success,
191
 *         PARSERUTILS_NOMEM       if output buffer is too small,
192
 *         PARSERUTILS_INVALID     if a character cannot be represented and the
193
 *                                 codec's error handling mode is set to STRICT,
194
 *
195
 * On exit, ::source will point immediately _after_ the last input character
196
 * read. Any remaining output for the character will be buffered by the
197
 * codec for writing on the next call.
198
 *
199
 * Note that, if failure occurs whilst attempting to write any output
200
 * buffered by the last call, then ::source and ::sourcelen will remain
201
 * unchanged (as nothing more has been read).
202
 *
203
 * ::sourcelen will be reduced appropriately on exit.
204
 *
205
 * ::dest will point immediately _after_ the last character written.
206
 *
207
 * ::destlen will be reduced appropriately on exit.
208
 */
209
parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec,
210
		const uint8_t **source, size_t *sourcelen,
211
		uint8_t **dest, size_t *destlen)
212
{
213
	charset_ext8_codec *c = (charset_ext8_codec *) codec;
214
	uint32_t ucs4;
215
	uint32_t *towrite;
216
	size_t towritelen;
217
	parserutils_error error;
218
 
219
	/* Process any outstanding characters from the previous call */
220
	if (c->write_len > 0) {
221
		uint32_t *pwrite = c->write_buf;
222
 
223
		while (c->write_len > 0) {
224
			error = charset_ext8_from_ucs4(c, pwrite[0],
225
					dest, destlen);
226
			if (error != PARSERUTILS_OK) {
227
				uint32_t len;
228
				assert(error == PARSERUTILS_NOMEM);
229
 
230
				for (len = 0; len < c->write_len; len++) {
231
					c->write_buf[len] = pwrite[len];
232
				}
233
 
234
				return error;
235
			}
236
 
237
			pwrite++;
238
			c->write_len--;
239
		}
240
	}
241
 
242
	/* Now process the characters for this call */
243
	while (*sourcelen > 0) {
244
		ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
245
		towrite = &ucs4;
246
		towritelen = 1;
247
 
248
		/* Output current characters */
249
		while (towritelen > 0) {
250
			error = charset_ext8_from_ucs4(c, towrite[0], dest,
251
					destlen);
252
			if (error != PARSERUTILS_OK) {
253
				uint32_t len;
254
				if (error != PARSERUTILS_NOMEM) {
255
					return error;
256
				}
257
 
258
				/* Insufficient output space */
259
				if (towritelen >= WRITE_BUFSIZE)
260
					abort();
261
 
262
				c->write_len = towritelen;
263
 
264
				/* Copy pending chars to save area, for
265
				 * processing next call. */
266
				for (len = 0; len < towritelen; len++)
267
					c->write_buf[len] = towrite[len];
268
 
269
				/* Claim character we've just buffered,
270
				 * so it's not reprocessed */
271
				*source += 4;
272
				*sourcelen -= 4;
273
 
274
				return PARSERUTILS_NOMEM;
275
			}
276
 
277
			towrite++;
278
			towritelen--;
279
		}
280
 
281
		*source += 4;
282
		*sourcelen -= 4;
283
	}
284
 
285
	return PARSERUTILS_OK;
286
}
287
 
288
/**
289
 * Decode a chunk of extended 8bit data into UCS-4 (big endian)
290
 *
291
 * \param codec      The codec to use
292
 * \param source     Pointer to pointer to source data
293
 * \param sourcelen  Pointer to length (in bytes) of source data
294
 * \param dest       Pointer to pointer to output buffer
295
 * \param destlen    Pointer to length (in bytes) of output buffer
296
 * \return PARSERUTILS_OK          on success,
297
 *         PARSERUTILS_NOMEM       if output buffer is too small,
298
 *         PARSERUTILS_INVALID     if a character cannot be represented and the
299
 *                                 codec's error handling mode is set to STRICT,
300
 *
301
 * On exit, ::source will point immediately _after_ the last input character
302
 * read, if the result is _OK or _NOMEM. Any remaining output for the
303
 * character will be buffered by the codec for writing on the next call.
304
 *
305
 * In the case of the result being _INVALID, ::source will point _at_ the
306
 * last input character read; nothing will be written or buffered for the
307
 * failed character. It is up to the client to fix the cause of the failure
308
 * and retry the decoding process.
309
 *
310
 * Note that, if failure occurs whilst attempting to write any output
311
 * buffered by the last call, then ::source and ::sourcelen will remain
312
 * unchanged (as nothing more has been read).
313
 *
314
 * If STRICT error handling is configured and an illegal sequence is split
315
 * over two calls, then _INVALID will be returned from the second call,
316
 * but ::source will point mid-way through the invalid sequence (i.e. it
317
 * will be unmodified over the second call). In addition, the internal
318
 * incomplete-sequence buffer will be emptied, such that subsequent calls
319
 * will progress, rather than re-evaluating the same invalid sequence.
320
 *
321
 * ::sourcelen will be reduced appropriately on exit.
322
 *
323
 * ::dest will point immediately _after_ the last character written.
324
 *
325
 * ::destlen will be reduced appropriately on exit.
326
 *
327
 * Call this with a source length of 0 to flush the output buffer.
328
 */
329
parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec,
330
		const uint8_t **source, size_t *sourcelen,
331
		uint8_t **dest, size_t *destlen)
332
{
333
	charset_ext8_codec *c = (charset_ext8_codec *) codec;
334
	parserutils_error error;
335
 
336
	if (c->read_len > 0) {
337
		/* Output left over from last decode */
338
		uint32_t *pread = c->read_buf;
339
 
340
		while (c->read_len > 0 && *destlen >= c->read_len * 4) {
341
			*((uint32_t *) (void *) *dest) =
342
					endian_host_to_big(pread[0]);
343
 
344
			*dest += 4;
345
			*destlen -= 4;
346
 
347
			pread++;
348
			c->read_len--;
349
		}
350
 
351
		if (*destlen < c->read_len * 4) {
352
			/* Ran out of output buffer */
353
			size_t i;
354
 
355
			/* Shuffle remaining output down */
356
			for (i = 0; i < c->read_len; i++)
357
				c->read_buf[i] = pread[i];
358
 
359
			return PARSERUTILS_NOMEM;
360
		}
361
	}
362
 
363
	/* Finally, the "normal" case; process all outstanding characters */
364
	while (*sourcelen > 0) {
365
		error = charset_ext8_codec_read_char(c,
366
				source, sourcelen, dest, destlen);
367
		if (error != PARSERUTILS_OK) {
368
			return error;
369
		}
370
	}
371
 
372
	return PARSERUTILS_OK;
373
}
374
 
375
/**
376
 * Clear an extended 8bit codec's encoding state
377
 *
378
 * \param codec  The codec to reset
379
 * \return PARSERUTILS_OK on success, appropriate error otherwise
380
 */
381
parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
382
{
383
	charset_ext8_codec *c = (charset_ext8_codec *) codec;
384
 
385
	c->read_buf[0] = 0;
386
	c->read_len = 0;
387
 
388
	c->write_buf[0] = 0;
389
	c->write_len = 0;
390
 
391
	return PARSERUTILS_OK;
392
}
393
 
394
 
395
/**
396
 * Read a character from the extended 8bit to UCS-4 (big endian)
397
 *
398
 * \param c          The codec
399
 * \param source     Pointer to pointer to source buffer (updated on exit)
400
 * \param sourcelen  Pointer to length of source buffer (updated on exit)
401
 * \param dest       Pointer to pointer to output buffer (updated on exit)
402
 * \param destlen    Pointer to length of output buffer (updated on exit)
403
 * \return PARSERUTILS_OK on success,
404
 *         PARSERUTILS_NOMEM       if output buffer is too small,
405
 *         PARSERUTILS_INVALID     if a character cannot be represented and the
406
 *                                 codec's error handling mode is set to STRICT,
407
 *
408
 * On exit, ::source will point immediately _after_ the last input character
409
 * read, if the result is _OK or _NOMEM. Any remaining output for the
410
 * character will be buffered by the codec for writing on the next call.
411
 *
412
 * In the case of the result being _INVALID, ::source will point _at_ the
413
 * last input character read; nothing will be written or buffered for the
414
 * failed character. It is up to the client to fix the cause of the failure
415
 * and retry the decoding process.
416
 *
417
 * ::sourcelen will be reduced appropriately on exit.
418
 *
419
 * ::dest will point immediately _after_ the last character written.
420
 *
421
 * ::destlen will be reduced appropriately on exit.
422
 */
423
parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c,
424
		const uint8_t **source, size_t *sourcelen,
425
		uint8_t **dest, size_t *destlen)
426
{
427
	uint32_t ucs4;
428
	parserutils_error error;
429
 
430
	/* Convert a single character */
431
	error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
432
	if (error == PARSERUTILS_OK) {
433
		/* Read a character */
434
		error = charset_ext8_codec_output_decoded_char(c,
435
				ucs4, dest, destlen);
436
		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
437
			/* output succeeded; update source pointers */
438
			*source += 1;
439
			*sourcelen -= 1;
440
		}
441
 
442
		return error;
443
	} else if (error == PARSERUTILS_NEEDDATA) {
444
		/* Can only happen if sourcelen == 0 */
445
		return error;
446
	} else if (error == PARSERUTILS_INVALID) {
447
		/* Illegal input sequence */
448
 
449
		/* Strict errormode; simply flag invalid character */
450
		if (c->base.errormode ==
451
				PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
452
			return PARSERUTILS_INVALID;
453
		}
454
 
455
		/* output U+FFFD and continue processing. */
456
		error = charset_ext8_codec_output_decoded_char(c,
457
				0xFFFD, dest, destlen);
458
		if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
459
			/* output succeeded; update source pointers */
460
			*source += 1;
461
			*sourcelen -= 1;
462
		}
463
 
464
		return error;
465
	}
466
 
467
	return PARSERUTILS_OK;
468
}
469
 
470
/**
471
 * Output a UCS-4 character (big endian)
472
 *
473
 * \param c        Codec to use
474
 * \param ucs4     UCS-4 character (host endian)
475
 * \param dest     Pointer to pointer to output buffer
476
 * \param destlen  Pointer to output buffer length
477
 * \return PARSERUTILS_OK          on success,
478
 *         PARSERUTILS_NOMEM       if output buffer is too small,
479
 */
480
parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c,
481
		uint32_t ucs4, uint8_t **dest, size_t *destlen)
482
{
483
	if (*destlen < 4) {
484
		/* Run out of output buffer */
485
		c->read_len = 1;
486
		c->read_buf[0] = ucs4;
487
 
488
		return PARSERUTILS_NOMEM;
489
	}
490
 
491
	*((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
492
	*dest += 4;
493
	*destlen -= 4;
494
 
495
	return PARSERUTILS_OK;
496
}
497
 
498
/**
499
 * Convert a UCS4 (host endian) character to extended 8bit
500
 *
501
 * \param c     The codec instance
502
 * \param ucs4  The UCS4 character to convert
503
 * \param s     Pointer to pointer to destination buffer
504
 * \param len   Pointer to destination buffer length
505
 * \return PARSERUTILS_OK on success,
506
 *         PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
507
 *         PARSERUTILS_INVALID if the character cannot be represented
508
 *
509
 * _INVALID will only be returned if the codec's conversion mode is STRICT.
510
 * Otherwise, '?' will be output.
511
 *
512
 * On successful conversion, *s and *len will be updated.
513
 */
514
parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
515
		uint32_t ucs4, uint8_t **s, size_t *len)
516
{
517
	uint8_t out = 0;
518
 
519
	if (*len < 1)
520
		return PARSERUTILS_NOMEM;
521
 
522
	if (ucs4 < 0x80) {
523
		/* ASCII */
524
		out = ucs4;
525
	} else {
526
		uint32_t i;
527
 
528
		for (i = 0; i < 128; i++) {
529
			if (ucs4 == c->table[i])
530
				break;
531
		}
532
 
533
		if (i == 128) {
534
			if (c->base.errormode ==
535
					PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
536
				return PARSERUTILS_INVALID;
537
			else
538
				out = '?';
539
		} else {
540
			out = 0x80 + i;
541
		}
542
	}
543
 
544
	*(*s) = out;
545
	(*s)++;
546
	(*len)--;
547
 
548
	return PARSERUTILS_OK;
549
}
550
 
551
/**
552
 * Convert an extended 8bit character to UCS4 (host endian)
553
 *
554
 * \param c     The codec instance
555
 * \param s     Pointer to source buffer
556
 * \param len   Source buffer length
557
 * \param ucs4  Pointer to destination buffer
558
 * \return PARSERUTILS_OK on success,
559
 *         PARSERUTILS_NEEDDATA if there's insufficient input data
560
 *         PARSERUTILS_INVALID if the character cannot be represented
561
 */
562
parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
563
		const uint8_t *s, size_t len, uint32_t *ucs4)
564
{
565
	uint32_t out;
566
 
567
	if (len < 1)
568
		return PARSERUTILS_NEEDDATA;
569
 
570
	if (*s < 0x80) {
571
		out = *s;
572
	} else {
573
		if (c->table[*s - 0x80] == 0xFFFF)
574
			return PARSERUTILS_INVALID;
575
 
576
		out = c->table[*s - 0x80];
577
	}
578
 
579
	*ucs4 = out;
580
 
581
	return PARSERUTILS_OK;
582
}
583
 
584
const parserutils_charset_handler charset_ext8_codec_handler = {
585
	charset_ext8_codec_handles_charset,
586
	charset_ext8_codec_create
587
};
588