WebSVN – Kolibri OS – Blame – /programs/network/netsurf/libparserutils/src/charset/codecs/codec_ext8.c

Rev	Author	Line No.	Line
3584	sourcerer	1	/*
		2	* This file is part of LibParserUtils.
		3	* Licensed under the MIT License,
		4	* http://www.opensource.org/licenses/mit-license.php
		5	* Copyright 2008 John-Mark Bell
		6	*/
		7
		8	#include
		9	#include
		10	#include
		11
		12	#include
		13
		14	#include "charset/codecs/codec_impl.h"
		15	#include "utils/endian.h"
		16	#include "utils/utils.h"
		17
		18	#include "charset/codecs/ext8_tables.h"
		19
		20	static struct {
		21	uint16_t mib;
		22	const char *name;
		23	size_t len;
		24	uint32_t *table;
		25	} known_charsets[] = {
		26	{ 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
		27	{ 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
		28	{ 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
		29	{ 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
		30	{ 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
		31	{ 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
		32	{ 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
		33	{ 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
		34	{ 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
		35	};
		36
		37	/**
		38	* Windows charset codec
		39	*/
		40	typedef struct charset_ext8_codec {
		41	parserutils_charset_codec base; /*< Base class /
		42
		43	uint32_t table; /< Mapping table for 0x80-0xFF /
		44
		45	#define READ_BUFSIZE (8)
		46	uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
		47	* output sequences (decode)
		48	* (host-endian) */
		49	size_t read_len; /*< Character length of read_buf /
		50
		51	#define WRITE_BUFSIZE (8)
		52	uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
		53	* output sequences (encode)
		54	* (host-endian) */
		55	size_t write_len; /*< Character length of write_buf /
		56
		57	} charset_ext8_codec;
		58
		59	static bool charset_ext8_codec_handles_charset(const char *charset);
		60	static parserutils_error charset_ext8_codec_create(const char *charset,
		61	parserutils_alloc alloc, void *pw,
		62	parserutils_charset_codec **codec);
		63	static parserutils_error charset_ext8_codec_destroy(
		64	parserutils_charset_codec *codec);
		65	static parserutils_error charset_ext8_codec_encode(
		66	parserutils_charset_codec *codec,
		67	const uint8_t *source, size_t sourcelen,
		68	uint8_t *dest, size_t destlen);
		69	static parserutils_error charset_ext8_codec_decode(
		70	parserutils_charset_codec *codec,
		71	const uint8_t *source, size_t sourcelen,
		72	uint8_t *dest, size_t destlen);
		73	static parserutils_error charset_ext8_codec_reset(
		74	parserutils_charset_codec *codec);
		75	static inline parserutils_error charset_ext8_codec_read_char(
		76	charset_ext8_codec *c,
		77	const uint8_t *source, size_t sourcelen,
		78	uint8_t *dest, size_t destlen);
		79	static inline parserutils_error charset_ext8_codec_output_decoded_char(
		80	charset_ext8_codec *c,
		81	uint32_t ucs4, uint8_t *dest, size_t destlen);
		82	static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
		83	uint32_t ucs4, uint8_t *s, size_t len);
		84	static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
		85	const uint8_t s, size_t len, uint32_t ucs4);
		86
		87	/**
		88	* Determine whether this codec handles a specific charset
		89	*
		90	* \param charset Charset to test
		91	* \return true if handleable, false otherwise
		92	*/
		93	bool charset_ext8_codec_handles_charset(const char *charset)
		94	{
		95	uint32_t i;
		96	uint16_t match = parserutils_charset_mibenum_from_name(charset,
		97	strlen(charset));
		98
		99	if (known_charsets[0].mib == 0) {
		100	for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
		101	known_charsets[i].mib =
		102	parserutils_charset_mibenum_from_name(
		103	known_charsets[i].name,
		104	known_charsets[i].len);
		105	}
		106	}
		107
		108	for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
		109	if (known_charsets[i].mib == match)
		110	return true;
		111	}
		112
		113	return false;
		114	}
		115
		116	/**
		117	* Create an extended 8bit codec
		118	*
		119	* \param charset The charset to read from / write to
		120	* \param alloc Memory (de)allocation function
		121	* \param pw Pointer to client-specific private data (may be NULL)
		122	* \param codec Pointer to location to receive codec
		123	* \return PARSERUTILS_OK on success,
		124	* PARSERUTILS_BADPARM on bad parameters,
		125	* PARSERUTILS_NOMEM on memory exhausion
		126	*/
		127	parserutils_error charset_ext8_codec_create(const char *charset,
		128	parserutils_alloc alloc, void *pw,
		129	parserutils_charset_codec **codec)
		130	{
		131	uint32_t i;
		132	charset_ext8_codec *c;
		133	uint16_t match = parserutils_charset_mibenum_from_name(
		134	charset, strlen(charset));
		135	uint32_t *table = NULL;
		136
		137	for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
		138	if (known_charsets[i].mib == match) {
		139	table = known_charsets[i].table;
		140	break;
		141	}
		142	}
		143
		144	assert(table != NULL);
		145
		146	c = alloc(NULL, sizeof(charset_ext8_codec), pw);
		147	if (c == NULL)
		148	return PARSERUTILS_NOMEM;
		149
		150	c->table = table;
		151
		152	c->read_buf[0] = 0;
		153	c->read_len = 0;
		154
		155	c->write_buf[0] = 0;
		156	c->write_len = 0;
		157
		158	/* Finally, populate vtable */
		159	c->base.handler.destroy = charset_ext8_codec_destroy;
		160	c->base.handler.encode = charset_ext8_codec_encode;
		161	c->base.handler.decode = charset_ext8_codec_decode;
		162	c->base.handler.reset = charset_ext8_codec_reset;
		163
		164	codec = (parserutils_charset_codec ) c;
		165
		166	return PARSERUTILS_OK;
		167	}
		168
		169	/**
		170	* Destroy an extended 8bit codec
		171	*
		172	* \param codec The codec to destroy
		173	* \return PARSERUTILS_OK on success, appropriate error otherwise
		174	*/
		175	parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec)
		176	{
		177	UNUSED(codec);
		178
		179	return PARSERUTILS_OK;
		180	}
		181
		182	/**
		183	* Encode a chunk of UCS-4 (big endian) data into extended 8bit
		184	*
		185	* \param codec The codec to use
		186	* \param source Pointer to pointer to source data
		187	* \param sourcelen Pointer to length (in bytes) of source data
		188	* \param dest Pointer to pointer to output buffer
		189	* \param destlen Pointer to length (in bytes) of output buffer
		190	* \return PARSERUTILS_OK on success,
		191	* PARSERUTILS_NOMEM if output buffer is too small,
		192	* PARSERUTILS_INVALID if a character cannot be represented and the
		193	* codec's error handling mode is set to STRICT,
		194	*
		195	* On exit, ::source will point immediately _after_ the last input character
		196	* read. Any remaining output for the character will be buffered by the
		197	* codec for writing on the next call.
		198	*
		199	* Note that, if failure occurs whilst attempting to write any output
		200	* buffered by the last call, then ::source and ::sourcelen will remain
		201	* unchanged (as nothing more has been read).
		202	*
		203	* ::sourcelen will be reduced appropriately on exit.
		204	*
		205	* ::dest will point immediately _after_ the last character written.
		206	*
		207	* ::destlen will be reduced appropriately on exit.
		208	*/
		209	parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec,
		210	const uint8_t *source, size_t sourcelen,
		211	uint8_t *dest, size_t destlen)
		212	{
		213	charset_ext8_codec c = (charset_ext8_codec ) codec;
		214	uint32_t ucs4;
		215	uint32_t *towrite;
		216	size_t towritelen;
		217	parserutils_error error;
		218
		219	/* Process any outstanding characters from the previous call */
		220	if (c->write_len > 0) {
		221	uint32_t *pwrite = c->write_buf;
		222
		223	while (c->write_len > 0) {
		224	error = charset_ext8_from_ucs4(c, pwrite[0],
		225	dest, destlen);
		226	if (error != PARSERUTILS_OK) {
		227	uint32_t len;
		228	assert(error == PARSERUTILS_NOMEM);
		229
		230	for (len = 0; len < c->write_len; len++) {
		231	c->write_buf[len] = pwrite[len];
		232	}
		233
		234	return error;
		235	}
		236
		237	pwrite++;
		238	c->write_len--;
		239	}
		240	}
		241
		242	/* Now process the characters for this call */
		243	while (*sourcelen > 0) {
		244	ucs4 = endian_big_to_host(((uint32_t ) (void ) source));
		245	towrite = &ucs4;
		246	towritelen = 1;
		247
		248	/* Output current characters */
		249	while (towritelen > 0) {
		250	error = charset_ext8_from_ucs4(c, towrite[0], dest,
		251	destlen);
		252	if (error != PARSERUTILS_OK) {
		253	uint32_t len;
		254	if (error != PARSERUTILS_NOMEM) {
		255	return error;
		256	}
		257
		258	/* Insufficient output space */
		259	if (towritelen >= WRITE_BUFSIZE)
		260	abort();
		261
		262	c->write_len = towritelen;
		263
		264	/* Copy pending chars to save area, for
		265	* processing next call. */
		266	for (len = 0; len < towritelen; len++)
		267	c->write_buf[len] = towrite[len];
		268
		269	/* Claim character we've just buffered,
		270	* so it's not reprocessed */
		271	*source += 4;
		272	*sourcelen -= 4;
		273
		274	return PARSERUTILS_NOMEM;
		275	}
		276
		277	towrite++;
		278	towritelen--;
		279	}
		280
		281	*source += 4;
		282	*sourcelen -= 4;
		283	}
		284
		285	return PARSERUTILS_OK;
		286	}
		287
		288	/**
		289	* Decode a chunk of extended 8bit data into UCS-4 (big endian)
		290	*
		291	* \param codec The codec to use
		292	* \param source Pointer to pointer to source data
		293	* \param sourcelen Pointer to length (in bytes) of source data
		294	* \param dest Pointer to pointer to output buffer
		295	* \param destlen Pointer to length (in bytes) of output buffer
		296	* \return PARSERUTILS_OK on success,
		297	* PARSERUTILS_NOMEM if output buffer is too small,
		298	* PARSERUTILS_INVALID if a character cannot be represented and the
		299	* codec's error handling mode is set to STRICT,
		300	*
		301	* On exit, ::source will point immediately _after_ the last input character
		302	* read, if the result is _OK or _NOMEM. Any remaining output for the
		303	* character will be buffered by the codec for writing on the next call.
		304	*
		305	* In the case of the result being _INVALID, ::source will point _at_ the
		306	* last input character read; nothing will be written or buffered for the
		307	* failed character. It is up to the client to fix the cause of the failure
		308	* and retry the decoding process.
		309	*
		310	* Note that, if failure occurs whilst attempting to write any output
		311	* buffered by the last call, then ::source and ::sourcelen will remain
		312	* unchanged (as nothing more has been read).
		313	*
		314	* If STRICT error handling is configured and an illegal sequence is split
		315	* over two calls, then _INVALID will be returned from the second call,
		316	* but ::source will point mid-way through the invalid sequence (i.e. it
		317	* will be unmodified over the second call). In addition, the internal
		318	* incomplete-sequence buffer will be emptied, such that subsequent calls
		319	* will progress, rather than re-evaluating the same invalid sequence.
		320	*
		321	* ::sourcelen will be reduced appropriately on exit.
		322	*
		323	* ::dest will point immediately _after_ the last character written.
		324	*
		325	* ::destlen will be reduced appropriately on exit.
		326	*
		327	* Call this with a source length of 0 to flush the output buffer.
		328	*/
		329	parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec,
		330	const uint8_t *source, size_t sourcelen,
		331	uint8_t *dest, size_t destlen)
		332	{
		333	charset_ext8_codec c = (charset_ext8_codec ) codec;
		334	parserutils_error error;
		335
		336	if (c->read_len > 0) {
		337	/* Output left over from last decode */
		338	uint32_t *pread = c->read_buf;
		339
		340	while (c->read_len > 0 && destlen >= c->read_len 4) {
		341	((uint32_t ) (void ) dest) =
		342	endian_host_to_big(pread[0]);
		343
		344	*dest += 4;
		345	*destlen -= 4;
		346
		347	pread++;
		348	c->read_len--;
		349	}
		350
		351	if (destlen < c->read_len 4) {
		352	/* Ran out of output buffer */
		353	size_t i;
		354
		355	/* Shuffle remaining output down */
		356	for (i = 0; i < c->read_len; i++)
		357	c->read_buf[i] = pread[i];
		358
		359	return PARSERUTILS_NOMEM;
		360	}
		361	}
		362
		363	/* Finally, the "normal" case; process all outstanding characters */
		364	while (*sourcelen > 0) {
		365	error = charset_ext8_codec_read_char(c,
		366	source, sourcelen, dest, destlen);
		367	if (error != PARSERUTILS_OK) {
		368	return error;
		369	}
		370	}
		371
		372	return PARSERUTILS_OK;
		373	}
		374
		375	/**
		376	* Clear an extended 8bit codec's encoding state
		377	*
		378	* \param codec The codec to reset
		379	* \return PARSERUTILS_OK on success, appropriate error otherwise
		380	*/
		381	parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
		382	{
		383	charset_ext8_codec c = (charset_ext8_codec ) codec;
		384
		385	c->read_buf[0] = 0;
		386	c->read_len = 0;
		387
		388	c->write_buf[0] = 0;
		389	c->write_len = 0;
		390
		391	return PARSERUTILS_OK;
		392	}
		393
		394
		395	/**
		396	* Read a character from the extended 8bit to UCS-4 (big endian)
		397	*
		398	* \param c The codec
		399	* \param source Pointer to pointer to source buffer (updated on exit)
		400	* \param sourcelen Pointer to length of source buffer (updated on exit)
		401	* \param dest Pointer to pointer to output buffer (updated on exit)
		402	* \param destlen Pointer to length of output buffer (updated on exit)
		403	* \return PARSERUTILS_OK on success,
		404	* PARSERUTILS_NOMEM if output buffer is too small,
		405	* PARSERUTILS_INVALID if a character cannot be represented and the
		406	* codec's error handling mode is set to STRICT,
		407	*
		408	* On exit, ::source will point immediately _after_ the last input character
		409	* read, if the result is _OK or _NOMEM. Any remaining output for the
		410	* character will be buffered by the codec for writing on the next call.
		411	*
		412	* In the case of the result being _INVALID, ::source will point _at_ the
		413	* last input character read; nothing will be written or buffered for the
		414	* failed character. It is up to the client to fix the cause of the failure
		415	* and retry the decoding process.
		416	*
		417	* ::sourcelen will be reduced appropriately on exit.
		418	*
		419	* ::dest will point immediately _after_ the last character written.
		420	*
		421	* ::destlen will be reduced appropriately on exit.
		422	*/
		423	parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c,
		424	const uint8_t *source, size_t sourcelen,
		425	uint8_t *dest, size_t destlen)
		426	{
		427	uint32_t ucs4;
		428	parserutils_error error;
		429
		430	/* Convert a single character */
		431	error = charset_ext8_to_ucs4(c, source, sourcelen, &ucs4);
		432	if (error == PARSERUTILS_OK) {
		433	/* Read a character */
		434	error = charset_ext8_codec_output_decoded_char(c,
		435	ucs4, dest, destlen);
		436	if (error == PARSERUTILS_OK \|\| error == PARSERUTILS_NOMEM) {
		437	/* output succeeded; update source pointers */
		438	*source += 1;
		439	*sourcelen -= 1;
		440	}
		441
		442	return error;
		443	} else if (error == PARSERUTILS_NEEDDATA) {
		444	/* Can only happen if sourcelen == 0 */
		445	return error;
		446	} else if (error == PARSERUTILS_INVALID) {
		447	/* Illegal input sequence */
		448
		449	/* Strict errormode; simply flag invalid character */
		450	if (c->base.errormode ==
		451	PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
		452	return PARSERUTILS_INVALID;
		453	}
		454
		455	/* output U+FFFD and continue processing. */
		456	error = charset_ext8_codec_output_decoded_char(c,
		457	0xFFFD, dest, destlen);
		458	if (error == PARSERUTILS_OK \|\| error == PARSERUTILS_NOMEM) {
		459	/* output succeeded; update source pointers */
		460	*source += 1;
		461	*sourcelen -= 1;
		462	}
		463
		464	return error;
		465	}
		466
		467	return PARSERUTILS_OK;
		468	}
		469
		470	/**
		471	* Output a UCS-4 character (big endian)
		472	*
		473	* \param c Codec to use
		474	* \param ucs4 UCS-4 character (host endian)
		475	* \param dest Pointer to pointer to output buffer
		476	* \param destlen Pointer to output buffer length
		477	* \return PARSERUTILS_OK on success,
		478	* PARSERUTILS_NOMEM if output buffer is too small,
		479	*/
		480	parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c,
		481	uint32_t ucs4, uint8_t *dest, size_t destlen)
		482	{
		483	if (*destlen < 4) {
		484	/* Run out of output buffer */
		485	c->read_len = 1;
		486	c->read_buf[0] = ucs4;
		487
		488	return PARSERUTILS_NOMEM;
		489	}
		490
		491	((uint32_t ) (void ) dest) = endian_host_to_big(ucs4);
		492	*dest += 4;
		493	*destlen -= 4;
		494
		495	return PARSERUTILS_OK;
		496	}
		497
		498	/**
		499	* Convert a UCS4 (host endian) character to extended 8bit
		500	*
		501	* \param c The codec instance
		502	* \param ucs4 The UCS4 character to convert
		503	* \param s Pointer to pointer to destination buffer
		504	* \param len Pointer to destination buffer length
		505	* \return PARSERUTILS_OK on success,
		506	* PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
		507	* PARSERUTILS_INVALID if the character cannot be represented
		508	*
		509	* _INVALID will only be returned if the codec's conversion mode is STRICT.
		510	* Otherwise, '?' will be output.
		511	*
		512	* On successful conversion, s and len will be updated.
		513	*/
		514	parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
		515	uint32_t ucs4, uint8_t *s, size_t len)
		516	{
		517	uint8_t out = 0;
		518
		519	if (*len < 1)
		520	return PARSERUTILS_NOMEM;
		521
		522	if (ucs4 < 0x80) {
		523	/* ASCII */
		524	out = ucs4;
		525	} else {
		526	uint32_t i;
		527
		528	for (i = 0; i < 128; i++) {
		529	if (ucs4 == c->table[i])
		530	break;
		531	}
		532
		533	if (i == 128) {
		534	if (c->base.errormode ==
		535	PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
		536	return PARSERUTILS_INVALID;
		537	else
		538	out = '?';
		539	} else {
		540	out = 0x80 + i;
		541	}
		542	}
		543
		544	(s) = out;
		545	(*s)++;
		546	(*len)--;
		547
		548	return PARSERUTILS_OK;
		549	}
		550
		551	/**
		552	* Convert an extended 8bit character to UCS4 (host endian)
		553	*
		554	* \param c The codec instance
		555	* \param s Pointer to source buffer
		556	* \param len Source buffer length
		557	* \param ucs4 Pointer to destination buffer
		558	* \return PARSERUTILS_OK on success,
		559	* PARSERUTILS_NEEDDATA if there's insufficient input data
		560	* PARSERUTILS_INVALID if the character cannot be represented
		561	*/
		562	parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
		563	const uint8_t s, size_t len, uint32_t ucs4)
		564	{
		565	uint32_t out;
		566
		567	if (len < 1)
		568	return PARSERUTILS_NEEDDATA;
		569
		570	if (*s < 0x80) {
		571	out = *s;
		572	} else {
		573	if (c->table[*s - 0x80] == 0xFFFF)
		574	return PARSERUTILS_INVALID;
		575
		576	out = c->table[*s - 0x80];
		577	}
		578
		579	*ucs4 = out;
		580
		581	return PARSERUTILS_OK;
		582	}
		583
		584	const parserutils_charset_handler charset_ext8_codec_handler = {
		585	charset_ext8_codec_handles_charset,
		586	charset_ext8_codec_create
		587	};
		588

Subversion Repositories Kolibri OS

(root)/programs/network/netsurf/libparserutils/src/charset/codecs/codec_ext8.c – Rev 3584