/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <parserutils/charset/mibenum.h>
#include <parserutils/charset/utf8.h>
#include <parserutils/input/inputstream.h>
#include "input/filter.h"
#include "utils/utils.h"
/**
* Private input stream definition
*/
typedef struct parserutils_inputstream_private {
parserutils_inputstream public; /**< Public part. Must be first */
parserutils_buffer *raw; /**< Buffer containing raw data */
bool done_first_chunk; /**< Whether the first chunk has
* been processed */
uint16_t mibenum; /**< MIB enum for charset, or 0 */
uint32_t encsrc; /**< Charset source */
parserutils_filter *input; /**< Charset conversion filter */
parserutils_charset_detect_func csdetect; /**< Charset detection func.*/
parserutils_alloc alloc; /**< Memory (de)allocation function */
void *pw; /**< Client private data */
} parserutils_inputstream_private;
static inline parserutils_error parserutils_inputstream_refill_buffer(
parserutils_inputstream_private *stream);
static inline parserutils_error parserutils_inputstream_strip_bom(
uint16_t *mibenum, parserutils_buffer *buffer);
/**
* Create an input stream
*
* \param enc Document charset, or NULL to autodetect
* \param encsrc Value for encoding source, if specified, or 0
* \param csdetect Charset detection function, or NULL
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param stream Pointer to location to receive stream instance
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhaustion,
* PARSERUTILS_BADENCODING on unsupported encoding
*
* The value 0 is defined as being the lowest priority encoding source
* (i.e. the default fallback encoding). Beyond this, no further
* interpretation is made upon the encoding source.
*/
parserutils_error parserutils_inputstream_create(const char *enc,
uint32_t encsrc, parserutils_charset_detect_func csdetect,
parserutils_alloc alloc, void *pw,
parserutils_inputstream **stream)
{
parserutils_inputstream_private *s;
parserutils_error error;
if (alloc == NULL || stream == NULL)
return PARSERUTILS_BADPARM;
s = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
if (s == NULL)
return PARSERUTILS_NOMEM;
error = parserutils_buffer_create(alloc, pw, &s->raw);
if (error != PARSERUTILS_OK) {
alloc(s, 0, pw);
return error;
}
error = parserutils_buffer_create(alloc, pw, &s->public.utf8);
if (error != PARSERUTILS_OK) {
parserutils_buffer_destroy(s->raw);
alloc(s, 0, pw);
return error;
}
s->public.cursor = 0;
s->public.had_eof = false;
s->done_first_chunk = false;
error = parserutils__filter_create("UTF-8", alloc, pw, &s->input);
if (error != PARSERUTILS_OK) {
parserutils_buffer_destroy(s->public.utf8);
parserutils_buffer_destroy(s->raw);
alloc(s, 0, pw);
return error;
}
if (enc != NULL) {
parserutils_filter_optparams params;
s->mibenum =
parserutils_charset_mibenum_from_name
(enc
, strlen(enc
));
if (s->mibenum == 0)
return PARSERUTILS_BADENCODING;
params.encoding.name = enc;
error = parserutils__filter_setopt(s->input,
PARSERUTILS_FILTER_SET_ENCODING,
¶ms);
if (error != PARSERUTILS_OK) {
parserutils__filter_destroy(s->input);
parserutils_buffer_destroy(s->public.utf8);
parserutils_buffer_destroy(s->raw);
alloc(s, 0, pw);
return error;
}
s->encsrc = encsrc;
} else {
s->mibenum = 0;
s->encsrc = 0;
}
s->csdetect = csdetect;
s->alloc = alloc;
s->pw = pw;
*stream = (parserutils_inputstream *) s;
return PARSERUTILS_OK;
}
/**
* Destroy an input stream
*
* \param stream Input stream to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_inputstream_destroy(
parserutils_inputstream *stream)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
if (stream == NULL)
return PARSERUTILS_BADPARM;
parserutils__filter_destroy(s->input);
parserutils_buffer_destroy(s->public.utf8);
parserutils_buffer_destroy(s->raw);
s->alloc(s, 0, s->pw);
return PARSERUTILS_OK;
}
/**
* Append data to an input stream
*
* \param stream Input stream to append data to
* \param data Data to append (in document charset), or NULL to flag EOF
* \param len Length, in bytes, of data
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_inputstream_append(
parserutils_inputstream *stream,
const uint8_t *data, size_t len)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
if (stream == NULL)
return PARSERUTILS_BADPARM;
if (data == NULL) {
s->public.had_eof = true;
return PARSERUTILS_OK;
}
return parserutils_buffer_append(s->raw, data, len);
}
/**
* Insert data into stream at current location
*
* \param stream Input stream to insert into
* \param data Data to insert (UTF-8 encoded)
* \param len Length, in bytes, of data
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_inputstream_insert(
parserutils_inputstream *stream,
const uint8_t *data, size_t len)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
if (stream == NULL || data == NULL)
return PARSERUTILS_BADPARM;
return parserutils_buffer_insert(s->public.utf8, s->public.cursor,
data, len);
}
#define IS_ASCII(x) (((x) & 0x80) == 0)
/**
* Look at the character in the stream that starts at
* offset bytes from the cursor (slow version)
*
* \param stream Stream to look in
* \param offset Byte offset of start of character
* \param ptr Pointer to location to receive pointer to character data
* \param length Pointer to location to receive character length (in bytes)
* \return PARSERUTILS_OK on success,
* _NEEDDATA on reaching the end of available input,
* _EOF on reaching the end of all input,
* _BADENCODING if the input cannot be decoded,
* _NOMEM on memory exhaustion,
* _BADPARM if bad parameters are passed.
*
* Once the character pointed to by the result of this call has been advanced
* past (i.e. parserutils_inputstream_advance has caused the stream cursor to
* pass over the character), then no guarantee is made as to the validity of
* the data pointed to. Thus, any attempt to dereference the pointer after
* advancing past the data it points to is a bug.
*/
parserutils_error parserutils_inputstream_peek_slow(
parserutils_inputstream *stream,
size_t offset, const uint8_t **ptr, size_t *length)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
parserutils_error error = PARSERUTILS_OK;
size_t len;
if (stream == NULL || ptr == NULL || length == NULL)
return PARSERUTILS_BADPARM;
/* There's insufficient data in the buffer, so read some more */
if (s->raw->length == 0) {
/* No more data to be had */
return s->public.had_eof ? PARSERUTILS_EOF
: PARSERUTILS_NEEDDATA;
}
/* Refill utf8 buffer from raw buffer */
error = parserutils_inputstream_refill_buffer(s);
if (error != PARSERUTILS_OK)
return error;
/* Refill may have succeeded, but not actually produced any new data */
if (s->public.cursor + offset == s->public.utf8->length)
return PARSERUTILS_NEEDDATA;
/* Now try the read */
if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
len = 1;
} else {
error = parserutils_charset_utf8_char_byte_length(
s->public.utf8->data + s->public.cursor + offset,
&len);
if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
return error;
if (error == PARSERUTILS_NEEDDATA) {
return s->public.had_eof ? PARSERUTILS_EOF
: PARSERUTILS_NEEDDATA;
}
}
(*length) = len;
(*ptr) = (s->public.utf8->data + s->public.cursor + offset);
return PARSERUTILS_OK;
}
#undef IS_ASCII
/**
* Read the source charset of the input stream
*
* \param stream Input stream to query
* \param source Pointer to location to receive charset source identifier
* \return Pointer to charset name (constant; do not free)
*/
const char *parserutils_inputstream_read_charset(
parserutils_inputstream *stream, uint32_t *source)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
if (stream == NULL || source == NULL)
return NULL;
*source = s->encsrc;
if (s->encsrc == 0)
return "UTF-8";
return parserutils_charset_mibenum_to_name(s->mibenum);
}
/**
* Change the source charset of the input stream
*
* \param stream Input stream to modify
* \param enc Charset name
* \param source Charset source identifier
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on invalid parameters,
* PARSERUTILS_INVALID if called after data has been read from stream,
* PARSERUTILS_BADENCODING if the encoding is unsupported,
* PARSERUTILS_NOMEM on memory exhaustion.
*/
parserutils_error parserutils_inputstream_change_charset(
parserutils_inputstream *stream,
const char *enc, uint32_t source)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
parserutils_filter_optparams params;
uint16_t temp;
parserutils_error error;
if (stream == NULL || enc == NULL)
return PARSERUTILS_BADPARM;
if (s->done_first_chunk)
return PARSERUTILS_INVALID;
temp
= parserutils_charset_mibenum_from_name
(enc
, strlen(enc
));
if (temp == 0)
return PARSERUTILS_BADENCODING;
/* Ensure filter is using the correct encoding */
params.encoding.name = enc;
error = parserutils__filter_setopt(s->input,
PARSERUTILS_FILTER_SET_ENCODING,
¶ms);
if (error != PARSERUTILS_OK)
return error;
/* Finally, replace the current settings */
s->mibenum = temp;
s->encsrc = source;
return PARSERUTILS_OK;
}
/******************************************************************************
******************************************************************************/
/**
* Refill the UTF-8 buffer from the raw buffer
*
* \param stream The inputstream to operate on
* \return PARSERUTILS_OK on success
*/
parserutils_error parserutils_inputstream_refill_buffer(
parserutils_inputstream_private *stream)
{
const uint8_t *raw;
uint8_t *utf8;
size_t raw_length, utf8_space;
parserutils_error error;
/* If this is the first chunk of data, we must detect the charset and
* strip the BOM, if one exists */
if (stream->done_first_chunk == false) {
parserutils_filter_optparams params;
/* If there is a charset detection routine, give it an
* opportunity to override any charset specified when the
* inputstream was created */
if (stream->csdetect != NULL) {
error = stream->csdetect(stream->raw->data,
stream->raw->length,
&stream->mibenum, &stream->encsrc);
if (error != PARSERUTILS_OK) {
if (error != PARSERUTILS_NEEDDATA ||
stream->public.had_eof == false)
return error;
/* We don't have enough data to detect the
* input encoding, but we're not going to get
* any more as we've been notified of EOF.
* Therefore, leave the encoding alone
* so that any charset specified when the
* inputstream was created will be preserved.
* If there was no charset specified, then
* we'll default to UTF-8, below */
}
}
/* Default to UTF-8 if there is still no encoding information
* We'll do this if there was no encoding specified up-front
* and:
* 1) there was no charset detection routine
* or 2) there was insufficient data for the charset
* detection routine to detect an encoding
*/
if (stream->mibenum == 0) {
stream->mibenum =
parserutils_charset_mibenum_from_name("UTF-8",
SLEN("UTF-8"));
stream->encsrc = 0;
}
if (stream->mibenum == 0)
/* Strip any BOM, and update encoding as appropriate */
error = parserutils_inputstream_strip_bom(&stream->mibenum,
stream->raw);
if (error != PARSERUTILS_OK)
return error;
/* Ensure filter is using the correct encoding */
params.encoding.name =
parserutils_charset_mibenum_to_name(stream->mibenum);
error = parserutils__filter_setopt(stream->input,
PARSERUTILS_FILTER_SET_ENCODING,
¶ms);
if (error != PARSERUTILS_OK)
return error;
stream->done_first_chunk = true;
}
/* Work out how to perform the buffer fill */
if (stream->public.cursor == stream->public.utf8->length) {
/* Cursor's at the end, so simply reuse the entire buffer */
utf8 = stream->public.utf8->data;
utf8_space = stream->public.utf8->allocated;
} else {
/* Cursor's not at the end, so shift data after cursor to the
* bottom of the buffer. If the buffer's still over half full,
* extend it. */
stream->public.utf8->data + stream->public.cursor,
stream->public.utf8->length - stream->public.cursor);
stream->public.utf8->length -= stream->public.cursor;
if (stream->public.utf8->length >
stream->public.utf8->allocated / 2) {
error = parserutils_buffer_grow(stream->public.utf8);
if (error != PARSERUTILS_OK)
return error;
}
utf8 = stream->public.utf8->data + stream->public.utf8->length;
utf8_space = stream->public.utf8->allocated -
stream->public.utf8->length;
}
raw = stream->raw->data;
raw_length = stream->raw->length;
/* Try to fill utf8 buffer from the raw data */
error = parserutils__filter_process_chunk(stream->input,
&raw, &raw_length, &utf8, &utf8_space);
/* _NOMEM implies that there's more input to read than available space
* in the utf8 buffer. That's fine, so we'll ignore that error. */
if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
return error;
/* Remove the raw data we've processed from the raw buffer */
error = parserutils_buffer_discard(stream->raw, 0,
stream->raw->length - raw_length);
if (error != PARSERUTILS_OK)
return error;
/* Fix up the utf8 buffer information */
stream->public.utf8->length =
stream->public.utf8->allocated - utf8_space;
/* Finally, fix up the cursor */
stream->public.cursor = 0;
return PARSERUTILS_OK;
}
/**
* Strip a BOM from a buffer in the given encoding
*
* \param mibenum Pointer to the character set of the buffer, updated on exit
* \param buffer The buffer to process
*/
parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum,
parserutils_buffer *buffer)
{
static uint16_t utf8;
static uint16_t utf16;
static uint16_t utf16be;
static uint16_t utf16le;
static uint16_t utf32;
static uint16_t utf32be;
static uint16_t utf32le;
if (utf8 == 0) {
utf8 = parserutils_charset_mibenum_from_name("UTF-8",
SLEN("UTF-8"));
utf16 = parserutils_charset_mibenum_from_name("UTF-16",
SLEN("UTF-16"));
utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
SLEN("UTF-16BE"));
utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
SLEN("UTF-16LE"));
utf32 = parserutils_charset_mibenum_from_name("UTF-32",
SLEN("UTF-32"));
utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
SLEN("UTF-32BE"));
utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
SLEN("UTF-32LE"));
}
#define UTF32_BOM_LEN (4)
#define UTF16_BOM_LEN (2)
#define UTF8_BOM_LEN (3)
if (*mibenum == utf8) {
if (buffer->length >= UTF8_BOM_LEN &&
buffer->data[0] == 0xEF &&
buffer->data[1] == 0xBB &&
buffer->data[2] == 0xBF) {
return parserutils_buffer_discard(
buffer, 0, UTF8_BOM_LEN);
}
} else if (*mibenum == utf16be) {
if (buffer->length >= UTF16_BOM_LEN &&
buffer->data[0] == 0xFE &&
buffer->data[1] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
}
} else if (*mibenum == utf16le) {
if (buffer->length >= UTF16_BOM_LEN &&
buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE) {
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
}
} else if (*mibenum == utf16) {
*mibenum = utf16be;
if (buffer->length >= UTF16_BOM_LEN) {
if (buffer->data[0] == 0xFE &&
buffer->data[1] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
} else if (buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE) {
*mibenum = utf16le;
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
}
}
} else if (*mibenum == utf32be) {
if (buffer->length >= UTF32_BOM_LEN &&
buffer->data[0] == 0x00 &&
buffer->data[1] == 0x00 &&
buffer->data[2] == 0xFE &&
buffer->data[3] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
}
} else if (*mibenum == utf32le) {
if (buffer->length >= UTF32_BOM_LEN &&
buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE &&
buffer->data[2] == 0x00 &&
buffer->data[3] == 0x00) {
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
}
} else if (*mibenum == utf32) {
*mibenum = utf32be;
if (buffer->length >= UTF32_BOM_LEN) {
if (buffer->data[0] == 0x00 &&
buffer->data[1] == 0x00 &&
buffer->data[2] == 0xFE &&
buffer->data[3] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
} else if (buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE &&
buffer->data[2] == 0x00 &&
buffer->data[3] == 0x00) {
*mibenum = utf32le;
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
}
}
}
#undef UTF8_BOM_LEN
#undef UTF16_BOM_LEN
#undef UTF32_BOM_LEN
return PARSERUTILS_OK;
}