WebSVN – Kolibri OS – /programs/network/netsurf/libcss/src/charset/detect.c

/*
* This file is part of LibCSS.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
#include <stdbool.h>
#include <string.h>
#include <parserutils/charset/mibenum.h>
#include "charset/detect.h"
#include "utils/utils.h"
static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
size_t len, uint16_t *mibenum);
static parserutils_error try_utf32_charset(const uint8_t *data,
size_t len, uint16_t *result);
static parserutils_error try_utf16_charset(const uint8_t *data,
size_t len, uint16_t *result);
static parserutils_error try_ascii_compatible_charset(const uint8_t *data,
size_t len, uint16_t *result);
/**
* Extract a charset from a chunk of data
*
* \param data Pointer to buffer containing data
* \param len Buffer length
* \param mibenum Pointer to location containing current MIB enum
* \param source Pointer to location containing current charset source
* \return PARSERUTILS_OK on success, appropriate error otherwise
*
* ::mibenum and ::source will be updated on exit
*
* CSS 2.1 $4.4
*/
parserutils_error css__charset_extract(const uint8_t *data, size_t len,
uint16_t *mibenum, uint32_t *source)
{
parserutils_error error;
uint16_t charset = 0;
if (data == NULL || mibenum == NULL || source == NULL)
return PARSERUTILS_BADPARM;
/* If the charset was dictated by the client, we've nothing to detect */
if (*source == CSS_CHARSET_DICTATED)
return PARSERUTILS_OK;
/* Look for a BOM and/or @charset */
error = css_charset_read_bom_or_charset(data, len, &charset);
if (error != PARSERUTILS_OK)
return error;
if (charset != 0) {
*mibenum = charset;
*source = CSS_CHARSET_DOCUMENT;
return PARSERUTILS_OK;
}
/* If we've already got a charset from the linking mechanism or
* referring document, then we've nothing further to do */
if (*source != CSS_CHARSET_DEFAULT)
return PARSERUTILS_OK;
/* We've not yet found a charset, so use the default fallback */
charset = parserutils_charset_mibenum_from_name("UTF-8", SLEN("UTF-8"));
*mibenum = charset;
*source = CSS_CHARSET_DEFAULT;
return PARSERUTILS_OK;
}
/**
* Inspect the beginning of a buffer of data for the presence of a
* UTF Byte Order Mark and/or an @charset rule
*
* \param data Pointer to buffer containing data
* \param len Buffer length
* \param mibenum Pointer to location to receive MIB enum
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
size_t len, uint16_t *mibenum)
{
parserutils_error error;
uint16_t charset = 0;
if (data == NULL)
return PARSERUTILS_BADPARM;
/* We require at least 4 bytes of data */
if (len < 4)
return PARSERUTILS_NEEDDATA;
/* Look for BOM */
if (data[0] == 0x00 && data[1] == 0x00 &&
data[2] == 0xFE && data[3] == 0xFF) {
charset = parserutils_charset_mibenum_from_name("UTF-32BE",
SLEN("UTF-32BE"));
} else if (data[0] == 0xFF && data[1] == 0xFE &&
data[2] == 0x00 && data[3] == 0x00) {
charset = parserutils_charset_mibenum_from_name("UTF-32LE",
SLEN("UTF-32LE"));
} else if (data[0] == 0xFE && data[1] == 0xFF) {
charset = parserutils_charset_mibenum_from_name("UTF-16BE",
SLEN("UTF-16BE"));
} else if (data[0] == 0xFF && data[1] == 0xFE) {
charset = parserutils_charset_mibenum_from_name("UTF-16LE",
SLEN("UTF-16LE"));
} else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
charset = parserutils_charset_mibenum_from_name("UTF-8",
SLEN("UTF-8"));
}
/* BOM beats @charset.
* UAs differ here, but none appear to match the spec.
* The spec indicates that any @charset present in conjunction with a
* BOM, should match the BOM. In reality, it appears UAs just take the
* BOM as gospel and ignore any @charset rule. The w3c CSS validator
* appears to do the same (at the least, it doesn't complain about a
* mismatch).
*/
if (charset != 0) {
*mibenum = charset;
return PARSERUTILS_OK;
}
error = try_utf32_charset(data, len, &charset);
if (error == PARSERUTILS_OK && charset != 0) {
*mibenum = charset;
return PARSERUTILS_OK;
}
error = try_utf16_charset(data, len, &charset);
if (error == PARSERUTILS_OK && charset != 0) {
*mibenum = charset;
return PARSERUTILS_OK;
}
error = try_ascii_compatible_charset(data, len, &charset);
if (error == PARSERUTILS_OK)
*mibenum = charset;
return PARSERUTILS_OK;
}
static parserutils_error try_utf32_charset(const uint8_t *data,
size_t len, uint16_t *result)
{
uint16_t charset = 0;
#define CHARSET_BE "\0\0\0@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\""
#define CHARSET_LE "@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\"\0\0\0"
if (len <= SLEN(CHARSET_LE))
return PARSERUTILS_NEEDDATA;
/* Look for @charset, assuming UTF-32 source data */
if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
const uint8_t *start = data + SLEN(CHARSET_LE);
const uint8_t *end;
char buf[8];
char *ptr = buf;
/* Look for "; at end of charset declaration */
for (end = start; end < data + len - 4; end += 4) {
uint32_t c = end[0] | (end[1] << 8) |
(end[2] << 16) | (end[3] << 24);
/* Bail if non-ASCII */
if (c > 0x007f)
break;
/* Reached the end? */
if (c == '"' && end < data + len - 8) {
uint32_t d = end[4] | (end[5] << 8) |
(end[6] << 16) | (end[7] << 24);
if (d == ';')
break;
}
/* Append to buf, if there's space */
if ((size_t) (ptr - buf) < sizeof(buf)) {
/* Uppercase */
if ('a' <= c && c <= 'z')
*ptr++ = c & ~0x20;
else
*ptr++ = c;
}
}
if (end == data + len - 4) {
/* Ran out of input */
return PARSERUTILS_NEEDDATA;
}
/* Ensure we have something that looks like UTF-32(LE)? */
if ((ptr - buf == SLEN("UTF-32LE") &&
memcmp(buf, "UTF-32LE", ptr - buf) == 0) ||
(ptr - buf == SLEN("UTF-32") &&
memcmp(buf, "UTF-32", ptr - buf) == 0)) {
/* Convert to MIB enum */
charset = parserutils_charset_mibenum_from_name(
"UTF-32LE", SLEN("UTF-32LE"));
}
} else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
const uint8_t *start = data + SLEN(CHARSET_BE);
const uint8_t *end;
char buf[8];
char *ptr = buf;
/* Look for "; at end of charset declaration */
for (end = start; end < data + len - 4; end += 4) {
uint32_t c = end[3] | (end[2] << 8) |
(end[1] << 16) | (end[0] << 24);
/* Bail if non-ASCII */
if (c > 0x007f)
break;
/* Reached the end? */
if (c == '"' && end < data + len - 8) {
uint32_t d = end[7] | (end[6] << 8) |
(end[5] << 16) | (end[4] << 24);
if (d == ';')
break;
}
/* Append to buf, if there's space */
if ((size_t) (ptr - buf) < sizeof(buf)) {
/* Uppercase */
if ('a' <= c && c <= 'z')
*ptr++ = c & ~0x20;
else
*ptr++ = c;
}
}
if (end == data + len - 4) {
/* Ran out of input */
return PARSERUTILS_NEEDDATA;
}
/* Ensure we have something that looks like UTF-32(BE)? */
if ((ptr - buf == SLEN("UTF-32BE") &&
memcmp(buf, "UTF-32BE", ptr - buf) == 0) ||
(ptr - buf == SLEN("UTF-32") &&
memcmp(buf, "UTF-32", ptr - buf) == 0)) {
/* Convert to MIB enum */
charset = parserutils_charset_mibenum_from_name(
"UTF-32BE", SLEN("UTF-32BE"));
}
}
#undef CHARSET_LE
#undef CHARSET_BE
*result = charset;
return PARSERUTILS_OK;
}
static parserutils_error try_utf16_charset(const uint8_t *data,
size_t len, uint16_t *result)
{
uint16_t charset = 0;
#define CHARSET_BE "\0@\0c\0h\0a\0r\0s\0e\0t\0 \0\""
#define CHARSET_LE "@\0c\0h\0a\0r\0s\0e\0t\0 \0\"\0"
if (len <= SLEN(CHARSET_LE))
return PARSERUTILS_NEEDDATA;
/* Look for @charset, assuming UTF-16 source data */
if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
const uint8_t *start = data + SLEN(CHARSET_LE);
const uint8_t *end;
char buf[8];
char *ptr = buf;
/* Look for "; at end of charset declaration */
for (end = start; end < data + len - 2; end += 2) {
uint32_t c = end[0] | (end[1] << 8);
/* Bail if non-ASCII */
if (c > 0x007f)
break;
/* Reached the end? */
if (c == '"' && end < data + len - 4) {
uint32_t d = end[2] | (end[3] << 8);
if (d == ';')
break;
}
/* Append to buf, if there's space */
if ((size_t) (ptr - buf) < sizeof(buf)) {
/* Uppercase */
if ('a' <= c && c <= 'z')
*ptr++ = c & ~0x20;
else
*ptr++ = c;
}
}
if (end == data + len - 2) {
/* Ran out of input */
return PARSERUTILS_NEEDDATA;
}
/* Ensure we have something that looks like UTF-16(LE)? */
if ((ptr - buf == SLEN("UTF-16LE") &&
memcmp(buf, "UTF-16LE", ptr - buf) == 0) ||
(ptr - buf == SLEN("UTF-16") &&
memcmp(buf, "UTF-16", ptr - buf) == 0)) {
/* Convert to MIB enum */
charset = parserutils_charset_mibenum_from_name(
"UTF-16LE", SLEN("UTF-16LE"));
}
} else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
const uint8_t *start = data + SLEN(CHARSET_BE);
const uint8_t *end;
char buf[8];
char *ptr = buf;
/* Look for "; at end of charset declaration */
for (end = start; end < data + len - 2; end += 2) {
uint32_t c = end[1] | (end[0] << 8);
/* Bail if non-ASCII */
if (c > 0x007f)
break;
/* Reached the end? */
if (c == '"' && end < data + len - 4) {
uint32_t d = end[3] | (end[2] << 8);
if (d == ';')
break;
}
/* Append to buf, if there's space */
if ((size_t) (ptr - buf) < sizeof(buf)) {
/* Uppercase */
if ('a' <= c && c <= 'z')
*ptr++ = c & ~0x20;
else
*ptr++ = c;
}
}
if (end == data + len - 2) {
/* Ran out of input */
return PARSERUTILS_NEEDDATA;
}
/* Ensure we have something that looks like UTF-16(BE)? */
if ((ptr - buf == SLEN("UTF-16BE") &&
memcmp(buf, "UTF-16BE", ptr - buf) == 0) ||
(ptr - buf == SLEN("UTF-16") &&
memcmp(buf, "UTF-16", ptr - buf) == 0)) {
/* Convert to MIB enum */
charset = parserutils_charset_mibenum_from_name(
"UTF-16BE", SLEN("UTF-16BE"));
}
}
#undef CHARSET_LE
#undef CHARSET_BE
*result = charset;
return PARSERUTILS_OK;
}
parserutils_error try_ascii_compatible_charset(const uint8_t *data, size_t len,
uint16_t *result)
{
uint16_t charset = 0;
#define CHARSET "@charset \""
if (len <= SLEN(CHARSET))
return PARSERUTILS_NEEDDATA;
/* Look for @charset, assuming ASCII-compatible source data */
if (memcmp(data, CHARSET, SLEN(CHARSET)) == 0) {
const uint8_t *start = data + SLEN(CHARSET);
const uint8_t *end;
/* Look for "; at end of charset declaration */
for (end = start; end < data + len; end++) {
if (*end == '"' && end < data + len - 1 &&
*(end + 1) == ';')
break;
}
if (end == data + len) {
/* Ran out of input */
return PARSERUTILS_NEEDDATA;
}
/* Convert to MIB enum */
charset = parserutils_charset_mibenum_from_name(
(const char *) start, end - start);
/* Any non-ASCII compatible charset must be ignored, as
* we've just used an ASCII parser to read it. */
if (charset == parserutils_charset_mibenum_from_name(
"UTF-32", SLEN("UTF-32")) ||
charset == parserutils_charset_mibenum_from_name(
"UTF-32LE", SLEN("UTF-32LE")) ||
charset == parserutils_charset_mibenum_from_name(
"UTF-32BE", SLEN("UTF-32BE")) ||
charset == parserutils_charset_mibenum_from_name(
"UTF-16", SLEN("UTF-16")) ||
charset == parserutils_charset_mibenum_from_name(
"UTF-16LE", SLEN("UTF-16LE")) ||
charset == parserutils_charset_mibenum_from_name(
"UTF-16BE", SLEN("UTF-16BE"))) {
charset = 0;
}
}
#undef CHARSET
*result = charset;
return PARSERUTILS_OK;
}

Subversion Repositories Kolibri OS

(root)/programs/network/netsurf/libcss/src/charset/detect.c – Rev 3584