WebSVN – Kolibri OS – Path Comparison – / – /programs/network/netsurf/libcss/src/charset/ Rev 3583 and /programs/network/netsurf/libcss/src/charset/ Rev 3584

Regard whitespace Rev 3583 → Rev 3584

/programs/network/netsurf/libcss/src/charset/Makefile
0,0 → 1,5

OUTFILE = libo.o
OBJS = detect.o
CFLAGS += -I ../../include/ -I ../../../ -I ../ -I /home/sourcerer/kos_src/newenginek/kolibri/include
include $(MENUETDEV)/makefiles/Makefile_for_o_lib

 /programs/network/netsurf/libcss/src/charset/detect.c
 ,0 → 1,440
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <stdbool.h>
+#include <string.h>
+#include <parserutils/charset/mibenum.h>
+#include "charset/detect.h"
+#include "utils/utils.h"
+static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
+                size_t len, uint16_t *mibenum);
+static parserutils_error try_utf32_charset(const uint8_t *data,
+                size_t len, uint16_t *result);
+static parserutils_error try_utf16_charset(const uint8_t *data,
+                size_t len, uint16_t *result);
+static parserutils_error try_ascii_compatible_charset(const uint8_t *data,
+                size_t len, uint16_t *result);
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data     Pointer to buffer containing data
+ * \param len      Buffer length
+ * \param mibenum  Pointer to location containing current MIB enum
+ * \param source   Pointer to location containing current charset source
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * ::mibenum and ::source will be updated on exit
+ *
+ * CSS 2.1 $4.4
+ */
+parserutils_error css__charset_extract(const uint8_t *data, size_t len,
+                uint16_t *mibenum, uint32_t *source)
+{
+        parserutils_error error;
+        uint16_t charset = 0;
+        if (data == NULL || mibenum == NULL || source == NULL)
+                return PARSERUTILS_BADPARM;
+        /* If the charset was dictated by the client, we've nothing to detect */
+        if (*source == CSS_CHARSET_DICTATED)
+                return PARSERUTILS_OK;
+        /* Look for a BOM and/or @charset */
+        error = css_charset_read_bom_or_charset(data, len, &charset);
+        if (error != PARSERUTILS_OK)
+                return error;
+        if (charset != 0) {
+                *mibenum = charset;
+                *source = CSS_CHARSET_DOCUMENT;
+                return PARSERUTILS_OK;
+        }
+        /* If we've already got a charset from the linking mechanism or
+         * referring document, then we've nothing further to do */
+        if (*source != CSS_CHARSET_DEFAULT)
+                return PARSERUTILS_OK;
+        /* We've not yet found a charset, so use the default fallback */
+        charset = parserutils_charset_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+        *mibenum = charset;
+        *source = CSS_CHARSET_DEFAULT;
+        return PARSERUTILS_OK;
+}
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark and/or an @charset rule
+ *
+ * \param data     Pointer to buffer containing data
+ * \param len      Buffer length
+ * \param mibenum  Pointer to location to receive MIB enum
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
+                size_t len, uint16_t *mibenum)
+{
+        parserutils_error error;
+        uint16_t charset = 0;
+        if (data == NULL)
+                return PARSERUTILS_BADPARM;
+        /* We require at least 4 bytes of data */
+        if (len < 4)
+                return PARSERUTILS_NEEDDATA;
+        /* Look for BOM */
+        if (data[0] == 0x00 && data[1] == 0x00 &&
+                        data[2] == 0xFE && data[3] == 0xFF) {
+                charset = parserutils_charset_mibenum_from_name("UTF-32BE",
+                                SLEN("UTF-32BE"));
+        } else if (data[0] == 0xFF && data[1] == 0xFE &&
+                        data[2] == 0x00 && data[3] == 0x00) {
+                charset = parserutils_charset_mibenum_from_name("UTF-32LE",
+                                SLEN("UTF-32LE"));
+        } else if (data[0] == 0xFE && data[1] == 0xFF) {
+                charset = parserutils_charset_mibenum_from_name("UTF-16BE",
+                                SLEN("UTF-16BE"));
+        } else if (data[0] == 0xFF && data[1] == 0xFE) {
+                charset = parserutils_charset_mibenum_from_name("UTF-16LE",
+                                SLEN("UTF-16LE"));
+        } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
+                charset = parserutils_charset_mibenum_from_name("UTF-8",
+                                SLEN("UTF-8"));
+        }
+        /* BOM beats @charset.
+         * UAs differ here, but none appear to match the spec.
+         * The spec indicates that any @charset present in conjunction with a
+         * BOM, should match the BOM. In reality, it appears UAs just take the
+         * BOM as gospel and ignore any @charset rule. The w3c CSS validator
+         * appears to do the same (at the least, it doesn't complain about a
+         * mismatch).
+         */
+        if (charset != 0) {
+                *mibenum = charset;
+                return PARSERUTILS_OK;
+        }
+        error = try_utf32_charset(data, len, &charset);
+        if (error == PARSERUTILS_OK && charset != 0) {
+                *mibenum = charset;
+                return PARSERUTILS_OK;
+        }
+        error = try_utf16_charset(data, len, &charset);
+        if (error == PARSERUTILS_OK && charset != 0) {
+                *mibenum = charset;
+                return PARSERUTILS_OK;
+        }
+        error = try_ascii_compatible_charset(data, len, &charset);
+        if (error == PARSERUTILS_OK)
+                *mibenum = charset;
+        return PARSERUTILS_OK;
+}
+static parserutils_error try_utf32_charset(const uint8_t *data,
+                size_t len, uint16_t *result)
+{
+        uint16_t charset = 0;
+#define CHARSET_BE "\0\0\0@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\""
+#define CHARSET_LE "@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\"\0\0\0"
+        if (len <= SLEN(CHARSET_LE))
+                return PARSERUTILS_NEEDDATA;
+        /* Look for @charset, assuming UTF-32 source data */
+        if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
+                const uint8_t *start = data + SLEN(CHARSET_LE);
+                const uint8_t *end;
+                char buf[8];
+                char *ptr = buf;
+                /* Look for "; at end of charset declaration */
+                for (end = start; end < data + len - 4; end += 4) {
+                        uint32_t c = end[0] | (end[1] << 8) |
+                                     (end[2] << 16) | (end[3] << 24);
+                        /* Bail if non-ASCII */
+                        if (c > 0x007f)
+                                break;
+                        /* Reached the end? */
+                        if (c == '"' && end < data + len - 8) {
+                                uint32_t d = end[4] | (end[5] << 8) |
+                                    (end[6] << 16) | (end[7] << 24);
+                                if (d == ';')
+                                        break;
+                        }
+                        /* Append to buf, if there's space */
+                        if ((size_t) (ptr - buf) < sizeof(buf)) {
+                                /* Uppercase */
+                                if ('a' <= c && c <= 'z')
+                                        *ptr++ = c & ~0x20;
+                                else
+                                        *ptr++ = c;
+                        }
+                }
+                if (end == data + len - 4) {
+                        /* Ran out of input */
+                        return PARSERUTILS_NEEDDATA;
+                }
+                /* Ensure we have something that looks like UTF-32(LE)? */
+                if ((ptr - buf == SLEN("UTF-32LE") &&
+                                memcmp(buf, "UTF-32LE", ptr - buf) == 0) ||
+                                (ptr - buf == SLEN("UTF-32") &&
+                                memcmp(buf, "UTF-32", ptr - buf) == 0)) {
+                        /* Convert to MIB enum */
+                        charset = parserutils_charset_mibenum_from_name(
+                                        "UTF-32LE", SLEN("UTF-32LE"));
+                }
+        } else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
+                const uint8_t *start = data + SLEN(CHARSET_BE);
+                const uint8_t *end;
+                char buf[8];
+                char *ptr = buf;
+                /* Look for "; at end of charset declaration */
+                for (end = start; end < data + len - 4; end += 4) {
+                        uint32_t c = end[3] | (end[2] << 8) |
+                                     (end[1] << 16) | (end[0] << 24);
+                        /* Bail if non-ASCII */
+                        if (c > 0x007f)
+                                break;
+                        /* Reached the end? */
+                        if (c == '"' && end < data + len - 8) {
+                                uint32_t d = end[7] | (end[6] << 8) |
+                                    (end[5] << 16) | (end[4] << 24);
+                                if (d == ';')
+                                        break;
+                        }
+                        /* Append to buf, if there's space */
+                        if ((size_t) (ptr - buf) < sizeof(buf)) {
+                                /* Uppercase */
+                                if ('a' <= c && c <= 'z')
+                                        *ptr++ = c & ~0x20;
+                                else
+                                        *ptr++ = c;
+                        }
+                }
+                if (end == data + len - 4) {
+                        /* Ran out of input */
+                        return PARSERUTILS_NEEDDATA;
+                }
+                /* Ensure we have something that looks like UTF-32(BE)? */
+                if ((ptr - buf == SLEN("UTF-32BE") &&
+                                memcmp(buf, "UTF-32BE", ptr - buf) == 0) ||
+                                (ptr - buf == SLEN("UTF-32") &&
+                                memcmp(buf, "UTF-32", ptr - buf) == 0)) {
+                        /* Convert to MIB enum */
+                        charset = parserutils_charset_mibenum_from_name(
+                                        "UTF-32BE", SLEN("UTF-32BE"));
+                }
+        }
+#undef CHARSET_LE
+#undef CHARSET_BE
+        *result = charset;
+        return PARSERUTILS_OK;
+}
+static parserutils_error try_utf16_charset(const uint8_t *data,
+                size_t len, uint16_t *result)
+{
+        uint16_t charset = 0;
+#define CHARSET_BE "\0@\0c\0h\0a\0r\0s\0e\0t\0 \0\""
+#define CHARSET_LE "@\0c\0h\0a\0r\0s\0e\0t\0 \0\"\0"
+        if (len <= SLEN(CHARSET_LE))
+                return PARSERUTILS_NEEDDATA;
+        /* Look for @charset, assuming UTF-16 source data */
+        if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
+                const uint8_t *start = data + SLEN(CHARSET_LE);
+                const uint8_t *end;
+                char buf[8];
+                char *ptr = buf;
+                /* Look for "; at end of charset declaration */
+                for (end = start; end < data + len - 2; end += 2) {
+                        uint32_t c = end[0] | (end[1] << 8);
+                        /* Bail if non-ASCII */
+                        if (c > 0x007f)
+                                break;
+                        /* Reached the end? */
+                        if (c == '"' && end < data + len - 4) {
+                                uint32_t d = end[2] | (end[3] << 8);
+                                if (d == ';')
+                                        break;
+                        }
+                        /* Append to buf, if there's space */
+                        if ((size_t) (ptr - buf) < sizeof(buf)) {
+                                /* Uppercase */
+                                if ('a' <= c && c <= 'z')
+                                        *ptr++ = c & ~0x20;
+                                else
+                                        *ptr++ = c;
+                        }
+                }
+                if (end == data + len - 2) {
+                        /* Ran out of input */
+                        return PARSERUTILS_NEEDDATA;
+                }
+                /* Ensure we have something that looks like UTF-16(LE)? */
+                if ((ptr - buf == SLEN("UTF-16LE") &&
+                                memcmp(buf, "UTF-16LE", ptr - buf) == 0) ||
+                                (ptr - buf == SLEN("UTF-16") &&
+                                memcmp(buf, "UTF-16", ptr - buf) == 0)) {
+                        /* Convert to MIB enum */
+                        charset = parserutils_charset_mibenum_from_name(
+                                        "UTF-16LE", SLEN("UTF-16LE"));
+                }
+        } else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
+                const uint8_t *start = data + SLEN(CHARSET_BE);
+                const uint8_t *end;
+                char buf[8];
+                char *ptr = buf;
+                /* Look for "; at end of charset declaration */
+                for (end = start; end < data + len - 2; end += 2) {
+                        uint32_t c = end[1] | (end[0] << 8);
+                        /* Bail if non-ASCII */
+                        if (c > 0x007f)
+                                break;
+                        /* Reached the end? */
+                        if (c == '"' && end < data + len - 4) {
+                                uint32_t d = end[3] | (end[2] << 8);
+                                if (d == ';')
+                                        break;
+                        }
+                        /* Append to buf, if there's space */
+                        if ((size_t) (ptr - buf) < sizeof(buf)) {
+                                /* Uppercase */
+                                if ('a' <= c && c <= 'z')
+                                        *ptr++ = c & ~0x20;
+                                else
+                                        *ptr++ = c;
+                        }
+                }
+                if (end == data + len - 2) {
+                        /* Ran out of input */
+                        return PARSERUTILS_NEEDDATA;
+                }
+                /* Ensure we have something that looks like UTF-16(BE)? */
+                if ((ptr - buf == SLEN("UTF-16BE") &&
+                                memcmp(buf, "UTF-16BE", ptr - buf) == 0) ||
+                                (ptr - buf == SLEN("UTF-16") &&
+                                memcmp(buf, "UTF-16", ptr - buf) == 0)) {
+                        /* Convert to MIB enum */
+                        charset = parserutils_charset_mibenum_from_name(
+                                        "UTF-16BE", SLEN("UTF-16BE"));
+                }
+        }
+#undef CHARSET_LE
+#undef CHARSET_BE
+        *result = charset;
+        return PARSERUTILS_OK;
+}
+parserutils_error try_ascii_compatible_charset(const uint8_t *data, size_t len,
+                uint16_t *result)
+{
+        uint16_t charset = 0;
+#define CHARSET "@charset \""
+        if (len <= SLEN(CHARSET))
+                return PARSERUTILS_NEEDDATA;
+        /* Look for @charset, assuming ASCII-compatible source data */
+        if (memcmp(data, CHARSET, SLEN(CHARSET)) == 0) {
+                const uint8_t *start = data + SLEN(CHARSET);
+                const uint8_t *end;
+                /* Look for "; at end of charset declaration */
+                for (end = start; end < data + len; end++) {
+                        if (*end == '"' && end < data + len - 1 &&
+                                        *(end + 1) == ';')
+                                break;
+                }
+                if (end == data + len) {
+                        /* Ran out of input */
+                        return PARSERUTILS_NEEDDATA;
+                }
+                /* Convert to MIB enum */
+                charset = parserutils_charset_mibenum_from_name(
+                                (const char *) start,  end - start);
+                /* Any non-ASCII compatible charset must be ignored, as
+                 * we've just used an ASCII parser to read it. */
+                if (charset == parserutils_charset_mibenum_from_name(
+                                        "UTF-32", SLEN("UTF-32")) ||
+                        charset == parserutils_charset_mibenum_from_name(
+                                        "UTF-32LE", SLEN("UTF-32LE")) ||
+                        charset == parserutils_charset_mibenum_from_name(
+                                        "UTF-32BE", SLEN("UTF-32BE")) ||
+                        charset == parserutils_charset_mibenum_from_name(
+                                        "UTF-16", SLEN("UTF-16")) ||
+                        charset == parserutils_charset_mibenum_from_name(
+                                        "UTF-16LE", SLEN("UTF-16LE")) ||
+                        charset == parserutils_charset_mibenum_from_name(
+                                        "UTF-16BE", SLEN("UTF-16BE"))) {
+                        charset = 0;
+                }
+        }
+#undef CHARSET
+        *result = charset;
+        return PARSERUTILS_OK;
+}

 /programs/network/netsurf/libcss/src/charset/detect.h
 ,0 → 1,24
+/*
+ * This file is part of LibCSS.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef css_charset_detect_h_
+#define css_charset_detect_h_
+#include <inttypes.h>
+#include <libcss/errors.h>
+#include <libcss/functypes.h>
+#include <libcss/types.h>
+#include <parserutils/errors.h>
+/* Extract a charset from a chunk of data */
+parserutils_error css__charset_extract(const uint8_t *data, size_t len,
+                uint16_t *mibenum, uint32_t *source);
+#endif

Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 3583 → Rev 3584