Subversion Repositories Kolibri OS

Rev

Rev 3584 | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>
  3.  *
  4.  * This file is part of NetSurf, http://www.netsurf-browser.org/
  5.  *
  6.  * NetSurf is free software; you can redistribute it and/or modify
  7.  * it under the terms of the GNU General Public License as published by
  8.  * the Free Software Foundation; version 2 of the License.
  9.  *
  10.  * NetSurf is distributed in the hope that it will be useful,
  11.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13.  * GNU General Public License for more details.
  14.  *
  15.  * You should have received a copy of the GNU General Public License
  16.  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17.  */
  18.  
  19. /** \file
  20.  * UTF-8 manipulation functions (implementation).
  21.  */
  22.  
  23. #include <assert.h>
  24. #include <errno.h>
  25. #include <stdlib.h>
  26. #include <string.h>
  27. #include <strings.h>
  28. #include <iconv.h>
  29.  
  30. typedef signed char int8_t;
  31. typedef signed short int16_t;
  32. typedef signed int int32_t;
  33.  
  34. typedef unsigned char uint8_t;
  35. typedef unsigned short uint16_t;
  36. typedef unsigned int uint32_t;
  37.  
  38. #include <parserutils/charset/utf8.h>
  39.  
  40. #include "utils/config.h"
  41. #include "utils/log.h"
  42. #include "utils/utf8.h"
  43.  
  44. static utf8_convert_ret utf8_convert(const char *string, size_t len,
  45.                 const char *from, const char *to, char **result);
  46.  
  47. /**
  48.  * Convert a UTF-8 multibyte sequence into a single UCS4 character
  49.  *
  50.  * Encoding of UCS values outside the UTF-16 plane has been removed from
  51.  * RFC3629. This function conforms to RFC2279, however.
  52.  *
  53.  * \param s_in  The sequence to process
  54.  * \param l  Length of sequence
  55.  * \return   UCS4 character
  56.  */
  57. uint32_t utf8_to_ucs4(const char *s_in, size_t l)
  58. {
  59.         uint32_t ucs4;
  60.         size_t len;
  61.         parserutils_error perror;
  62.  
  63.         perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l,
  64.                         &ucs4, &len);
  65.         if (perror != PARSERUTILS_OK)
  66.                 ucs4 = 0xfffd;
  67.  
  68.         return ucs4;
  69. }
  70.  
  71. /**
  72.  * Convert a single UCS4 character into a UTF-8 multibyte sequence
  73.  *
  74.  * Encoding of UCS values outside the UTF-16 plane has been removed from
  75.  * RFC3629. This function conforms to RFC2279, however.
  76.  *
  77.  * \param c  The character to process (0 <= c <= 0x7FFFFFFF)
  78.  * \param s  Pointer to 6 byte long output buffer
  79.  * \return   Length of multibyte sequence
  80.  */
  81. size_t utf8_from_ucs4(uint32_t c, char *s)
  82. {
  83.         uint8_t *in = (uint8_t *) s;
  84.         size_t len = 6;
  85.         parserutils_error perror;
  86.  
  87.         perror = parserutils_charset_utf8_from_ucs4(c, &in, &len);
  88.         if (perror != PARSERUTILS_OK) {
  89.                 s[0] = 0xef;
  90.                 s[1] = 0xbf;
  91.                 s[2] = 0xbd;
  92.                 return 3;
  93.         }
  94.  
  95.         return 6 - len;
  96. }
  97.  
  98. /**
  99.  * Calculate the length (in characters) of a NULL-terminated UTF-8 string
  100.  *
  101.  * \param s  The string
  102.  * \return   Length of string
  103.  */
  104. size_t utf8_length(const char *s)
  105. {
  106.         return utf8_bounded_length(s, strlen(s));
  107. }
  108.  
  109. /**
  110.  * Calculated the length (in characters) of a bounded UTF-8 string
  111.  *
  112.  * \param s  The string
  113.  * \param l  Maximum length of input (in bytes)
  114.  * \return Length of string, in characters
  115.  */
  116. size_t utf8_bounded_length(const char *s, size_t l)
  117. {
  118.         size_t len;
  119.         parserutils_error perror;
  120.  
  121.         perror = parserutils_charset_utf8_length((const uint8_t *) s, l, &len);
  122.         if (perror != PARSERUTILS_OK)
  123.                 return 0;
  124.  
  125.         return len;
  126. }
  127.  
  128. /**
  129.  * Calculate the length (in bytes) of a UTF-8 character
  130.  *
  131.  * \param s  Pointer to start of character
  132.  * \return Length of character, in bytes
  133.  */
  134. size_t utf8_char_byte_length(const char *s)
  135. {
  136.         size_t len;
  137.         parserutils_error perror;
  138.  
  139.         perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s,
  140.                         &len);
  141.         assert(perror == PARSERUTILS_OK);
  142.  
  143.         return len;
  144. }
  145.  
  146. /**
  147.  * Find previous legal UTF-8 char in string
  148.  *
  149.  * \param s  The string
  150.  * \param o  Offset in the string to start at
  151.  * \return Offset of first byte of previous legal character
  152.  */
  153. size_t utf8_prev(const char *s, size_t o)
  154. {
  155.         uint32_t prev;
  156.         parserutils_error perror;
  157.  
  158.         perror = parserutils_charset_utf8_prev((const uint8_t *) s, o, &prev);
  159.         assert(perror == PARSERUTILS_OK);
  160.  
  161.         return prev;
  162. }
  163.  
  164. /**
  165.  * Find next legal UTF-8 char in string
  166.  *
  167.  * \param s  The string
  168.  * \param l  Maximum offset in string
  169.  * \param o  Offset in the string to start at
  170.  * \return Offset of first byte of next legal character
  171.  */
  172. size_t utf8_next(const char *s, size_t l, size_t o)
  173. {
  174.         uint32_t next;
  175.         parserutils_error perror;
  176.  
  177.         perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o,
  178.                         &next);
  179.         assert(perror == PARSERUTILS_OK);
  180.  
  181.         return next;
  182. }
  183.  
  184. /* Cache of previous iconv conversion descriptor used by utf8_convert */
  185. static struct {
  186.         char from[32];  /**< Encoding name to convert from */
  187.         char to[32];    /**< Encoding name to convert to */
  188.         iconv_t cd;     /**< Iconv conversion descriptor */
  189. } last_cd;
  190.  
  191. /**
  192.  * Finalise the UTF-8 library
  193.  */
  194. void utf8_finalise(void)
  195. {
  196.         if (last_cd.cd != 0)
  197.                 iconv_close(last_cd.cd);
  198.  
  199.         /* paranoia follows */
  200.         last_cd.from[0] = '\0';
  201.         last_cd.to[0] = '\0';
  202.         last_cd.cd = 0;
  203. }
  204.  
  205. /**
  206.  * Convert a UTF8 string into the named encoding
  207.  *
  208.  * \param string  The NULL-terminated string to convert
  209.  * \param encname The encoding name (suitable for passing to iconv)
  210.  * \param len     Length of input string to consider (in bytes), or 0
  211.  * \param result  Pointer to location to store result (allocated on heap)
  212.  * \return Appropriate utf8_convert_ret value
  213.  */
  214. utf8_convert_ret utf8_to_enc(const char *string, const char *encname,
  215.                 size_t len, char **result)
  216. {
  217.         return utf8_convert(string, len, "UTF-8", encname, result);
  218. }
  219.  
  220. /**
  221.  * Convert a string in the named encoding into a UTF-8 string
  222.  *
  223.  * \param string  The NULL-terminated string to convert
  224.  * \param encname The encoding name (suitable for passing to iconv)
  225.  * \param len     Length of input string to consider (in bytes), or 0
  226.  * \param result  Pointer to location to store result (allocated on heap)
  227.  * \return Appropriate utf8_convert_ret value
  228.  */
  229. utf8_convert_ret utf8_from_enc(const char *string, const char *encname,
  230.                 size_t len, char **result)
  231. {
  232.         return utf8_convert(string, len, encname, "UTF-8", result);
  233. }
  234.  
  235. /**
  236.  * Convert a string from one encoding to another
  237.  *
  238.  * \param string  The NULL-terminated string to convert
  239.  * \param len     Length of input string to consider (in bytes), or 0
  240.  * \param from    The encoding name to convert from
  241.  * \param to      The encoding name to convert to
  242.  * \param result  Pointer to location in which to store result
  243.  * \return Appropriate utf8_convert_ret value
  244.  */
  245. utf8_convert_ret utf8_convert(const char *string, size_t len,
  246.                 const char *from, const char *to, char **result)
  247. {
  248.         iconv_t cd;
  249.         char *temp, *out, *in;
  250.         size_t slen, rlen;
  251.  
  252.         assert(string && from && to && result);
  253.  
  254.         if (string[0] == '\0') {
  255.                 /* On AmigaOS, iconv() returns an error if we pass an
  256.                  * empty string.  This prevents iconv() being called as
  257.                  * there is no conversion necessary anyway. */
  258.                 *result = strdup("");
  259.                 if (!(*result)) {
  260.                         *result = NULL;
  261.                         return UTF8_CONVERT_NOMEM;
  262.                 }
  263.  
  264.                 return UTF8_CONVERT_OK;
  265.         }
  266.  
  267.         if (strcasecmp(from, to) == 0) {
  268.                 /* conversion from an encoding to itself == strdup */
  269.                 slen = len ? len : strlen(string);
  270.                 *(result) = strndup(string, slen);
  271.                 if (!(*result)) {
  272.                         *(result) = NULL;
  273.                         return UTF8_CONVERT_NOMEM;
  274.                 }
  275.  
  276.                 return UTF8_CONVERT_OK;
  277.         }
  278.  
  279.         in = (char *)string;
  280.  
  281.         /* we cache the last used conversion descriptor,
  282.          * so check if we're trying to use it here */
  283.         if (strncasecmp(last_cd.from, from, sizeof(last_cd.from)) == 0 &&
  284.                         strncasecmp(last_cd.to, to, sizeof(last_cd.to)) == 0) {
  285.                 cd = last_cd.cd;
  286.         }
  287.         else {
  288.                 /* no match, so create a new cd */
  289.                 cd = iconv_open(to, from);
  290.                 if (cd == (iconv_t)-1) {
  291.                         if (errno == EINVAL)
  292.                                 return UTF8_CONVERT_BADENC;
  293.                         /* default to no memory */
  294.                         return UTF8_CONVERT_NOMEM;
  295.                 }
  296.  
  297.                 /* close the last cd - we don't care if this fails */
  298.                 if (last_cd.cd)
  299.                         iconv_close(last_cd.cd);
  300.  
  301.                 /* and copy the to/from/cd data into last_cd */
  302.                 strncpy(last_cd.from, from, sizeof(last_cd.from));
  303.                 strncpy(last_cd.to, to, sizeof(last_cd.to));
  304.                 last_cd.cd = cd;
  305.         }
  306.  
  307.         slen = len ? len : strlen(string);
  308.         /* Worst case = ASCII -> UCS4, so allocate an output buffer
  309.          * 4 times larger than the input buffer, and add 4 bytes at
  310.          * the end for the NULL terminator
  311.          */
  312.         rlen = slen * 4 + 4;
  313.  
  314.         temp = out = malloc(rlen);
  315.         if (!out)
  316.                 return UTF8_CONVERT_NOMEM;
  317.  
  318.         /* perform conversion */
  319.         if (iconv(cd, (void *) &in, &slen, &out, &rlen) == (size_t)-1) {
  320.                 free(temp);
  321.                 /* clear the cached conversion descriptor as it's invalid */
  322.                 if (last_cd.cd)
  323.                         iconv_close(last_cd.cd);
  324.                 last_cd.from[0] = '\0';
  325.                 last_cd.to[0] = '\0';
  326.                 last_cd.cd = 0;
  327.                 /** \todo handle the various cases properly
  328.                  * There are 3 possible error cases:
  329.                  * a) Insufficiently large output buffer
  330.                  * b) Invalid input byte sequence
  331.                  * c) Incomplete input sequence */
  332.                 return UTF8_CONVERT_NOMEM;
  333.         }
  334.  
  335.         *(result) = realloc(temp, out - temp + 4);
  336.         if (!(*result)) {
  337.                 free(temp);
  338.                 *(result) = NULL; /* for sanity's sake */
  339.                 return UTF8_CONVERT_NOMEM;
  340.         }
  341.  
  342.         /* NULL terminate - needs 4 characters as we may have
  343.          * converted to UTF-32 */
  344.         memset((*result) + (out - temp), 0, 4);
  345.  
  346.         return UTF8_CONVERT_OK;
  347. }
  348.  
  349. static utf8_convert_ret utf8_convert_html_chunk(iconv_t cd,
  350.                 const char *chunk, size_t inlen,
  351.                 char **out, size_t *outlen)
  352. {
  353.         size_t ret, esclen;
  354.         uint32_t ucs4;
  355.         char *pescape, escape[11];
  356.  
  357.         while (inlen > 0) {
  358.                 ret = iconv(cd, (void *) &chunk, &inlen, (void *) out, outlen);
  359.                 if (ret != (size_t) -1)
  360.                         break;
  361.  
  362.                 if (errno != EILSEQ)
  363.                         return UTF8_CONVERT_NOMEM;
  364.  
  365.                 ucs4 = utf8_to_ucs4(chunk, inlen);
  366.                 esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4);
  367.                 pescape = escape;
  368.                 ret = iconv(cd, (void *) &pescape, &esclen,
  369.                                 (void *) out, outlen);
  370.                 if (ret == (size_t) -1)
  371.                         return UTF8_CONVERT_NOMEM;
  372.  
  373.                 esclen = utf8_next(chunk, inlen, 0);
  374.                 chunk += esclen;
  375.                 inlen -= esclen;
  376.         }
  377.  
  378.         return UTF8_CONVERT_OK;
  379. }
  380.  
  381. /**
  382.  * Convert a UTF-8 encoded string into a string of the given encoding,
  383.  * applying HTML escape sequences where necessary.
  384.  *
  385.  * \param string   String to convert (NUL-terminated)
  386.  * \param encname  Name of encoding to convert to
  387.  * \param len      Length, in bytes, of the input string, or 0
  388.  * \param result   Pointer to location to receive result
  389.  * \return Appropriate utf8_convert_ret value
  390.  */
  391. utf8_convert_ret utf8_to_html(const char *string, const char *encname,
  392.                 size_t len, char **result)
  393. {
  394.         iconv_t cd;
  395.         const char *in;
  396.         char *out, *origout;
  397.         size_t off, prev_off, inlen, outlen, origoutlen, esclen;
  398.         utf8_convert_ret ret;
  399.         char *pescape, escape[11];
  400.  
  401.         if (len == 0)
  402.                 len = strlen(string);
  403.  
  404.         cd = iconv_open(encname, "UTF-8");
  405.         if (cd == (iconv_t) -1) {
  406.                 if (errno == EINVAL)
  407.                         return UTF8_CONVERT_BADENC;
  408.                 /* default to no memory */
  409.                 return UTF8_CONVERT_NOMEM;
  410.         }
  411.  
  412.         /* Worst case is ASCII -> UCS4, with all characters escaped:
  413.          * "&#xYYYYYY;", thus each input character may become a string
  414.          * of 10 UCS4 characters, each 4 bytes in length */
  415.         origoutlen = outlen = len * 10 * 4;
  416.         origout = out = malloc(outlen);
  417.         if (out == NULL) {
  418.                 iconv_close(cd);
  419.                 return UTF8_CONVERT_NOMEM;
  420.         }
  421.  
  422.         /* Process input in chunks between characters we must escape */
  423.         prev_off = off = 0;
  424.         while (off < len) {
  425.                 /* Must escape '&', '<', and '>' */
  426.                 if (string[off] == '&' || string[off] == '<' ||
  427.                                 string[off] == '>') {
  428.                         if (off - prev_off > 0) {
  429.                                 /* Emit chunk */
  430.                                 in = string + prev_off;
  431.                                 inlen = off - prev_off;
  432.                                 ret = utf8_convert_html_chunk(cd, in, inlen,
  433.                                                 &out, &outlen);
  434.                                 if (ret != UTF8_CONVERT_OK) {
  435.                                         free(origout);
  436.                                         iconv_close(cd);
  437.                                         return ret;
  438.                                 }
  439.                         }
  440.  
  441.                         /* Emit mandatory escape */
  442.                         esclen = snprintf(escape, sizeof(escape),
  443.                                         "&#x%06x;", string[off]);
  444.                         pescape = escape;
  445.                         ret = utf8_convert_html_chunk(cd, pescape, esclen,
  446.                                         &out, &outlen);
  447.                         if (ret != UTF8_CONVERT_OK) {
  448.                                 free(origout);
  449.                                 iconv_close(cd);
  450.                                 return ret;
  451.                         }
  452.  
  453.                         prev_off = off = utf8_next(string, len, off);
  454.                 } else {
  455.                         off = utf8_next(string, len, off);
  456.                 }
  457.         }
  458.  
  459.         /* Process final chunk */
  460.         if (prev_off < len) {
  461.                 in = string + prev_off;
  462.                 inlen = len - prev_off;
  463.                 ret = utf8_convert_html_chunk(cd, in, inlen, &out, &outlen);
  464.                 if (ret != UTF8_CONVERT_OK) {
  465.                         free(origout);
  466.                         iconv_close(cd);
  467.                         return ret;
  468.                 }
  469.         }
  470.  
  471.         iconv_close(cd);
  472.  
  473.         /* Shrink-wrap */
  474.         *result = realloc(origout, origoutlen - outlen + 4);
  475.         if (*result == NULL) {
  476.                 free(origout);
  477.                 return UTF8_CONVERT_NOMEM;
  478.         }
  479.         memset(*result + (origoutlen - outlen), 0, 4);
  480.  
  481.         return UTF8_CONVERT_OK;
  482. }
  483.  
  484.  
  485.