Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * This file is part of LibParserUtils.
  3.  * Licensed under the MIT License,
  4.  *                http://www.opensource.org/licenses/mit-license.php
  5.  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
  6.  */
  7.  
  8. #include <stdlib.h>
  9. #include <string.h>
  10.  
  11. #include <parserutils/charset/mibenum.h>
  12.  
  13. #include "charset/codecs/codec_impl.h"
  14. #include "charset/encodings/utf8impl.h"
  15. #include "utils/endian.h"
  16. #include "utils/utils.h"
  17.  
  18. /**
  19.  * UTF-8 charset codec
  20.  */
  21. typedef struct charset_utf8_codec {
  22.         parserutils_charset_codec base; /**< Base class */
  23.  
  24. #define INVAL_BUFSIZE (32)
  25.         uint8_t inval_buf[INVAL_BUFSIZE];       /**< Buffer for fixing up
  26.                                                  * incomplete input
  27.                                                  * sequences */
  28.         size_t inval_len;               /*< Byte length of inval_buf **/
  29.  
  30. #define READ_BUFSIZE (8)
  31.         uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
  32.                                                  * output sequences (decode)
  33.                                                  * (host-endian) */
  34.         size_t read_len;                /**< Character length of read_buf */
  35.  
  36. #define WRITE_BUFSIZE (8)
  37.         uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
  38.                                                  * output sequences (encode)
  39.                                                  * (host-endian) */
  40.         size_t write_len;               /**< Character length of write_buf */
  41.  
  42. } charset_utf8_codec;
  43.  
  44. static bool charset_utf8_codec_handles_charset(const char *charset);
  45. static parserutils_error charset_utf8_codec_create(const char *charset,
  46.                 parserutils_alloc alloc, void *pw,
  47.                 parserutils_charset_codec **codec);
  48. static parserutils_error charset_utf8_codec_destroy(
  49.                 parserutils_charset_codec *codec);
  50. static parserutils_error charset_utf8_codec_encode(
  51.                 parserutils_charset_codec *codec,
  52.                 const uint8_t **source, size_t *sourcelen,
  53.                 uint8_t **dest, size_t *destlen);
  54. static parserutils_error charset_utf8_codec_decode(
  55.                 parserutils_charset_codec *codec,
  56.                 const uint8_t **source, size_t *sourcelen,
  57.                 uint8_t **dest, size_t *destlen);
  58. static parserutils_error charset_utf8_codec_reset(
  59.                 parserutils_charset_codec *codec);
  60. static inline parserutils_error charset_utf8_codec_read_char(
  61.                 charset_utf8_codec *c,
  62.                 const uint8_t **source, size_t *sourcelen,
  63.                 uint8_t **dest, size_t *destlen);
  64. static inline parserutils_error charset_utf8_codec_output_decoded_char(
  65.                 charset_utf8_codec *c,
  66.                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
  67.  
  68. /**
  69.  * Determine whether this codec handles a specific charset
  70.  *
  71.  * \param charset  Charset to test
  72.  * \return true if handleable, false otherwise
  73.  */
  74. bool charset_utf8_codec_handles_charset(const char *charset)
  75. {
  76.         return parserutils_charset_mibenum_from_name(charset,
  77.                                 strlen(charset)) ==
  78.                         parserutils_charset_mibenum_from_name("UTF-8",
  79.                                 SLEN("UTF-8"));
  80. }
  81.  
  82. /**
  83.  * Create a UTF-8 codec
  84.  *
  85.  * \param charset  The charset to read from / write to
  86.  * \param alloc    Memory (de)allocation function
  87.  * \param pw       Pointer to client-specific private data (may be NULL)
  88.  * \param codec    Pointer to location to receive codec
  89.  * \return PARSERUTILS_OK on success,
  90.  *         PARSERUTILS_BADPARM on bad parameters,
  91.  *         PARSERUTILS_NOMEM on memory exhausion
  92.  */
  93. parserutils_error charset_utf8_codec_create(const char *charset,
  94.                 parserutils_alloc alloc, void *pw,
  95.                 parserutils_charset_codec **codec)
  96. {
  97.         charset_utf8_codec *c;
  98.  
  99.         UNUSED(charset);
  100.  
  101.         c = alloc(NULL, sizeof(charset_utf8_codec), pw);
  102.         if (c == NULL)
  103.                 return PARSERUTILS_NOMEM;
  104.  
  105.         c->inval_buf[0] = '\0';
  106.         c->inval_len = 0;
  107.  
  108.         c->read_buf[0] = 0;
  109.         c->read_len = 0;
  110.  
  111.         c->write_buf[0] = 0;
  112.         c->write_len = 0;
  113.  
  114.         /* Finally, populate vtable */
  115.         c->base.handler.destroy = charset_utf8_codec_destroy;
  116.         c->base.handler.encode = charset_utf8_codec_encode;
  117.         c->base.handler.decode = charset_utf8_codec_decode;
  118.         c->base.handler.reset = charset_utf8_codec_reset;
  119.  
  120.         *codec = (parserutils_charset_codec *) c;
  121.  
  122.         return PARSERUTILS_OK;
  123. }
  124.  
  125. /**
  126.  * Destroy a UTF-8 codec
  127.  *
  128.  * \param codec  The codec to destroy
  129.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  130.  */
  131. parserutils_error charset_utf8_codec_destroy (parserutils_charset_codec *codec)
  132. {
  133.         UNUSED(codec);
  134.  
  135.         return PARSERUTILS_OK;
  136. }
  137.  
  138. /**
  139.  * Encode a chunk of UCS-4 (big endian) data into UTF-8
  140.  *
  141.  * \param codec      The codec to use
  142.  * \param source     Pointer to pointer to source data
  143.  * \param sourcelen  Pointer to length (in bytes) of source data
  144.  * \param dest       Pointer to pointer to output buffer
  145.  * \param destlen    Pointer to length (in bytes) of output buffer
  146.  * \return PARSERUTILS_OK          on success,
  147.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  148.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  149.  *                            codec's error handling mode is set to STRICT,
  150.  *
  151.  * On exit, ::source will point immediately _after_ the last input character
  152.  * read. Any remaining output for the character will be buffered by the
  153.  * codec for writing on the next call.
  154.  *
  155.  * Note that, if failure occurs whilst attempting to write any output
  156.  * buffered by the last call, then ::source and ::sourcelen will remain
  157.  * unchanged (as nothing more has been read).
  158.  *
  159.  * ::sourcelen will be reduced appropriately on exit.
  160.  *
  161.  * ::dest will point immediately _after_ the last character written.
  162.  *
  163.  * ::destlen will be reduced appropriately on exit.
  164.  */
  165. parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
  166.                 const uint8_t **source, size_t *sourcelen,
  167.                 uint8_t **dest, size_t *destlen)
  168. {
  169.         charset_utf8_codec *c = (charset_utf8_codec *) codec;
  170.         uint32_t ucs4;
  171.         uint32_t *towrite;
  172.         size_t towritelen;
  173.         parserutils_error error;
  174.  
  175.         /* Process any outstanding characters from the previous call */
  176.         if (c->write_len > 0) {
  177.                 uint32_t *pwrite = c->write_buf;
  178.  
  179.                 while (c->write_len > 0) {
  180.                         UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
  181.                         if (error != PARSERUTILS_OK) {
  182.                                 uint32_t len;
  183.                                 if (error != PARSERUTILS_NOMEM)
  184.                                         abort();
  185.  
  186.                                 /* Insufficient output buffer space */
  187.                                 for (len = 0; len < c->write_len; len++) {
  188.                                         c->write_buf[len] = pwrite[len];
  189.                                 }
  190.  
  191.                                 return PARSERUTILS_NOMEM;
  192.                         }
  193.  
  194.                         pwrite++;
  195.                         c->write_len--;
  196.                 }
  197.         }
  198.  
  199.         /* Now process the characters for this call */
  200.         while (*sourcelen > 0) {
  201.                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
  202.                 towrite = &ucs4;
  203.                 towritelen = 1;
  204.  
  205.                 /* Output current characters */
  206.                 while (towritelen > 0) {
  207.                         UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
  208.                         if (error != PARSERUTILS_OK) {
  209.                                 uint32_t len;
  210.                                 if (error != PARSERUTILS_NOMEM)
  211.                                         abort();
  212.  
  213.                                 /* Insufficient output space */
  214.                                 if (towritelen >= WRITE_BUFSIZE)
  215.                                         abort();
  216.  
  217.                                 c->write_len = towritelen;
  218.  
  219.                                 /* Copy pending chars to save area, for
  220.                                  * processing next call. */
  221.                                 for (len = 0; len < towritelen; len++)
  222.                                         c->write_buf[len] = towrite[len];
  223.  
  224.                                 /* Claim character we've just buffered,
  225.                                  * so it's not reprocessed */
  226.                                 *source += 4;
  227.                                 *sourcelen -= 4;
  228.  
  229.                                 return PARSERUTILS_NOMEM;
  230.                         }
  231.  
  232.                         towrite++;
  233.                         towritelen--;
  234.                 }
  235.  
  236.                 *source += 4;
  237.                 *sourcelen -= 4;
  238.         }
  239.  
  240.         return PARSERUTILS_OK;
  241. }
  242.  
  243. /**
  244.  * Decode a chunk of UTF-8 data into UCS-4 (big endian)
  245.  *
  246.  * \param codec      The codec to use
  247.  * \param source     Pointer to pointer to source data
  248.  * \param sourcelen  Pointer to length (in bytes) of source data
  249.  * \param dest       Pointer to pointer to output buffer
  250.  * \param destlen    Pointer to length (in bytes) of output buffer
  251.  * \return PARSERUTILS_OK          on success,
  252.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  253.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  254.  *                            codec's error handling mode is set to STRICT,
  255.  *
  256.  * On exit, ::source will point immediately _after_ the last input character
  257.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  258.  * character will be buffered by the codec for writing on the next call.
  259.  *
  260.  * In the case of the result being _INVALID, ::source will point _at_ the
  261.  * last input character read; nothing will be written or buffered for the
  262.  * failed character. It is up to the client to fix the cause of the failure
  263.  * and retry the decoding process.
  264.  *
  265.  * Note that, if failure occurs whilst attempting to write any output
  266.  * buffered by the last call, then ::source and ::sourcelen will remain
  267.  * unchanged (as nothing more has been read).
  268.  *
  269.  * If STRICT error handling is configured and an illegal sequence is split
  270.  * over two calls, then _INVALID will be returned from the second call,
  271.  * but ::source will point mid-way through the invalid sequence (i.e. it
  272.  * will be unmodified over the second call). In addition, the internal
  273.  * incomplete-sequence buffer will be emptied, such that subsequent calls
  274.  * will progress, rather than re-evaluating the same invalid sequence.
  275.  *
  276.  * ::sourcelen will be reduced appropriately on exit.
  277.  *
  278.  * ::dest will point immediately _after_ the last character written.
  279.  *
  280.  * ::destlen will be reduced appropriately on exit.
  281.  *
  282.  * Call this with a source length of 0 to flush the output buffer.
  283.  */
  284. parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
  285.                 const uint8_t **source, size_t *sourcelen,
  286.                 uint8_t **dest, size_t *destlen)
  287. {
  288.         charset_utf8_codec *c = (charset_utf8_codec *) codec;
  289.         parserutils_error error;
  290.  
  291.         if (c->read_len > 0) {
  292.                 /* Output left over from last decode */
  293.                 uint32_t *pread = c->read_buf;
  294.  
  295.                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
  296.                         *((uint32_t *) (void *) *dest) =
  297.                                         endian_host_to_big(pread[0]);
  298.  
  299.                         *dest += 4;
  300.                         *destlen -= 4;
  301.  
  302.                         pread++;
  303.                         c->read_len--;
  304.                 }
  305.  
  306.                 if (*destlen < c->read_len * 4) {
  307.                         /* Ran out of output buffer */
  308.                         size_t i;
  309.  
  310.                         /* Shuffle remaining output down */
  311.                         for (i = 0; i < c->read_len; i++)
  312.                                 c->read_buf[i] = pread[i];
  313.  
  314.                         return PARSERUTILS_NOMEM;
  315.                 }
  316.         }
  317.  
  318.         if (c->inval_len > 0) {
  319.                 /* The last decode ended in an incomplete sequence.
  320.                  * Fill up inval_buf with data from the start of the
  321.                  * new chunk and process it. */
  322.                 uint8_t *in = c->inval_buf;
  323.                 size_t ol = c->inval_len;
  324.                 size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
  325.                 size_t orig_l = l;
  326.  
  327.                 memcpy(c->inval_buf + ol, *source, l);
  328.  
  329.                 l += c->inval_len;
  330.  
  331.                 error = charset_utf8_codec_read_char(c,
  332.                                 (const uint8_t **) &in, &l, dest, destlen);
  333.                 if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
  334.                         return error;
  335.                 }
  336.  
  337.                 /* And now, fix up source pointers */
  338.                 *source += max((signed) (orig_l - l), 0);
  339.                 *sourcelen -= max((signed) (orig_l - l), 0);
  340.  
  341.                 /* Failed to resolve an incomplete character and
  342.                  * ran out of buffer space. No recovery strategy
  343.                  * possible, so explode everywhere. */
  344.                 if ((orig_l + ol) - l == 0)
  345.                         abort();
  346.  
  347.                 /* Report memory exhaustion case from above */
  348.                 if (error != PARSERUTILS_OK)
  349.                         return error;
  350.         }
  351.  
  352.         /* Finally, the "normal" case; process all outstanding characters */
  353.         while (*sourcelen > 0) {
  354.                 error = charset_utf8_codec_read_char(c,
  355.                                 source, sourcelen, dest, destlen);
  356.                 if (error != PARSERUTILS_OK) {
  357.                         return error;
  358.                 }
  359.         }
  360.  
  361.         return PARSERUTILS_OK;
  362. }
  363.  
  364. /**
  365.  * Clear a UTF-8 codec's encoding state
  366.  *
  367.  * \param codec  The codec to reset
  368.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  369.  */
  370. parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
  371. {
  372.         charset_utf8_codec *c = (charset_utf8_codec *) codec;
  373.  
  374.         c->inval_buf[0] = '\0';
  375.         c->inval_len = 0;
  376.  
  377.         c->read_buf[0] = 0;
  378.         c->read_len = 0;
  379.  
  380.         c->write_buf[0] = 0;
  381.         c->write_len = 0;
  382.  
  383.         return PARSERUTILS_OK;
  384. }
  385.  
  386.  
  387. /**
  388.  * Read a character from the UTF-8 to UCS-4 (big endian)
  389.  *
  390.  * \param c          The codec
  391.  * \param source     Pointer to pointer to source buffer (updated on exit)
  392.  * \param sourcelen  Pointer to length of source buffer (updated on exit)
  393.  * \param dest       Pointer to pointer to output buffer (updated on exit)
  394.  * \param destlen    Pointer to length of output buffer (updated on exit)
  395.  * \return PARSERUTILS_OK on success,
  396.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  397.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  398.  *                            codec's error handling mode is set to STRICT,
  399.  *
  400.  * On exit, ::source will point immediately _after_ the last input character
  401.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  402.  * character will be buffered by the codec for writing on the next call.
  403.  *
  404.  * In the case of the result being _INVALID, ::source will point _at_ the
  405.  * last input character read; nothing will be written or buffered for the
  406.  * failed character. It is up to the client to fix the cause of the failure
  407.  * and retry the decoding process.
  408.  *
  409.  * ::sourcelen will be reduced appropriately on exit.
  410.  *
  411.  * ::dest will point immediately _after_ the last character written.
  412.  *
  413.  * ::destlen will be reduced appropriately on exit.
  414.  */
  415. parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
  416.                 const uint8_t **source, size_t *sourcelen,
  417.                 uint8_t **dest, size_t *destlen)
  418. {
  419.         uint32_t ucs4;
  420.         size_t sucs4;
  421.         parserutils_error error;
  422.  
  423.         /* Convert a single character */
  424.         {
  425.                 const uint8_t *src = *source;
  426.                 size_t srclen = *sourcelen;
  427.                 uint32_t *uptr = &ucs4;
  428.                 size_t *usptr = &sucs4;
  429.                 UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
  430.         }
  431.         if (error == PARSERUTILS_OK) {
  432.                 /* Read a character */
  433.                 error = charset_utf8_codec_output_decoded_char(c,
  434.                                 ucs4, dest, destlen);
  435.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  436.                         /* output succeeded; update source pointers */
  437.                         *source += sucs4;
  438.                         *sourcelen -= sucs4;
  439.                 }
  440.  
  441.                 /* Clear inval buffer */
  442.                 c->inval_buf[0] = '\0';
  443.                 c->inval_len = 0;
  444.  
  445.                 return error;
  446.         } else if (error == PARSERUTILS_NEEDDATA) {
  447.                 /* Incomplete input sequence */
  448.                 if (*sourcelen > INVAL_BUFSIZE)
  449.                         abort();
  450.  
  451.                 memmove(c->inval_buf, *source, *sourcelen);
  452.                 c->inval_buf[*sourcelen] = '\0';
  453.                 c->inval_len = *sourcelen;
  454.  
  455.                 *source += *sourcelen;
  456.                 *sourcelen = 0;
  457.  
  458.                 return PARSERUTILS_OK;
  459.         } else if (error == PARSERUTILS_INVALID) {
  460.                 /* Illegal input sequence */
  461.                 uint32_t nextchar;
  462.  
  463.                 /* Strict errormode; simply flag invalid character */
  464.                 if (c->base.errormode ==
  465.                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
  466.                         /* Clear inval buffer */
  467.                         c->inval_buf[0] = '\0';
  468.                         c->inval_len = 0;
  469.  
  470.                         return PARSERUTILS_INVALID;
  471.                 }
  472.  
  473.                 /* Find next valid UTF-8 sequence.
  474.                  * We're processing client-provided data, so let's
  475.                  * be paranoid about its validity. */
  476.                 {
  477.                         const uint8_t *src = *source;
  478.                         size_t srclen = *sourcelen;
  479.                         uint32_t off = 0;
  480.                         uint32_t *ncptr = &nextchar;
  481.  
  482.                         UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
  483.                 }
  484.                 if (error != PARSERUTILS_OK) {
  485.                         if (error == PARSERUTILS_NEEDDATA) {
  486.                                 /* Need more data to be sure */
  487.                                 if (*sourcelen > INVAL_BUFSIZE)
  488.                                         abort();
  489.  
  490.                                 memmove(c->inval_buf, *source, *sourcelen);
  491.                                 c->inval_buf[*sourcelen] = '\0';
  492.                                 c->inval_len = *sourcelen;
  493.  
  494.                                 *source += *sourcelen;
  495.                                 *sourcelen = 0;
  496.  
  497.                                 nextchar = 0;
  498.                         } else {
  499.                                 return error;
  500.                         }
  501.                 }
  502.  
  503.                 /* Clear inval buffer */
  504.                 c->inval_buf[0] = '\0';
  505.                 c->inval_len = 0;
  506.  
  507.                 /* output U+FFFD and continue processing. */
  508.                 error = charset_utf8_codec_output_decoded_char(c,
  509.                                 0xFFFD, dest, destlen);
  510.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  511.                         /* output succeeded; update source pointers */
  512.                         *source += nextchar;
  513.                         *sourcelen -= nextchar;
  514.                 }
  515.  
  516.                 return error;
  517.         }
  518.  
  519.         return PARSERUTILS_OK;
  520. }
  521.  
  522. /**
  523.  * Output a UCS-4 character (big endian)
  524.  *
  525.  * \param c        Codec to use
  526.  * \param ucs4     UCS-4 character (host endian)
  527.  * \param dest     Pointer to pointer to output buffer
  528.  * \param destlen  Pointer to output buffer length
  529.  * \return PARSERUTILS_OK          on success,
  530.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  531.  */
  532. parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
  533.                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
  534. {
  535.         if (*destlen < 4) {
  536.                 /* Run out of output buffer */
  537.                 c->read_len = 1;
  538.                 c->read_buf[0] = ucs4;
  539.  
  540.                 return PARSERUTILS_NOMEM;
  541.         }
  542.  
  543.         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
  544.         *dest += 4;
  545.         *destlen -= 4;
  546.  
  547.         return PARSERUTILS_OK;
  548. }
  549.  
  550.  
  551. const parserutils_charset_handler charset_utf8_codec_handler = {
  552.         charset_utf8_codec_handles_charset,
  553.         charset_utf8_codec_create
  554. };
  555.  
  556.