Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * This file is part of LibParserUtils.
  3.  * Licensed under the MIT License,
  4.  *                http://www.opensource.org/licenses/mit-license.php
  5.  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
  6.  */
  7.  
  8. #include <assert.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11.  
  12. #include <parserutils/charset/mibenum.h>
  13.  
  14. #include "charset/codecs/codec_impl.h"
  15. #include "utils/endian.h"
  16. #include "utils/utils.h"
  17.  
  18. /**
  19.  * US-ASCII charset codec
  20.  */
  21. typedef struct charset_ascii_codec {
  22.         parserutils_charset_codec base; /**< Base class */
  23.  
  24. #define READ_BUFSIZE (8)
  25.         uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
  26.                                                  * output sequences (decode)
  27.                                                  * (host-endian) */
  28.         size_t read_len;                /**< Character length of read_buf */
  29.  
  30. #define WRITE_BUFSIZE (8)
  31.         uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
  32.                                                  * output sequences (encode)
  33.                                                  * (host-endian) */
  34.         size_t write_len;               /**< Character length of write_buf */
  35.  
  36. } charset_ascii_codec;
  37.  
  38. static bool charset_ascii_codec_handles_charset(const char *charset);
  39. static parserutils_error charset_ascii_codec_create(
  40.                 const char *charset, parserutils_alloc alloc, void *pw,
  41.                 parserutils_charset_codec **codec);
  42. static parserutils_error charset_ascii_codec_destroy(
  43.                 parserutils_charset_codec *codec);
  44. static parserutils_error charset_ascii_codec_encode(
  45.                 parserutils_charset_codec *codec,
  46.                 const uint8_t **source, size_t *sourcelen,
  47.                 uint8_t **dest, size_t *destlen);
  48. static parserutils_error charset_ascii_codec_decode(
  49.                 parserutils_charset_codec *codec,
  50.                 const uint8_t **source, size_t *sourcelen,
  51.                 uint8_t **dest, size_t *destlen);
  52. static parserutils_error charset_ascii_codec_reset(
  53.                 parserutils_charset_codec *codec);
  54. static inline parserutils_error charset_ascii_codec_read_char(
  55.                 charset_ascii_codec *c,
  56.                 const uint8_t **source, size_t *sourcelen,
  57.                 uint8_t **dest, size_t *destlen);
  58. static inline parserutils_error charset_ascii_codec_output_decoded_char(
  59.                 charset_ascii_codec *c,
  60.                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
  61. static inline parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
  62.                 uint32_t ucs4, uint8_t **s, size_t *len);
  63. static inline parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
  64.                 const uint8_t *s, size_t len, uint32_t *ucs4);
  65.  
  66. /**
  67.  * Determine whether this codec handles a specific charset
  68.  *
  69.  * \param charset  Charset to test
  70.  * \return true if handleable, false otherwise
  71.  */
  72. bool charset_ascii_codec_handles_charset(const char *charset)
  73. {
  74.         static uint16_t ascii;
  75.         uint16_t match = parserutils_charset_mibenum_from_name(charset,
  76.                         strlen(charset));
  77.  
  78.         if (ascii == 0) {
  79.                 ascii = parserutils_charset_mibenum_from_name(
  80.                                 "US-ASCII", SLEN("US-ASCII"));
  81.         }
  82.  
  83.         if (ascii != 0 && ascii == match)
  84.                 return true;
  85.  
  86.         return false;
  87. }
  88.  
  89. /**
  90.  * Create a US-ASCII codec
  91.  *
  92.  * \param charset  The charset to read from / write to
  93.  * \param alloc    Memory (de)allocation function
  94.  * \param pw       Pointer to client-specific private data (may be NULL)
  95.  * \param codec    Pointer to location to receive codec
  96.  * \return PARSERUTILS_OK on success,
  97.  *         PARSERUTILS_BADPARM on bad parameters,
  98.  *         PARSERUTILS_NOMEM on memory exhausion
  99.  */
  100. parserutils_error charset_ascii_codec_create(const char *charset,
  101.                 parserutils_alloc alloc, void *pw,
  102.                 parserutils_charset_codec **codec)
  103. {
  104.         charset_ascii_codec *c;
  105.  
  106.         UNUSED(charset);
  107.  
  108.         c = alloc(NULL, sizeof(charset_ascii_codec), pw);
  109.         if (c == NULL)
  110.                 return PARSERUTILS_NOMEM;
  111.  
  112.         c->read_buf[0] = 0;
  113.         c->read_len = 0;
  114.  
  115.         c->write_buf[0] = 0;
  116.         c->write_len = 0;
  117.  
  118.         /* Finally, populate vtable */
  119.         c->base.handler.destroy = charset_ascii_codec_destroy;
  120.         c->base.handler.encode = charset_ascii_codec_encode;
  121.         c->base.handler.decode = charset_ascii_codec_decode;
  122.         c->base.handler.reset = charset_ascii_codec_reset;
  123.  
  124.         *codec = (parserutils_charset_codec *) c;
  125.  
  126.         return PARSERUTILS_OK;
  127. }
  128.  
  129. /**
  130.  * Destroy a US-ASCII codec
  131.  *
  132.  * \param codec  The codec to destroy
  133.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  134.  */
  135. parserutils_error charset_ascii_codec_destroy (parserutils_charset_codec *codec)
  136. {
  137.         UNUSED(codec);
  138.  
  139.         return PARSERUTILS_OK;
  140. }
  141.  
  142. /**
  143.  * Encode a chunk of UCS-4 (big endian) data into US-ASCII
  144.  *
  145.  * \param codec      The codec to use
  146.  * \param source     Pointer to pointer to source data
  147.  * \param sourcelen  Pointer to length (in bytes) of source data
  148.  * \param dest       Pointer to pointer to output buffer
  149.  * \param destlen    Pointer to length (in bytes) of output buffer
  150.  * \return PARSERUTILS_OK          on success,
  151.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  152.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  153.  *                                 codec's error handling mode is set to STRICT,
  154.  *
  155.  * On exit, ::source will point immediately _after_ the last input character
  156.  * read. Any remaining output for the character will be buffered by the
  157.  * codec for writing on the next call.
  158.  *
  159.  * Note that, if failure occurs whilst attempting to write any output
  160.  * buffered by the last call, then ::source and ::sourcelen will remain
  161.  * unchanged (as nothing more has been read).
  162.  *
  163.  * ::sourcelen will be reduced appropriately on exit.
  164.  *
  165.  * ::dest will point immediately _after_ the last character written.
  166.  *
  167.  * ::destlen will be reduced appropriately on exit.
  168.  */
  169. parserutils_error charset_ascii_codec_encode(parserutils_charset_codec *codec,
  170.                 const uint8_t **source, size_t *sourcelen,
  171.                 uint8_t **dest, size_t *destlen)
  172. {
  173.         charset_ascii_codec *c = (charset_ascii_codec *) codec;
  174.         uint32_t ucs4;
  175.         uint32_t *towrite;
  176.         size_t towritelen;
  177.         parserutils_error error;
  178.  
  179.         /* Process any outstanding characters from the previous call */
  180.         if (c->write_len > 0) {
  181.                 uint32_t *pwrite = c->write_buf;
  182.  
  183.                 while (c->write_len > 0) {
  184.                         error = charset_ascii_from_ucs4(c, pwrite[0],
  185.                                         dest, destlen);
  186.                         if (error != PARSERUTILS_OK) {
  187.                                 uint32_t len;
  188.                                 assert(error == PARSERUTILS_NOMEM);
  189.  
  190.                                 for (len = 0; len < c->write_len; len++) {
  191.                                         c->write_buf[len] = pwrite[len];
  192.                                 }
  193.  
  194.                                 return error;
  195.                         }
  196.  
  197.                         pwrite++;
  198.                         c->write_len--;
  199.                 }
  200.         }
  201.  
  202.         /* Now process the characters for this call */
  203.         while (*sourcelen > 0) {
  204.                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
  205.                 towrite = &ucs4;
  206.                 towritelen = 1;
  207.  
  208.                 /* Output current characters */
  209.                 while (towritelen > 0) {
  210.                         error = charset_ascii_from_ucs4(c, towrite[0], dest,
  211.                                         destlen);
  212.                         if (error != PARSERUTILS_OK) {
  213.                                 uint32_t len;
  214.                                 if (error != PARSERUTILS_NOMEM) {
  215.                                         return error;
  216.                                 }
  217.  
  218.                                 /* Insufficient output space */
  219.                                 if (towritelen >= WRITE_BUFSIZE)
  220.                                         abort();
  221.  
  222.                                 c->write_len = towritelen;
  223.  
  224.                                 /* Copy pending chars to save area, for
  225.                                  * processing next call. */
  226.                                 for (len = 0; len < towritelen; len++)
  227.                                         c->write_buf[len] = towrite[len];
  228.  
  229.                                 /* Claim character we've just buffered,
  230.                                  * so it's not reprocessed */
  231.                                 *source += 4;
  232.                                 *sourcelen -= 4;
  233.  
  234.                                 return PARSERUTILS_NOMEM;
  235.                         }
  236.  
  237.                         towrite++;
  238.                         towritelen--;
  239.                 }
  240.  
  241.                 *source += 4;
  242.                 *sourcelen -= 4;
  243.         }
  244.  
  245.         return PARSERUTILS_OK;
  246. }
  247.  
  248. /**
  249.  * Decode a chunk of US-ASCII data into UCS-4 (big endian)
  250.  *
  251.  * \param codec      The codec to use
  252.  * \param source     Pointer to pointer to source data
  253.  * \param sourcelen  Pointer to length (in bytes) of source data
  254.  * \param dest       Pointer to pointer to output buffer
  255.  * \param destlen    Pointer to length (in bytes) of output buffer
  256.  * \return PARSERUTILS_OK          on success,
  257.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  258.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  259.  *                                 codec's error handling mode is set to STRICT,
  260.  *
  261.  * On exit, ::source will point immediately _after_ the last input character
  262.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  263.  * character will be buffered by the codec for writing on the next call.
  264.  *
  265.  * In the case of the result being _INVALID, ::source will point _at_ the
  266.  * last input character read; nothing will be written or buffered for the
  267.  * failed character. It is up to the client to fix the cause of the failure
  268.  * and retry the decoding process.
  269.  *
  270.  * Note that, if failure occurs whilst attempting to write any output
  271.  * buffered by the last call, then ::source and ::sourcelen will remain
  272.  * unchanged (as nothing more has been read).
  273.  *
  274.  * If STRICT error handling is configured and an illegal sequence is split
  275.  * over two calls, then _INVALID will be returned from the second call,
  276.  * but ::source will point mid-way through the invalid sequence (i.e. it
  277.  * will be unmodified over the second call). In addition, the internal
  278.  * incomplete-sequence buffer will be emptied, such that subsequent calls
  279.  * will progress, rather than re-evaluating the same invalid sequence.
  280.  *
  281.  * ::sourcelen will be reduced appropriately on exit.
  282.  *
  283.  * ::dest will point immediately _after_ the last character written.
  284.  *
  285.  * ::destlen will be reduced appropriately on exit.
  286.  *
  287.  * Call this with a source length of 0 to flush the output buffer.
  288.  */
  289. parserutils_error charset_ascii_codec_decode(parserutils_charset_codec *codec,
  290.                 const uint8_t **source, size_t *sourcelen,
  291.                 uint8_t **dest, size_t *destlen)
  292. {
  293.         charset_ascii_codec *c = (charset_ascii_codec *) codec;
  294.         parserutils_error error;
  295.  
  296.         if (c->read_len > 0) {
  297.                 /* Output left over from last decode */
  298.                 uint32_t *pread = c->read_buf;
  299.  
  300.                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
  301.                         *((uint32_t *) (void *) *dest) =
  302.                                         endian_host_to_big(pread[0]);
  303.  
  304.                         *dest += 4;
  305.                         *destlen -= 4;
  306.  
  307.                         pread++;
  308.                         c->read_len--;
  309.                 }
  310.  
  311.                 if (*destlen < c->read_len * 4) {
  312.                         /* Ran out of output buffer */
  313.                         size_t i;
  314.  
  315.                         /* Shuffle remaining output down */
  316.                         for (i = 0; i < c->read_len; i++)
  317.                                 c->read_buf[i] = pread[i];
  318.  
  319.                         return PARSERUTILS_NOMEM;
  320.                 }
  321.         }
  322.  
  323.         /* Finally, the "normal" case; process all outstanding characters */
  324.         while (*sourcelen > 0) {
  325.                 error = charset_ascii_codec_read_char(c,
  326.                                 source, sourcelen, dest, destlen);
  327.                 if (error != PARSERUTILS_OK) {
  328.                         return error;
  329.                 }
  330.         }
  331.  
  332.         return PARSERUTILS_OK;
  333. }
  334.  
  335. /**
  336.  * Clear a US-ASCII codec's encoding state
  337.  *
  338.  * \param codec  The codec to reset
  339.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  340.  */
  341. parserutils_error charset_ascii_codec_reset(parserutils_charset_codec *codec)
  342. {
  343.         charset_ascii_codec *c = (charset_ascii_codec *) codec;
  344.  
  345.         c->read_buf[0] = 0;
  346.         c->read_len = 0;
  347.  
  348.         c->write_buf[0] = 0;
  349.         c->write_len = 0;
  350.  
  351.         return PARSERUTILS_OK;
  352. }
  353.  
  354.  
  355. /**
  356.  * Read a character from US-ASCII to UCS-4 (big endian)
  357.  *
  358.  * \param c          The codec
  359.  * \param source     Pointer to pointer to source buffer (updated on exit)
  360.  * \param sourcelen  Pointer to length of source buffer (updated on exit)
  361.  * \param dest       Pointer to pointer to output buffer (updated on exit)
  362.  * \param destlen    Pointer to length of output buffer (updated on exit)
  363.  * \return PARSERUTILS_OK on success,
  364.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  365.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  366.  *                                 codec's error handling mode is set to STRICT,
  367.  *
  368.  * On exit, ::source will point immediately _after_ the last input character
  369.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  370.  * character will be buffered by the codec for writing on the next call.
  371.  *
  372.  * In the case of the result being _INVALID, ::source will point _at_ the
  373.  * last input character read; nothing will be written or buffered for the
  374.  * failed character. It is up to the client to fix the cause of the failure
  375.  * and retry the decoding process.
  376.  *
  377.  * ::sourcelen will be reduced appropriately on exit.
  378.  *
  379.  * ::dest will point immediately _after_ the last character written.
  380.  *
  381.  * ::destlen will be reduced appropriately on exit.
  382.  */
  383. parserutils_error charset_ascii_codec_read_char(charset_ascii_codec *c,
  384.                 const uint8_t **source, size_t *sourcelen,
  385.                 uint8_t **dest, size_t *destlen)
  386. {
  387.         uint32_t ucs4;
  388.         parserutils_error error;
  389.  
  390.         /* Convert a single character */
  391.         error = charset_ascii_to_ucs4(c, *source, *sourcelen, &ucs4);
  392.         if (error == PARSERUTILS_OK) {
  393.                 /* Read a character */
  394.                 error = charset_ascii_codec_output_decoded_char(c,
  395.                                 ucs4, dest, destlen);
  396.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  397.                         /* output succeeded; update source pointers */
  398.                         *source += 1;
  399.                         *sourcelen -= 1;
  400.                 }
  401.  
  402.                 return error;
  403.         } else if (error == PARSERUTILS_NEEDDATA) {
  404.                 /* Can only happen if sourcelen == 0 */
  405.                 return error;
  406.         } else if (error == PARSERUTILS_INVALID) {
  407.                 /* Illegal input sequence */
  408.  
  409.                 /* Strict errormode; simply flag invalid character */
  410.                 if (c->base.errormode ==
  411.                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
  412.                         return PARSERUTILS_INVALID;
  413.                 }
  414.  
  415.                 /* output U+FFFD and continue processing. */
  416.                 error = charset_ascii_codec_output_decoded_char(c,
  417.                                 0xFFFD, dest, destlen);
  418.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  419.                         /* output succeeded; update source pointers */
  420.                         *source += 1;
  421.                         *sourcelen -= 1;
  422.                 }
  423.  
  424.                 return error;
  425.         }
  426.  
  427.         return PARSERUTILS_OK;
  428. }
  429.  
  430. /**
  431.  * Output a UCS-4 character (big endian)
  432.  *
  433.  * \param c        Codec to use
  434.  * \param ucs4     UCS-4 character (host endian)
  435.  * \param dest     Pointer to pointer to output buffer
  436.  * \param destlen  Pointer to output buffer length
  437.  * \return PARSERUTILS_OK          on success,
  438.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  439.  */
  440. parserutils_error charset_ascii_codec_output_decoded_char(
  441.                 charset_ascii_codec *c,
  442.                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
  443. {
  444.         if (*destlen < 4) {
  445.                 /* Run out of output buffer */
  446.                 c->read_len = 1;
  447.                 c->read_buf[0] = ucs4;
  448.  
  449.                 return PARSERUTILS_NOMEM;
  450.         }
  451.  
  452.         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
  453.         *dest += 4;
  454.         *destlen -= 4;
  455.  
  456.         return PARSERUTILS_OK;
  457. }
  458.  
  459. /**
  460.  * Convert a UCS4 (host endian) character to US-ASCII
  461.  *
  462.  * \param c     The codec instance
  463.  * \param ucs4  The UCS4 character to convert
  464.  * \param s     Pointer to pointer to destination buffer
  465.  * \param len   Pointer to destination buffer length
  466.  * \return PARSERUTILS_OK on success,
  467.  *         PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
  468.  *         PARSERUTILS_INVALID if the character cannot be represented
  469.  *
  470.  * _INVALID will only be returned if the codec's conversion mode is STRICT.
  471.  * Otherwise, '?' will be output.
  472.  *
  473.  * On successful conversion, *s and *len will be updated.
  474.  */
  475. parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
  476.                 uint32_t ucs4, uint8_t **s, size_t *len)
  477. {
  478.         uint8_t out = 0;
  479.  
  480.         if (*len < 1)
  481.                 return PARSERUTILS_NOMEM;
  482.  
  483.         if (ucs4 < 0x80) {
  484.                 /* ASCII */
  485.                 out = ucs4;
  486.         } else {
  487.                 if (c->base.errormode == PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
  488.                         return PARSERUTILS_INVALID;
  489.                 else
  490.                         out = '?';
  491.         }
  492.  
  493.         *(*s) = out;
  494.         (*s)++;
  495.         (*len)--;
  496.  
  497.         return PARSERUTILS_OK;
  498. }
  499.  
  500. /**
  501.  * Convert a US-ASCII character to UCS4 (host endian)
  502.  *
  503.  * \param c     The codec instance
  504.  * \param s     Pointer to source buffer
  505.  * \param len   Source buffer length
  506.  * \param ucs4  Pointer to destination buffer
  507.  * \return PARSERUTILS_OK on success,
  508.  *         PARSERUTILS_NEEDDATA if there's insufficient input data
  509.  *         PARSERUTILS_INVALID if the character cannot be represented
  510.  */
  511. parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
  512.                 const uint8_t *s, size_t len, uint32_t *ucs4)
  513. {
  514.         uint32_t out;
  515.  
  516.         UNUSED(c);
  517.  
  518.         if (len < 1)
  519.                 return PARSERUTILS_NEEDDATA;
  520.  
  521.         if (*s < 0x80) {
  522.                 out = *s;
  523.         } else {
  524.                 return PARSERUTILS_INVALID;
  525.         }
  526.  
  527.         *ucs4 = out;
  528.  
  529.         return PARSERUTILS_OK;
  530. }
  531.  
  532. const parserutils_charset_handler charset_ascii_codec_handler = {
  533.         charset_ascii_codec_handles_charset,
  534.         charset_ascii_codec_create
  535. };
  536.  
  537.