Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * This file is part of LibParserUtils.
  3.  * Licensed under the MIT License,
  4.  *                http://www.opensource.org/licenses/mit-license.php
  5.  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
  6.  */
  7.  
  8. #include <assert.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11.  
  12. #include <parserutils/charset/mibenum.h>
  13.  
  14. #include "charset/codecs/codec_impl.h"
  15. #include "utils/endian.h"
  16. #include "utils/utils.h"
  17.  
  18. #include "charset/codecs/ext8_tables.h"
  19.  
  20. static struct {
  21.         uint16_t mib;
  22.         const char *name;
  23.         size_t len;
  24.         uint32_t *table;
  25. } known_charsets[] = {
  26.         { 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
  27.         { 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
  28.         { 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
  29.         { 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
  30.         { 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
  31.         { 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
  32.         { 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
  33.         { 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
  34.         { 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
  35. };
  36.  
  37. /**
  38.  * Windows charset codec
  39.  */
  40. typedef struct charset_ext8_codec {
  41.         parserutils_charset_codec base; /**< Base class */
  42.  
  43.         uint32_t *table;                /**< Mapping table for 0x80-0xFF */
  44.  
  45. #define READ_BUFSIZE (8)
  46.         uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
  47.                                                  * output sequences (decode)
  48.                                                  * (host-endian) */
  49.         size_t read_len;                /**< Character length of read_buf */
  50.  
  51. #define WRITE_BUFSIZE (8)
  52.         uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
  53.                                                  * output sequences (encode)
  54.                                                  * (host-endian) */
  55.         size_t write_len;               /**< Character length of write_buf */
  56.  
  57. } charset_ext8_codec;
  58.  
  59. static bool charset_ext8_codec_handles_charset(const char *charset);
  60. static parserutils_error charset_ext8_codec_create(const char *charset,
  61.                 parserutils_alloc alloc, void *pw,
  62.                 parserutils_charset_codec **codec);
  63. static parserutils_error charset_ext8_codec_destroy(
  64.                 parserutils_charset_codec *codec);
  65. static parserutils_error charset_ext8_codec_encode(
  66.                 parserutils_charset_codec *codec,
  67.                 const uint8_t **source, size_t *sourcelen,
  68.                 uint8_t **dest, size_t *destlen);
  69. static parserutils_error charset_ext8_codec_decode(
  70.                 parserutils_charset_codec *codec,
  71.                 const uint8_t **source, size_t *sourcelen,
  72.                 uint8_t **dest, size_t *destlen);
  73. static parserutils_error charset_ext8_codec_reset(
  74.                 parserutils_charset_codec *codec);
  75. static inline parserutils_error charset_ext8_codec_read_char(
  76.                 charset_ext8_codec *c,
  77.                 const uint8_t **source, size_t *sourcelen,
  78.                 uint8_t **dest, size_t *destlen);
  79. static inline parserutils_error charset_ext8_codec_output_decoded_char(
  80.                 charset_ext8_codec *c,
  81.                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
  82. static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
  83.                 uint32_t ucs4, uint8_t **s, size_t *len);
  84. static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
  85.                 const uint8_t *s, size_t len, uint32_t *ucs4);
  86.  
  87. /**
  88.  * Determine whether this codec handles a specific charset
  89.  *
  90.  * \param charset  Charset to test
  91.  * \return true if handleable, false otherwise
  92.  */
  93. bool charset_ext8_codec_handles_charset(const char *charset)
  94. {
  95.         uint32_t i;
  96.         uint16_t match = parserutils_charset_mibenum_from_name(charset,
  97.                         strlen(charset));
  98.  
  99.         if (known_charsets[0].mib == 0) {
  100.                 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
  101.                         known_charsets[i].mib =
  102.                                 parserutils_charset_mibenum_from_name(
  103.                                                 known_charsets[i].name,
  104.                                                 known_charsets[i].len);
  105.                 }
  106.         }
  107.  
  108.         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
  109.                 if (known_charsets[i].mib == match)
  110.                         return true;
  111.         }
  112.  
  113.         return false;
  114. }
  115.  
  116. /**
  117.  * Create an extended 8bit codec
  118.  *
  119.  * \param charset  The charset to read from / write to
  120.  * \param alloc    Memory (de)allocation function
  121.  * \param pw       Pointer to client-specific private data (may be NULL)
  122.  * \param codec    Pointer to location to receive codec
  123.  * \return PARSERUTILS_OK on success,
  124.  *         PARSERUTILS_BADPARM on bad parameters,
  125.  *         PARSERUTILS_NOMEM on memory exhausion
  126.  */
  127. parserutils_error charset_ext8_codec_create(const char *charset,
  128.                 parserutils_alloc alloc, void *pw,
  129.                 parserutils_charset_codec **codec)
  130. {
  131.         uint32_t i;
  132.         charset_ext8_codec *c;
  133.         uint16_t match = parserutils_charset_mibenum_from_name(
  134.                         charset, strlen(charset));
  135.         uint32_t *table = NULL;
  136.  
  137.         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
  138.                 if (known_charsets[i].mib == match) {
  139.                         table = known_charsets[i].table;
  140.                         break;
  141.                 }
  142.         }
  143.  
  144.         assert(table != NULL);
  145.  
  146.         c = alloc(NULL, sizeof(charset_ext8_codec), pw);
  147.         if (c == NULL)
  148.                 return PARSERUTILS_NOMEM;
  149.  
  150.         c->table = table;
  151.  
  152.         c->read_buf[0] = 0;
  153.         c->read_len = 0;
  154.  
  155.         c->write_buf[0] = 0;
  156.         c->write_len = 0;
  157.  
  158.         /* Finally, populate vtable */
  159.         c->base.handler.destroy = charset_ext8_codec_destroy;
  160.         c->base.handler.encode = charset_ext8_codec_encode;
  161.         c->base.handler.decode = charset_ext8_codec_decode;
  162.         c->base.handler.reset = charset_ext8_codec_reset;
  163.  
  164.         *codec = (parserutils_charset_codec *) c;
  165.  
  166.         return PARSERUTILS_OK;
  167. }
  168.  
  169. /**
  170.  * Destroy an extended 8bit codec
  171.  *
  172.  * \param codec  The codec to destroy
  173.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  174.  */
  175. parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec)
  176. {
  177.         UNUSED(codec);
  178.  
  179.         return PARSERUTILS_OK;
  180. }
  181.  
  182. /**
  183.  * Encode a chunk of UCS-4 (big endian) data into extended 8bit
  184.  *
  185.  * \param codec      The codec to use
  186.  * \param source     Pointer to pointer to source data
  187.  * \param sourcelen  Pointer to length (in bytes) of source data
  188.  * \param dest       Pointer to pointer to output buffer
  189.  * \param destlen    Pointer to length (in bytes) of output buffer
  190.  * \return PARSERUTILS_OK          on success,
  191.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  192.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  193.  *                                 codec's error handling mode is set to STRICT,
  194.  *
  195.  * On exit, ::source will point immediately _after_ the last input character
  196.  * read. Any remaining output for the character will be buffered by the
  197.  * codec for writing on the next call.
  198.  *
  199.  * Note that, if failure occurs whilst attempting to write any output
  200.  * buffered by the last call, then ::source and ::sourcelen will remain
  201.  * unchanged (as nothing more has been read).
  202.  *
  203.  * ::sourcelen will be reduced appropriately on exit.
  204.  *
  205.  * ::dest will point immediately _after_ the last character written.
  206.  *
  207.  * ::destlen will be reduced appropriately on exit.
  208.  */
  209. parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec,
  210.                 const uint8_t **source, size_t *sourcelen,
  211.                 uint8_t **dest, size_t *destlen)
  212. {
  213.         charset_ext8_codec *c = (charset_ext8_codec *) codec;
  214.         uint32_t ucs4;
  215.         uint32_t *towrite;
  216.         size_t towritelen;
  217.         parserutils_error error;
  218.  
  219.         /* Process any outstanding characters from the previous call */
  220.         if (c->write_len > 0) {
  221.                 uint32_t *pwrite = c->write_buf;
  222.  
  223.                 while (c->write_len > 0) {
  224.                         error = charset_ext8_from_ucs4(c, pwrite[0],
  225.                                         dest, destlen);
  226.                         if (error != PARSERUTILS_OK) {
  227.                                 uint32_t len;
  228.                                 assert(error == PARSERUTILS_NOMEM);
  229.  
  230.                                 for (len = 0; len < c->write_len; len++) {
  231.                                         c->write_buf[len] = pwrite[len];
  232.                                 }
  233.  
  234.                                 return error;
  235.                         }
  236.  
  237.                         pwrite++;
  238.                         c->write_len--;
  239.                 }
  240.         }
  241.  
  242.         /* Now process the characters for this call */
  243.         while (*sourcelen > 0) {
  244.                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
  245.                 towrite = &ucs4;
  246.                 towritelen = 1;
  247.  
  248.                 /* Output current characters */
  249.                 while (towritelen > 0) {
  250.                         error = charset_ext8_from_ucs4(c, towrite[0], dest,
  251.                                         destlen);
  252.                         if (error != PARSERUTILS_OK) {
  253.                                 uint32_t len;
  254.                                 if (error != PARSERUTILS_NOMEM) {
  255.                                         return error;
  256.                                 }
  257.  
  258.                                 /* Insufficient output space */
  259.                                 if (towritelen >= WRITE_BUFSIZE)
  260.                                         abort();
  261.  
  262.                                 c->write_len = towritelen;
  263.  
  264.                                 /* Copy pending chars to save area, for
  265.                                  * processing next call. */
  266.                                 for (len = 0; len < towritelen; len++)
  267.                                         c->write_buf[len] = towrite[len];
  268.  
  269.                                 /* Claim character we've just buffered,
  270.                                  * so it's not reprocessed */
  271.                                 *source += 4;
  272.                                 *sourcelen -= 4;
  273.  
  274.                                 return PARSERUTILS_NOMEM;
  275.                         }
  276.  
  277.                         towrite++;
  278.                         towritelen--;
  279.                 }
  280.  
  281.                 *source += 4;
  282.                 *sourcelen -= 4;
  283.         }
  284.  
  285.         return PARSERUTILS_OK;
  286. }
  287.  
  288. /**
  289.  * Decode a chunk of extended 8bit data into UCS-4 (big endian)
  290.  *
  291.  * \param codec      The codec to use
  292.  * \param source     Pointer to pointer to source data
  293.  * \param sourcelen  Pointer to length (in bytes) of source data
  294.  * \param dest       Pointer to pointer to output buffer
  295.  * \param destlen    Pointer to length (in bytes) of output buffer
  296.  * \return PARSERUTILS_OK          on success,
  297.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  298.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  299.  *                                 codec's error handling mode is set to STRICT,
  300.  *
  301.  * On exit, ::source will point immediately _after_ the last input character
  302.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  303.  * character will be buffered by the codec for writing on the next call.
  304.  *
  305.  * In the case of the result being _INVALID, ::source will point _at_ the
  306.  * last input character read; nothing will be written or buffered for the
  307.  * failed character. It is up to the client to fix the cause of the failure
  308.  * and retry the decoding process.
  309.  *
  310.  * Note that, if failure occurs whilst attempting to write any output
  311.  * buffered by the last call, then ::source and ::sourcelen will remain
  312.  * unchanged (as nothing more has been read).
  313.  *
  314.  * If STRICT error handling is configured and an illegal sequence is split
  315.  * over two calls, then _INVALID will be returned from the second call,
  316.  * but ::source will point mid-way through the invalid sequence (i.e. it
  317.  * will be unmodified over the second call). In addition, the internal
  318.  * incomplete-sequence buffer will be emptied, such that subsequent calls
  319.  * will progress, rather than re-evaluating the same invalid sequence.
  320.  *
  321.  * ::sourcelen will be reduced appropriately on exit.
  322.  *
  323.  * ::dest will point immediately _after_ the last character written.
  324.  *
  325.  * ::destlen will be reduced appropriately on exit.
  326.  *
  327.  * Call this with a source length of 0 to flush the output buffer.
  328.  */
  329. parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec,
  330.                 const uint8_t **source, size_t *sourcelen,
  331.                 uint8_t **dest, size_t *destlen)
  332. {
  333.         charset_ext8_codec *c = (charset_ext8_codec *) codec;
  334.         parserutils_error error;
  335.  
  336.         if (c->read_len > 0) {
  337.                 /* Output left over from last decode */
  338.                 uint32_t *pread = c->read_buf;
  339.  
  340.                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
  341.                         *((uint32_t *) (void *) *dest) =
  342.                                         endian_host_to_big(pread[0]);
  343.  
  344.                         *dest += 4;
  345.                         *destlen -= 4;
  346.  
  347.                         pread++;
  348.                         c->read_len--;
  349.                 }
  350.  
  351.                 if (*destlen < c->read_len * 4) {
  352.                         /* Ran out of output buffer */
  353.                         size_t i;
  354.  
  355.                         /* Shuffle remaining output down */
  356.                         for (i = 0; i < c->read_len; i++)
  357.                                 c->read_buf[i] = pread[i];
  358.  
  359.                         return PARSERUTILS_NOMEM;
  360.                 }
  361.         }
  362.  
  363.         /* Finally, the "normal" case; process all outstanding characters */
  364.         while (*sourcelen > 0) {
  365.                 error = charset_ext8_codec_read_char(c,
  366.                                 source, sourcelen, dest, destlen);
  367.                 if (error != PARSERUTILS_OK) {
  368.                         return error;
  369.                 }
  370.         }
  371.  
  372.         return PARSERUTILS_OK;
  373. }
  374.  
  375. /**
  376.  * Clear an extended 8bit codec's encoding state
  377.  *
  378.  * \param codec  The codec to reset
  379.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  380.  */
  381. parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
  382. {
  383.         charset_ext8_codec *c = (charset_ext8_codec *) codec;
  384.  
  385.         c->read_buf[0] = 0;
  386.         c->read_len = 0;
  387.  
  388.         c->write_buf[0] = 0;
  389.         c->write_len = 0;
  390.  
  391.         return PARSERUTILS_OK;
  392. }
  393.  
  394.  
  395. /**
  396.  * Read a character from the extended 8bit to UCS-4 (big endian)
  397.  *
  398.  * \param c          The codec
  399.  * \param source     Pointer to pointer to source buffer (updated on exit)
  400.  * \param sourcelen  Pointer to length of source buffer (updated on exit)
  401.  * \param dest       Pointer to pointer to output buffer (updated on exit)
  402.  * \param destlen    Pointer to length of output buffer (updated on exit)
  403.  * \return PARSERUTILS_OK on success,
  404.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  405.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  406.  *                                 codec's error handling mode is set to STRICT,
  407.  *
  408.  * On exit, ::source will point immediately _after_ the last input character
  409.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  410.  * character will be buffered by the codec for writing on the next call.
  411.  *
  412.  * In the case of the result being _INVALID, ::source will point _at_ the
  413.  * last input character read; nothing will be written or buffered for the
  414.  * failed character. It is up to the client to fix the cause of the failure
  415.  * and retry the decoding process.
  416.  *
  417.  * ::sourcelen will be reduced appropriately on exit.
  418.  *
  419.  * ::dest will point immediately _after_ the last character written.
  420.  *
  421.  * ::destlen will be reduced appropriately on exit.
  422.  */
  423. parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c,
  424.                 const uint8_t **source, size_t *sourcelen,
  425.                 uint8_t **dest, size_t *destlen)
  426. {
  427.         uint32_t ucs4;
  428.         parserutils_error error;
  429.  
  430.         /* Convert a single character */
  431.         error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
  432.         if (error == PARSERUTILS_OK) {
  433.                 /* Read a character */
  434.                 error = charset_ext8_codec_output_decoded_char(c,
  435.                                 ucs4, dest, destlen);
  436.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  437.                         /* output succeeded; update source pointers */
  438.                         *source += 1;
  439.                         *sourcelen -= 1;
  440.                 }
  441.  
  442.                 return error;
  443.         } else if (error == PARSERUTILS_NEEDDATA) {
  444.                 /* Can only happen if sourcelen == 0 */
  445.                 return error;
  446.         } else if (error == PARSERUTILS_INVALID) {
  447.                 /* Illegal input sequence */
  448.  
  449.                 /* Strict errormode; simply flag invalid character */
  450.                 if (c->base.errormode ==
  451.                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
  452.                         return PARSERUTILS_INVALID;
  453.                 }
  454.  
  455.                 /* output U+FFFD and continue processing. */
  456.                 error = charset_ext8_codec_output_decoded_char(c,
  457.                                 0xFFFD, dest, destlen);
  458.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  459.                         /* output succeeded; update source pointers */
  460.                         *source += 1;
  461.                         *sourcelen -= 1;
  462.                 }
  463.  
  464.                 return error;
  465.         }
  466.  
  467.         return PARSERUTILS_OK;
  468. }
  469.  
  470. /**
  471.  * Output a UCS-4 character (big endian)
  472.  *
  473.  * \param c        Codec to use
  474.  * \param ucs4     UCS-4 character (host endian)
  475.  * \param dest     Pointer to pointer to output buffer
  476.  * \param destlen  Pointer to output buffer length
  477.  * \return PARSERUTILS_OK          on success,
  478.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  479.  */
  480. parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c,
  481.                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
  482. {
  483.         if (*destlen < 4) {
  484.                 /* Run out of output buffer */
  485.                 c->read_len = 1;
  486.                 c->read_buf[0] = ucs4;
  487.  
  488.                 return PARSERUTILS_NOMEM;
  489.         }
  490.  
  491.         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
  492.         *dest += 4;
  493.         *destlen -= 4;
  494.  
  495.         return PARSERUTILS_OK;
  496. }
  497.  
  498. /**
  499.  * Convert a UCS4 (host endian) character to extended 8bit
  500.  *
  501.  * \param c     The codec instance
  502.  * \param ucs4  The UCS4 character to convert
  503.  * \param s     Pointer to pointer to destination buffer
  504.  * \param len   Pointer to destination buffer length
  505.  * \return PARSERUTILS_OK on success,
  506.  *         PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
  507.  *         PARSERUTILS_INVALID if the character cannot be represented
  508.  *
  509.  * _INVALID will only be returned if the codec's conversion mode is STRICT.
  510.  * Otherwise, '?' will be output.
  511.  *
  512.  * On successful conversion, *s and *len will be updated.
  513.  */
  514. parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
  515.                 uint32_t ucs4, uint8_t **s, size_t *len)
  516. {
  517.         uint8_t out = 0;
  518.  
  519.         if (*len < 1)
  520.                 return PARSERUTILS_NOMEM;
  521.  
  522.         if (ucs4 < 0x80) {
  523.                 /* ASCII */
  524.                 out = ucs4;
  525.         } else {
  526.                 uint32_t i;
  527.  
  528.                 for (i = 0; i < 128; i++) {
  529.                         if (ucs4 == c->table[i])
  530.                                 break;
  531.                 }
  532.  
  533.                 if (i == 128) {
  534.                         if (c->base.errormode ==
  535.                                         PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
  536.                                 return PARSERUTILS_INVALID;
  537.                         else
  538.                                 out = '?';
  539.                 } else {
  540.                         out = 0x80 + i;
  541.                 }
  542.         }
  543.  
  544.         *(*s) = out;
  545.         (*s)++;
  546.         (*len)--;
  547.  
  548.         return PARSERUTILS_OK;
  549. }
  550.  
  551. /**
  552.  * Convert an extended 8bit character to UCS4 (host endian)
  553.  *
  554.  * \param c     The codec instance
  555.  * \param s     Pointer to source buffer
  556.  * \param len   Source buffer length
  557.  * \param ucs4  Pointer to destination buffer
  558.  * \return PARSERUTILS_OK on success,
  559.  *         PARSERUTILS_NEEDDATA if there's insufficient input data
  560.  *         PARSERUTILS_INVALID if the character cannot be represented
  561.  */
  562. parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
  563.                 const uint8_t *s, size_t len, uint32_t *ucs4)
  564. {
  565.         uint32_t out;
  566.  
  567.         if (len < 1)
  568.                 return PARSERUTILS_NEEDDATA;
  569.  
  570.         if (*s < 0x80) {
  571.                 out = *s;
  572.         } else {
  573.                 if (c->table[*s - 0x80] == 0xFFFF)
  574.                         return PARSERUTILS_INVALID;
  575.  
  576.                 out = c->table[*s - 0x80];
  577.         }
  578.  
  579.         *ucs4 = out;
  580.  
  581.         return PARSERUTILS_OK;
  582. }
  583.  
  584. const parserutils_charset_handler charset_ext8_codec_handler = {
  585.         charset_ext8_codec_handles_charset,
  586.         charset_ext8_codec_create
  587. };
  588.  
  589.