Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * This file is part of LibParserUtils.
  3.  * Licensed under the MIT License,
  4.  *                http://www.opensource.org/licenses/mit-license.php
  5.  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
  6.  */
  7.  
  8. #include <assert.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11.  
  12. #include <parserutils/charset/mibenum.h>
  13.  
  14. #include "charset/codecs/codec_impl.h"
  15. #include "utils/endian.h"
  16. #include "utils/utils.h"
  17.  
  18. #include "charset/codecs/8859_tables.h"
  19.  
  20. static struct {
  21.         uint16_t mib;
  22.         const char *name;
  23.         size_t len;
  24.         uint32_t *table;
  25. } known_charsets[] = {
  26.         { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
  27.         { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
  28.         { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
  29.         { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
  30.         { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
  31.         { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
  32.         { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
  33.         { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
  34.         { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
  35.         { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
  36.         { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
  37.         { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
  38.         { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
  39.         { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
  40.         { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
  41. };
  42.  
  43. /**
  44.  * ISO-8859-n charset codec
  45.  */
  46. typedef struct charset_8859_codec {
  47.         parserutils_charset_codec base; /**< Base class */
  48.  
  49.         uint32_t *table;                /**< Mapping table for 0xA0-0xFF */
  50.  
  51. #define READ_BUFSIZE (8)
  52.         uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
  53.                                                  * output sequences (decode)
  54.                                                  * (host-endian) */
  55.         size_t read_len;                /**< Character length of read_buf */
  56.  
  57. #define WRITE_BUFSIZE (8)
  58.         uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
  59.                                                  * output sequences (encode)
  60.                                                  * (host-endian) */
  61.         size_t write_len;               /**< Character length of write_buf */
  62.  
  63. } charset_8859_codec;
  64.  
  65. static bool charset_8859_codec_handles_charset(const char *charset);
  66. static parserutils_error charset_8859_codec_create(const char *charset,
  67.                 parserutils_alloc alloc, void *pw,
  68.                 parserutils_charset_codec **codec);
  69. static parserutils_error charset_8859_codec_destroy(
  70.                 parserutils_charset_codec *codec);
  71. static parserutils_error charset_8859_codec_encode(
  72.                 parserutils_charset_codec *codec,
  73.                 const uint8_t **source, size_t *sourcelen,
  74.                 uint8_t **dest, size_t *destlen);
  75. static parserutils_error charset_8859_codec_decode(
  76.                 parserutils_charset_codec *codec,
  77.                 const uint8_t **source, size_t *sourcelen,
  78.                 uint8_t **dest, size_t *destlen);
  79. static parserutils_error charset_8859_codec_reset(
  80.                 parserutils_charset_codec *codec);
  81. static inline parserutils_error charset_8859_codec_read_char(
  82.                 charset_8859_codec *c,
  83.                 const uint8_t **source, size_t *sourcelen,
  84.                 uint8_t **dest, size_t *destlen);
  85. static inline parserutils_error charset_8859_codec_output_decoded_char(
  86.                 charset_8859_codec *c,
  87.                 uint32_t ucs4, uint8_t **dest, size_t *destlen);
  88. static inline parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
  89.                 uint32_t ucs4, uint8_t **s, size_t *len);
  90. static inline parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
  91.                 const uint8_t *s, size_t len, uint32_t *ucs4);
  92.  
  93. /**
  94.  * Determine whether this codec handles a specific charset
  95.  *
  96.  * \param charset  Charset to test
  97.  * \return true if handleable, false otherwise
  98.  */
  99. bool charset_8859_codec_handles_charset(const char *charset)
  100. {
  101.         uint32_t i;
  102.         uint16_t match = parserutils_charset_mibenum_from_name(charset,
  103.                         strlen(charset));
  104.  
  105.         if (known_charsets[0].mib == 0) {
  106.                 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
  107.                         known_charsets[i].mib =
  108.                                 parserutils_charset_mibenum_from_name(
  109.                                                 known_charsets[i].name,
  110.                                                 known_charsets[i].len);
  111.                 }
  112.         }
  113.  
  114.         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
  115.                 if (known_charsets[i].mib == match)
  116.                         return true;
  117.         }
  118.  
  119.         return false;
  120. }
  121.  
  122. /**
  123.  * Create an ISO-8859-n codec
  124.  *
  125.  * \param charset  The charset to read from / write to
  126.  * \param alloc    Memory (de)allocation function
  127.  * \param pw       Pointer to client-specific private data (may be NULL)
  128.  * \param codec    Pointer to location to receive codec
  129.  * \return PARSERUTILS_OK on success,
  130.  *         PARSERUTILS_BADPARM on bad parameters,
  131.  *         PARSERUTILS_NOMEM on memory exhausion
  132.  */
  133. parserutils_error charset_8859_codec_create(const char *charset,
  134.                 parserutils_alloc alloc, void *pw,
  135.                 parserutils_charset_codec **codec)
  136. {
  137.         uint32_t i;
  138.         charset_8859_codec *c;
  139.         uint16_t match = parserutils_charset_mibenum_from_name(
  140.                         charset, strlen(charset));
  141.         uint32_t *table = NULL;
  142.  
  143.         for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
  144.                 if (known_charsets[i].mib == match) {
  145.                         table = known_charsets[i].table;
  146.                         break;
  147.                 }
  148.         }
  149.  
  150.         assert(table != NULL);
  151.  
  152.         c = alloc(NULL, sizeof(charset_8859_codec), pw);
  153.         if (c == NULL)
  154.                 return PARSERUTILS_NOMEM;
  155.  
  156.         c->table = table;
  157.  
  158.         c->read_buf[0] = 0;
  159.         c->read_len = 0;
  160.  
  161.         c->write_buf[0] = 0;
  162.         c->write_len = 0;
  163.  
  164.         /* Finally, populate vtable */
  165.         c->base.handler.destroy = charset_8859_codec_destroy;
  166.         c->base.handler.encode = charset_8859_codec_encode;
  167.         c->base.handler.decode = charset_8859_codec_decode;
  168.         c->base.handler.reset = charset_8859_codec_reset;
  169.  
  170.         *codec = (parserutils_charset_codec *) c;
  171.  
  172.         return PARSERUTILS_OK;
  173. }
  174.  
  175. /**
  176.  * Destroy an ISO-8859-n codec
  177.  *
  178.  * \param codec  The codec to destroy
  179.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  180.  */
  181. parserutils_error charset_8859_codec_destroy (parserutils_charset_codec *codec)
  182. {
  183.         UNUSED(codec);
  184.  
  185.         return PARSERUTILS_OK;
  186. }
  187.  
  188. /**
  189.  * Encode a chunk of UCS-4 (big endian) data into ISO-8859-n
  190.  *
  191.  * \param codec      The codec to use
  192.  * \param source     Pointer to pointer to source data
  193.  * \param sourcelen  Pointer to length (in bytes) of source data
  194.  * \param dest       Pointer to pointer to output buffer
  195.  * \param destlen    Pointer to length (in bytes) of output buffer
  196.  * \return PARSERUTILS_OK          on success,
  197.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  198.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  199.  *                                 codec's error handling mode is set to STRICT,
  200.  *
  201.  * On exit, ::source will point immediately _after_ the last input character
  202.  * read. Any remaining output for the character will be buffered by the
  203.  * codec for writing on the next call.
  204.  *
  205.  * Note that, if failure occurs whilst attempting to write any output
  206.  * buffered by the last call, then ::source and ::sourcelen will remain
  207.  * unchanged (as nothing more has been read).
  208.  *
  209.  * ::sourcelen will be reduced appropriately on exit.
  210.  *
  211.  * ::dest will point immediately _after_ the last character written.
  212.  *
  213.  * ::destlen will be reduced appropriately on exit.
  214.  */
  215. parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec,
  216.                 const uint8_t **source, size_t *sourcelen,
  217.                 uint8_t **dest, size_t *destlen)
  218. {
  219.         charset_8859_codec *c = (charset_8859_codec *) codec;
  220.         uint32_t ucs4;
  221.         uint32_t *towrite;
  222.         size_t towritelen;
  223.         parserutils_error error;
  224.  
  225.         /* Process any outstanding characters from the previous call */
  226.         if (c->write_len > 0) {
  227.                 uint32_t *pwrite = c->write_buf;
  228.  
  229.                 while (c->write_len > 0) {
  230.                         error = charset_8859_from_ucs4(c, pwrite[0],
  231.                                         dest, destlen);
  232.                         if (error != PARSERUTILS_OK) {
  233.                                 uint32_t len;
  234.                                 assert(error == PARSERUTILS_NOMEM);
  235.  
  236.                                 for (len = 0; len < c->write_len; len++) {
  237.                                         c->write_buf[len] = pwrite[len];
  238.                                 }
  239.  
  240.                                 return error;
  241.                         }
  242.  
  243.                         pwrite++;
  244.                         c->write_len--;
  245.                 }
  246.         }
  247.  
  248.         /* Now process the characters for this call */
  249.         while (*sourcelen > 0) {
  250.                 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
  251.                 towrite = &ucs4;
  252.                 towritelen = 1;
  253.  
  254.                 /* Output current characters */
  255.                 while (towritelen > 0) {
  256.                         error = charset_8859_from_ucs4(c, towrite[0], dest,
  257.                                         destlen);
  258.                         if (error != PARSERUTILS_OK) {
  259.                                 uint32_t len;
  260.                                 if (error != PARSERUTILS_NOMEM) {
  261.                                         return error;
  262.                                 }
  263.  
  264.                                 /* Insufficient output space */
  265.                                 if (towritelen >= WRITE_BUFSIZE)
  266.                                         abort();
  267.  
  268.                                 c->write_len = towritelen;
  269.  
  270.                                 /* Copy pending chars to save area, for
  271.                                  * processing next call. */
  272.                                 for (len = 0; len < towritelen; len++)
  273.                                         c->write_buf[len] = towrite[len];
  274.  
  275.                                 /* Claim character we've just buffered,
  276.                                  * so it's not reprocessed */
  277.                                 *source += 4;
  278.                                 *sourcelen -= 4;
  279.  
  280.                                 return PARSERUTILS_NOMEM;
  281.                         }
  282.  
  283.                         towrite++;
  284.                         towritelen--;
  285.                 }
  286.  
  287.                 *source += 4;
  288.                 *sourcelen -= 4;
  289.         }
  290.  
  291.         return PARSERUTILS_OK;
  292. }
  293.  
  294. /**
  295.  * Decode a chunk of ISO-8859-n data into UCS-4 (big endian)
  296.  *
  297.  * \param codec      The codec to use
  298.  * \param source     Pointer to pointer to source data
  299.  * \param sourcelen  Pointer to length (in bytes) of source data
  300.  * \param dest       Pointer to pointer to output buffer
  301.  * \param destlen    Pointer to length (in bytes) of output buffer
  302.  * \return PARSERUTILS_OK          on success,
  303.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  304.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  305.  *                                 codec's error handling mode is set to STRICT,
  306.  *
  307.  * On exit, ::source will point immediately _after_ the last input character
  308.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  309.  * character will be buffered by the codec for writing on the next call.
  310.  *
  311.  * In the case of the result being _INVALID, ::source will point _at_ the
  312.  * last input character read; nothing will be written or buffered for the
  313.  * failed character. It is up to the client to fix the cause of the failure
  314.  * and retry the decoding process.
  315.  *
  316.  * Note that, if failure occurs whilst attempting to write any output
  317.  * buffered by the last call, then ::source and ::sourcelen will remain
  318.  * unchanged (as nothing more has been read).
  319.  *
  320.  * If STRICT error handling is configured and an illegal sequence is split
  321.  * over two calls, then _INVALID will be returned from the second call,
  322.  * but ::source will point mid-way through the invalid sequence (i.e. it
  323.  * will be unmodified over the second call). In addition, the internal
  324.  * incomplete-sequence buffer will be emptied, such that subsequent calls
  325.  * will progress, rather than re-evaluating the same invalid sequence.
  326.  *
  327.  * ::sourcelen will be reduced appropriately on exit.
  328.  *
  329.  * ::dest will point immediately _after_ the last character written.
  330.  *
  331.  * ::destlen will be reduced appropriately on exit.
  332.  *
  333.  * Call this with a source length of 0 to flush the output buffer.
  334.  */
  335. parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec,
  336.                 const uint8_t **source, size_t *sourcelen,
  337.                 uint8_t **dest, size_t *destlen)
  338. {
  339.         charset_8859_codec *c = (charset_8859_codec *) codec;
  340.         parserutils_error error;
  341.  
  342.         if (c->read_len > 0) {
  343.                 /* Output left over from last decode */
  344.                 uint32_t *pread = c->read_buf;
  345.  
  346.                 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
  347.                         *((uint32_t *) (void *) *dest) =
  348.                                         endian_host_to_big(pread[0]);
  349.  
  350.                         *dest += 4;
  351.                         *destlen -= 4;
  352.  
  353.                         pread++;
  354.                         c->read_len--;
  355.                 }
  356.  
  357.                 if (*destlen < c->read_len * 4) {
  358.                         /* Ran out of output buffer */
  359.                         size_t i;
  360.  
  361.                         /* Shuffle remaining output down */
  362.                         for (i = 0; i < c->read_len; i++)
  363.                                 c->read_buf[i] = pread[i];
  364.  
  365.                         return PARSERUTILS_NOMEM;
  366.                 }
  367.         }
  368.  
  369.         /* Finally, the "normal" case; process all outstanding characters */
  370.         while (*sourcelen > 0) {
  371.                 error = charset_8859_codec_read_char(c,
  372.                                 source, sourcelen, dest, destlen);
  373.                 if (error != PARSERUTILS_OK) {
  374.                         return error;
  375.                 }
  376.         }
  377.  
  378.         return PARSERUTILS_OK;
  379. }
  380.  
  381. /**
  382.  * Clear an ISO-8859-n codec's encoding state
  383.  *
  384.  * \param codec  The codec to reset
  385.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  386.  */
  387. parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec)
  388. {
  389.         charset_8859_codec *c = (charset_8859_codec *) codec;
  390.  
  391.         c->read_buf[0] = 0;
  392.         c->read_len = 0;
  393.  
  394.         c->write_buf[0] = 0;
  395.         c->write_len = 0;
  396.  
  397.         return PARSERUTILS_OK;
  398. }
  399.  
  400.  
  401. /**
  402.  * Read a character from the ISO-8859-n to UCS-4 (big endian)
  403.  *
  404.  * \param c          The codec
  405.  * \param source     Pointer to pointer to source buffer (updated on exit)
  406.  * \param sourcelen  Pointer to length of source buffer (updated on exit)
  407.  * \param dest       Pointer to pointer to output buffer (updated on exit)
  408.  * \param destlen    Pointer to length of output buffer (updated on exit)
  409.  * \return PARSERUTILS_OK on success,
  410.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  411.  *         PARSERUTILS_INVALID     if a character cannot be represented and the
  412.  *                                 codec's error handling mode is set to STRICT,
  413.  *
  414.  * On exit, ::source will point immediately _after_ the last input character
  415.  * read, if the result is _OK or _NOMEM. Any remaining output for the
  416.  * character will be buffered by the codec for writing on the next call.
  417.  *
  418.  * In the case of the result being _INVALID, ::source will point _at_ the
  419.  * last input character read; nothing will be written or buffered for the
  420.  * failed character. It is up to the client to fix the cause of the failure
  421.  * and retry the decoding process.
  422.  *
  423.  * ::sourcelen will be reduced appropriately on exit.
  424.  *
  425.  * ::dest will point immediately _after_ the last character written.
  426.  *
  427.  * ::destlen will be reduced appropriately on exit.
  428.  */
  429. parserutils_error charset_8859_codec_read_char(charset_8859_codec *c,
  430.                 const uint8_t **source, size_t *sourcelen,
  431.                 uint8_t **dest, size_t *destlen)
  432. {
  433.         uint32_t ucs4;
  434.         parserutils_error error;
  435.  
  436.         /* Convert a single character */
  437.         error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4);
  438.         if (error == PARSERUTILS_OK) {
  439.                 /* Read a character */
  440.                 error = charset_8859_codec_output_decoded_char(c,
  441.                                 ucs4, dest, destlen);
  442.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  443.                         /* output succeeded; update source pointers */
  444.                         *source += 1;
  445.                         *sourcelen -= 1;
  446.                 }
  447.  
  448.                 return error;
  449.         } else if (error == PARSERUTILS_NEEDDATA) {
  450.                 /* Can only happen if sourcelen == 0 */
  451.                 return error;
  452.         } else if (error == PARSERUTILS_INVALID) {
  453.                 /* Illegal input sequence */
  454.  
  455.                 /* Strict errormode; simply flag invalid character */
  456.                 if (c->base.errormode ==
  457.                                 PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
  458.                         return PARSERUTILS_INVALID;
  459.                 }
  460.  
  461.                 /* output U+FFFD and continue processing. */
  462.                 error = charset_8859_codec_output_decoded_char(c,
  463.                                 0xFFFD, dest, destlen);
  464.                 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
  465.                         /* output succeeded; update source pointers */
  466.                         *source += 1;
  467.                         *sourcelen -= 1;
  468.                 }
  469.  
  470.                 return error;
  471.         }
  472.  
  473.         return PARSERUTILS_OK;
  474. }
  475.  
  476. /**
  477.  * Output a UCS-4 character (big endian)
  478.  *
  479.  * \param c        Codec to use
  480.  * \param ucs4     UCS-4 character (host endian)
  481.  * \param dest     Pointer to pointer to output buffer
  482.  * \param destlen  Pointer to output buffer length
  483.  * \return PARSERUTILS_OK          on success,
  484.  *         PARSERUTILS_NOMEM       if output buffer is too small,
  485.  */
  486. parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c,
  487.                 uint32_t ucs4, uint8_t **dest, size_t *destlen)
  488. {
  489.         if (*destlen < 4) {
  490.                 /* Run out of output buffer */
  491.                 c->read_len = 1;
  492.                 c->read_buf[0] = ucs4;
  493.  
  494.                 return PARSERUTILS_NOMEM;
  495.         }
  496.  
  497.         *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
  498.         *dest += 4;
  499.         *destlen -= 4;
  500.  
  501.         return PARSERUTILS_OK;
  502. }
  503.  
  504. /**
  505.  * Convert a UCS4 (host endian) character to ISO-8859-n
  506.  *
  507.  * \param c     The codec instance
  508.  * \param ucs4  The UCS4 character to convert
  509.  * \param s     Pointer to pointer to destination buffer
  510.  * \param len   Pointer to destination buffer length
  511.  * \return PARSERUTILS_OK on success,
  512.  *         PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
  513.  *         PARSERUTILS_INVALID if the character cannot be represented
  514.  *
  515.  * _INVALID will only be returned if the codec's conversion mode is STRICT.
  516.  * Otherwise, '?' will be output.
  517.  *
  518.  * On successful conversion, *s and *len will be updated.
  519.  */
  520. parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
  521.                 uint32_t ucs4, uint8_t **s, size_t *len)
  522. {
  523.         uint8_t out = 0;
  524.  
  525.         if (*len < 1)
  526.                 return PARSERUTILS_NOMEM;
  527.  
  528.         if (ucs4 < 0x80) {
  529.                 /* ASCII */
  530.                 out = ucs4;
  531.         } else {
  532.                 uint32_t i;
  533.  
  534.                 for (i = 0; i < 96; i++) {
  535.                         if (ucs4 == c->table[i])
  536.                                 break;
  537.                 }
  538.  
  539.                 if (i == 96) {
  540.                         if (c->base.errormode ==
  541.                                         PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
  542.                                 return PARSERUTILS_INVALID;
  543.                         else
  544.                                 out = '?';
  545.                 } else {
  546.                         out = 0xA0 + i;
  547.                 }
  548.         }
  549.  
  550.         *(*s) = out;
  551.         (*s)++;
  552.         (*len)--;
  553.  
  554.         return PARSERUTILS_OK;
  555. }
  556.  
  557. /**
  558.  * Convert an ISO-8859-n character to UCS4 (host endian)
  559.  *
  560.  * \param c     The codec instance
  561.  * \param s     Pointer to source buffer
  562.  * \param len   Source buffer length
  563.  * \param ucs4  Pointer to destination buffer
  564.  * \return PARSERUTILS_OK on success,
  565.  *         PARSERUTILS_NEEDDATA if there's insufficient input data
  566.  *         PARSERUTILS_INVALID if the character cannot be represented
  567.  */
  568. parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
  569.                 const uint8_t *s, size_t len, uint32_t *ucs4)
  570. {
  571.         uint32_t out;
  572.  
  573.         if (len < 1)
  574.                 return PARSERUTILS_NEEDDATA;
  575.  
  576.         if (*s < 0x80) {
  577.                 out = *s;
  578.         } else if (*s >= 0xA0) {
  579.                 if (c->table[*s - 0xA0] == 0xFFFF)
  580.                         return PARSERUTILS_INVALID;
  581.  
  582.                 out = c->table[*s - 0xA0];
  583.         } else {
  584.                 return PARSERUTILS_INVALID;
  585.         }
  586.  
  587.         *ucs4 = out;
  588.  
  589.         return PARSERUTILS_OK;
  590. }
  591.  
  592. const parserutils_charset_handler charset_8859_codec_handler = {
  593.         charset_8859_codec_handles_charset,
  594.         charset_8859_codec_create
  595. };
  596.  
  597.