Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * This file is part of LibParserUtils.
  3.  * Licensed under the MIT License,
  4.  *                http://www.opensource.org/licenses/mit-license.php
  5.  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
  6.  */
  7.  
  8. /** \file
  9.  * UTF-8 manipulation functions (implementation).
  10.  */
  11. #include <stdint.h>
  12. #include <stdbool.h>
  13. #include <stdlib.h>
  14. #include <string.h>
  15.  
  16. #include <parserutils/charset/utf8.h>
  17. #include "charset/encodings/utf8impl.h"
  18.  
  19. /** Number of continuation bytes for a given start byte */
  20. const uint8_t numContinuations[256] = {
  21.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  22.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  23.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  24.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  25.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  26.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  27.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  28.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  29.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  30.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  31.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  32.         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  33.         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34.         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35.         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  36.         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
  37. };
  38.  
  39. /**
  40.  * Convert a UTF-8 multibyte sequence into a single UCS-4 character
  41.  *
  42.  * Encoding of UCS values outside the UTF-16 plane has been removed from
  43.  * RFC3629. This function conforms to RFC2279, however.
  44.  *
  45.  * \param s     The sequence to process
  46.  * \param len   Length of sequence
  47.  * \param ucs4  Pointer to location to receive UCS-4 character (host endian)
  48.  * \param clen  Pointer to location to receive byte length of UTF-8 sequence
  49.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  50.  */
  51. parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
  52.                 uint32_t *ucs4, size_t *clen)
  53. {
  54.         parserutils_error error;
  55.  
  56.         UTF8_TO_UCS4(s, len, ucs4, clen, error);
  57.  
  58.         return error;
  59. }
  60.  
  61. /**
  62.  * Convert a single UCS-4 character into a UTF-8 multibyte sequence
  63.  *
  64.  * Encoding of UCS values outside the UTF-16 plane has been removed from
  65.  * RFC3629. This function conforms to RFC2279, however.
  66.  *
  67.  * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
  68.  * \param s     Pointer to pointer to output buffer, updated on exit
  69.  * \param len   Pointer to length, in bytes, of output buffer, updated on exit
  70.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  71.  */
  72. parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4,
  73.                 uint8_t **s, size_t *len)
  74. {
  75.         parserutils_error error;
  76.  
  77.         UTF8_FROM_UCS4(ucs4, s, len, error);
  78.  
  79.         return error;
  80. }
  81.  
  82. /**
  83.  * Calculate the length (in characters) of a bounded UTF-8 string
  84.  *
  85.  * \param s    The string
  86.  * \param max  Maximum length
  87.  * \param len  Pointer to location to receive length of string
  88.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  89.  */
  90. parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
  91.                 size_t *len)
  92. {
  93.         parserutils_error error;
  94.  
  95.         UTF8_LENGTH(s, max, len, error);
  96.  
  97.         return error;
  98. }
  99.  
  100. /**
  101.  * Calculate the length (in bytes) of a UTF-8 character
  102.  *
  103.  * \param s    Pointer to start of character
  104.  * \param len  Pointer to location to receive length
  105.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  106.  */
  107. parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
  108.                 size_t *len)
  109. {
  110.         parserutils_error error;
  111.  
  112.         UTF8_CHAR_BYTE_LENGTH(s, len, error);
  113.  
  114.         return error;
  115. }
  116.  
  117. /**
  118.  * Find previous legal UTF-8 char in string
  119.  *
  120.  * \param s        The string
  121.  * \param off      Offset in the string to start at
  122.  * \param prevoff  Pointer to location to receive offset of first byte of
  123.  *                 previous legal character
  124.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  125.  */
  126. parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
  127.                 uint32_t *prevoff)
  128. {
  129.         parserutils_error error;
  130.  
  131.         UTF8_PREV(s, off, prevoff, error);
  132.  
  133.         return error;
  134. }
  135.  
  136. /**
  137.  * Find next legal UTF-8 char in string
  138.  *
  139.  * \param s        The string (assumed valid)
  140.  * \param len      Maximum offset in string
  141.  * \param off      Offset in the string to start at
  142.  * \param nextoff  Pointer to location to receive offset of first byte of
  143.  *                 next legal character
  144.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  145.  */
  146. parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
  147.                 uint32_t off, uint32_t *nextoff)
  148. {
  149.         parserutils_error error;
  150.  
  151.         UTF8_NEXT(s, len, off, nextoff, error);
  152.  
  153.         return error;
  154. }
  155.  
  156. /**
  157.  * Find next legal UTF-8 char in string
  158.  *
  159.  * \param s        The string (assumed to be of dubious validity)
  160.  * \param len      Maximum offset in string
  161.  * \param off      Offset in the string to start at
  162.  * \param nextoff  Pointer to location to receive offset of first byte of
  163.  *                 next legal character
  164.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  165.  */
  166. parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
  167.                 uint32_t len, uint32_t off, uint32_t *nextoff)
  168. {
  169.         parserutils_error error;
  170.  
  171.         UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
  172.  
  173.         return error;
  174. }
  175.  
  176.