Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * This file is part of LibParserUtils.
  3.  * Licensed under the MIT License,
  4.  *                http://www.opensource.org/licenses/mit-license.php
  5.  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
  6.  */
  7.  
  8. /** \file
  9.  * UTF-16 manipulation functions (implementation).
  10.  */
  11.  
  12. #include <stdbool.h>
  13. #include <stdlib.h>
  14. #include <string.h>
  15. #include <stdint.h>
  16.  
  17. #include <parserutils/charset/utf16.h>
  18.  
  19. /**
  20.  * Convert a UTF-16 sequence into a single UCS-4 character
  21.  *
  22.  * \param s     The sequence to process
  23.  * \param len   Length of sequence in bytes
  24.  * \param ucs4  Pointer to location to receive UCS-4 character (host endian)
  25.  * \param clen  Pointer to location to receive byte length of UTF-16 sequence
  26.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  27.  */
  28. parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
  29.                 size_t len, uint32_t *ucs4, size_t *clen)
  30. {
  31.         const uint16_t *ss = (const uint16_t *) (const void *) s;
  32.  
  33.         if (s == NULL || ucs4 == NULL || clen == NULL)
  34.                 return PARSERUTILS_BADPARM;
  35.  
  36.         if (len < 2)
  37.                 return PARSERUTILS_NEEDDATA;
  38.  
  39.         if (*ss < 0xD800 || *ss > 0xDFFF) {
  40.                 *ucs4 = *ss;
  41.                 *clen = 2;
  42.         } else if (0xD800 <= *ss && *ss <= 0xDBFF) {
  43.                 /* High-surrogate code unit.  */
  44.                 if (len < 4)
  45.                         return PARSERUTILS_NEEDDATA;
  46.  
  47.                 if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) {
  48.                         /* We have a valid surrogate pair.  */
  49.                         *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF))
  50.                                 + (1<<16);
  51.                         *clen = 4;
  52.                 } else {
  53.                         return PARSERUTILS_INVALID;
  54.                 }
  55.         } else {
  56.                 /* Low-surrogate code unit.  */
  57.                 return PARSERUTILS_INVALID;
  58.         }
  59.  
  60.         return PARSERUTILS_OK;
  61. }
  62.  
  63. /**
  64.  * Convert a single UCS-4 character into a UTF-16 sequence
  65.  *
  66.  * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
  67.  * \param s     Pointer to 4 byte long output buffer
  68.  * \param len   Pointer to location to receive length of multibyte sequence
  69.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  70.  */
  71. parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
  72.                 size_t *len)
  73. {
  74.         uint16_t *ss = (uint16_t *) (void *) s;
  75.         uint32_t l = 0;
  76.  
  77.         if (s == NULL || len == NULL)
  78.                 return PARSERUTILS_BADPARM;
  79.         else if (ucs4 < 0x10000) {
  80.                 *ss = (uint16_t) ucs4;
  81.                 l = 2;
  82.         } else if (ucs4 < 0x110000) {
  83.                 ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
  84.                 ss[1] = 0xDC00 | (ucs4 & 0x3ff);
  85.                 l = 4;
  86.         } else {
  87.                 return PARSERUTILS_INVALID;
  88.         }
  89.  
  90.         *len = l;
  91.  
  92.         return PARSERUTILS_OK;
  93. }
  94.  
  95. /**
  96.  * Calculate the length (in characters) of a bounded UTF-16 string
  97.  *
  98.  * \param s    The string
  99.  * \param max  Maximum length
  100.  * \param len  Pointer to location to receive length of string
  101.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  102.  */
  103. parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
  104.                 size_t *len)
  105. {
  106.         const uint16_t *ss = (const uint16_t *) (const void *) s;
  107.         const uint16_t *end = (const uint16_t *) (const void *) (s + max);
  108.         int l = 0;
  109.  
  110.         if (s == NULL || len == NULL)
  111.                 return PARSERUTILS_BADPARM;
  112.  
  113.         while (ss < end) {
  114.                 if (*ss < 0xD800 || 0xDFFF < *ss)
  115.                         ss++;
  116.                 else
  117.                         ss += 2;
  118.  
  119.                 l++;
  120.         }
  121.  
  122.         *len = l;
  123.  
  124.         return PARSERUTILS_OK;
  125. }
  126.  
  127. /**
  128.  * Calculate the length (in bytes) of a UTF-16 character
  129.  *
  130.  * \param s    Pointer to start of character
  131.  * \param len  Pointer to location to receive length
  132.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  133.  */
  134. parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
  135.                 size_t *len)
  136. {
  137.         const uint16_t *ss = (const uint16_t *) (const void *) s;
  138.  
  139.         if (s == NULL || len == NULL)
  140.                 return PARSERUTILS_BADPARM;
  141.  
  142.         if (*ss < 0xD800 || 0xDFFF < *ss)
  143.                 *len = 2;
  144.         else
  145.                 *len = 4;
  146.  
  147.         return PARSERUTILS_OK;
  148. }
  149.  
  150. /**
  151.  * Find previous legal UTF-16 char in string
  152.  *
  153.  * \param s        The string
  154.  * \param off      Offset in the string to start at
  155.  * \param prevoff  Pointer to location to receive offset of first byte of
  156.  *                 previous legal character
  157.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  158.  */
  159. parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
  160.                 uint32_t *prevoff)
  161. {
  162.         const uint16_t *ss = (const uint16_t *) (const void *) s;
  163.  
  164.         if (s == NULL || prevoff == NULL)
  165.                 return PARSERUTILS_BADPARM;
  166.  
  167.         if (off < 2)
  168.                 *prevoff = 0;
  169.         else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
  170.                 *prevoff = off - 2;
  171.         else
  172.                 *prevoff = (off < 4) ? 0 : off - 4;
  173.  
  174.         return PARSERUTILS_OK;
  175. }
  176.  
  177. /**
  178.  * Find next legal UTF-16 char in string
  179.  *
  180.  * \param s        The string (assumed valid)
  181.  * \param len      Maximum offset in string
  182.  * \param off      Offset in the string to start at
  183.  * \param nextoff  Pointer to location to receive offset of first byte of
  184.  *                 next legal character
  185.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  186.  */
  187. parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
  188.                 uint32_t off, uint32_t *nextoff)
  189. {
  190.         const uint16_t *ss = (const uint16_t *) (const void *) s;
  191.  
  192.         if (s == NULL || off >= len || nextoff == NULL)
  193.                 return PARSERUTILS_BADPARM;
  194.  
  195.         if (len - off < 4)
  196.                 *nextoff = len;
  197.         else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
  198.                 *nextoff = off + 2;
  199.         else
  200.                 *nextoff = (len - off < 6) ? len : off + 4;
  201.  
  202.         return PARSERUTILS_OK;
  203. }
  204.  
  205. /**
  206.  * Find next legal UTF-16 char in string
  207.  *
  208.  * \param s        The string (assumed to be of dubious validity)
  209.  * \param len      Maximum offset in string
  210.  * \param off      Offset in the string to start at
  211.  * \param nextoff  Pointer to location to receive offset of first byte of
  212.  *                 next legal character
  213.  * \return PARSERUTILS_OK on success, appropriate error otherwise
  214.  */
  215. parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
  216.                 uint32_t len, uint32_t off, uint32_t *nextoff)
  217. {
  218.         const uint16_t *ss = (const uint16_t *) (const void *) s;
  219.  
  220.         if (s == NULL || off >= len || nextoff == NULL)
  221.                 return PARSERUTILS_BADPARM;
  222.  
  223.         while (1) {
  224.                 if (len - off < 4) {
  225.                         return PARSERUTILS_NEEDDATA;
  226.                 } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
  227.                         *nextoff = off + 2;
  228.                         break;
  229.                 } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
  230.                         if (len - off < 6)
  231.                                 return PARSERUTILS_NEEDDATA;
  232.  
  233.                         if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
  234.                                 *nextoff = off + 4;
  235.                                 break;
  236.                         } else {
  237.                                 ss++;
  238.                                 off += 2;
  239.                         }
  240.                 }
  241.         }
  242.  
  243.         return PARSERUTILS_OK;
  244. }
  245.  
  246.