WebSVN – Kolibri OS – Path Comparison – / – /programs/network/netsurf/libparserutils/ Rev 3583 and /programs/network/netsurf/libparserutils/ Rev 3584

Regard whitespace Rev 3583 → Rev 3584

/programs/network/netsurf/libparserutils/.gitignore
0,0 → 1,3
build-*
Makefile.config.override

 /programs/network/netsurf/libparserutils/COPYING
 ,0 → 1,19
+Copyright (C) 2007-8 J-M Bell
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+  * The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

 /programs/network/netsurf/libparserutils/Makefile
 ,0 → 1,52
+# Component settings
+COMPONENT := parserutils
+COMPONENT_VERSION := 0.1.1
+# Default to a static library
+COMPONENT_TYPE ?= lib-static
+# Setup the tooling
+PREFIX ?= /opt/netsurf
+NSSHARED ?= $(PREFIX)/share/netsurf-buildsystem
+include $(NSSHARED)/makefiles/Makefile.tools
+TESTRUNNER := $(PERL) $(NSTESTTOOLS)/testrunner.pl
+# Toolchain flags
+WARNFLAGS := -Wall -W -Wundef -Wpointer-arith -Wcast-align \
+        -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
+        -Wmissing-declarations -Wnested-externs -pedantic
+# BeOS/Haiku standard library headers create warnings.
+ifneq ($(TARGET),beos)
+  WARNFLAGS := $(WARNFLAGS) -Werror
+endif
+CFLAGS := -D_BSD_SOURCE -I$(CURDIR)/include/ \
+        -I$(CURDIR)/src $(WARNFLAGS) $(CFLAGS)
+ifneq ($(GCCVER),2)
+  CFLAGS := $(CFLAGS) -std=c99
+else
+  # __inline__ is a GCCism
+  CFLAGS := $(CFLAGS) -Dinline="__inline__"
+endif
+include $(NSBUILD)/Makefile.top
+# Extra installation rules
+Is := include/parserutils
+I := /include/parserutils
+INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/errors.h;$(Is)/functypes.h;$(Is)/parserutils.h;$(Is)/types.h
+Is := include/parserutils/charset
+I := /include/parserutils/charset
+INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/codec.h;$(Is)/mibenum.h;$(Is)/utf16.h;$(Is)/utf8.h
+Is := include/parserutils/input
+I := /include/parserutils/input
+INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/inputstream.h
+Is := include/parserutils/utils
+I := /include/parserutils/utils
+INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/buffer.h;$(Is)/stack.h;$(Is)/vector.h
+INSTALL_ITEMS := $(INSTALL_ITEMS) /lib/pkgconfig:lib$(COMPONENT).pc.in
+INSTALL_ITEMS := $(INSTALL_ITEMS) /lib:$(OUTPUT)

/programs/network/netsurf/libparserutils/Makefile.config
0,0 → 1,7
# Configuration Makefile fragment

# Disable use of iconv in the input filter
# CFLAGS := $(CFLAGS) -DWITHOUT_ICONV_FILTER

# Cater for local configuration changes
-include Makefile.config.override

 /programs/network/netsurf/libparserutils/README
 ,0 → 1,123
+LibParserUtils -- a utility library for parser building
+=======================================================
+Overview
+--------
+  LibParserUtils provides various pieces of functionality that are useful
+  when writing parsers. These are:
+  + A number of character set convertors
+  + Mapping of character set names to/from MIB enum values
+  + UTF-8 and UTF-16 (host endian) support functions
+  + Various simple data structures (resizeable buffer, stack, vector)
+  + A UTF-8 input stream
+Requirements
+------------
+  LibParserUtils requires the following tools:
+    + A C99 capable C compiler
+    + GNU make or compatible
+    + Perl (for the testcases)
+    + Pkg-config (for the testcases)
+    + doxygen (for the API documentation)
+  For enhanced charset support, LibParserUtils requires an iconv()
+  implementation. If you don't have an implementation of iconv(),
+  this requirement may be disabled: see the "Disabling iconv()
+  support" section, below.
+Compilation
+-----------
+  The exact type of build may be configured by passing parameters to make.
+  Common usage is described below.
+  For a static library:
+                $ make
+  For a shared library:
+                $ make COMPONENT_TYPE=lib-shared
+  For a static library with debug enabled:
+                $ make BUILD=debug
+  To cross-compile a static library:
+                $ make TARGET=<target-platform>
+Verification
+------------
+  The library's functionality may be verified, thus:
+                $ make test
+  If you wish to see test coverage statistics, run:
+                $ make coverage
+  Then open build/coverage/index.html in a web browser.
+  In both cases, ensure that the same parameters to make are passed as when
+  building the library.
+(Un)installation
+----------------
+  To install the library:
+                $ make install
+  Ensure that the same parameters to make are passed as when building the
+  library.
+  To specify the installation prefix:
+                $ make install PREFIX=/path/to/prefix
+  To specify a staging directory for packaging:
+                $ make install DESTDIR=/path/to/directory
+  Items will be installed to $(DESTDIR)$(PREFIX)/
+  To uninstall:
+                $ make uninstall
+API documentation
+-----------------
+  Use doxygen to auto-generate API documentation, thus:
+                $ make docs
+  Then open build/docs/html/index.html in a web browser.
+  The test driver code in test/ may also provide some useful pointers.
+Disabling iconv() support
+-------------------------
+  Without iconv() support enabled, libparserutils only supports the
+  following character sets:
+    + UTF-16 (platform-native endian)
+    + UTF-8
+    + ISO-8859-n
+    + Windows-125n
+    + US-ASCII
+  To disable iconv() support in libparserutils, do the following:
+                $ echo "CFLAGS += -DWITHOUT_ICONV_FILTER" \
+                                >Makefile.config.override
+  Then build libparserutils as normal.

 /programs/network/netsurf/libparserutils/build/Aliases
 ,0 → 1,303
+# > Unicode:Files.Aliases
+# Mapping of character set encoding names to their canonical form
+#
+# Lines starting with a '#' are comments, blank lines are ignored.
+#
+# Based on http://www.iana.org/assignments/character-sets and
+# http://www.iana.org/assignments/ianacharset-mib
+#
+# Canonical Form        MIBenum         Aliases...
+#
+US-ASCII                3               iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII
+ISO-10646-UTF-1         27              csISO10646UTF1
+ISO_646.basic:1983      28              ref csISO646basic1983
+INVARIANT               29              csINVARIANT
+ISO_646.irv:1983        30              iso-ir-2 irv csISO2IntlRefVersion
+BS_4730                 20              iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom
+NATS-SEFI               31              iso-ir-8-1 csNATSSEFI
+NATS-SEFI-ADD           32              iso-ir-8-2 csNATSSEFIADD
+NATS-DANO               33              iso-ir-9-1 csNATSDANO
+NATS-DANO-ADD           34              iso-ir-9-2 csNATSDANOADD
+SEN_850200_B            35              iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish
+SEN_850200_C            21              iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames
+KS_C_5601-1987          36              iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987
+ISO-2022-KR             37              csISO2022KR
+EUC-KR                  38              csEUCKR EUCKR
+ISO-2022-JP             39              csISO2022JP
+ISO-2022-JP-2           40              csISO2022JP2
+ISO-2022-CN             104
+ISO-2022-CN-EXT         105
+JIS_C6220-1969-jp       41              JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp
+JIS_C6220-1969-ro       42              iso-ir-14 jp ISO646-JP csISO14JISC6220ro
+IT                      22              iso-ir-15 ISO646-IT csISO15Italian
+PT                      43              iso-ir-16 ISO646-PT csISO16Portuguese
+ES                      23              iso-ir-17 ISO646-ES csISO17Spanish
+greek7-old              44              iso-ir-18 csISO18Greek7Old
+latin-greek             45              iso-ir-19 csISO19LatinGreek
+DIN_66003               24              iso-ir-21 de ISO646-DE csISO21German
+NF_Z_62-010_(1973)      46              iso-ir-25 ISO646-FR1 csISO25French
+Latin-greek-1           47              iso-ir-27 csISO27LatinGreek1
+ISO_5427                48              iso-ir-37 csISO5427Cyrillic
+JIS_C6226-1978          49              iso-ir-42 csISO42JISC62261978
+BS_viewdata             50              iso-ir-47 csISO47BSViewdata
+INIS                    51              iso-ir-49 csISO49INIS
+INIS-8                  52              iso-ir-50 csISO50INIS8
+INIS-cyrillic           53              iso-ir-51 csISO51INISCyrillic
+ISO_5427:1981           54              iso-ir-54 ISO5427Cyrillic1981
+ISO_5428:1980           55              iso-ir-55 csISO5428Greek
+GB_1988-80              56              iso-ir-57 cn ISO646-CN csISO57GB1988
+GB_2312-80              57              iso-ir-58 chinese csISO58GB231280
+NS_4551-1               25              iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1
+NS_4551-2               58              ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2
+NF_Z_62-010             26              iso-ir-69 ISO646-FR fr csISO69French
+videotex-suppl          59              iso-ir-70 csISO70VideotexSupp1
+PT2                     60              iso-ir-84 ISO646-PT2 csISO84Portuguese2
+ES2                     61              iso-ir-85 ISO646-ES2 csISO85Spanish2
+MSZ_7795.3              62              iso-ir-86 ISO646-HU hu csISO86Hungarian
+JIS_C6226-1983          63              iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208
+greek7                  64              iso-ir-88 csISO88Greek7
+ASMO_449                65              ISO_9036 arabic7 iso-ir-89 csISO89ASMO449
+iso-ir-90               66              csISO90
+JIS_C6229-1984-a        67              iso-ir-91 jp-ocr-a csISO91JISC62291984a
+JIS_C6229-1984-b        68              iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b
+JIS_C6229-1984-b-add    69              iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd
+JIS_C6229-1984-hand     70              iso-ir-94 jp-ocr-hand csISO94JIS62291984hand
+JIS_C6229-1984-hand-add 71              iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd
+JIS_C6229-1984-kana     72              iso-ir-96 csISO96JISC62291984kana
+ISO_2033-1983           73              iso-ir-98 e13b csISO2033
+ANSI_X3.110-1983        74              iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS
+ISO-8859-1              4               iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1
+ISO-8859-2              5               iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2
+T.61-7bit               75              iso-ir-102 csISO102T617bit
+T.61-8bit               76              T.61 iso-ir-103 csISO103T618bit
+ISO-8859-3              6               iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3
+ISO-8859-4              7               iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4
+ECMA-cyrillic           77              iso-ir-111 KOI8-E csISO111ECMACyrillic
+CSA_Z243.4-1985-1       78              iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1
+CSA_Z243.4-1985-2       79              iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2
+CSA_Z243.4-1985-gr      80              iso-ir-123 csISO123CSAZ24341985gr
+ISO-8859-6              9               iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic
+ISO-8859-6-E            81              csISO88596E ISO_8859-6-E
+ISO-8859-6-I            82              csISO88596I ISO_8859-6-I
+ISO-8859-7              10              iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7
+T.101-G2                83              iso-ir-128 csISO128T101G2
+ISO-8859-8              11              iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8
+ISO-8859-8-E            84              csISO88598E ISO_8859-8-E
+ISO-8859-8-I            85              csISO88598I ISO_8859-8-I
+CSN_369103              86              iso-ir-139 csISO139CSN369103
+JUS_I.B1.002            87              iso-ir-141 ISO646-YU js yu csISO141JUSIB1002
+ISO_6937-2-add          14              iso-ir-142 csISOTextComm
+IEC_P27-1               88              iso-ir-143 csISO143IECP271
+ISO-8859-5              8               iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5
+JUS_I.B1.003-serb       89              iso-ir-146 serbian csISO146Serbian
+JUS_I.B1.003-mac        90              macedonian iso-ir-147 csISO147Macedonian
+ISO-8859-9              12              iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9
+greek-ccitt             91              iso-ir-150 csISO150 csISO150GreekCCITT
+NC_NC00-10:81           92              cuba iso-ir-151 ISO646-CU csISO151Cuba
+ISO_6937-2-25           93              iso-ir-152 csISO6937Add
+GOST_19768-74           94              ST_SEV_358-88 iso-ir-153 csISO153GOST1976874
+ISO_8859-supp           95              iso-ir-154 latin1-2-5 csISO8859Supp
+ISO_10367-box           96              iso-ir-155 csISO10367Box
+ISO-8859-10             13              iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10
+latin-lap               97              lap iso-ir-158 csISO158Lap
+JIS_X0212-1990          98              x0212 iso-ir-159 csISO159JISX02121990
+DS_2089                 99              DS2089 ISO646-DK dk csISO646Danish
+us-dk                   100             csUSDK
+dk-us                   101             csDKUS
+JIS_X0201               15              X0201 csHalfWidthKatakana
+KSC5636                 102             ISO646-KR csKSC5636
+ISO-10646-UCS-2         1000            csUnicode UCS-2 UCS2
+ISO-10646-UCS-4         1001            csUCS4 UCS-4 UCS4
+DEC-MCS                 2008            dec csDECMCS
+hp-roman8               2004            roman8 r8 csHPRoman8
+macintosh               2027            mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN
+IBM037                  2028            cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037
+IBM038                  2029            EBCDIC-INT cp038 csIBM038
+IBM273                  2030            CP273 csIBM273
+IBM274                  2031            EBCDIC-BE CP274 csIBM274
+IBM275                  2032            EBCDIC-BR cp275 csIBM275
+IBM277                  2033            EBCDIC-CP-DK EBCDIC-CP-NO csIBM277
+IBM278                  2034            CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278
+IBM280                  2035            CP280 ebcdic-cp-it csIBM280
+IBM281                  2036            EBCDIC-JP-E cp281 csIBM281
+IBM284                  2037            CP284 ebcdic-cp-es csIBM284
+IBM285                  2038            CP285 ebcdic-cp-gb csIBM285
+IBM290                  2039            cp290 EBCDIC-JP-kana csIBM290
+IBM297                  2040            cp297 ebcdic-cp-fr csIBM297
+IBM420                  2041            cp420 ebcdic-cp-ar1 csIBM420
+IBM423                  2042            cp423 ebcdic-cp-gr csIBM423
+IBM424                  2043            cp424 ebcdic-cp-he csIBM424
+IBM437                  2011            cp437 437 csPC8CodePage437
+IBM500                  2044            CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500
+IBM775                  2087            cp775 csPC775Baltic
+IBM850                  2009            cp850 850 csPC850Multilingual
+IBM851                  2045            cp851 851 csIBM851
+IBM852                  2010            cp852 852 csPCp852
+IBM855                  2046            cp855 855 csIBM855
+IBM857                  2047            cp857 857 csIBM857
+IBM860                  2048            cp860 860 csIBM860
+IBM861                  2049            cp861 861 cp-is csIBM861
+IBM862                  2013            cp862 862 csPC862LatinHebrew
+IBM863                  2050            cp863 863 csIBM863
+IBM864                  2051            cp864 csIBM864
+IBM865                  2052            cp865 865 csIBM865
+IBM866                  2086            cp866 866 csIBM866
+IBM868                  2053            CP868 cp-ar csIBM868
+IBM869                  2054            cp869 869 cp-gr csIBM869
+IBM870                  2055            CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870
+IBM871                  2056            CP871 ebcdic-cp-is csIBM871
+IBM880                  2057            cp880 EBCDIC-Cyrillic csIBM880
+IBM891                  2058            cp891 csIBM891
+IBM903                  2059            cp903 csIBM903
+IBM904                  2060            cp904 904 csIBBM904
+IBM905                  2061            CP905 ebcdic-cp-tr csIBM905
+IBM918                  2062            CP918 ebcdic-cp-ar2 csIBM918
+IBM1026                 2063            CP1026 csIBM1026
+EBCDIC-AT-DE            2064            csIBMEBCDICATDE
+EBCDIC-AT-DE-A          2065            csEBCDICATDEA
+EBCDIC-CA-FR            2066            csEBCDICCAFR
+EBCDIC-DK-NO            2067            csEBCDICDKNO
+EBCDIC-DK-NO-A          2068            csEBCDICDKNOA
+EBCDIC-FI-SE            2069            csEBCDICFISE
+EBCDIC-FI-SE-A          2070            csEBCDICFISEA
+EBCDIC-FR               2071            csEBCDICFR
+EBCDIC-IT               2072            csEBCDICIT
+EBCDIC-PT               2073            csEBCDICPT
+EBCDIC-ES               2074            csEBCDICES
+EBCDIC-ES-A             2075            csEBCDICESA
+EBCDIC-ES-S             2076            csEBCDICESS
+EBCDIC-UK               2077            csEBCDICUK
+EBCDIC-US               2078            csEBCDICUS
+UNKNOWN-8BIT            2079            csUnknown8BiT
+MNEMONIC                2080            csMnemonic
+MNEM                    2081            csMnem
+VISCII                  2082            csVISCII
+VIQR                    2083            csVIQR
+KOI8-R                  2084            csKOI8R
+KOI8-U                  2088
+IBM00858                2089            CCSID00858 CP00858 PC-Multilingual-850+euro
+IBM00924                2090            CCSID00924 CP00924 ebcdic-Latin9--euro
+IBM01140                2091            CCSID01140 CP01140 ebcdic-us-37+euro
+IBM01141                2092            CCSID01141 CP01141 ebcdic-de-273+euro
+IBM01142                2093            CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro
+IBM01143                2094            CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro
+IBM01144                2095            CCSID01144 CP01144 ebcdic-it-280+euro
+IBM01145                2096            CCSID01145 CP01145 ebcdic-es-284+euro
+IBM01146                2097            CCSID01146 CP01146 ebcdic-gb-285+euro
+IBM01147                2098            CCSID01147 CP01147 ebcdic-fr-297+euro
+IBM01148                2099            CCSID01148 CP01148 ebcdic-international-500+euro
+IBM01149                2100            CCSID01149 CP01149 ebcdic-is-871+euro
+Big5-HKSCS              2101
+IBM1047                 2102            IBM-1047
+PTCP154                 2103            csPTCP154 PT154 CP154 Cyrillic-Asian
+Amiga-1251              2104            Ami1251 Amiga1251 Ami-1251
+KOI7-switched           2105
+UNICODE-1-1             1010            csUnicode11
+SCSU                    1011
+UTF-7                   1012
+UTF-16BE                1013
+UTF-16LE                1014
+UTF-16                  1015
+CESU-8                  1016            csCESU-8
+UTF-32                  1017
+UTF-32BE                1018
+UTF-32LE                1019
+BOCU-1                  1020            csBOCU-1
+UNICODE-1-1-UTF-7       103             csUnicode11UTF7
+UTF-8                   106             UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8
+ISO-8859-13             109             8859_13 ISO8859-13
+ISO-8859-14             110             iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14
+ISO-8859-15             111             ISO_8859-15 Latin-9 8859_15 ISO8859-15
+ISO-8859-16             112             iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10
+GBK                     113             CP936 MS936 windows-936
+GB18030                 114
+OSD_EBCDIC_DF04_15      115
+OSD_EBCDIC_DF03_IRV     116
+OSD_EBCDIC_DF04_1       117
+JIS_Encoding            16              csJISEncoding
+Shift_JIS               17              MS_Kanji csShiftJIS X-SJIS Shift-JIS
+EUC-JP                  18              csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP
+Extended_UNIX_Code_Fixed_Width_for_Japanese     19              csEUCFixWidJapanese
+ISO-10646-UCS-Basic     1002            csUnicodeASCII
+ISO-10646-Unicode-Latin1        1003            csUnicodeLatin1 ISO-10646
+ISO-Unicode-IBM-1261    1005            csUnicodeIBM1261
+ISO-Unicode-IBM-1268    1006            csUnicodeIBM1268
+ISO-Unicode-IBM-1276    1007            csUnicodeIBM1276
+ISO-Unicode-IBM-1264    1008            csUnicodeIBM1264
+ISO-Unicode-IBM-1265    1009            csUnicodeIBM1265
+ISO-8859-1-Windows-3.0-Latin-1  2000            csWindows30Latin1
+ISO-8859-1-Windows-3.1-Latin-1  2001            csWindows31Latin1
+ISO-8859-2-Windows-Latin-2      2002            csWindows31Latin2
+ISO-8859-9-Windows-Latin-5      2003            csWindows31Latin5
+Adobe-Standard-Encoding 2005            csAdobeStandardEncoding
+Ventura-US              2006            csVenturaUS
+Ventura-International   2007            csVenturaInternational
+PC8-Danish-Norwegian    2012            csPC8DanishNorwegian
+PC8-Turkish             2014            csPC8Turkish
+IBM-Symbols             2015            csIBMSymbols
+IBM-Thai                2016            csIBMThai
+HP-Legal                2017            csHPLegal
+HP-Pi-font              2018            csHPPiFont
+HP-Math8                2019            csHPMath8
+Adobe-Symbol-Encoding   2020            csHPPSMath
+HP-DeskTop              2021            csHPDesktop
+Ventura-Math            2022            csVenturaMath
+Microsoft-Publishing    2023            csMicrosoftPublishing
+Windows-31J             2024            csWindows31J
+GB2312                  2025            csGB2312 EUC-CN EUCCN CN-GB
+Big5                    2026            csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE x-x-big5
+windows-1250            2250            CP1250 MS-EE
+windows-1251            2251            CP1251 MS-CYRL
+windows-1252            2252            CP1252 MS-ANSI
+windows-1253            2253            CP1253 MS-GREEK
+windows-1254            2254            CP1254 MS-TURK
+windows-1255            2255
+windows-1256            2256            CP1256 MS-ARAB
+windows-1257            2257            CP1257 WINBALTRIM
+windows-1258            2258
+TIS-620                 2259
+HZ-GB-2312              2085
+# Additional encodings not defined by IANA
+# Arbitrary allocations
+#CP737                  3001
+#CP853                  3002
+#CP856                  3003
+CP874                   3004            WINDOWS-874
+#CP922                  3005
+#CP1046                 3006
+#CP1124                 3007
+#CP1125                 3008            WINDOWS-1125
+#CP1129                 3009
+#CP1133                 3010            IBM-CP1133
+#CP1161                 3011            IBM-1161 IBM1161 CSIBM1161
+#CP1162                 3012            IBM-1162 IBM1162 CSIBM1162
+#CP1163                 3013            IBM-1163 IBM1163 CSIBM1163
+#GEORGIAN-ACADEMY       3014
+#GEORGIAN-PS            3015
+#KOI8-RU                3016
+#KOI8-T                 3017
+#MACARABIC              3018            X-MAC-ARABIC MAC-ARABIC
+#MACCROATIAN            3019            X-MAC-CROATIAN MAC-CROATIAN
+#MACGREEK               3020            X-MAC-GREEK MAC-GREEK
+#MACHEBREW              3021            X-MAC-HEBREW MAC-HEBREW
+#MACICELAND             3022            X-MAC-ICELAND MAC-ICELAND
+#MACROMANIA             3023            X-MAC-ROMANIA MAC-ROMANIA
+#MACTHAI                3024            X-MAC-THAI MAC-THAI
+#MACTURKISH             3025            X-MAC-TURKISH MAC-TURKISH
+#MULELAO-1              3026
+CP949                   3027            WINDOWS-949
+# From Unicode Lib
+ISO-IR-182              4000
+ISO-IR-197              4002
+ISO-2022-JP-1           4008
+MACCYRILLIC             4009            X-MAC-CYRILLIC MAC-CYRILLIC
+MACUKRAINE              4010            X-MAC-UKRAINIAN MAC-UKRAINIAN
+MACCENTRALEUROPE        4011            X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN
+JOHAB                   4012
+ISO-8859-11             4014            iso-ir-166 ISO_8859-11 ISO8859-11 8859_11
+X-CURRENT               4999            X-SYSTEM
+X-ACORN-LATIN1          5001
+X-ACORN-FUZZY           5002

 /programs/network/netsurf/libparserutils/build/Doxyfile
 ,0 → 1,1237
+# Doxyfile 1.4.6
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+PROJECT_NAME           = Libparserutils
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+PROJECT_NUMBER         =
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+OUTPUT_DIRECTORY       = build/docs
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+CREATE_SUBDIRS         = NO
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish,
+# Dutch, Finnish, French, German, Greek, Hungarian, Italian, Japanese,
+# Japanese-en (Japanese with English messages), Korean, Korean-en, Norwegian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish,
+# Swedish, and Ukrainian.
+OUTPUT_LANGUAGE        = English
+# This tag can be used to specify the encoding used in the generated output.
+# The encoding is not always determined by the language that is chosen,
+# but also whether or not the output is meant for Windows or non-Windows users.
+# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES
+# forces the Windows encoding (this is the default for the Windows binary),
+# whereas setting the tag to NO uses a Unix-style encoding (the default for
+# all platforms other than Windows).
+USE_WINDOWS_ENCODING   = NO
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+BRIEF_MEMBER_DESC      = YES
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+REPEAT_BRIEF           = YES
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+ABBREVIATE_BRIEF       =
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+ALWAYS_DETAILED_SEC    = NO
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+INLINE_INHERITED_MEMB  = NO
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+FULL_PATH_NAMES        = YES
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+STRIP_FROM_PATH        =
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+STRIP_FROM_INC_PATH    =
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+SHORT_NAMES            = NO
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like the Qt-style comments (thus requiring an
+# explicit @brief command for a brief description.
+JAVADOC_AUTOBRIEF      = YES
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+MULTILINE_CPP_IS_BRIEF = NO
+# If the DETAILS_AT_TOP tag is set to YES then Doxygen
+# will output the detailed description near the top, like JavaDoc.
+# If set to NO, the detailed description appears after the member
+# documentation.
+DETAILS_AT_TOP         = NO
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+INHERIT_DOCS           = YES
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+SEPARATE_MEMBER_PAGES  = NO
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+TAB_SIZE               = 8
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+ALIASES                =
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+OPTIMIZE_OUTPUT_FOR_C  = YES
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for Java.
+# For instance, namespaces will be presented as packages, qualified scopes
+# will look different, etc.
+OPTIMIZE_OUTPUT_JAVA   = NO
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to
+# include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+BUILTIN_STL_SUPPORT    = NO
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+DISTRIBUTE_GROUP_DOC   = NO
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+SUBGROUPING            = YES
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+EXTRACT_ALL            = YES
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+EXTRACT_PRIVATE        = YES
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+EXTRACT_STATIC         = YES
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+EXTRACT_LOCAL_CLASSES  = YES
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+EXTRACT_LOCAL_METHODS  = NO
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+HIDE_UNDOC_MEMBERS     = NO
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+HIDE_UNDOC_CLASSES     = NO
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+HIDE_FRIEND_COMPOUNDS  = NO
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+HIDE_IN_BODY_DOCS      = NO
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+INTERNAL_DOCS          = NO
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+CASE_SENSE_NAMES       = YES
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+HIDE_SCOPE_NAMES       = NO
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+SHOW_INCLUDE_FILES     = YES
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+INLINE_INFO            = YES
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+SORT_MEMBER_DOCS       = YES
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+SORT_BRIEF_DOCS        = NO
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+SORT_BY_SCOPE_NAME     = NO
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+GENERATE_TODOLIST      = YES
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+GENERATE_TESTLIST      = YES
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+GENERATE_BUGLIST       = YES
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+GENERATE_DEPRECATEDLIST= YES
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+ENABLED_SECTIONS       =
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+MAX_INITIALIZER_LINES  = 30
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+SHOW_USED_FILES        = YES
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+SHOW_DIRECTORIES       = YES
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from the
+# version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+FILE_VERSION_FILTER    =
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+QUIET                  = NO
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+WARNINGS               = YES
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+WARN_IF_UNDOCUMENTED   = YES
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+WARN_IF_DOC_ERROR      = YES
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+WARN_NO_PARAMDOC       = NO
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+WARN_FORMAT            = "$file:$line: $text"
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+WARN_LOGFILE           =
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+INPUT                  = include src
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py
+FILE_PATTERNS          = *.c *.h
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+RECURSIVE              = YES
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+EXCLUDE                =
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+EXCLUDE_SYMLINKS       = NO
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+EXCLUDE_PATTERNS       = */.svn/*
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+EXAMPLE_PATH           =
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+EXAMPLE_PATTERNS       =
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+EXAMPLE_RECURSIVE      = NO
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+IMAGE_PATH             =
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+INPUT_FILTER           =
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.  Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.  The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+FILTER_PATTERNS        =
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+FILTER_SOURCE_FILES    = NO
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+SOURCE_BROWSER         = YES
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+INLINE_SOURCES         = NO
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+STRIP_CODE_COMMENTS    = YES
+# If the REFERENCED_BY_RELATION tag is set to YES (the default)
+# then for each documented function all documented
+# functions referencing it will be listed.
+REFERENCED_BY_RELATION = YES
+# If the REFERENCES_RELATION tag is set to YES (the default)
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+REFERENCES_RELATION    = YES
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+USE_HTAGS              = NO
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+VERBATIM_HEADERS       = YES
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+ALPHABETICAL_INDEX     = NO
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+COLS_IN_ALPHA_INDEX    = 5
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+IGNORE_PREFIX          =
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+GENERATE_HTML          = YES
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+HTML_OUTPUT            = html
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+HTML_FILE_EXTENSION    = .html
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+HTML_HEADER            =
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+HTML_FOOTER            =
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+HTML_STYLESHEET        =
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+HTML_ALIGN_MEMBERS     = YES
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
+# of the generated HTML documentation.
+GENERATE_HTMLHELP      = NO
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+CHM_FILE               =
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+HHC_LOCATION           =
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+GENERATE_CHI           = NO
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+BINARY_TOC             = NO
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+TOC_EXPAND             = NO
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+DISABLE_INDEX          = NO
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+ENUM_VALUES_PER_LINE   = 4
+# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
+# generated containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature.
+GENERATE_TREEVIEW      = NO
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+TREEVIEW_WIDTH         = 250
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+GENERATE_LATEX         = NO
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+LATEX_OUTPUT           = latex
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+LATEX_CMD_NAME         = latex
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+MAKEINDEX_CMD_NAME     = makeindex
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+COMPACT_LATEX          = NO
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+PAPER_TYPE             = a4wide
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+EXTRA_PACKAGES         =
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+LATEX_HEADER           =
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+PDF_HYPERLINKS         = NO
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+USE_PDFLATEX           = NO
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+LATEX_BATCHMODE        = NO
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+LATEX_HIDE_INDICES     = NO
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+GENERATE_RTF           = NO
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+RTF_OUTPUT             = rtf
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+COMPACT_RTF            = NO
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+RTF_HYPERLINKS         = NO
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+RTF_STYLESHEET_FILE    =
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+RTF_EXTENSIONS_FILE    =
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+GENERATE_MAN           = NO
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+MAN_OUTPUT             = man
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+MAN_EXTENSION          = .3
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+MAN_LINKS              = NO
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+GENERATE_XML           = NO
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+XML_OUTPUT             = xml
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+XML_SCHEMA             =
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+XML_DTD                =
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+XML_PROGRAMLISTING     = YES
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+GENERATE_AUTOGEN_DEF   = NO
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+GENERATE_PERLMOD       = NO
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+PERLMOD_LATEX          = NO
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.  This is useful
+# if you want to understand what is going on.  On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+PERLMOD_PRETTY         = YES
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+PERLMOD_MAKEVAR_PREFIX =
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+ENABLE_PREPROCESSING   = YES
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+MACRO_EXPANSION        = NO
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+EXPAND_ONLY_PREDEF     = NO
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+SEARCH_INCLUDES        = YES
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+INCLUDE_PATH           =
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+INCLUDE_FILE_PATTERNS  =
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+PREDEFINED             =
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+EXPAND_AS_DEFINED      =
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+SKIP_FUNCTION_MACROS   = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#   TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#   TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+TAGFILES               =
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+GENERATE_TAGFILE       =
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+ALLEXTERNALS           = NO
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+EXTERNAL_GROUPS        = YES
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+PERL_PATH              = /usr/bin/perl
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+CLASS_DIAGRAMS         = YES
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+HIDE_UNDOC_RELATIONS   = YES
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+HAVE_DOT               = NO
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+CLASS_GRAPH            = YES
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+COLLABORATION_GRAPH    = YES
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+GROUP_GRAPHS           = YES
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+UML_LOOK               = NO
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+TEMPLATE_RELATIONS     = NO
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+INCLUDE_GRAPH          = YES
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+INCLUDED_BY_GRAPH      = YES
+# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will
+# generate a call dependency graph for every global function or class method.
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+CALL_GRAPH             = NO
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+GRAPHICAL_HIERARCHY    = YES
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+DIRECTORY_GRAPH        = YES
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+DOT_IMAGE_FORMAT       = png
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+DOT_PATH               =
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+DOTFILE_DIRS           =
+# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width
+# (in pixels) of the graphs generated by dot. If a graph becomes larger than
+# this value, doxygen will try to truncate the graph, so that it fits within
+# the specified constraint. Beware that most browsers cannot cope with very
+# large images.
+MAX_DOT_GRAPH_WIDTH    = 1024
+# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height
+# (in pixels) of the graphs generated by dot. If a graph becomes larger than
+# this value, doxygen will try to truncate the graph, so that it fits within
+# the specified constraint. Beware that most browsers cannot cope with very
+# large images.
+MAX_DOT_GRAPH_HEIGHT   = 1024
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that a graph may be further truncated if the graph's
+# image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH
+# and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default),
+# the graph is not depth-constrained.
+MAX_DOT_GRAPH_DEPTH    = 0
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, which results in a white background.
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+DOT_TRANSPARENT        = NO
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+DOT_MULTI_TARGETS      = NO
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+GENERATE_LEGEND        = YES
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+DOT_CLEANUP            = YES
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+SEARCHENGINE           = NO

 /programs/network/netsurf/libparserutils/build/conv.pl
 ,0 → 1,49
+#!/usr/bin/perl
+use warnings;
+use strict;
+# Convert Unicode mapping tables to C structures
+# Input files may be found at http://unicode.org/Public/MAPPINGS
+#
+# Usage: conv.pl <input_file>
+die "Usage: conv.pl <input_file>\n" if (scalar(@ARGV) != 1);
+my @table;
+open MAP, "<$ARGV[0]" or die "Failed opening $ARGV[0]: $!\n";
+while (<MAP>) {
+        next if (/^#/);
+        my @parts = split(/\s+/);
+        # Ignore ASCII part
+        next if (hex($parts[0]) < 0x80);
+        # Convert undefined entries to U+FFFF
+        if ($parts[1] =~ /^#/) {
+                push(@table, "0xFFFF");
+        } else {
+                push(@table, $parts[1]);
+        }
+}
+close MAP;
+# You'll have to go through and fix up the structure name
+print "static uint32_t ${ARGV[0]}[128] = {\n\t";
+my $count = 0;
+foreach my $item (@table) {
+        print "$item, ";
+        $count++;
+        if ($count % 8 == 0 && $count != 128) {
+                print "\n\t";
+        }
+}
+print "\n};\n\n";
 Property changes:
 Added: svn:executable
 +*
 \ No newline at end of property

 /programs/network/netsurf/libparserutils/build/make-aliases.pl
 ,0 → 1,124
+#!/usr/bin/perl -w
+# This file is part of LibParserUtils.
+# Licensed under the MIT License,
+#                http://www.opensource.org/licenses/mit-license.php
+# Copyright 2010 Daniel Silverstone <dsilvers@netsurf-browser.org>
+#                John-Mark Bell <jmb@netsurf-browser.org>
+use strict;
+use constant ALIAS_FILE => 'build/Aliases';
+use constant ALIAS_INC  => 'src/charset/aliases.inc';
+use constant UNICODE_CHARSETS =>
+  [
+   qr'^ISO-10646-UCS-[24]$',
+   qr'^UTF-16',
+   qr'^UTF-8$',
+   qr'^UTF-32'
+  ];
+open(INFILE, "<", ALIAS_FILE) || die "Unable to open " . ALIAS_FILE;
+my %charsets;
+while (my $line = <INFILE>) {
+   last unless (defined $line);
+   next if ($line =~ /^#/);
+   chomp $line;
+   next if ($line eq '');
+   my @elements = split /\s+/, $line;
+   my $canon = shift @elements;
+   my $mibenum = shift @elements;
+   $charsets{$canon} = [$mibenum, \@elements];
+}
+close(INFILE);
+my $unicodeexp = "";
+my $output = <<'EOH';
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2010 The NetSurf Project.
+ *
+ * Note: This file is automatically generated by make-aliases.pl
+ *
+ * Do not edit file file, changes will be overwritten during build.
+ */
+static parserutils_charset_aliases_canon canonical_charset_names[] = {
+EOH
+my %aliases;
+my $canonnr = 0;
+foreach my $canon (sort keys %charsets) {
+   my ($mibenum, $elements) = @{$charsets{$canon}};
+   # Ordering must match struct in src/charset/aliases.h
+   $output .= "\t{ " . $mibenum . ", " . length($canon) . ', "' . $canon . '" },' . "\n";
+   my $isunicode = 0;
+   foreach my $unirexp (@{UNICODE_CHARSETS()}) {
+      $isunicode = 1 if ($canon =~ $unirexp);
+   }
+   if ($isunicode == 1) {
+      $unicodeexp .= "((x) == $mibenum) || ";
+   }
+   $canon =~ y/A-Z/a-z/;
+   $canon =~ s/[^a-z0-9]//g;
+   $aliases{$canon} = $canonnr;
+   foreach my $alias (@$elements) {
+      $alias =~ y/A-Z/a-z/;
+      $alias =~ s/[^a-z0-9]//g;
+      $aliases{$alias} = $canonnr;
+   }
+   $canonnr += 1;
+}
+$output .= "};\n\nstatic const uint16_t charset_aliases_canon_count = ${canonnr};\n\n";
+$output .= <<'EOT';
+typedef struct {
+        uint16_t name_len;
+        const char *name;
+        parserutils_charset_aliases_canon *canon;
+} parserutils_charset_aliases_alias;
+static parserutils_charset_aliases_alias charset_aliases[] = {
+EOT
+my $aliascount = 0;
+foreach my $alias (sort keys %aliases) {
+   my $canonnr = $aliases{$alias};
+   $output .= "\t{ " . length($alias) . ', "' . $alias . '", &canonical_charset_names[' . $canonnr . "] },\n";
+   $aliascount += 1;
+}
+$output .= "};\n\n";
+# Drop the final " || "
+chop $unicodeexp;
+chop $unicodeexp;
+chop $unicodeexp;
+chop $unicodeexp;
+$output .= <<"EOS";
+static const uint16_t charset_aliases_count = ${aliascount};
+#define MIBENUM_IS_UNICODE(x) ($unicodeexp)
+EOS
+if (open(EXISTING, "<", ALIAS_INC)) {
+   local $/ = undef();
+   my $now = <EXISTING>;
+   undef($output) if ($output eq $now);
+   close(EXISTING);
+}
+if (defined($output)) {
+   open(OUTF, ">", ALIAS_INC);
+   print OUTF $output;
+   close(OUTF);
+}

/programs/network/netsurf/libparserutils/docs/Todo
0,0 → 1,5
Todo list
---------

+ Charset conversion should use Unicode Normalisation Form C.

 /programs/network/netsurf/libparserutils/include/parserutils/charset/codec.h
 ,0 → 1,125
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_charset_codec_h_
+#define parserutils_charset_codec_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <inttypes.h>
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+typedef struct parserutils_charset_codec parserutils_charset_codec;
+#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU)
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ *   encoding data) or which cannot be converted to UCS-4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ *     U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ *     U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ *     U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum parserutils_charset_codec_errormode {
+        /** Abort processing if unrepresentable character encountered */
+        PARSERUTILS_CHARSET_CODEC_ERROR_STRICT   = 0,
+        /** Replace unrepresentable characters with single alternate */
+        PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE    = 1,
+        /** Transliterate unrepresentable characters, if possible */
+        PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2
+} parserutils_charset_codec_errormode;
+/**
+ * Charset codec option types
+ */
+typedef enum parserutils_charset_codec_opttype {
+        /** Set codec error mode */
+        PARSERUTILS_CHARSET_CODEC_ERROR_MODE  = 1
+} parserutils_charset_codec_opttype;
+/**
+ * Charset codec option parameters
+ */
+typedef union parserutils_charset_codec_optparams {
+        /** Parameters for error mode setting */
+        struct {
+                /** The desired error handling mode */
+                parserutils_charset_codec_errormode mode;
+        } error_mode;
+} parserutils_charset_codec_optparams;
+/* Create a charset codec */
+parserutils_error parserutils_charset_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec);
+/* Destroy a charset codec */
+parserutils_error parserutils_charset_codec_destroy(
+                parserutils_charset_codec *codec);
+/* Configure a charset codec */
+parserutils_error parserutils_charset_codec_setopt(
+                parserutils_charset_codec *codec,
+                parserutils_charset_codec_opttype type,
+                parserutils_charset_codec_optparams *params);
+/* Encode a chunk of UCS-4 data into a codec's charset */
+parserutils_error parserutils_charset_codec_encode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+/* Decode a chunk of data in a codec's charset into UCS-4 */
+parserutils_error parserutils_charset_codec_decode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+/* Reset a charset codec */
+parserutils_error parserutils_charset_codec_reset(
+                parserutils_charset_codec *codec);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/charset/mibenum.h
 ,0 → 1,33
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_charset_mibenum_h_
+#define parserutils_charset_mibenum_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <inttypes.h>
+#include <stdbool.h>
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+/* Convert an encoding alias to a MIB enum value */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum);
+/* Determine if a MIB enum value represents a Unicode variant */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/charset/utf16.h
 ,0 → 1,47
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+/** \file
+ * UTF-16 manipulation functions (interface).
+ */
+#ifndef parserutils_charset_utf16_h_
+#define parserutils_charset_utf16_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <inttypes.h>
+#include <parserutils/errors.h>
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
+                size_t len, uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4,
+                uint8_t *s, size_t *len);
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s,
+                size_t max, size_t *len);
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+                size_t *len);
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s,
+                uint32_t off, uint32_t *prevoff);
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s,
+                uint32_t len, uint32_t off, uint32_t *nextoff);
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+                uint32_t len, uint32_t off, uint32_t *nextoff);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/charset/utf8.h
 ,0 → 1,47
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+#ifndef parserutils_charset_utf8_h_
+#define parserutils_charset_utf8_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <inttypes.h>
+#include <parserutils/errors.h>
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+                uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s,
+                size_t *len);
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+                size_t *len);
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+                size_t *len);
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+                uint32_t *prevoff);
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+                uint32_t off, uint32_t *nextoff);
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
+                uint32_t len, uint32_t off, uint32_t *nextoff);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/errors.h
 ,0 → 1,40
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_errors_h_
+#define parserutils_errors_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <stddef.h>
+typedef enum parserutils_error {
+        PARSERUTILS_OK               = 0,
+        PARSERUTILS_NOMEM            = 1,
+        PARSERUTILS_BADPARM          = 2,
+        PARSERUTILS_INVALID          = 3,
+        PARSERUTILS_FILENOTFOUND     = 4,
+        PARSERUTILS_NEEDDATA         = 5,
+        PARSERUTILS_BADENCODING      = 6,
+        PARSERUTILS_EOF              = 7
+} parserutils_error;
+/* Convert a parserutils error value to a string */
+const char *parserutils_error_to_string(parserutils_error error);
+/* Convert a string to a parserutils error value */
+parserutils_error parserutils_error_from_string(const char *str, size_t len);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/functypes.h
 ,0 → 1,30
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_functypes_h_
+#define parserutils_functypes_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <parserutils/types.h>
+/* Type of allocation function for parserutils */
+typedef void *(*parserutils_alloc)(void *ptr, size_t size, void *pw);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/input/inputstream.h
 ,0 → 1,188
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_input_inputstream_h_
+#define parserutils_input_inputstream_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <stdbool.h>
+#ifndef NDEBUG
+#include <stdio.h>
+#endif
+#include <stdlib.h>
+#include <inttypes.h>
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/utils/buffer.h>
+/**
+ * Type of charset detection function
+ */
+typedef parserutils_error (*parserutils_charset_detect_func)(
+                const uint8_t *data, size_t len,
+                uint16_t *mibenum, uint32_t *source);
+/**
+ * Input stream object
+ */
+typedef struct parserutils_inputstream
+{
+        parserutils_buffer *utf8;       /**< Buffer containing UTF-8 data */
+        uint32_t cursor;                /**< Byte offset of current position */
+        bool had_eof;                   /**< Whether EOF has been reached */
+} parserutils_inputstream;
+/* Create an input stream */
+parserutils_error parserutils_inputstream_create(const char *enc,
+                uint32_t encsrc, parserutils_charset_detect_func csdetect,
+                parserutils_alloc alloc, void *pw,
+                parserutils_inputstream **stream);
+/* Destroy an input stream */
+parserutils_error parserutils_inputstream_destroy(
+                parserutils_inputstream *stream);
+/* Append data to an input stream */
+parserutils_error parserutils_inputstream_append(
+                parserutils_inputstream *stream,
+                const uint8_t *data, size_t len);
+/* Insert data into stream at current location */
+parserutils_error parserutils_inputstream_insert(
+                parserutils_inputstream *stream,
+                const uint8_t *data, size_t len);
+/* Slow form of css_inputstream_peek. */
+parserutils_error parserutils_inputstream_peek_slow(
+                parserutils_inputstream *stream,
+                size_t offset, const uint8_t **ptr, size_t *length);
+/**
+ * Look at the character in the stream that starts at
+ * offset bytes from the cursor
+ *
+ * \param stream  Stream to look in
+ * \param offset  Byte offset of start of character
+ * \param ptr     Pointer to location to receive pointer to character data
+ * \param length  Pointer to location to receive character length (in bytes)
+ * \return PARSERUTILS_OK on success,
+ *                    _NEEDDATA on reaching the end of available input,
+ *                    _EOF on reaching the end of all input,
+ *                    _BADENCODING if the input cannot be decoded,
+ *                    _NOMEM on memory exhaustion,
+ *                    _BADPARM if bad parameters are passed.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to
+ * pass over the character), then no guarantee is made as to the validity of
+ * the data pointed to. Thus, any attempt to dereference the pointer after
+ * advancing past the data it points to is a bug.
+ */
+static inline parserutils_error parserutils_inputstream_peek(
+                parserutils_inputstream *stream, size_t offset,
+                const uint8_t **ptr, size_t *length)
+{
+        parserutils_error error = PARSERUTILS_OK;
+        const parserutils_buffer *utf8;
+        const uint8_t *utf8_data;
+        size_t len, off, utf8_len;
+        if (stream == NULL || ptr == NULL || length == NULL)
+                return PARSERUTILS_BADPARM;
+#ifndef NDEBUG
+#ifdef VERBOSE_INPUTSTREAM
+        fprintf(stdout, "Peek: len: %zu cur: %u off: %zu\n",
+                        stream->utf8->length, stream->cursor, offset);
+#endif
+#ifdef RANDOMISE_INPUTSTREAM
+        parserutils_buffer_randomise(stream->utf8);
+#endif
+#endif
+        utf8 = stream->utf8;
+        utf8_data = utf8->data;
+        utf8_len = utf8->length;
+        off = stream->cursor + offset;
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+        if (off < utf8_len) {
+                if (IS_ASCII(utf8_data[off])) {
+                        /* Early exit for ASCII case */
+                        (*length) = 1;
+                        (*ptr) = (utf8_data + off);
+                        return PARSERUTILS_OK;
+                } else {
+                        error = parserutils_charset_utf8_char_byte_length(
+                                utf8_data + off, &len);
+                        if (error == PARSERUTILS_OK) {
+                                (*length) = len;
+                                (*ptr) = (utf8_data + off);
+                                return PARSERUTILS_OK;
+                        } else if (error != PARSERUTILS_NEEDDATA) {
+                                return error;
+                        }
+                }
+        }
+#undef IS_ASCII
+        if (off != utf8_len && error != PARSERUTILS_NEEDDATA)
+                abort();
+        return parserutils_inputstream_peek_slow(stream, offset, ptr, length);
+}
+/**
+ * Advance the stream's current position
+ *
+ * \param stream  The stream whose position to advance
+ * \param bytes   The number of bytes to advance
+ */
+static inline void parserutils_inputstream_advance(
+                parserutils_inputstream *stream, size_t bytes)
+{
+        if (stream == NULL)
+                return;
+#if !defined(NDEBUG) && defined(VERBOSE_INPUTSTREAM)
+        fprintf(stdout, "Advance: len: %zu cur: %u bytes: %zu\n",
+                        stream->utf8->length, stream->cursor, bytes);
+#endif
+        if (bytes > stream->utf8->length - stream->cursor)
+                abort();
+        if (stream->cursor == stream->utf8->length)
+                return;
+        stream->cursor += bytes;
+}
+/* Read the document charset */
+const char *parserutils_inputstream_read_charset(
+                parserutils_inputstream *stream, uint32_t *source);
+/* Change the document charset */
+parserutils_error parserutils_inputstream_change_charset(
+                parserutils_inputstream *stream,
+                const char *enc, uint32_t source);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/parserutils.h
 ,0 → 1,25
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_parserutils_h_
+#define parserutils_parserutils_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/types.h
 ,0 → 1,24
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_types_h_
+#define parserutils_types_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <stdbool.h>
+#include <inttypes.h>
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/utils/buffer.h
 ,0 → 1,50
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_utils_buffer_h_
+#define parserutils_utils_buffer_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+struct parserutils_buffer
+{
+        uint8_t *data;
+        size_t length;
+        size_t allocated;
+        parserutils_alloc alloc;
+        void *pw;
+};
+typedef struct parserutils_buffer parserutils_buffer;
+parserutils_error parserutils_buffer_create(parserutils_alloc alloc,
+                void *pw, parserutils_buffer **buffer);
+parserutils_error parserutils_buffer_destroy(parserutils_buffer *buffer);
+parserutils_error parserutils_buffer_append(parserutils_buffer *buffer,
+                const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer,
+                size_t offset, const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer,
+                size_t offset, size_t len);
+parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer);
+parserutils_error parserutils_buffer_randomise(parserutils_buffer *buffer);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/utils/stack.h
 ,0 → 1,39
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_utils_stack_h_
+#define parserutils_utils_stack_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <stddef.h>
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+struct parserutils_stack;
+typedef struct parserutils_stack parserutils_stack;
+parserutils_error parserutils_stack_create(size_t item_size, size_t chunk_size,
+                parserutils_alloc alloc, void *pw, parserutils_stack **stack);
+parserutils_error parserutils_stack_destroy(parserutils_stack *stack);
+parserutils_error parserutils_stack_push(parserutils_stack *stack,
+                const void *item);
+parserutils_error parserutils_stack_pop(parserutils_stack *stack, void *item);
+void *parserutils_stack_get_current(parserutils_stack *stack);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/include/parserutils/utils/vector.h
 ,0 → 1,45
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_utils_vector_h_
+#define parserutils_utils_vector_h_
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include <stddef.h>
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+struct parserutils_vector;
+typedef struct parserutils_vector parserutils_vector;
+parserutils_error parserutils_vector_create(size_t item_size,
+                size_t chunk_size, parserutils_alloc alloc, void *pw,
+                parserutils_vector **vector);
+parserutils_error parserutils_vector_destroy(parserutils_vector *vector);
+parserutils_error parserutils_vector_append(parserutils_vector *vector,
+                void *item);
+parserutils_error parserutils_vector_clear(parserutils_vector *vector);
+parserutils_error parserutils_vector_remove_last(parserutils_vector *vector);
+parserutils_error parserutils_vector_get_length(parserutils_vector *vector, size_t *length);
+const void *parserutils_vector_iterate(const parserutils_vector *vector,
+                int32_t *ctx);
+const void *parserutils_vector_peek(const parserutils_vector *vector,
+                int32_t ctx);
+#ifdef __cplusplus
+}
+#endif
+#endif

 /programs/network/netsurf/libparserutils/libparserutils.pc.in
 ,0 → 1,10
+prefix=PREFIX
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+Name: libparserutils
+Description: Utility library for facilitating parser development
+Version: VERSION
+Libs: -L${libdir} -lparserutils
+Cflags: -I${includedir}

/programs/network/netsurf/libparserutils/src/Makefile
0,0 → 1,2

include $(NSBUILD)/Makefile.subdir

/programs/network/netsurf/libparserutils/src/charset/Makefile
0,0 → 1,5

OUTFILE = libo.o
OBJS = aliases.o codec.o
CFLAGS += -I ../../include/ -I ../../../ -I ../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib

 /programs/network/netsurf/libparserutils/src/charset/aliases.c
 ,0 → 1,150
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "charset/aliases.h"
+#include "utils/utils.h"
+/* Bring in the aliases tables */
+#include "aliases.inc"
+typedef struct {
+        size_t slen;
+        const char *s;
+} lengthed_string;
+#define IS_PUNCT_OR_SPACE(x)                    \
+        (!(((x) >= 'A' && (x) <= 'Z') ||        \
+           ((x) >= 'a' && (x) <= 'z') ||        \
+           ((x) >= '0' && (x) <= '9')))
+static int parserutils_charset_alias_match(const void *a, const void *b)
+{
+        lengthed_string *s = (lengthed_string *)a;
+        parserutils_charset_aliases_alias *alias = (parserutils_charset_aliases_alias*)b;
+        size_t key_left = s->slen;
+        size_t alias_left = alias->name_len;
+        const char *s_alias = alias->name;
+        const char *s_key = s->s;
+        int cmpret;
+        while ((key_left > 0) && (alias_left > 0)) {
+                while ((key_left > 0) && IS_PUNCT_OR_SPACE(*s_key)) {
+                        key_left--; s_key++;
+                }
+                if (key_left == 0)
+                        break;
+                cmpret = tolower(*s_key) - *s_alias;
+                if (cmpret != 0) {
+                        return cmpret;
+                }
+                key_left--;
+                s_key++;
+                alias_left--;
+                s_alias++;
+        }
+        while ((key_left > 0) && IS_PUNCT_OR_SPACE(*s_key)) {
+          key_left--; s_key++;
+        }
+        return key_left - alias_left;
+}
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias  The alias name
+ * \param len    The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+parserutils_charset_aliases_canon *parserutils__charset_alias_canonicalise(
+                const char *alias, size_t len)
+{
+        parserutils_charset_aliases_alias *c;
+        lengthed_string s;
+        s.slen = len;
+        s.s = alias;
+        c = (parserutils_charset_aliases_alias*)bsearch(&s,
+                &charset_aliases[0],
+                charset_aliases_count,
+                sizeof(parserutils_charset_aliases_alias),
+                parserutils_charset_alias_match);
+        if (c == NULL)
+                return NULL;
+        return c->canon;
+}
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias  The alias to lookup
+ * \param len    The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
+{
+        parserutils_charset_aliases_canon *c;
+        if (alias == NULL)
+                return 0;
+        c = parserutils__charset_alias_canonicalise(alias, len);
+        if (c == NULL)
+                return 0;
+        return c->mib_enum;
+}
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum)
+{
+        int i;
+        parserutils_charset_aliases_canon *c;
+        for (i = 0; i < charset_aliases_canon_count; ++i) {
+                c = &canonical_charset_names[i];
+                if (c->mib_enum == mibenum)
+                        return c->name;
+        }
+        return NULL;
+}
+/**
+ * Detect if a parserutils_charset is Unicode
+ *
+ * \param mibenum  The MIB enum to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
+{
+        return MIBENUM_IS_UNICODE(mibenum);
+}

 /programs/network/netsurf/libparserutils/src/charset/aliases.h
 ,0 → 1,26
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_charset_aliases_h_
+#define parserutils_charset_aliases_h_
+#include <inttypes.h>
+#include <parserutils/charset/mibenum.h>
+typedef struct parserutils_charset_aliases_canon {
+        /* Do not change the ordering here without changing make-aliases.pl */
+        uint16_t mib_enum;
+        uint16_t name_len;
+        const char *name;
+} parserutils_charset_aliases_canon;
+/* Canonicalise an alias name */
+parserutils_charset_aliases_canon *parserutils__charset_alias_canonicalise(
+                const char *alias, size_t len);
+#endif

 /programs/network/netsurf/libparserutils/src/charset/aliases.inc
 ,0 → 1,1142
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2010 The NetSurf Project.
+ *
+ * Note: This file is automatically generated by make-aliases.pl
+ *
+ * Do not edit file file, changes will be overwritten during build.
+ */
+static parserutils_charset_aliases_canon canonical_charset_names[] = {
+        { 74, 16, "ANSI_X3.110-1983" },
+        { 65, 8, "ASMO_449" },
+        { 2005, 23, "Adobe-Standard-Encoding" },
+        { 2020, 21, "Adobe-Symbol-Encoding" },
+        { 2104, 10, "Amiga-1251" },
+        { 1020, 6, "BOCU-1" },
+        { 20, 7, "BS_4730" },
+        { 50, 11, "BS_viewdata" },
+        { 2026, 4, "Big5" },
+        { 2101, 10, "Big5-HKSCS" },
+        { 1016, 6, "CESU-8" },
+        { 3004, 5, "CP874" },
+        { 3027, 5, "CP949" },
+        { 78, 17, "CSA_Z243.4-1985-1" },
+        { 79, 17, "CSA_Z243.4-1985-2" },
+        { 80, 18, "CSA_Z243.4-1985-gr" },
+        { 86, 10, "CSN_369103" },
+        { 2008, 7, "DEC-MCS" },
+        { 24, 9, "DIN_66003" },
+        { 99, 7, "DS_2089" },
+        { 2064, 12, "EBCDIC-AT-DE" },
+        { 2065, 14, "EBCDIC-AT-DE-A" },
+        { 2066, 12, "EBCDIC-CA-FR" },
+        { 2067, 12, "EBCDIC-DK-NO" },
+        { 2068, 14, "EBCDIC-DK-NO-A" },
+        { 2074, 9, "EBCDIC-ES" },
+        { 2075, 11, "EBCDIC-ES-A" },
+        { 2076, 11, "EBCDIC-ES-S" },
+        { 2069, 12, "EBCDIC-FI-SE" },
+        { 2070, 14, "EBCDIC-FI-SE-A" },
+        { 2071, 9, "EBCDIC-FR" },
+        { 2072, 9, "EBCDIC-IT" },
+        { 2073, 9, "EBCDIC-PT" },
+        { 2077, 9, "EBCDIC-UK" },
+        { 2078, 9, "EBCDIC-US" },
+        { 77, 13, "ECMA-cyrillic" },
+        { 23, 2, "ES" },
+        { 61, 3, "ES2" },
+        { 18, 6, "EUC-JP" },
+        { 38, 6, "EUC-KR" },
+        { 19, 43, "Extended_UNIX_Code_Fixed_Width_for_Japanese" },
+        { 114, 7, "GB18030" },
+        { 2025, 6, "GB2312" },
+        { 113, 3, "GBK" },
+        { 56, 10, "GB_1988-80" },
+        { 57, 10, "GB_2312-80" },
+        { 94, 13, "GOST_19768-74" },
+        { 2021, 10, "HP-DeskTop" },
+        { 2017, 8, "HP-Legal" },
+        { 2019, 8, "HP-Math8" },
+        { 2018, 10, "HP-Pi-font" },
+        { 2085, 10, "HZ-GB-2312" },
+        { 2015, 11, "IBM-Symbols" },
+        { 2016, 8, "IBM-Thai" },
+        { 2089, 8, "IBM00858" },
+        { 2090, 8, "IBM00924" },
+        { 2091, 8, "IBM01140" },
+        { 2092, 8, "IBM01141" },
+        { 2093, 8, "IBM01142" },
+        { 2094, 8, "IBM01143" },
+        { 2095, 8, "IBM01144" },
+        { 2096, 8, "IBM01145" },
+        { 2097, 8, "IBM01146" },
+        { 2098, 8, "IBM01147" },
+        { 2099, 8, "IBM01148" },
+        { 2100, 8, "IBM01149" },
+        { 2028, 6, "IBM037" },
+        { 2029, 6, "IBM038" },
+        { 2063, 7, "IBM1026" },
+        { 2102, 7, "IBM1047" },
+        { 2030, 6, "IBM273" },
+        { 2031, 6, "IBM274" },
+        { 2032, 6, "IBM275" },
+        { 2033, 6, "IBM277" },
+        { 2034, 6, "IBM278" },
+        { 2035, 6, "IBM280" },
+        { 2036, 6, "IBM281" },
+        { 2037, 6, "IBM284" },
+        { 2038, 6, "IBM285" },
+        { 2039, 6, "IBM290" },
+        { 2040, 6, "IBM297" },
+        { 2041, 6, "IBM420" },
+        { 2042, 6, "IBM423" },
+        { 2043, 6, "IBM424" },
+        { 2011, 6, "IBM437" },
+        { 2044, 6, "IBM500" },
+        { 2087, 6, "IBM775" },
+        { 2009, 6, "IBM850" },
+        { 2045, 6, "IBM851" },
+        { 2010, 6, "IBM852" },
+        { 2046, 6, "IBM855" },
+        { 2047, 6, "IBM857" },
+        { 2048, 6, "IBM860" },
+        { 2049, 6, "IBM861" },
+        { 2013, 6, "IBM862" },
+        { 2050, 6, "IBM863" },
+        { 2051, 6, "IBM864" },
+        { 2052, 6, "IBM865" },
+        { 2086, 6, "IBM866" },
+        { 2053, 6, "IBM868" },
+        { 2054, 6, "IBM869" },
+        { 2055, 6, "IBM870" },
+        { 2056, 6, "IBM871" },
+        { 2057, 6, "IBM880" },
+        { 2058, 6, "IBM891" },
+        { 2059, 6, "IBM903" },
+        { 2060, 6, "IBM904" },
+        { 2061, 6, "IBM905" },
+        { 2062, 6, "IBM918" },
+        { 88, 9, "IEC_P27-1" },
+        { 51, 4, "INIS" },
+        { 52, 6, "INIS-8" },
+        { 53, 13, "INIS-cyrillic" },
+        { 29, 9, "INVARIANT" },
+        { 1000, 15, "ISO-10646-UCS-2" },
+        { 1001, 15, "ISO-10646-UCS-4" },
+        { 1002, 19, "ISO-10646-UCS-Basic" },
+        { 27, 15, "ISO-10646-UTF-1" },
+        { 1003, 24, "ISO-10646-Unicode-Latin1" },
+        { 104, 11, "ISO-2022-CN" },
+        { 105, 15, "ISO-2022-CN-EXT" },
+        { 39, 11, "ISO-2022-JP" },
+        { 4008, 13, "ISO-2022-JP-1" },
+        { 40, 13, "ISO-2022-JP-2" },
+        { 37, 11, "ISO-2022-KR" },
+        { 4, 10, "ISO-8859-1" },
+        { 2000, 30, "ISO-8859-1-Windows-3.0-Latin-1" },
+        { 2001, 30, "ISO-8859-1-Windows-3.1-Latin-1" },
+        { 13, 11, "ISO-8859-10" },
+        { 4014, 11, "ISO-8859-11" },
+        { 109, 11, "ISO-8859-13" },
+        { 110, 11, "ISO-8859-14" },
+        { 111, 11, "ISO-8859-15" },
+        { 112, 11, "ISO-8859-16" },
+        { 5, 10, "ISO-8859-2" },
+        { 2002, 26, "ISO-8859-2-Windows-Latin-2" },
+        { 6, 10, "ISO-8859-3" },
+        { 7, 10, "ISO-8859-4" },
+        { 8, 10, "ISO-8859-5" },
+        { 9, 10, "ISO-8859-6" },
+        { 81, 12, "ISO-8859-6-E" },
+        { 82, 12, "ISO-8859-6-I" },
+        { 10, 10, "ISO-8859-7" },
+        { 11, 10, "ISO-8859-8" },
+        { 84, 12, "ISO-8859-8-E" },
+        { 85, 12, "ISO-8859-8-I" },
+        { 12, 10, "ISO-8859-9" },
+        { 2003, 26, "ISO-8859-9-Windows-Latin-5" },
+        { 4000, 10, "ISO-IR-182" },
+        { 4002, 10, "ISO-IR-197" },
+        { 1005, 20, "ISO-Unicode-IBM-1261" },
+        { 1008, 20, "ISO-Unicode-IBM-1264" },
+        { 1009, 20, "ISO-Unicode-IBM-1265" },
+        { 1006, 20, "ISO-Unicode-IBM-1268" },
+        { 1007, 20, "ISO-Unicode-IBM-1276" },
+        { 96, 13, "ISO_10367-box" },
+        { 73, 13, "ISO_2033-1983" },
+        { 48, 8, "ISO_5427" },
+        { 54, 13, "ISO_5427:1981" },
+        { 55, 13, "ISO_5428:1980" },
+        { 28, 18, "ISO_646.basic:1983" },
+        { 30, 16, "ISO_646.irv:1983" },
+        { 93, 13, "ISO_6937-2-25" },
+        { 14, 14, "ISO_6937-2-add" },
+        { 95, 13, "ISO_8859-supp" },
+        { 22, 2, "IT" },
+        { 41, 17, "JIS_C6220-1969-jp" },
+        { 42, 17, "JIS_C6220-1969-ro" },
+        { 49, 14, "JIS_C6226-1978" },
+        { 63, 14, "JIS_C6226-1983" },
+        { 67, 16, "JIS_C6229-1984-a" },
+        { 68, 16, "JIS_C6229-1984-b" },
+        { 69, 20, "JIS_C6229-1984-b-add" },
+        { 70, 19, "JIS_C6229-1984-hand" },
+        { 71, 23, "JIS_C6229-1984-hand-add" },
+        { 72, 19, "JIS_C6229-1984-kana" },
+        { 16, 12, "JIS_Encoding" },
+        { 15, 9, "JIS_X0201" },
+        { 98, 14, "JIS_X0212-1990" },
+        { 4012, 5, "JOHAB" },
+        { 87, 12, "JUS_I.B1.002" },
+        { 90, 16, "JUS_I.B1.003-mac" },
+        { 89, 17, "JUS_I.B1.003-serb" },
+        { 2105, 13, "KOI7-switched" },
+        { 2084, 6, "KOI8-R" },
+        { 2088, 6, "KOI8-U" },
+        { 102, 7, "KSC5636" },
+        { 36, 14, "KS_C_5601-1987" },
+        { 47, 13, "Latin-greek-1" },
+        { 4011, 16, "MACCENTRALEUROPE" },
+        { 4009, 11, "MACCYRILLIC" },
+        { 4010, 10, "MACUKRAINE" },
+        { 2081, 4, "MNEM" },
+        { 2080, 8, "MNEMONIC" },
+        { 62, 10, "MSZ_7795.3" },
+        { 2023, 20, "Microsoft-Publishing" },
+        { 33, 9, "NATS-DANO" },
+        { 34, 13, "NATS-DANO-ADD" },
+        { 31, 9, "NATS-SEFI" },
+        { 32, 13, "NATS-SEFI-ADD" },
+        { 92, 13, "NC_NC00-10:81" },
+        { 26, 11, "NF_Z_62-010" },
+        { 46, 18, "NF_Z_62-010_(1973)" },
+        { 25, 9, "NS_4551-1" },
+        { 58, 9, "NS_4551-2" },
+        { 116, 19, "OSD_EBCDIC_DF03_IRV" },
+        { 117, 17, "OSD_EBCDIC_DF04_1" },
+        { 115, 18, "OSD_EBCDIC_DF04_15" },
+        { 2012, 20, "PC8-Danish-Norwegian" },
+        { 2014, 11, "PC8-Turkish" },
+        { 43, 2, "PT" },
+        { 60, 3, "PT2" },
+        { 2103, 7, "PTCP154" },
+        { 1011, 4, "SCSU" },
+        { 35, 12, "SEN_850200_B" },
+        { 21, 12, "SEN_850200_C" },
+        { 17, 9, "Shift_JIS" },
+        { 83, 8, "T.101-G2" },
+        { 75, 9, "T.61-7bit" },
+        { 76, 9, "T.61-8bit" },
+        { 2259, 7, "TIS-620" },
+        { 1010, 11, "UNICODE-1-1" },
+        { 103, 17, "UNICODE-1-1-UTF-7" },
+        { 2079, 12, "UNKNOWN-8BIT" },
+        { 3, 8, "US-ASCII" },
+        { 1015, 6, "UTF-16" },
+        { 1013, 8, "UTF-16BE" },
+        { 1014, 8, "UTF-16LE" },
+        { 1017, 6, "UTF-32" },
+        { 1018, 8, "UTF-32BE" },
+        { 1019, 8, "UTF-32LE" },
+        { 1012, 5, "UTF-7" },
+        { 106, 5, "UTF-8" },
+        { 2083, 4, "VIQR" },
+        { 2082, 6, "VISCII" },
+        { 2007, 21, "Ventura-International" },
+        { 2022, 12, "Ventura-Math" },
+        { 2006, 10, "Ventura-US" },
+        { 2024, 11, "Windows-31J" },
+        { 5002, 13, "X-ACORN-FUZZY" },
+        { 5001, 14, "X-ACORN-LATIN1" },
+        { 4999, 9, "X-CURRENT" },
+        { 101, 5, "dk-us" },
+        { 91, 11, "greek-ccitt" },
+        { 64, 6, "greek7" },
+        { 44, 10, "greek7-old" },
+        { 2004, 9, "hp-roman8" },
+        { 66, 9, "iso-ir-90" },
+        { 45, 11, "latin-greek" },
+        { 97, 9, "latin-lap" },
+        { 2027, 9, "macintosh" },
+        { 100, 5, "us-dk" },
+        { 59, 14, "videotex-suppl" },
+        { 2250, 12, "windows-1250" },
+        { 2251, 12, "windows-1251" },
+        { 2252, 12, "windows-1252" },
+        { 2253, 12, "windows-1253" },
+        { 2254, 12, "windows-1254" },
+        { 2255, 12, "windows-1255" },
+        { 2256, 12, "windows-1256" },
+        { 2257, 12, "windows-1257" },
+        { 2258, 12, "windows-1258" },
+};
+static const uint16_t charset_aliases_canon_count = 262;
+typedef struct {
+        uint16_t name_len;
+        const char *name;
+        parserutils_charset_aliases_canon *canon;
+} parserutils_charset_aliases_alias;
+static parserutils_charset_aliases_alias charset_aliases[] = {
+        { 3, "437", &canonical_charset_names[84] },
+        { 3, "850", &canonical_charset_names[87] },
+        { 3, "851", &canonical_charset_names[88] },
+        { 3, "852", &canonical_charset_names[89] },
+        { 3, "855", &canonical_charset_names[90] },
+        { 3, "857", &canonical_charset_names[91] },
+        { 3, "860", &canonical_charset_names[92] },
+        { 3, "861", &canonical_charset_names[93] },
+        { 3, "862", &canonical_charset_names[94] },
+        { 3, "863", &canonical_charset_names[95] },
+        { 3, "865", &canonical_charset_names[97] },
+        { 3, "866", &canonical_charset_names[98] },
+        { 3, "869", &canonical_charset_names[100] },
+        { 5, "88591", &canonical_charset_names[125] },
+        { 6, "885910", &canonical_charset_names[128] },
+        { 6, "885911", &canonical_charset_names[129] },
+        { 6, "885913", &canonical_charset_names[130] },
+        { 6, "885914", &canonical_charset_names[131] },
+        { 6, "885915", &canonical_charset_names[132] },
+        { 5, "88592", &canonical_charset_names[134] },
+        { 5, "88593", &canonical_charset_names[136] },
+        { 5, "88594", &canonical_charset_names[137] },
+        { 5, "88595", &canonical_charset_names[138] },
+        { 5, "88597", &canonical_charset_names[142] },
+        { 5, "88598", &canonical_charset_names[143] },
+        { 5, "88599", &canonical_charset_names[146] },
+        { 3, "904", &canonical_charset_names[106] },
+        { 21, "adobestandardencoding", &canonical_charset_names[2] },
+        { 19, "adobesymbolencoding", &canonical_charset_names[3] },
+        { 7, "ami1251", &canonical_charset_names[4] },
+        { 9, "amiga1251", &canonical_charset_names[4] },
+        { 13, "ansix31101983", &canonical_charset_names[0] },
+        { 11, "ansix341968", &canonical_charset_names[224] },
+        { 11, "ansix341986", &canonical_charset_names[224] },
+        { 6, "arabic", &canonical_charset_names[139] },
+        { 7, "arabic7", &canonical_charset_names[1] },
+        { 5, "ascii", &canonical_charset_names[224] },
+        { 7, "asmo449", &canonical_charset_names[1] },
+        { 7, "asmo708", &canonical_charset_names[139] },
+        { 4, "big5", &canonical_charset_names[8] },
+        { 9, "big5hkscs", &canonical_charset_names[9] },
+        { 7, "bigfive", &canonical_charset_names[8] },
+        { 5, "bocu1", &canonical_charset_names[5] },
+        { 6, "bs4730", &canonical_charset_names[6] },
+        { 10, "bsviewdata", &canonical_charset_names[7] },
+        { 2, "ca", &canonical_charset_names[13] },
+        { 10, "ccsid00858", &canonical_charset_names[54] },
+        { 10, "ccsid00924", &canonical_charset_names[55] },
+        { 10, "ccsid01140", &canonical_charset_names[56] },
+        { 10, "ccsid01141", &canonical_charset_names[57] },
+        { 10, "ccsid01142", &canonical_charset_names[58] },
+        { 10, "ccsid01143", &canonical_charset_names[59] },
+        { 10, "ccsid01144", &canonical_charset_names[60] },
+        { 10, "ccsid01145", &canonical_charset_names[61] },
+        { 10, "ccsid01146", &canonical_charset_names[62] },
+        { 10, "ccsid01147", &canonical_charset_names[63] },
+        { 10, "ccsid01148", &canonical_charset_names[64] },
+        { 10, "ccsid01149", &canonical_charset_names[65] },
+        { 5, "cesu8", &canonical_charset_names[10] },
+        { 7, "chinese", &canonical_charset_names[45] },
+        { 2, "cn", &canonical_charset_names[44] },
+        { 6, "cnbig5", &canonical_charset_names[8] },
+        { 4, "cngb", &canonical_charset_names[42] },
+        { 7, "cp00858", &canonical_charset_names[54] },
+        { 7, "cp00924", &canonical_charset_names[55] },
+        { 7, "cp01140", &canonical_charset_names[56] },
+        { 7, "cp01141", &canonical_charset_names[57] },
+        { 7, "cp01142", &canonical_charset_names[58] },
+        { 7, "cp01143", &canonical_charset_names[59] },
+        { 7, "cp01144", &canonical_charset_names[60] },
+        { 7, "cp01145", &canonical_charset_names[61] },
+        { 7, "cp01146", &canonical_charset_names[62] },
+        { 7, "cp01147", &canonical_charset_names[63] },
+        { 7, "cp01148", &canonical_charset_names[64] },
+        { 7, "cp01149", &canonical_charset_names[65] },
+        { 5, "cp037", &canonical_charset_names[66] },
+        { 5, "cp038", &canonical_charset_names[67] },
+        { 6, "cp1026", &canonical_charset_names[68] },
+        { 6, "cp1250", &canonical_charset_names[253] },
+        { 6, "cp1251", &canonical_charset_names[254] },
+        { 6, "cp1252", &canonical_charset_names[255] },
+        { 6, "cp1253", &canonical_charset_names[256] },
+        { 6, "cp1254", &canonical_charset_names[257] },
+        { 6, "cp1256", &canonical_charset_names[259] },
+        { 6, "cp1257", &canonical_charset_names[260] },
+        { 5, "cp154", &canonical_charset_names[212] },
+        { 5, "cp273", &canonical_charset_names[70] },
+        { 5, "cp274", &canonical_charset_names[71] },
+        { 5, "cp275", &canonical_charset_names[72] },
+        { 5, "cp278", &canonical_charset_names[74] },
+        { 5, "cp280", &canonical_charset_names[75] },
+        { 5, "cp281", &canonical_charset_names[76] },
+        { 5, "cp284", &canonical_charset_names[77] },
+        { 5, "cp285", &canonical_charset_names[78] },
+        { 5, "cp290", &canonical_charset_names[79] },
+        { 5, "cp297", &canonical_charset_names[80] },
+        { 5, "cp367", &canonical_charset_names[224] },
+        { 5, "cp420", &canonical_charset_names[81] },
+        { 5, "cp423", &canonical_charset_names[82] },
+        { 5, "cp424", &canonical_charset_names[83] },
+        { 5, "cp437", &canonical_charset_names[84] },
+        { 5, "cp500", &canonical_charset_names[85] },
+        { 5, "cp775", &canonical_charset_names[86] },
+        { 5, "cp819", &canonical_charset_names[125] },
+        { 5, "cp850", &canonical_charset_names[87] },
+        { 5, "cp851", &canonical_charset_names[88] },
+        { 5, "cp852", &canonical_charset_names[89] },
+        { 5, "cp855", &canonical_charset_names[90] },
+        { 5, "cp857", &canonical_charset_names[91] },
+        { 5, "cp860", &canonical_charset_names[92] },
+        { 5, "cp861", &canonical_charset_names[93] },
+        { 5, "cp862", &canonical_charset_names[94] },
+        { 5, "cp863", &canonical_charset_names[95] },
+        { 5, "cp864", &canonical_charset_names[96] },
+        { 5, "cp865", &canonical_charset_names[97] },
+        { 5, "cp866", &canonical_charset_names[98] },
+        { 5, "cp868", &canonical_charset_names[99] },
+        { 5, "cp869", &canonical_charset_names[100] },
+        { 5, "cp870", &canonical_charset_names[101] },
+        { 5, "cp871", &canonical_charset_names[102] },
+        { 5, "cp874", &canonical_charset_names[11] },
+        { 5, "cp880", &canonical_charset_names[103] },
+        { 5, "cp891", &canonical_charset_names[104] },
+        { 5, "cp903", &canonical_charset_names[105] },
+        { 5, "cp904", &canonical_charset_names[106] },
+        { 5, "cp905", &canonical_charset_names[107] },
+        { 5, "cp918", &canonical_charset_names[108] },
+        { 5, "cp936", &canonical_charset_names[43] },
+        { 5, "cp949", &canonical_charset_names[12] },
+        { 4, "cpar", &canonical_charset_names[99] },
+        { 4, "cpgr", &canonical_charset_names[100] },
+        { 4, "cpis", &canonical_charset_names[93] },
+        { 5, "csa71", &canonical_charset_names[13] },
+        { 5, "csa72", &canonical_charset_names[14] },
+        { 23, "csadobestandardencoding", &canonical_charset_names[2] },
+        { 7, "csascii", &canonical_charset_names[224] },
+        { 11, "csat5001983", &canonical_charset_names[0] },
+        { 13, "csaz243419851", &canonical_charset_names[13] },
+        { 13, "csaz243419852", &canonical_charset_names[14] },
+        { 14, "csaz24341985gr", &canonical_charset_names[15] },
+        { 6, "csbig5", &canonical_charset_names[8] },
+        { 7, "csbocu1", &canonical_charset_names[5] },
+        { 7, "cscesu8", &canonical_charset_names[10] },
+        { 8, "csdecmcs", &canonical_charset_names[17] },
+        { 6, "csdkus", &canonical_charset_names[242] },
+        { 13, "csebcdicatdea", &canonical_charset_names[21] },
+        { 12, "csebcdiccafr", &canonical_charset_names[22] },
+        { 12, "csebcdicdkno", &canonical_charset_names[23] },
+        { 13, "csebcdicdknoa", &canonical_charset_names[24] },
+        { 10, "csebcdices", &canonical_charset_names[25] },
+        { 11, "csebcdicesa", &canonical_charset_names[26] },
+        { 11, "csebcdicess", &canonical_charset_names[27] },
+        { 12, "csebcdicfise", &canonical_charset_names[28] },
+        { 13, "csebcdicfisea", &canonical_charset_names[29] },
+        { 10, "csebcdicfr", &canonical_charset_names[30] },
+        { 10, "csebcdicit", &canonical_charset_names[31] },
+        { 10, "csebcdicpt", &canonical_charset_names[32] },
+        { 10, "csebcdicuk", &canonical_charset_names[33] },
+        { 10, "csebcdicus", &canonical_charset_names[34] },
+        { 19, "cseucfixwidjapanese", &canonical_charset_names[40] },
+        { 7, "cseuckr", &canonical_charset_names[39] },
+        { 19, "cseucpkdfmtjapanese", &canonical_charset_names[38] },
+        { 8, "csgb2312", &canonical_charset_names[42] },
+        { 19, "cshalfwidthkatakana", &canonical_charset_names[177] },
+        { 11, "cshpdesktop", &canonical_charset_names[47] },
+        { 9, "cshplegal", &canonical_charset_names[48] },
+        { 9, "cshpmath8", &canonical_charset_names[49] },
+        { 10, "cshppifont", &canonical_charset_names[50] },
+        { 10, "cshppsmath", &canonical_charset_names[3] },
+        { 10, "cshproman8", &canonical_charset_names[246] },
+        { 9, "csibbm904", &canonical_charset_names[106] },
+        { 8, "csibm037", &canonical_charset_names[66] },
+        { 8, "csibm038", &canonical_charset_names[67] },
+        { 9, "csibm1026", &canonical_charset_names[68] },
+        { 8, "csibm273", &canonical_charset_names[70] },
+        { 8, "csibm274", &canonical_charset_names[71] },
+        { 8, "csibm275", &canonical_charset_names[72] },
+        { 8, "csibm277", &canonical_charset_names[73] },
+        { 8, "csibm278", &canonical_charset_names[74] },
+        { 8, "csibm280", &canonical_charset_names[75] },
+        { 8, "csibm281", &canonical_charset_names[76] },
+        { 8, "csibm284", &canonical_charset_names[77] },
+        { 8, "csibm285", &canonical_charset_names[78] },
+        { 8, "csibm290", &canonical_charset_names[79] },
+        { 8, "csibm297", &canonical_charset_names[80] },
+        { 8, "csibm420", &canonical_charset_names[81] },
+        { 8, "csibm423", &canonical_charset_names[82] },
+        { 8, "csibm424", &canonical_charset_names[83] },
+        { 8, "csibm500", &canonical_charset_names[85] },
+        { 8, "csibm851", &canonical_charset_names[88] },
+        { 8, "csibm855", &canonical_charset_names[90] },
+        { 8, "csibm857", &canonical_charset_names[91] },
+        { 8, "csibm860", &canonical_charset_names[92] },
+        { 8, "csibm861", &canonical_charset_names[93] },
+        { 8, "csibm863", &canonical_charset_names[95] },
+        { 8, "csibm864", &canonical_charset_names[96] },
+        { 8, "csibm865", &canonical_charset_names[97] },
+        { 8, "csibm866", &canonical_charset_names[98] },
+        { 8, "csibm868", &canonical_charset_names[99] },
+        { 8, "csibm869", &canonical_charset_names[100] },
+        { 8, "csibm870", &canonical_charset_names[101] },
+        { 8, "csibm871", &canonical_charset_names[102] },
+        { 8, "csibm880", &canonical_charset_names[103] },
+        { 8, "csibm891", &canonical_charset_names[104] },
+        { 8, "csibm903", &canonical_charset_names[105] },
+        { 8, "csibm905", &canonical_charset_names[107] },
+        { 8, "csibm918", &canonical_charset_names[108] },
+        { 15, "csibmebcdicatde", &canonical_charset_names[20] },
+        { 12, "csibmsymbols", &canonical_charset_names[52] },
+        { 9, "csibmthai", &canonical_charset_names[53] },
+        { 11, "csinvariant", &canonical_charset_names[113] },
+        { 15, "csiso102t617bit", &canonical_charset_names[218] },
+        { 13, "csiso10367box", &canonical_charset_names[155] },
+        { 15, "csiso103t618bit", &canonical_charset_names[219] },
+        { 14, "csiso10646utf1", &canonical_charset_names[117] },
+        { 14, "csiso10swedish", &canonical_charset_names[214] },
+        { 20, "csiso111ecmacyrillic", &canonical_charset_names[35] },
+        { 22, "csiso11swedishfornames", &canonical_charset_names[215] },
+        { 17, "csiso121canadian1", &canonical_charset_names[13] },
+        { 17, "csiso122canadian2", &canonical_charset_names[14] },
+        { 22, "csiso123csaz24341985gr", &canonical_charset_names[15] },
+        { 14, "csiso128t101g2", &canonical_charset_names[217] },
+        { 17, "csiso139csn369103", &canonical_charset_names[16] },
+        { 17, "csiso13jisc6220jp", &canonical_charset_names[166] },
+        { 17, "csiso141jusib1002", &canonical_charset_names[180] },
+        { 15, "csiso143iecp271", &canonical_charset_names[109] },
+        { 15, "csiso146serbian", &canonical_charset_names[182] },
+        { 18, "csiso147macedonian", &canonical_charset_names[181] },
+        { 17, "csiso14jisc6220ro", &canonical_charset_names[167] },
+        { 8, "csiso150", &canonical_charset_names[243] },
+        { 18, "csiso150greekccitt", &canonical_charset_names[243] },
+        { 12, "csiso151cuba", &canonical_charset_names[200] },
+        { 19, "csiso153gost1976874", &canonical_charset_names[46] },
+        { 11, "csiso158lap", &canonical_charset_names[249] },
+        { 20, "csiso159jisx02121990", &canonical_charset_names[178] },
+        { 14, "csiso15italian", &canonical_charset_names[165] },
+        { 17, "csiso16portuguese", &canonical_charset_names[210] },
+        { 14, "csiso17spanish", &canonical_charset_names[36] },
+        { 16, "csiso18greek7old", &canonical_charset_names[245] },
+        { 17, "csiso19latingreek", &canonical_charset_names[248] },
+        { 11, "csiso2022jp", &canonical_charset_names[121] },
+        { 12, "csiso2022jp2", &canonical_charset_names[123] },
+        { 11, "csiso2022kr", &canonical_charset_names[124] },
+        { 9, "csiso2033", &canonical_charset_names[156] },
+        { 13, "csiso21german", &canonical_charset_names[18] },
+        { 13, "csiso25french", &canonical_charset_names[202] },
+        { 18, "csiso27latingreek1", &canonical_charset_names[188] },
+        { 20, "csiso2intlrefversion", &canonical_charset_names[161] },
+        { 19, "csiso42jisc62261978", &canonical_charset_names[168] },
+        { 17, "csiso47bsviewdata", &canonical_charset_names[7] },
+        { 11, "csiso49inis", &canonical_charset_names[110] },
+        { 19, "csiso4unitedkingdom", &canonical_charset_names[6] },
+        { 12, "csiso50inis8", &canonical_charset_names[111] },
+        { 19, "csiso51iniscyrillic", &canonical_charset_names[112] },
+        { 17, "csiso5427cyrillic", &canonical_charset_names[157] },
+        { 14, "csiso5428greek", &canonical_charset_names[159] },
+        { 13, "csiso57gb1988", &canonical_charset_names[44] },
+        { 15, "csiso58gb231280", &canonical_charset_names[45] },
+        { 22, "csiso60danishnorwegian", &canonical_charset_names[203] },
+        { 17, "csiso60norwegian1", &canonical_charset_names[203] },
+        { 17, "csiso61norwegian2", &canonical_charset_names[204] },
+        { 17, "csiso646basic1983", &canonical_charset_names[160] },
+        { 14, "csiso646danish", &canonical_charset_names[19] },
+        { 12, "csiso6937add", &canonical_charset_names[162] },
+        { 13, "csiso69french", &canonical_charset_names[201] },
+        { 20, "csiso70videotexsupp1", &canonical_charset_names[252] },
+        { 18, "csiso84portuguese2", &canonical_charset_names[211] },
+        { 15, "csiso85spanish2", &canonical_charset_names[37] },
+        { 16, "csiso86hungarian", &canonical_charset_names[194] },
+        { 15, "csiso87jisx0208", &canonical_charset_names[169] },
+        { 11, "csiso88596e", &canonical_charset_names[140] },
+        { 11, "csiso88596i", &canonical_charset_names[141] },
+        { 11, "csiso88598e", &canonical_charset_names[144] },
+        { 11, "csiso88598i", &canonical_charset_names[145] },
+        { 13, "csiso8859supp", &canonical_charset_names[164] },
+        { 13, "csiso88greek7", &canonical_charset_names[244] },
+        { 14, "csiso89asmo449", &canonical_charset_names[1] },
+        { 7, "csiso90", &canonical_charset_names[247] },
+        { 20, "csiso91jisc62291984a", &canonical_charset_names[170] },
+        { 20, "csiso92jisc62991984b", &canonical_charset_names[171] },
+        { 22, "csiso93jis62291984badd", &canonical_charset_names[172] },
+        { 22, "csiso94jis62291984hand", &canonical_charset_names[173] },
+        { 25, "csiso95jis62291984handadd", &canonical_charset_names[174] },
+        { 23, "csiso96jisc62291984kana", &canonical_charset_names[175] },
+        { 13, "csiso99naplps", &canonical_charset_names[0] },
+        { 11, "csisolatin1", &canonical_charset_names[125] },
+        { 11, "csisolatin2", &canonical_charset_names[134] },
+        { 11, "csisolatin3", &canonical_charset_names[136] },
+        { 11, "csisolatin4", &canonical_charset_names[137] },
+        { 11, "csisolatin5", &canonical_charset_names[146] },
+        { 11, "csisolatin6", &canonical_charset_names[128] },
+        { 16, "csisolatinarabic", &canonical_charset_names[139] },
+        { 18, "csisolatincyrillic", &canonical_charset_names[138] },
+        { 15, "csisolatingreek", &canonical_charset_names[142] },
+        { 16, "csisolatinhebrew", &canonical_charset_names[143] },
+        { 13, "csisotextcomm", &canonical_charset_names[163] },
+        { 13, "csjisencoding", &canonical_charset_names[176] },
+        { 7, "cskoi8r", &canonical_charset_names[184] },
+        { 13, "csksc56011987", &canonical_charset_names[187] },
+        { 9, "csksc5636", &canonical_charset_names[186] },
+        { 11, "csmacintosh", &canonical_charset_names[250] },
+        { 21, "csmicrosoftpublishing", &canonical_charset_names[195] },
+        { 6, "csmnem", &canonical_charset_names[192] },
+        { 10, "csmnemonic", &canonical_charset_names[193] },
+        { 9, "csn369103", &canonical_charset_names[16] },
+        { 10, "csnatsdano", &canonical_charset_names[196] },
+        { 13, "csnatsdanoadd", &canonical_charset_names[197] },
+        { 10, "csnatssefi", &canonical_charset_names[198] },
+        { 13, "csnatssefiadd", &canonical_charset_names[199] },
+        { 13, "cspc775baltic", &canonical_charset_names[86] },
+        { 19, "cspc850multilingual", &canonical_charset_names[87] },
+        { 18, "cspc862latinhebrew", &canonical_charset_names[94] },
+        { 16, "cspc8codepage437", &canonical_charset_names[84] },
+        { 20, "cspc8danishnorwegian", &canonical_charset_names[208] },
+        { 12, "cspc8turkish", &canonical_charset_names[209] },
+        { 8, "cspcp852", &canonical_charset_names[89] },
+        { 9, "csptcp154", &canonical_charset_names[212] },
+        { 10, "csshiftjis", &canonical_charset_names[216] },
+        { 6, "csucs4", &canonical_charset_names[115] },
+        { 9, "csunicode", &canonical_charset_names[114] },
+        { 11, "csunicode11", &canonical_charset_names[221] },
+        { 15, "csunicode11utf7", &canonical_charset_names[222] },
+        { 14, "csunicodeascii", &canonical_charset_names[116] },
+        { 16, "csunicodeibm1261", &canonical_charset_names[150] },
+        { 16, "csunicodeibm1264", &canonical_charset_names[151] },
+        { 16, "csunicodeibm1265", &canonical_charset_names[152] },
+        { 16, "csunicodeibm1268", &canonical_charset_names[153] },
+        { 16, "csunicodeibm1276", &canonical_charset_names[154] },
+        { 15, "csunicodelatin1", &canonical_charset_names[118] },
+        { 13, "csunknown8bit", &canonical_charset_names[223] },
+        { 6, "csusdk", &canonical_charset_names[251] },
+        { 22, "csventurainternational", &canonical_charset_names[235] },
+        { 13, "csventuramath", &canonical_charset_names[236] },
+        { 11, "csventuraus", &canonical_charset_names[237] },
+        { 6, "csviqr", &canonical_charset_names[233] },
+        { 8, "csviscii", &canonical_charset_names[234] },
+        { 17, "cswindows30latin1", &canonical_charset_names[126] },
+        { 12, "cswindows31j", &canonical_charset_names[238] },
+        { 17, "cswindows31latin1", &canonical_charset_names[127] },
+        { 17, "cswindows31latin2", &canonical_charset_names[135] },
+        { 17, "cswindows31latin5", &canonical_charset_names[147] },
+        { 4, "cuba", &canonical_charset_names[200] },
+        { 8, "cyrillic", &canonical_charset_names[138] },
+        { 13, "cyrillicasian", &canonical_charset_names[212] },
+        { 2, "de", &canonical_charset_names[18] },
+        { 3, "dec", &canonical_charset_names[17] },
+        { 6, "decmcs", &canonical_charset_names[17] },
+        { 8, "din66003", &canonical_charset_names[18] },
+        { 2, "dk", &canonical_charset_names[19] },
+        { 4, "dkus", &canonical_charset_names[242] },
+        { 6, "ds2089", &canonical_charset_names[19] },
+        { 4, "e13b", &canonical_charset_names[156] },
+        { 10, "ebcdicatde", &canonical_charset_names[20] },
+        { 11, "ebcdicatdea", &canonical_charset_names[21] },
+        { 8, "ebcdicbe", &canonical_charset_names[71] },
+        { 8, "ebcdicbr", &canonical_charset_names[72] },
+        { 10, "ebcdiccafr", &canonical_charset_names[22] },
+        { 11, "ebcdiccpar1", &canonical_charset_names[81] },
+        { 11, "ebcdiccpar2", &canonical_charset_names[108] },
+        { 10, "ebcdiccpbe", &canonical_charset_names[85] },
+        { 10, "ebcdiccpca", &canonical_charset_names[66] },
+        { 10, "ebcdiccpch", &canonical_charset_names[85] },
+        { 10, "ebcdiccpdk", &canonical_charset_names[73] },
+        { 10, "ebcdiccpes", &canonical_charset_names[77] },
+        { 10, "ebcdiccpfi", &canonical_charset_names[74] },
+        { 10, "ebcdiccpfr", &canonical_charset_names[80] },
+        { 10, "ebcdiccpgb", &canonical_charset_names[78] },
+        { 10, "ebcdiccpgr", &canonical_charset_names[82] },
+        { 10, "ebcdiccphe", &canonical_charset_names[83] },
+        { 10, "ebcdiccpis", &canonical_charset_names[102] },
+        { 10, "ebcdiccpit", &canonical_charset_names[75] },
+        { 10, "ebcdiccpnl", &canonical_charset_names[66] },
+        { 10, "ebcdiccpno", &canonical_charset_names[73] },
+        { 13, "ebcdiccproece", &canonical_charset_names[101] },
+        { 10, "ebcdiccpse", &canonical_charset_names[74] },
+        { 10, "ebcdiccptr", &canonical_charset_names[107] },
+        { 10, "ebcdiccpus", &canonical_charset_names[66] },
+        { 10, "ebcdiccpwt", &canonical_charset_names[66] },
+        { 10, "ebcdiccpyu", &canonical_charset_names[101] },
+        { 14, "ebcdiccyrillic", &canonical_charset_names[103] },
+        { 15, "ebcdicde273euro", &canonical_charset_names[57] },
+        { 15, "ebcdicdk277euro", &canonical_charset_names[58] },
+        { 10, "ebcdicdkno", &canonical_charset_names[23] },
+        { 11, "ebcdicdknoa", &canonical_charset_names[24] },
+        { 8, "ebcdices", &canonical_charset_names[25] },
+        { 15, "ebcdices284euro", &canonical_charset_names[61] },
+        { 9, "ebcdicesa", &canonical_charset_names[26] },
+        { 9, "ebcdicess", &canonical_charset_names[27] },
+        { 15, "ebcdicfi278euro", &canonical_charset_names[59] },
+        { 10, "ebcdicfise", &canonical_charset_names[28] },
+        { 11, "ebcdicfisea", &canonical_charset_names[29] },
+        { 8, "ebcdicfr", &canonical_charset_names[30] },
+        { 15, "ebcdicfr297euro", &canonical_charset_names[63] },
+        { 15, "ebcdicgb285euro", &canonical_charset_names[62] },
+        { 9, "ebcdicint", &canonical_charset_names[67] },
+        { 26, "ebcdicinternational500euro", &canonical_charset_names[64] },
+        { 15, "ebcdicis871euro", &canonical_charset_names[65] },
+        { 8, "ebcdicit", &canonical_charset_names[31] },
+        { 15, "ebcdicit280euro", &canonical_charset_names[60] },
+        { 9, "ebcdicjpe", &canonical_charset_names[76] },
+        { 12, "ebcdicjpkana", &canonical_charset_names[79] },
+        { 16, "ebcdiclatin9euro", &canonical_charset_names[55] },
+        { 15, "ebcdicno277euro", &canonical_charset_names[58] },
+        { 8, "ebcdicpt", &canonical_charset_names[32] },
+        { 15, "ebcdicse278euro", &canonical_charset_names[59] },
+        { 8, "ebcdicuk", &canonical_charset_names[33] },
+        { 8, "ebcdicus", &canonical_charset_names[34] },
+        { 14, "ebcdicus37euro", &canonical_charset_names[56] },
+        { 7, "ecma114", &canonical_charset_names[139] },
+        { 7, "ecma118", &canonical_charset_names[142] },
+        { 12, "ecmacyrillic", &canonical_charset_names[35] },
+        { 7, "elot928", &canonical_charset_names[142] },
+        { 2, "es", &canonical_charset_names[36] },
+        { 3, "es2", &canonical_charset_names[37] },
+        { 5, "euccn", &canonical_charset_names[42] },
+        { 5, "eucjp", &canonical_charset_names[38] },
+        { 5, "euckr", &canonical_charset_names[39] },
+        { 37, "extendedunixcodefixedwidthforjapanese", &canonical_charset_names[40] },
+        { 39, "extendedunixcodepackedformatforjapanese", &canonical_charset_names[38] },
+        { 2, "fi", &canonical_charset_names[214] },
+        { 2, "fr", &canonical_charset_names[201] },
+        { 2, "gb", &canonical_charset_names[6] },
+        { 7, "gb18030", &canonical_charset_names[41] },
+        { 8, "gb198880", &canonical_charset_names[44] },
+        { 6, "gb2312", &canonical_charset_names[42] },
+        { 8, "gb231280", &canonical_charset_names[45] },
+        { 3, "gbk", &canonical_charset_names[43] },
+        { 11, "gost1976874", &canonical_charset_names[46] },
+        { 5, "greek", &canonical_charset_names[142] },
+        { 6, "greek7", &canonical_charset_names[244] },
+        { 9, "greek7old", &canonical_charset_names[245] },
+        { 6, "greek8", &canonical_charset_names[142] },
+        { 10, "greekccitt", &canonical_charset_names[243] },
+        { 6, "hebrew", &canonical_charset_names[143] },
+        { 9, "hpdesktop", &canonical_charset_names[47] },
+        { 7, "hplegal", &canonical_charset_names[48] },
+        { 7, "hpmath8", &canonical_charset_names[49] },
+        { 8, "hppifont", &canonical_charset_names[50] },
+        { 8, "hproman8", &canonical_charset_names[246] },
+        { 2, "hu", &canonical_charset_names[194] },
+        { 8, "hzgb2312", &canonical_charset_names[51] },
+        { 8, "ibm00858", &canonical_charset_names[54] },
+        { 8, "ibm00924", &canonical_charset_names[55] },
+        { 8, "ibm01140", &canonical_charset_names[56] },
+        { 8, "ibm01141", &canonical_charset_names[57] },
+        { 8, "ibm01142", &canonical_charset_names[58] },
+        { 8, "ibm01143", &canonical_charset_names[59] },
+        { 8, "ibm01144", &canonical_charset_names[60] },
+        { 8, "ibm01145", &canonical_charset_names[61] },
+        { 8, "ibm01146", &canonical_charset_names[62] },
+        { 8, "ibm01147", &canonical_charset_names[63] },
+        { 8, "ibm01148", &canonical_charset_names[64] },
+        { 8, "ibm01149", &canonical_charset_names[65] },
+        { 6, "ibm037", &canonical_charset_names[66] },
+        { 6, "ibm038", &canonical_charset_names[67] },
+        { 7, "ibm1026", &canonical_charset_names[68] },
+        { 7, "ibm1047", &canonical_charset_names[69] },
+        { 6, "ibm273", &canonical_charset_names[70] },
+        { 6, "ibm274", &canonical_charset_names[71] },
+        { 6, "ibm275", &canonical_charset_names[72] },
+        { 6, "ibm277", &canonical_charset_names[73] },
+        { 6, "ibm278", &canonical_charset_names[74] },
+        { 6, "ibm280", &canonical_charset_names[75] },
+        { 6, "ibm281", &canonical_charset_names[76] },
+        { 6, "ibm284", &canonical_charset_names[77] },
+        { 6, "ibm285", &canonical_charset_names[78] },
+        { 6, "ibm290", &canonical_charset_names[79] },
+        { 6, "ibm297", &canonical_charset_names[80] },
+        { 6, "ibm367", &canonical_charset_names[224] },
+        { 6, "ibm420", &canonical_charset_names[81] },
+        { 6, "ibm423", &canonical_charset_names[82] },
+        { 6, "ibm424", &canonical_charset_names[83] },
+        { 6, "ibm437", &canonical_charset_names[84] },
+        { 6, "ibm500", &canonical_charset_names[85] },
+        { 6, "ibm775", &canonical_charset_names[86] },
+        { 6, "ibm819", &canonical_charset_names[125] },
+        { 6, "ibm850", &canonical_charset_names[87] },
+        { 6, "ibm851", &canonical_charset_names[88] },
+        { 6, "ibm852", &canonical_charset_names[89] },
+        { 6, "ibm855", &canonical_charset_names[90] },
+        { 6, "ibm857", &canonical_charset_names[91] },
+        { 6, "ibm860", &canonical_charset_names[92] },
+        { 6, "ibm861", &canonical_charset_names[93] },
+        { 6, "ibm862", &canonical_charset_names[94] },
+        { 6, "ibm863", &canonical_charset_names[95] },
+        { 6, "ibm864", &canonical_charset_names[96] },
+        { 6, "ibm865", &canonical_charset_names[97] },
+        { 6, "ibm866", &canonical_charset_names[98] },
+        { 6, "ibm868", &canonical_charset_names[99] },
+        { 6, "ibm869", &canonical_charset_names[100] },
+        { 6, "ibm870", &canonical_charset_names[101] },
+        { 6, "ibm871", &canonical_charset_names[102] },
+        { 6, "ibm880", &canonical_charset_names[103] },
+        { 6, "ibm891", &canonical_charset_names[104] },
+        { 6, "ibm903", &canonical_charset_names[105] },
+        { 6, "ibm904", &canonical_charset_names[106] },
+        { 6, "ibm905", &canonical_charset_names[107] },
+        { 6, "ibm918", &canonical_charset_names[108] },
+        { 10, "ibmsymbols", &canonical_charset_names[52] },
+        { 7, "ibmthai", &canonical_charset_names[53] },
+        { 7, "iecp271", &canonical_charset_names[109] },
+        { 4, "inis", &canonical_charset_names[110] },
+        { 5, "inis8", &canonical_charset_names[111] },
+        { 12, "iniscyrillic", &canonical_charset_names[112] },
+        { 9, "invariant", &canonical_charset_names[113] },
+        { 3, "irv", &canonical_charset_names[161] },
+        { 11, "iso10367box", &canonical_charset_names[155] },
+        { 8, "iso10646", &canonical_charset_names[118] },
+        { 12, "iso10646ucs2", &canonical_charset_names[114] },
+        { 12, "iso10646ucs4", &canonical_charset_names[115] },
+        { 16, "iso10646ucsbasic", &canonical_charset_names[116] },
+        { 21, "iso10646unicodelatin1", &canonical_charset_names[118] },
+        { 12, "iso10646utf1", &canonical_charset_names[117] },
+        { 9, "iso2022cn", &canonical_charset_names[119] },
+        { 12, "iso2022cnext", &canonical_charset_names[120] },
+        { 9, "iso2022jp", &canonical_charset_names[121] },
+        { 10, "iso2022jp1", &canonical_charset_names[122] },
+        { 10, "iso2022jp2", &canonical_charset_names[123] },
+        { 9, "iso2022kr", &canonical_charset_names[124] },
+        { 11, "iso20331983", &canonical_charset_names[156] },
+        { 7, "iso5427", &canonical_charset_names[157] },
+        { 11, "iso54271981", &canonical_charset_names[158] },
+        { 19, "iso5427cyrillic1981", &canonical_charset_names[158] },
+        { 11, "iso54281980", &canonical_charset_names[159] },
+        { 15, "iso646basic1983", &canonical_charset_names[160] },
+        { 8, "iso646ca", &canonical_charset_names[13] },
+        { 9, "iso646ca2", &canonical_charset_names[14] },
+        { 8, "iso646cn", &canonical_charset_names[44] },
+        { 8, "iso646cu", &canonical_charset_names[200] },
+        { 8, "iso646de", &canonical_charset_names[18] },
+        { 8, "iso646dk", &canonical_charset_names[19] },
+        { 8, "iso646es", &canonical_charset_names[36] },
+        { 9, "iso646es2", &canonical_charset_names[37] },
+        { 8, "iso646fi", &canonical_charset_names[214] },
+        { 8, "iso646fr", &canonical_charset_names[201] },
+        { 9, "iso646fr1", &canonical_charset_names[202] },
+        { 8, "iso646gb", &canonical_charset_names[6] },
+        { 8, "iso646hu", &canonical_charset_names[194] },
+        { 13, "iso646irv1983", &canonical_charset_names[161] },
+        { 13, "iso646irv1991", &canonical_charset_names[224] },
+        { 8, "iso646it", &canonical_charset_names[165] },
+        { 8, "iso646jp", &canonical_charset_names[167] },
+        { 12, "iso646jpocrb", &canonical_charset_names[171] },
+        { 8, "iso646kr", &canonical_charset_names[186] },
+        { 8, "iso646no", &canonical_charset_names[203] },
+        { 9, "iso646no2", &canonical_charset_names[204] },
+        { 8, "iso646pt", &canonical_charset_names[210] },
+        { 9, "iso646pt2", &canonical_charset_names[211] },
+        { 8, "iso646se", &canonical_charset_names[214] },
+        { 9, "iso646se2", &canonical_charset_names[215] },
+        { 8, "iso646us", &canonical_charset_names[224] },
+        { 8, "iso646yu", &canonical_charset_names[180] },
+        { 10, "iso6937225", &canonical_charset_names[162] },
+        { 11, "iso69372add", &canonical_charset_names[163] },
+        { 8, "iso88591", &canonical_charset_names[125] },
+        { 9, "iso885910", &canonical_charset_names[128] },
+        { 13, "iso8859101992", &canonical_charset_names[128] },
+        { 9, "iso885911", &canonical_charset_names[129] },
+        { 12, "iso885911987", &canonical_charset_names[125] },
+        { 9, "iso885913", &canonical_charset_names[130] },
+        { 9, "iso885914", &canonical_charset_names[131] },
+        { 13, "iso8859141998", &canonical_charset_names[131] },
+        { 9, "iso885915", &canonical_charset_names[132] },
+        { 9, "iso885916", &canonical_charset_names[133] },
+        { 13, "iso8859162001", &canonical_charset_names[133] },
+        { 23, "iso88591windows30latin1", &canonical_charset_names[126] },
+        { 23, "iso88591windows31latin1", &canonical_charset_names[127] },
+        { 8, "iso88592", &canonical_charset_names[134] },
+        { 12, "iso885921987", &canonical_charset_names[134] },
+        { 21, "iso88592windowslatin2", &canonical_charset_names[135] },
+        { 8, "iso88593", &canonical_charset_names[136] },
+        { 12, "iso885931988", &canonical_charset_names[136] },
+        { 8, "iso88594", &canonical_charset_names[137] },
+        { 12, "iso885941988", &canonical_charset_names[137] },
+        { 8, "iso88595", &canonical_charset_names[138] },
+        { 12, "iso885951988", &canonical_charset_names[138] },
+        { 8, "iso88596", &canonical_charset_names[139] },
+        { 12, "iso885961987", &canonical_charset_names[139] },
+        { 9, "iso88596e", &canonical_charset_names[140] },
+        { 9, "iso88596i", &canonical_charset_names[141] },
+        { 8, "iso88597", &canonical_charset_names[142] },
+        { 12, "iso885971987", &canonical_charset_names[142] },
+        { 8, "iso88598", &canonical_charset_names[143] },
+        { 12, "iso885981988", &canonical_charset_names[143] },
+        { 9, "iso88598e", &canonical_charset_names[144] },
+        { 9, "iso88598i", &canonical_charset_names[145] },
+        { 8, "iso88599", &canonical_charset_names[146] },
+        { 12, "iso885991989", &canonical_charset_names[146] },
+        { 21, "iso88599windowslatin5", &canonical_charset_names[147] },
+        { 11, "iso8859supp", &canonical_charset_names[164] },
+        { 7, "iso9036", &canonical_charset_names[1] },
+        { 9, "isoceltic", &canonical_charset_names[131] },
+        { 7, "isoir10", &canonical_charset_names[214] },
+        { 8, "isoir100", &canonical_charset_names[125] },
+        { 8, "isoir101", &canonical_charset_names[134] },
+        { 8, "isoir102", &canonical_charset_names[218] },
+        { 8, "isoir103", &canonical_charset_names[219] },
+        { 8, "isoir109", &canonical_charset_names[136] },
+        { 7, "isoir11", &canonical_charset_names[215] },
+        { 8, "isoir110", &canonical_charset_names[137] },
+        { 8, "isoir111", &canonical_charset_names[35] },
+        { 8, "isoir121", &canonical_charset_names[13] },
+        { 8, "isoir122", &canonical_charset_names[14] },
+        { 8, "isoir123", &canonical_charset_names[15] },
+        { 8, "isoir126", &canonical_charset_names[142] },
+        { 8, "isoir127", &canonical_charset_names[139] },
+        { 8, "isoir128", &canonical_charset_names[217] },
+        { 7, "isoir13", &canonical_charset_names[166] },
+        { 8, "isoir138", &canonical_charset_names[143] },
+        { 8, "isoir139", &canonical_charset_names[16] },
+        { 7, "isoir14", &canonical_charset_names[167] },
+        { 8, "isoir141", &canonical_charset_names[180] },
+        { 8, "isoir142", &canonical_charset_names[163] },
+        { 8, "isoir143", &canonical_charset_names[109] },
+        { 8, "isoir144", &canonical_charset_names[138] },
+        { 8, "isoir146", &canonical_charset_names[182] },
+        { 8, "isoir147", &canonical_charset_names[181] },
+        { 8, "isoir148", &canonical_charset_names[146] },
+        { 8, "isoir149", &canonical_charset_names[187] },
+        { 7, "isoir15", &canonical_charset_names[165] },
+        { 8, "isoir150", &canonical_charset_names[243] },
+        { 8, "isoir151", &canonical_charset_names[200] },
+        { 8, "isoir152", &canonical_charset_names[162] },
+        { 8, "isoir153", &canonical_charset_names[46] },
+        { 8, "isoir154", &canonical_charset_names[164] },
+        { 8, "isoir155", &canonical_charset_names[155] },
+        { 8, "isoir157", &canonical_charset_names[128] },
+        { 8, "isoir158", &canonical_charset_names[249] },
+        { 8, "isoir159", &canonical_charset_names[178] },
+        { 7, "isoir16", &canonical_charset_names[210] },
+        { 8, "isoir166", &canonical_charset_names[129] },
+        { 7, "isoir17", &canonical_charset_names[36] },
+        { 7, "isoir18", &canonical_charset_names[245] },
+        { 8, "isoir182", &canonical_charset_names[148] },
+        { 7, "isoir19", &canonical_charset_names[248] },
+        { 8, "isoir197", &canonical_charset_names[149] },
+        { 8, "isoir199", &canonical_charset_names[131] },
+        { 6, "isoir2", &canonical_charset_names[161] },
+        { 7, "isoir21", &canonical_charset_names[18] },
+        { 8, "isoir226", &canonical_charset_names[133] },
+        { 7, "isoir25", &canonical_charset_names[202] },
+        { 7, "isoir27", &canonical_charset_names[188] },
+        { 7, "isoir37", &canonical_charset_names[157] },
+        { 6, "isoir4", &canonical_charset_names[6] },
+        { 7, "isoir42", &canonical_charset_names[168] },
+        { 7, "isoir47", &canonical_charset_names[7] },
+        { 7, "isoir49", &canonical_charset_names[110] },
+        { 7, "isoir50", &canonical_charset_names[111] },
+        { 7, "isoir51", &canonical_charset_names[112] },
+        { 7, "isoir54", &canonical_charset_names[158] },
+        { 7, "isoir55", &canonical_charset_names[159] },
+        { 7, "isoir57", &canonical_charset_names[44] },
+        { 7, "isoir58", &canonical_charset_names[45] },
+        { 6, "isoir6", &canonical_charset_names[224] },
+        { 7, "isoir60", &canonical_charset_names[203] },
+        { 7, "isoir61", &canonical_charset_names[204] },
+        { 7, "isoir69", &canonical_charset_names[201] },
+        { 7, "isoir70", &canonical_charset_names[252] },
+        { 7, "isoir81", &canonical_charset_names[198] },
+        { 7, "isoir82", &canonical_charset_names[199] },
+        { 7, "isoir84", &canonical_charset_names[211] },
+        { 7, "isoir85", &canonical_charset_names[37] },
+        { 7, "isoir86", &canonical_charset_names[194] },
+        { 7, "isoir87", &canonical_charset_names[169] },
+        { 7, "isoir88", &canonical_charset_names[244] },
+        { 7, "isoir89", &canonical_charset_names[1] },
+        { 7, "isoir90", &canonical_charset_names[247] },
+        { 7, "isoir91", &canonical_charset_names[196] },
+        { 7, "isoir92", &canonical_charset_names[197] },
+        { 7, "isoir93", &canonical_charset_names[172] },
+        { 7, "isoir94", &canonical_charset_names[173] },
+        { 7, "isoir95", &canonical_charset_names[174] },
+        { 7, "isoir96", &canonical_charset_names[175] },
+        { 7, "isoir98", &canonical_charset_names[156] },
+        { 7, "isoir99", &canonical_charset_names[0] },
+        { 17, "isounicodeibm1261", &canonical_charset_names[150] },
+        { 17, "isounicodeibm1264", &canonical_charset_names[151] },
+        { 17, "isounicodeibm1265", &canonical_charset_names[152] },
+        { 17, "isounicodeibm1268", &canonical_charset_names[153] },
+        { 17, "isounicodeibm1276", &canonical_charset_names[154] },
+        { 2, "it", &canonical_charset_names[165] },
+        { 12, "jisc62201969", &canonical_charset_names[166] },
+        { 14, "jisc62201969jp", &canonical_charset_names[166] },
+        { 14, "jisc62201969ro", &canonical_charset_names[167] },
+        { 12, "jisc62261978", &canonical_charset_names[168] },
+        { 12, "jisc62261983", &canonical_charset_names[169] },
+        { 13, "jisc62291984a", &canonical_charset_names[170] },
+        { 13, "jisc62291984b", &canonical_charset_names[171] },
+        { 16, "jisc62291984badd", &canonical_charset_names[172] },
+        { 16, "jisc62291984hand", &canonical_charset_names[173] },
+        { 19, "jisc62291984handadd", &canonical_charset_names[174] },
+        { 16, "jisc62291984kana", &canonical_charset_names[175] },
+        { 11, "jisencoding", &canonical_charset_names[176] },
+        { 8, "jisx0201", &canonical_charset_names[177] },
+        { 12, "jisx02081983", &canonical_charset_names[169] },
+        { 12, "jisx02121990", &canonical_charset_names[178] },
+        { 5, "johab", &canonical_charset_names[179] },
+        { 2, "jp", &canonical_charset_names[167] },
+        { 6, "jpocra", &canonical_charset_names[170] },
+        { 6, "jpocrb", &canonical_charset_names[171] },
+        { 9, "jpocrbadd", &canonical_charset_names[172] },
+        { 9, "jpocrhand", &canonical_charset_names[173] },
+        { 12, "jpocrhandadd", &canonical_charset_names[174] },
+        { 2, "js", &canonical_charset_names[180] },
+        { 9, "jusib1002", &canonical_charset_names[180] },
+        { 12, "jusib1003mac", &canonical_charset_names[181] },
+        { 13, "jusib1003serb", &canonical_charset_names[182] },
+        { 8, "katakana", &canonical_charset_names[166] },
+        { 12, "koi7switched", &canonical_charset_names[183] },
+        { 5, "koi8e", &canonical_charset_names[35] },
+        { 5, "koi8r", &canonical_charset_names[184] },
+        { 5, "koi8u", &canonical_charset_names[185] },
+        { 6, "korean", &canonical_charset_names[187] },
+        { 7, "ksc5601", &canonical_charset_names[187] },
+        { 11, "ksc56011987", &canonical_charset_names[187] },
+        { 11, "ksc56011989", &canonical_charset_names[187] },
+        { 7, "ksc5636", &canonical_charset_names[186] },
+        { 2, "l1", &canonical_charset_names[125] },
+        { 3, "l10", &canonical_charset_names[133] },
+        { 2, "l2", &canonical_charset_names[134] },
+        { 2, "l3", &canonical_charset_names[136] },
+        { 2, "l4", &canonical_charset_names[137] },
+        { 2, "l5", &canonical_charset_names[146] },
+        { 2, "l6", &canonical_charset_names[128] },
+        { 2, "l8", &canonical_charset_names[131] },
+        { 3, "lap", &canonical_charset_names[249] },
+        { 6, "latin1", &canonical_charset_names[125] },
+        { 7, "latin10", &canonical_charset_names[133] },
+        { 8, "latin125", &canonical_charset_names[164] },
+        { 6, "latin2", &canonical_charset_names[134] },
+        { 6, "latin3", &canonical_charset_names[136] },
+        { 6, "latin4", &canonical_charset_names[137] },
+        { 6, "latin5", &canonical_charset_names[146] },
+        { 6, "latin6", &canonical_charset_names[128] },
+        { 6, "latin8", &canonical_charset_names[131] },
+        { 6, "latin9", &canonical_charset_names[132] },
+        { 10, "latingreek", &canonical_charset_names[248] },
+        { 11, "latingreek1", &canonical_charset_names[188] },
+        { 8, "latinlap", &canonical_charset_names[249] },
+        { 3, "mac", &canonical_charset_names[250] },
+        { 16, "maccentraleurope", &canonical_charset_names[189] },
+        { 18, "maccentraleurroman", &canonical_charset_names[189] },
+        { 11, "maccyrillic", &canonical_charset_names[190] },
+        { 10, "macedonian", &canonical_charset_names[181] },
+        { 9, "macintosh", &canonical_charset_names[250] },
+        { 8, "macroman", &canonical_charset_names[250] },
+        { 10, "macukraine", &canonical_charset_names[191] },
+        { 12, "macukrainian", &canonical_charset_names[191] },
+        { 19, "microsoftpublishing", &canonical_charset_names[195] },
+        { 4, "mnem", &canonical_charset_names[192] },
+        { 8, "mnemonic", &canonical_charset_names[193] },
+        { 5, "ms936", &canonical_charset_names[43] },
+        { 6, "msansi", &canonical_charset_names[255] },
+        { 6, "msarab", &canonical_charset_names[259] },
+        { 6, "mscyrl", &canonical_charset_names[254] },
+        { 4, "msee", &canonical_charset_names[253] },
+        { 7, "msgreek", &canonical_charset_names[256] },
+        { 7, "mskanji", &canonical_charset_names[216] },
+        { 6, "msturk", &canonical_charset_names[257] },
+        { 8, "msz77953", &canonical_charset_names[194] },
+        { 6, "naplps", &canonical_charset_names[0] },
+        { 8, "natsdano", &canonical_charset_names[196] },
+        { 11, "natsdanoadd", &canonical_charset_names[197] },
+        { 8, "natssefi", &canonical_charset_names[198] },
+        { 11, "natssefiadd", &canonical_charset_names[199] },
+        { 10, "ncnc001081", &canonical_charset_names[200] },
+        { 8, "nfz62010", &canonical_charset_names[201] },
+        { 12, "nfz620101973", &canonical_charset_names[202] },
+        { 2, "no", &canonical_charset_names[203] },
+        { 3, "no2", &canonical_charset_names[204] },
+        { 7, "ns45511", &canonical_charset_names[203] },
+        { 7, "ns45512", &canonical_charset_names[204] },
+        { 16, "osdebcdicdf03irv", &canonical_charset_names[205] },
+        { 14, "osdebcdicdf041", &canonical_charset_names[206] },
+        { 15, "osdebcdicdf0415", &canonical_charset_names[207] },
+        { 18, "pc8danishnorwegian", &canonical_charset_names[208] },
+        { 10, "pc8turkish", &canonical_charset_names[209] },
+        { 21, "pcmultilingual850euro", &canonical_charset_names[54] },
+        { 2, "pt", &canonical_charset_names[210] },
+        { 5, "pt154", &canonical_charset_names[212] },
+        { 3, "pt2", &canonical_charset_names[211] },
+        { 7, "ptcp154", &canonical_charset_names[212] },
+        { 2, "r8", &canonical_charset_names[246] },
+        { 3, "ref", &canonical_charset_names[160] },
+        { 6, "roman8", &canonical_charset_names[246] },
+        { 4, "scsu", &canonical_charset_names[213] },
+        { 2, "se", &canonical_charset_names[214] },
+        { 3, "se2", &canonical_charset_names[215] },
+        { 10, "sen850200b", &canonical_charset_names[214] },
+        { 10, "sen850200c", &canonical_charset_names[215] },
+        { 7, "serbian", &canonical_charset_names[182] },
+        { 8, "shiftjis", &canonical_charset_names[216] },
+        { 10, "stsev35888", &canonical_charset_names[46] },
+        { 6, "t101g2", &canonical_charset_names[217] },
+        { 3, "t61", &canonical_charset_names[219] },
+        { 7, "t617bit", &canonical_charset_names[218] },
+        { 7, "t618bit", &canonical_charset_names[219] },
+        { 6, "tis620", &canonical_charset_names[220] },
+        { 4, "ucs2", &canonical_charset_names[114] },
+        { 4, "ucs4", &canonical_charset_names[115] },
+        { 2, "uk", &canonical_charset_names[6] },
+        { 9, "unicode11", &canonical_charset_names[221] },
+        { 13, "unicode11utf7", &canonical_charset_names[222] },
+        { 13, "unicode11utf8", &canonical_charset_names[232] },
+        { 13, "unicode20utf8", &canonical_charset_names[232] },
+        { 11, "unknown8bit", &canonical_charset_names[223] },
+        { 2, "us", &canonical_charset_names[224] },
+        { 7, "usascii", &canonical_charset_names[224] },
+        { 4, "usdk", &canonical_charset_names[251] },
+        { 5, "utf16", &canonical_charset_names[225] },
+        { 7, "utf16be", &canonical_charset_names[226] },
+        { 7, "utf16le", &canonical_charset_names[227] },
+        { 5, "utf32", &canonical_charset_names[228] },
+        { 7, "utf32be", &canonical_charset_names[229] },
+        { 7, "utf32le", &canonical_charset_names[230] },
+        { 4, "utf7", &canonical_charset_names[231] },
+        { 4, "utf8", &canonical_charset_names[232] },
+        { 20, "venturainternational", &canonical_charset_names[235] },
+        { 11, "venturamath", &canonical_charset_names[236] },
+        { 9, "venturaus", &canonical_charset_names[237] },
+        { 13, "videotexsuppl", &canonical_charset_names[252] },
+        { 4, "viqr", &canonical_charset_names[233] },
+        { 6, "viscii", &canonical_charset_names[234] },
+        { 10, "winbaltrim", &canonical_charset_names[260] },
+        { 11, "windows1250", &canonical_charset_names[253] },
+        { 11, "windows1251", &canonical_charset_names[254] },
+        { 11, "windows1252", &canonical_charset_names[255] },
+        { 11, "windows1253", &canonical_charset_names[256] },
+        { 11, "windows1254", &canonical_charset_names[257] },
+        { 11, "windows1255", &canonical_charset_names[258] },
+        { 11, "windows1256", &canonical_charset_names[259] },
+        { 11, "windows1257", &canonical_charset_names[260] },
+        { 11, "windows1258", &canonical_charset_names[261] },
+        { 10, "windows31j", &canonical_charset_names[238] },
+        { 10, "windows874", &canonical_charset_names[11] },
+        { 10, "windows936", &canonical_charset_names[43] },
+        { 10, "windows949", &canonical_charset_names[12] },
+        { 5, "x0201", &canonical_charset_names[177] },
+        { 6, "x02017", &canonical_charset_names[166] },
+        { 5, "x0208", &canonical_charset_names[169] },
+        { 5, "x0212", &canonical_charset_names[178] },
+        { 11, "xacornfuzzy", &canonical_charset_names[239] },
+        { 12, "xacornlatin1", &canonical_charset_names[240] },
+        { 8, "xcurrent", &canonical_charset_names[241] },
+        { 19, "xmaccentraleurroman", &canonical_charset_names[189] },
+        { 12, "xmaccyrillic", &canonical_charset_names[190] },
+        { 9, "xmacroman", &canonical_charset_names[250] },
+        { 13, "xmacukrainian", &canonical_charset_names[191] },
+        { 5, "xsjis", &canonical_charset_names[216] },
+        { 7, "xsystem", &canonical_charset_names[241] },
+        { 6, "xxbig5", &canonical_charset_names[8] },
+        { 2, "yu", &canonical_charset_names[180] },
+};
+static const uint16_t charset_aliases_count = 852;
+#define MIBENUM_IS_UNICODE(x) (((x) == 1000) || ((x) == 1001) || ((x) == 1015) || ((x) == 1013) || ((x) == 1014) || ((x) == 1017) || ((x) == 1018) || ((x) == 1019) || ((x) == 106))

 /programs/network/netsurf/libparserutils/src/charset/codec.c
 ,0 → 1,196
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <string.h>
+#include "charset/aliases.h"
+#include "charset/codecs/codec_impl.h"
+extern parserutils_charset_handler charset_ascii_codec_handler;
+extern parserutils_charset_handler charset_8859_codec_handler;
+extern parserutils_charset_handler charset_ext8_codec_handler;
+extern parserutils_charset_handler charset_utf8_codec_handler;
+extern parserutils_charset_handler charset_utf16_codec_handler;
+static parserutils_charset_handler *handler_table[] = {
+        &charset_utf8_codec_handler,
+        &charset_utf16_codec_handler,
+        &charset_8859_codec_handler,
+        &charset_ext8_codec_handler,
+        &charset_ascii_codec_handler,
+        NULL,
+};
+/**
+ * Create a charset codec
+ *
+ * \param charset  Target charset
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \param codec    Pointer to location to receive codec instance
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhaustion,
+ *         PARSERUTILS_BADENCODING on unsupported charset
+ */
+parserutils_error parserutils_charset_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec)
+{
+        parserutils_charset_codec *c;
+        parserutils_charset_handler **handler;
+        const parserutils_charset_aliases_canon * canon;
+        parserutils_error error;
+        if (charset == NULL || alloc == NULL || codec == NULL)
+                return PARSERUTILS_BADPARM;
+        /* Canonicalise parserutils_charset name. */
+        canon = parserutils__charset_alias_canonicalise(charset,
+                        strlen(charset));
+        if (canon == NULL)
+                return PARSERUTILS_BADENCODING;
+        /* Search for handler class */
+        for (handler = handler_table; *handler != NULL; handler++) {
+                if ((*handler)->handles_charset(canon->name))
+                        break;
+        }
+        /* None found */
+        if ((*handler) == NULL)
+                return PARSERUTILS_BADENCODING;
+        /* Instantiate class */
+        error = (*handler)->create(canon->name, alloc, pw, &c);
+        if (error != PARSERUTILS_OK)
+                return error;
+        /* and initialise it */
+        c->mibenum = canon->mib_enum;
+        c->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+        c->alloc = alloc;
+        c->alloc_pw = pw;
+        *codec = c;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy a charset codec
+ *
+ * \param codec  The codec to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_destroy(
+                parserutils_charset_codec *codec)
+{
+        if (codec == NULL)
+                return PARSERUTILS_BADPARM;
+        codec->handler.destroy(codec);
+        codec->alloc(codec, 0, codec->alloc_pw);
+        return PARSERUTILS_OK;
+}
+/**
+ * Configure a charset codec
+ *
+ * \param codec   The codec to configure
+ * \param type    The codec option type to configure
+ * \param params  Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_setopt(
+                parserutils_charset_codec *codec,
+                parserutils_charset_codec_opttype type,
+                parserutils_charset_codec_optparams *params)
+{
+        if (codec == NULL || params == NULL)
+                return PARSERUTILS_BADPARM;
+        switch (type) {
+        case PARSERUTILS_CHARSET_CODEC_ERROR_MODE:
+                codec->errormode = params->error_mode.mode;
+                break;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Encode a chunk of UCS-4 data into a codec's charset
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+parserutils_error parserutils_charset_codec_encode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        if (codec == NULL || source == NULL || *source == NULL ||
+                        sourcelen == NULL || dest == NULL || *dest == NULL ||
+                        destlen == NULL)
+                return PARSERUTILS_BADPARM;
+        return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+/**
+ * Decode a chunk of data in a codec's charset into UCS-4
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_charset_codec_decode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        if (codec == NULL || source == NULL || *source == NULL ||
+                        sourcelen == NULL || dest == NULL || *dest == NULL ||
+                        destlen == NULL)
+                return PARSERUTILS_BADPARM;
+        return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_reset(
+                parserutils_charset_codec *codec)
+{
+        if (codec == NULL)
+                return PARSERUTILS_BADPARM;
+        return codec->handler.reset(codec);
+}

 /programs/network/netsurf/libparserutils/src/charset/codecs/8859_tables.h
 ,0 → 1,241
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_charset_codecs_8859tables_h_
+#define parserutils_charset_codecs_8859tables_h_
+/* Mapping tables for ISO-8859-n -> UCS4.
+ * Undefined characters are mapped to U+FFFF,
+ * which is a guaranteed non-character
+ */
+static uint32_t t1[96] = {
+x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
+};
+static uint32_t t2[96] = {
+x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
+x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
+x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
+x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
+x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
+x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
+x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
+x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
+x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
+x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
+x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
+x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9,
+};
+static uint32_t t3[96] = {
+x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0xFFFF, 0x0124, 0x00A7,
+x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, 0xFFFF, 0x017B,
+x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7,
+x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, 0xFFFF, 0x017C,
+x00C0, 0x00C1, 0x00C2, 0xFFFF, 0x00C4, 0x010A, 0x0108, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+xFFFF, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7,
+x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0xFFFF, 0x00E4, 0x010B, 0x0109, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+xFFFF, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7,
+x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9,
+};
+static uint32_t t4[96] = {
+x00A0, 0x0104, 0x0138, 0x0156, 0x00A4, 0x0128, 0x013B, 0x00A7,
+x00A8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00AD, 0x017D, 0x00AF,
+x00B0, 0x0105, 0x02DB, 0x0157, 0x00B4, 0x0129, 0x013C, 0x02C7,
+x00B8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014A, 0x017E, 0x014B,
+x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
+x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A,
+x0110, 0x0145, 0x014C, 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x0168, 0x016A, 0x00DF,
+x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
+x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x012B,
+x0111, 0x0146, 0x014D, 0x0137, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x0169, 0x016B, 0x02D9,
+};
+static uint32_t t5[96] = {
+x00A0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
+x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x00AD, 0x040E, 0x040F,
+x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
+x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
+x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
+x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
+x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
+x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
+x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
+x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
+x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
+x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x00A7, 0x045E, 0x045F,
+};
+static uint32_t t6[96] = {
+x00A0, 0xFFFF, 0xFFFF, 0xFFFF, 0x00A4, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x060C, 0x00AD, 0xFFFF, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0x061B, 0xFFFF, 0xFFFF, 0xFFFF, 0x061F,
+xFFFF, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
+x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
+x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
+x0638, 0x0639, 0x063A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
+x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F,
+x0650, 0x0651, 0x0652, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+};
+static uint32_t t7[96] = {
+x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0xFFFF, 0x2015,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7,
+x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
+x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
+x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
+x03A0, 0x03A1, 0xFFFF, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
+x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
+x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
+x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
+x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
+x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFF,
+};
+static uint32_t t8[96] = {
+x00A0, 0xFFFF, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2017,
+x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
+x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
+x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
+x05E8, 0x05E9, 0x05EA, 0xFFFF, 0xFFFF, 0x200E, 0x200F, 0xFFFF,
+};
+static uint32_t t9[96] = {
+x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF,
+};
+static uint32_t t10[96] = {
+x00A0, 0x0104, 0x0112, 0x0122, 0x012A, 0x0128, 0x0136, 0x00A7,
+x013B, 0x0110, 0x0160, 0x0166, 0x017D, 0x00AD, 0x016A, 0x014A,
+x00B0, 0x0105, 0x0113, 0x0123, 0x012B, 0x0129, 0x0137, 0x00B7,
+x013C, 0x0111, 0x0161, 0x0167, 0x017E, 0x2015, 0x016B, 0x014B,
+x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
+x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x00CF,
+x00D0, 0x0145, 0x014C, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0168,
+x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
+x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
+x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x00EF,
+x00F0, 0x0146, 0x014D, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0169,
+x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x0138,
+};
+static uint32_t t11[96] = {
+x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07,
+x0E08, 0x0E09, 0x0E0A, 0x0E0B, 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F,
+x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17,
+x0E18, 0x0E19, 0x0E1A, 0x0E1B, 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F,
+x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27,
+x0E28, 0x0E29, 0x0E2A, 0x0E2B, 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F,
+x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37,
+x0E38, 0x0E39, 0x0E3A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0E3F,
+x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47,
+x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F,
+x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57,
+x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+};
+static uint32_t t13[96] = {
+x00A0, 0x201D, 0x00A2, 0x00A3, 0x00A4, 0x201E, 0x00A6, 0x00A7,
+x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x201C, 0x00B5, 0x00B6, 0x00B7,
+x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
+x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112,
+x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
+x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7,
+x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
+x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113,
+x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
+x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7,
+x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x2019,
+};
+static uint32_t t14[96] = {
+x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7,
+x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
+x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56,
+x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61,
+x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A,
+x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B,
+x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF,
+};
+static uint32_t t15[96] = {
+x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AC, 0x00A5, 0x0160, 0x00A7,
+x0161, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x017D, 0x00B5, 0x00B6, 0x00B7,
+x017E, 0x00B9, 0x00BA, 0x00BB, 0x0152, 0x0153, 0x0178, 0x00BF,
+x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
+};
+static uint32_t t16[96] = {
+x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
+x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
+x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
+x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
+x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
+x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
+x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF,
+};
+#endif

/programs/network/netsurf/libparserutils/src/charset/codecs/Makefile
0,0 → 1,5
OUTFILE = libo.o
OBJS = codec_ascii.o codec_8859.o codec_ext8.o \
codec_utf8.o codec_utf16.o
CFLAGS += -I ../../../include/ -I ../../../../ -I ../../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib

 /programs/network/netsurf/libparserutils/src/charset/codecs/codec_8859.c
 ,0 → 1,596
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/charset/mibenum.h>
+#include "charset/codecs/codec_impl.h"
+#include "utils/endian.h"
+#include "utils/utils.h"
+#include "charset/codecs/8859_tables.h"
+static struct {
+        uint16_t mib;
+        const char *name;
+        size_t len;
+        uint32_t *table;
+} known_charsets[] = {
+        { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
+        { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
+        { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
+        { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
+        { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
+        { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
+        { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
+        { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
+        { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
+        { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
+        { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
+        { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
+        { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
+        { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
+        { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
+};
+/**
+ * ISO-8859-n charset codec
+ */
+typedef struct charset_8859_codec {
+        parserutils_charset_codec base; /**< Base class */
+        uint32_t *table;                /**< Mapping table for 0xA0-0xFF */
+#define READ_BUFSIZE (8)
+        uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
+                                                 * output sequences (decode)
+                                                 * (host-endian) */
+        size_t read_len;                /**< Character length of read_buf */
+#define WRITE_BUFSIZE (8)
+        uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
+                                                 * output sequences (encode)
+                                                 * (host-endian) */
+        size_t write_len;               /**< Character length of write_buf */
+} charset_8859_codec;
+static bool charset_8859_codec_handles_charset(const char *charset);
+static parserutils_error charset_8859_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec);
+static parserutils_error charset_8859_codec_destroy(
+                parserutils_charset_codec *codec);
+static parserutils_error charset_8859_codec_encode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_8859_codec_decode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_8859_codec_reset(
+                parserutils_charset_codec *codec);
+static inline parserutils_error charset_8859_codec_read_char(
+                charset_8859_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_8859_codec_output_decoded_char(
+                charset_8859_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
+                uint32_t ucs4, uint8_t **s, size_t *len);
+static inline parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
+                const uint8_t *s, size_t len, uint32_t *ucs4);
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_8859_codec_handles_charset(const char *charset)
+{
+        uint32_t i;
+        uint16_t match = parserutils_charset_mibenum_from_name(charset,
+                        strlen(charset));
+        if (known_charsets[0].mib == 0) {
+                for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
+                        known_charsets[i].mib =
+                                parserutils_charset_mibenum_from_name(
+                                                known_charsets[i].name,
+                                                known_charsets[i].len);
+                }
+        }
+        for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
+                if (known_charsets[i].mib == match)
+                        return true;
+        }
+        return false;
+}
+/**
+ * Create an ISO-8859-n codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \param codec    Pointer to location to receive codec
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhausion
+ */
+parserutils_error charset_8859_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec)
+{
+        uint32_t i;
+        charset_8859_codec *c;
+        uint16_t match = parserutils_charset_mibenum_from_name(
+                        charset, strlen(charset));
+        uint32_t *table = NULL;
+        for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
+                if (known_charsets[i].mib == match) {
+                        table = known_charsets[i].table;
+                        break;
+                }
+        }
+        assert(table != NULL);
+        c = alloc(NULL, sizeof(charset_8859_codec), pw);
+        if (c == NULL)
+                return PARSERUTILS_NOMEM;
+        c->table = table;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        /* Finally, populate vtable */
+        c->base.handler.destroy = charset_8859_codec_destroy;
+        c->base.handler.encode = charset_8859_codec_encode;
+        c->base.handler.decode = charset_8859_codec_decode;
+        c->base.handler.reset = charset_8859_codec_reset;
+        *codec = (parserutils_charset_codec *) c;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy an ISO-8859-n codec
+ *
+ * \param codec  The codec to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_8859_codec_destroy (parserutils_charset_codec *codec)
+{
+        UNUSED(codec);
+        return PARSERUTILS_OK;
+}
+/**
+ * Encode a chunk of UCS-4 (big endian) data into ISO-8859-n
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_8859_codec *c = (charset_8859_codec *) codec;
+        uint32_t ucs4;
+        uint32_t *towrite;
+        size_t towritelen;
+        parserutils_error error;
+        /* Process any outstanding characters from the previous call */
+        if (c->write_len > 0) {
+                uint32_t *pwrite = c->write_buf;
+                while (c->write_len > 0) {
+                        error = charset_8859_from_ucs4(c, pwrite[0],
+                                        dest, destlen);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                assert(error == PARSERUTILS_NOMEM);
+                                for (len = 0; len < c->write_len; len++) {
+                                        c->write_buf[len] = pwrite[len];
+                                }
+                                return error;
+                        }
+                        pwrite++;
+                        c->write_len--;
+                }
+        }
+        /* Now process the characters for this call */
+        while (*sourcelen > 0) {
+                ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
+                towrite = &ucs4;
+                towritelen = 1;
+                /* Output current characters */
+                while (towritelen > 0) {
+                        error = charset_8859_from_ucs4(c, towrite[0], dest,
+                                        destlen);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                if (error != PARSERUTILS_NOMEM) {
+                                        return error;
+                                }
+                                /* Insufficient output space */
+                                if (towritelen >= WRITE_BUFSIZE)
+                                        abort();
+                                c->write_len = towritelen;
+                                /* Copy pending chars to save area, for
+                                 * processing next call. */
+                                for (len = 0; len < towritelen; len++)
+                                        c->write_buf[len] = towrite[len];
+                                /* Claim character we've just buffered,
+                                 * so it's not reprocessed */
+                                *source += 4;
+                                *sourcelen -= 4;
+                                return PARSERUTILS_NOMEM;
+                        }
+                        towrite++;
+                        towritelen--;
+                }
+                *source += 4;
+                *sourcelen -= 4;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Decode a chunk of ISO-8859-n data into UCS-4 (big endian)
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_8859_codec *c = (charset_8859_codec *) codec;
+        parserutils_error error;
+        if (c->read_len > 0) {
+                /* Output left over from last decode */
+                uint32_t *pread = c->read_buf;
+                while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+                        *((uint32_t *) (void *) *dest) =
+                                        endian_host_to_big(pread[0]);
+                        *dest += 4;
+                        *destlen -= 4;
+                        pread++;
+                        c->read_len--;
+                }
+                if (*destlen < c->read_len * 4) {
+                        /* Ran out of output buffer */
+                        size_t i;
+                        /* Shuffle remaining output down */
+                        for (i = 0; i < c->read_len; i++)
+                                c->read_buf[i] = pread[i];
+                        return PARSERUTILS_NOMEM;
+                }
+        }
+        /* Finally, the "normal" case; process all outstanding characters */
+        while (*sourcelen > 0) {
+                error = charset_8859_codec_read_char(c,
+                                source, sourcelen, dest, destlen);
+                if (error != PARSERUTILS_OK) {
+                        return error;
+                }
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Clear an ISO-8859-n codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec)
+{
+        charset_8859_codec *c = (charset_8859_codec *) codec;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        return PARSERUTILS_OK;
+}
+/**
+ * Read a character from the ISO-8859-n to UCS-4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_8859_codec_read_char(charset_8859_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        uint32_t ucs4;
+        parserutils_error error;
+        /* Convert a single character */
+        error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4);
+        if (error == PARSERUTILS_OK) {
+                /* Read a character */
+                error = charset_8859_codec_output_decoded_char(c,
+                                ucs4, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += 1;
+                        *sourcelen -= 1;
+                }
+                return error;
+        } else if (error == PARSERUTILS_NEEDDATA) {
+                /* Can only happen if sourcelen == 0 */
+                return error;
+        } else if (error == PARSERUTILS_INVALID) {
+                /* Illegal input sequence */
+                /* Strict errormode; simply flag invalid character */
+                if (c->base.errormode ==
+                                PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+                        return PARSERUTILS_INVALID;
+                }
+                /* output U+FFFD and continue processing. */
+                error = charset_8859_codec_output_decoded_char(c,
+xFFFD, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += 1;
+                        *sourcelen -= 1;
+                }
+                return error;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Output a UCS-4 character (big endian)
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS-4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+        if (*destlen < 4) {
+                /* Run out of output buffer */
+                c->read_len = 1;
+                c->read_buf[0] = ucs4;
+                return PARSERUTILS_NOMEM;
+        }
+        *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
+        *dest += 4;
+        *destlen -= 4;
+        return PARSERUTILS_OK;
+}
+/**
+ * Convert a UCS4 (host endian) character to ISO-8859-n
+ *
+ * \param c     The codec instance
+ * \param ucs4  The UCS4 character to convert
+ * \param s     Pointer to pointer to destination buffer
+ * \param len   Pointer to destination buffer length
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
+ *         PARSERUTILS_INVALID if the character cannot be represented
+ *
+ * _INVALID will only be returned if the codec's conversion mode is STRICT.
+ * Otherwise, '?' will be output.
+ *
+ * On successful conversion, *s and *len will be updated.
+ */
+parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
+                uint32_t ucs4, uint8_t **s, size_t *len)
+{
+        uint8_t out = 0;
+        if (*len < 1)
+                return PARSERUTILS_NOMEM;
+        if (ucs4 < 0x80) {
+                /* ASCII */
+                out = ucs4;
+        } else {
+                uint32_t i;
+                for (i = 0; i < 96; i++) {
+                        if (ucs4 == c->table[i])
+                                break;
+                }
+                if (i == 96) {
+                        if (c->base.errormode ==
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
+                                return PARSERUTILS_INVALID;
+                        else
+                                out = '?';
+                } else {
+                        out = 0xA0 + i;
+                }
+        }
+        *(*s) = out;
+        (*s)++;
+        (*len)--;
+        return PARSERUTILS_OK;
+}
+/**
+ * Convert an ISO-8859-n character to UCS4 (host endian)
+ *
+ * \param c     The codec instance
+ * \param s     Pointer to source buffer
+ * \param len   Source buffer length
+ * \param ucs4  Pointer to destination buffer
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NEEDDATA if there's insufficient input data
+ *         PARSERUTILS_INVALID if the character cannot be represented
+ */
+parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
+                const uint8_t *s, size_t len, uint32_t *ucs4)
+{
+        uint32_t out;
+        if (len < 1)
+                return PARSERUTILS_NEEDDATA;
+        if (*s < 0x80) {
+                out = *s;
+        } else if (*s >= 0xA0) {
+                if (c->table[*s - 0xA0] == 0xFFFF)
+                        return PARSERUTILS_INVALID;
+                out = c->table[*s - 0xA0];
+        } else {
+                return PARSERUTILS_INVALID;
+        }
+        *ucs4 = out;
+        return PARSERUTILS_OK;
+}
+const parserutils_charset_handler charset_8859_codec_handler = {
+        charset_8859_codec_handles_charset,
+        charset_8859_codec_create
+};

 /programs/network/netsurf/libparserutils/src/charset/codecs/codec_ascii.c
 ,0 → 1,536
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/charset/mibenum.h>
+#include "charset/codecs/codec_impl.h"
+#include "utils/endian.h"
+#include "utils/utils.h"
+/**
+ * US-ASCII charset codec
+ */
+typedef struct charset_ascii_codec {
+        parserutils_charset_codec base; /**< Base class */
+#define READ_BUFSIZE (8)
+        uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
+                                                 * output sequences (decode)
+                                                 * (host-endian) */
+        size_t read_len;                /**< Character length of read_buf */
+#define WRITE_BUFSIZE (8)
+        uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
+                                                 * output sequences (encode)
+                                                 * (host-endian) */
+        size_t write_len;               /**< Character length of write_buf */
+} charset_ascii_codec;
+static bool charset_ascii_codec_handles_charset(const char *charset);
+static parserutils_error charset_ascii_codec_create(
+                const char *charset, parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec);
+static parserutils_error charset_ascii_codec_destroy(
+                parserutils_charset_codec *codec);
+static parserutils_error charset_ascii_codec_encode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_ascii_codec_decode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_ascii_codec_reset(
+                parserutils_charset_codec *codec);
+static inline parserutils_error charset_ascii_codec_read_char(
+                charset_ascii_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_ascii_codec_output_decoded_char(
+                charset_ascii_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
+                uint32_t ucs4, uint8_t **s, size_t *len);
+static inline parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
+                const uint8_t *s, size_t len, uint32_t *ucs4);
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_ascii_codec_handles_charset(const char *charset)
+{
+        static uint16_t ascii;
+        uint16_t match = parserutils_charset_mibenum_from_name(charset,
+                        strlen(charset));
+        if (ascii == 0) {
+                ascii = parserutils_charset_mibenum_from_name(
+                                "US-ASCII", SLEN("US-ASCII"));
+        }
+        if (ascii != 0 && ascii == match)
+                return true;
+        return false;
+}
+/**
+ * Create a US-ASCII codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \param codec    Pointer to location to receive codec
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhausion
+ */
+parserutils_error charset_ascii_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec)
+{
+        charset_ascii_codec *c;
+        UNUSED(charset);
+        c = alloc(NULL, sizeof(charset_ascii_codec), pw);
+        if (c == NULL)
+                return PARSERUTILS_NOMEM;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        /* Finally, populate vtable */
+        c->base.handler.destroy = charset_ascii_codec_destroy;
+        c->base.handler.encode = charset_ascii_codec_encode;
+        c->base.handler.decode = charset_ascii_codec_decode;
+        c->base.handler.reset = charset_ascii_codec_reset;
+        *codec = (parserutils_charset_codec *) c;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy a US-ASCII codec
+ *
+ * \param codec  The codec to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_ascii_codec_destroy (parserutils_charset_codec *codec)
+{
+        UNUSED(codec);
+        return PARSERUTILS_OK;
+}
+/**
+ * Encode a chunk of UCS-4 (big endian) data into US-ASCII
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_ascii_codec_encode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_ascii_codec *c = (charset_ascii_codec *) codec;
+        uint32_t ucs4;
+        uint32_t *towrite;
+        size_t towritelen;
+        parserutils_error error;
+        /* Process any outstanding characters from the previous call */
+        if (c->write_len > 0) {
+                uint32_t *pwrite = c->write_buf;
+                while (c->write_len > 0) {
+                        error = charset_ascii_from_ucs4(c, pwrite[0],
+                                        dest, destlen);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                assert(error == PARSERUTILS_NOMEM);
+                                for (len = 0; len < c->write_len; len++) {
+                                        c->write_buf[len] = pwrite[len];
+                                }
+                                return error;
+                        }
+                        pwrite++;
+                        c->write_len--;
+                }
+        }
+        /* Now process the characters for this call */
+        while (*sourcelen > 0) {
+                ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
+                towrite = &ucs4;
+                towritelen = 1;
+                /* Output current characters */
+                while (towritelen > 0) {
+                        error = charset_ascii_from_ucs4(c, towrite[0], dest,
+                                        destlen);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                if (error != PARSERUTILS_NOMEM) {
+                                        return error;
+                                }
+                                /* Insufficient output space */
+                                if (towritelen >= WRITE_BUFSIZE)
+                                        abort();
+                                c->write_len = towritelen;
+                                /* Copy pending chars to save area, for
+                                 * processing next call. */
+                                for (len = 0; len < towritelen; len++)
+                                        c->write_buf[len] = towrite[len];
+                                /* Claim character we've just buffered,
+                                 * so it's not reprocessed */
+                                *source += 4;
+                                *sourcelen -= 4;
+                                return PARSERUTILS_NOMEM;
+                        }
+                        towrite++;
+                        towritelen--;
+                }
+                *source += 4;
+                *sourcelen -= 4;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Decode a chunk of US-ASCII data into UCS-4 (big endian)
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_ascii_codec_decode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_ascii_codec *c = (charset_ascii_codec *) codec;
+        parserutils_error error;
+        if (c->read_len > 0) {
+                /* Output left over from last decode */
+                uint32_t *pread = c->read_buf;
+                while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+                        *((uint32_t *) (void *) *dest) =
+                                        endian_host_to_big(pread[0]);
+                        *dest += 4;
+                        *destlen -= 4;
+                        pread++;
+                        c->read_len--;
+                }
+                if (*destlen < c->read_len * 4) {
+                        /* Ran out of output buffer */
+                        size_t i;
+                        /* Shuffle remaining output down */
+                        for (i = 0; i < c->read_len; i++)
+                                c->read_buf[i] = pread[i];
+                        return PARSERUTILS_NOMEM;
+                }
+        }
+        /* Finally, the "normal" case; process all outstanding characters */
+        while (*sourcelen > 0) {
+                error = charset_ascii_codec_read_char(c,
+                                source, sourcelen, dest, destlen);
+                if (error != PARSERUTILS_OK) {
+                        return error;
+                }
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Clear a US-ASCII codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_ascii_codec_reset(parserutils_charset_codec *codec)
+{
+        charset_ascii_codec *c = (charset_ascii_codec *) codec;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        return PARSERUTILS_OK;
+}
+/**
+ * Read a character from US-ASCII to UCS-4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_ascii_codec_read_char(charset_ascii_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        uint32_t ucs4;
+        parserutils_error error;
+        /* Convert a single character */
+        error = charset_ascii_to_ucs4(c, *source, *sourcelen, &ucs4);
+        if (error == PARSERUTILS_OK) {
+                /* Read a character */
+                error = charset_ascii_codec_output_decoded_char(c,
+                                ucs4, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += 1;
+                        *sourcelen -= 1;
+                }
+                return error;
+        } else if (error == PARSERUTILS_NEEDDATA) {
+                /* Can only happen if sourcelen == 0 */
+                return error;
+        } else if (error == PARSERUTILS_INVALID) {
+                /* Illegal input sequence */
+                /* Strict errormode; simply flag invalid character */
+                if (c->base.errormode ==
+                                PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+                        return PARSERUTILS_INVALID;
+                }
+                /* output U+FFFD and continue processing. */
+                error = charset_ascii_codec_output_decoded_char(c,
+xFFFD, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += 1;
+                        *sourcelen -= 1;
+                }
+                return error;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Output a UCS-4 character (big endian)
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS-4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_ascii_codec_output_decoded_char(
+                charset_ascii_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+        if (*destlen < 4) {
+                /* Run out of output buffer */
+                c->read_len = 1;
+                c->read_buf[0] = ucs4;
+                return PARSERUTILS_NOMEM;
+        }
+        *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
+        *dest += 4;
+        *destlen -= 4;
+        return PARSERUTILS_OK;
+}
+/**
+ * Convert a UCS4 (host endian) character to US-ASCII
+ *
+ * \param c     The codec instance
+ * \param ucs4  The UCS4 character to convert
+ * \param s     Pointer to pointer to destination buffer
+ * \param len   Pointer to destination buffer length
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
+ *         PARSERUTILS_INVALID if the character cannot be represented
+ *
+ * _INVALID will only be returned if the codec's conversion mode is STRICT.
+ * Otherwise, '?' will be output.
+ *
+ * On successful conversion, *s and *len will be updated.
+ */
+parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
+                uint32_t ucs4, uint8_t **s, size_t *len)
+{
+        uint8_t out = 0;
+        if (*len < 1)
+                return PARSERUTILS_NOMEM;
+        if (ucs4 < 0x80) {
+                /* ASCII */
+                out = ucs4;
+        } else {
+                if (c->base.errormode == PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
+                        return PARSERUTILS_INVALID;
+                else
+                        out = '?';
+        }
+        *(*s) = out;
+        (*s)++;
+        (*len)--;
+        return PARSERUTILS_OK;
+}
+/**
+ * Convert a US-ASCII character to UCS4 (host endian)
+ *
+ * \param c     The codec instance
+ * \param s     Pointer to source buffer
+ * \param len   Source buffer length
+ * \param ucs4  Pointer to destination buffer
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NEEDDATA if there's insufficient input data
+ *         PARSERUTILS_INVALID if the character cannot be represented
+ */
+parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
+                const uint8_t *s, size_t len, uint32_t *ucs4)
+{
+        uint32_t out;
+        UNUSED(c);
+        if (len < 1)
+                return PARSERUTILS_NEEDDATA;
+        if (*s < 0x80) {
+                out = *s;
+        } else {
+                return PARSERUTILS_INVALID;
+        }
+        *ucs4 = out;
+        return PARSERUTILS_OK;
+}
+const parserutils_charset_handler charset_ascii_codec_handler = {
+        charset_ascii_codec_handles_charset,
+        charset_ascii_codec_create
+};

 /programs/network/netsurf/libparserutils/src/charset/codecs/codec_ext8.c
 ,0 → 1,588
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/charset/mibenum.h>
+#include "charset/codecs/codec_impl.h"
+#include "utils/endian.h"
+#include "utils/utils.h"
+#include "charset/codecs/ext8_tables.h"
+static struct {
+        uint16_t mib;
+        const char *name;
+        size_t len;
+        uint32_t *table;
+} known_charsets[] = {
+        { 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
+        { 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
+        { 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
+        { 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
+        { 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
+        { 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
+        { 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
+        { 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
+        { 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
+};
+/**
+ * Windows charset codec
+ */
+typedef struct charset_ext8_codec {
+        parserutils_charset_codec base; /**< Base class */
+        uint32_t *table;                /**< Mapping table for 0x80-0xFF */
+#define READ_BUFSIZE (8)
+        uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
+                                                 * output sequences (decode)
+                                                 * (host-endian) */
+        size_t read_len;                /**< Character length of read_buf */
+#define WRITE_BUFSIZE (8)
+        uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
+                                                 * output sequences (encode)
+                                                 * (host-endian) */
+        size_t write_len;               /**< Character length of write_buf */
+} charset_ext8_codec;
+static bool charset_ext8_codec_handles_charset(const char *charset);
+static parserutils_error charset_ext8_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec);
+static parserutils_error charset_ext8_codec_destroy(
+                parserutils_charset_codec *codec);
+static parserutils_error charset_ext8_codec_encode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_ext8_codec_decode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_ext8_codec_reset(
+                parserutils_charset_codec *codec);
+static inline parserutils_error charset_ext8_codec_read_char(
+                charset_ext8_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_ext8_codec_output_decoded_char(
+                charset_ext8_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
+                uint32_t ucs4, uint8_t **s, size_t *len);
+static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
+                const uint8_t *s, size_t len, uint32_t *ucs4);
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_ext8_codec_handles_charset(const char *charset)
+{
+        uint32_t i;
+        uint16_t match = parserutils_charset_mibenum_from_name(charset,
+                        strlen(charset));
+        if (known_charsets[0].mib == 0) {
+                for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
+                        known_charsets[i].mib =
+                                parserutils_charset_mibenum_from_name(
+                                                known_charsets[i].name,
+                                                known_charsets[i].len);
+                }
+        }
+        for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
+                if (known_charsets[i].mib == match)
+                        return true;
+        }
+        return false;
+}
+/**
+ * Create an extended 8bit codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \param codec    Pointer to location to receive codec
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhausion
+ */
+parserutils_error charset_ext8_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec)
+{
+        uint32_t i;
+        charset_ext8_codec *c;
+        uint16_t match = parserutils_charset_mibenum_from_name(
+                        charset, strlen(charset));
+        uint32_t *table = NULL;
+        for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
+                if (known_charsets[i].mib == match) {
+                        table = known_charsets[i].table;
+                        break;
+                }
+        }
+        assert(table != NULL);
+        c = alloc(NULL, sizeof(charset_ext8_codec), pw);
+        if (c == NULL)
+                return PARSERUTILS_NOMEM;
+        c->table = table;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        /* Finally, populate vtable */
+        c->base.handler.destroy = charset_ext8_codec_destroy;
+        c->base.handler.encode = charset_ext8_codec_encode;
+        c->base.handler.decode = charset_ext8_codec_decode;
+        c->base.handler.reset = charset_ext8_codec_reset;
+        *codec = (parserutils_charset_codec *) c;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy an extended 8bit codec
+ *
+ * \param codec  The codec to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec)
+{
+        UNUSED(codec);
+        return PARSERUTILS_OK;
+}
+/**
+ * Encode a chunk of UCS-4 (big endian) data into extended 8bit
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_ext8_codec *c = (charset_ext8_codec *) codec;
+        uint32_t ucs4;
+        uint32_t *towrite;
+        size_t towritelen;
+        parserutils_error error;
+        /* Process any outstanding characters from the previous call */
+        if (c->write_len > 0) {
+                uint32_t *pwrite = c->write_buf;
+                while (c->write_len > 0) {
+                        error = charset_ext8_from_ucs4(c, pwrite[0],
+                                        dest, destlen);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                assert(error == PARSERUTILS_NOMEM);
+                                for (len = 0; len < c->write_len; len++) {
+                                        c->write_buf[len] = pwrite[len];
+                                }
+                                return error;
+                        }
+                        pwrite++;
+                        c->write_len--;
+                }
+        }
+        /* Now process the characters for this call */
+        while (*sourcelen > 0) {
+                ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
+                towrite = &ucs4;
+                towritelen = 1;
+                /* Output current characters */
+                while (towritelen > 0) {
+                        error = charset_ext8_from_ucs4(c, towrite[0], dest,
+                                        destlen);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                if (error != PARSERUTILS_NOMEM) {
+                                        return error;
+                                }
+                                /* Insufficient output space */
+                                if (towritelen >= WRITE_BUFSIZE)
+                                        abort();
+                                c->write_len = towritelen;
+                                /* Copy pending chars to save area, for
+                                 * processing next call. */
+                                for (len = 0; len < towritelen; len++)
+                                        c->write_buf[len] = towrite[len];
+                                /* Claim character we've just buffered,
+                                 * so it's not reprocessed */
+                                *source += 4;
+                                *sourcelen -= 4;
+                                return PARSERUTILS_NOMEM;
+                        }
+                        towrite++;
+                        towritelen--;
+                }
+                *source += 4;
+                *sourcelen -= 4;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Decode a chunk of extended 8bit data into UCS-4 (big endian)
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_ext8_codec *c = (charset_ext8_codec *) codec;
+        parserutils_error error;
+        if (c->read_len > 0) {
+                /* Output left over from last decode */
+                uint32_t *pread = c->read_buf;
+                while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+                        *((uint32_t *) (void *) *dest) =
+                                        endian_host_to_big(pread[0]);
+                        *dest += 4;
+                        *destlen -= 4;
+                        pread++;
+                        c->read_len--;
+                }
+                if (*destlen < c->read_len * 4) {
+                        /* Ran out of output buffer */
+                        size_t i;
+                        /* Shuffle remaining output down */
+                        for (i = 0; i < c->read_len; i++)
+                                c->read_buf[i] = pread[i];
+                        return PARSERUTILS_NOMEM;
+                }
+        }
+        /* Finally, the "normal" case; process all outstanding characters */
+        while (*sourcelen > 0) {
+                error = charset_ext8_codec_read_char(c,
+                                source, sourcelen, dest, destlen);
+                if (error != PARSERUTILS_OK) {
+                        return error;
+                }
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Clear an extended 8bit codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
+{
+        charset_ext8_codec *c = (charset_ext8_codec *) codec;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        return PARSERUTILS_OK;
+}
+/**
+ * Read a character from the extended 8bit to UCS-4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                                 codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        uint32_t ucs4;
+        parserutils_error error;
+        /* Convert a single character */
+        error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
+        if (error == PARSERUTILS_OK) {
+                /* Read a character */
+                error = charset_ext8_codec_output_decoded_char(c,
+                                ucs4, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += 1;
+                        *sourcelen -= 1;
+                }
+                return error;
+        } else if (error == PARSERUTILS_NEEDDATA) {
+                /* Can only happen if sourcelen == 0 */
+                return error;
+        } else if (error == PARSERUTILS_INVALID) {
+                /* Illegal input sequence */
+                /* Strict errormode; simply flag invalid character */
+                if (c->base.errormode ==
+                                PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+                        return PARSERUTILS_INVALID;
+                }
+                /* output U+FFFD and continue processing. */
+                error = charset_ext8_codec_output_decoded_char(c,
+xFFFD, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += 1;
+                        *sourcelen -= 1;
+                }
+                return error;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Output a UCS-4 character (big endian)
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS-4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+        if (*destlen < 4) {
+                /* Run out of output buffer */
+                c->read_len = 1;
+                c->read_buf[0] = ucs4;
+                return PARSERUTILS_NOMEM;
+        }
+        *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
+        *dest += 4;
+        *destlen -= 4;
+        return PARSERUTILS_OK;
+}
+/**
+ * Convert a UCS4 (host endian) character to extended 8bit
+ *
+ * \param c     The codec instance
+ * \param ucs4  The UCS4 character to convert
+ * \param s     Pointer to pointer to destination buffer
+ * \param len   Pointer to destination buffer length
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
+ *         PARSERUTILS_INVALID if the character cannot be represented
+ *
+ * _INVALID will only be returned if the codec's conversion mode is STRICT.
+ * Otherwise, '?' will be output.
+ *
+ * On successful conversion, *s and *len will be updated.
+ */
+parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
+                uint32_t ucs4, uint8_t **s, size_t *len)
+{
+        uint8_t out = 0;
+        if (*len < 1)
+                return PARSERUTILS_NOMEM;
+        if (ucs4 < 0x80) {
+                /* ASCII */
+                out = ucs4;
+        } else {
+                uint32_t i;
+                for (i = 0; i < 128; i++) {
+                        if (ucs4 == c->table[i])
+                                break;
+                }
+                if (i == 128) {
+                        if (c->base.errormode ==
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
+                                return PARSERUTILS_INVALID;
+                        else
+                                out = '?';
+                } else {
+                        out = 0x80 + i;
+                }
+        }
+        *(*s) = out;
+        (*s)++;
+        (*len)--;
+        return PARSERUTILS_OK;
+}
+/**
+ * Convert an extended 8bit character to UCS4 (host endian)
+ *
+ * \param c     The codec instance
+ * \param s     Pointer to source buffer
+ * \param len   Source buffer length
+ * \param ucs4  Pointer to destination buffer
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NEEDDATA if there's insufficient input data
+ *         PARSERUTILS_INVALID if the character cannot be represented
+ */
+parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
+                const uint8_t *s, size_t len, uint32_t *ucs4)
+{
+        uint32_t out;
+        if (len < 1)
+                return PARSERUTILS_NEEDDATA;
+        if (*s < 0x80) {
+                out = *s;
+        } else {
+                if (c->table[*s - 0x80] == 0xFFFF)
+                        return PARSERUTILS_INVALID;
+                out = c->table[*s - 0x80];
+        }
+        *ucs4 = out;
+        return PARSERUTILS_OK;
+}
+const parserutils_charset_handler charset_ext8_codec_handler = {
+        charset_ext8_codec_handles_charset,
+        charset_ext8_codec_create
+};

 /programs/network/netsurf/libparserutils/src/charset/codecs/codec_impl.h
 ,0 → 1,49
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_charset_codecs_codecimpl_h_
+#define parserutils_charset_codecs_codecimpl_h_
+#include <stdbool.h>
+#include <inttypes.h>
+#include <parserutils/charset/codec.h>
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct parserutils_charset_codec {
+        uint16_t mibenum;                       /**< MIB enum for charset */
+        parserutils_charset_codec_errormode errormode;  /**< error mode */
+        parserutils_alloc alloc;                /**< allocation function */
+        void *alloc_pw;                         /**< private word */
+        struct {
+                parserutils_error (*destroy)(parserutils_charset_codec *codec);
+                parserutils_error (*encode)(parserutils_charset_codec *codec,
+                                const uint8_t **source, size_t *sourcelen,
+                                uint8_t **dest, size_t *destlen);
+                parserutils_error (*decode)(parserutils_charset_codec *codec,
+                                const uint8_t **source, size_t *sourcelen,
+                                uint8_t **dest, size_t *destlen);
+                parserutils_error (*reset)(parserutils_charset_codec *codec);
+        } handler; /**< Vtable for handler code */
+};
+/**
+ * Codec factory component definition
+ */
+typedef struct parserutils_charset_handler {
+        bool (*handles_charset)(const char *charset);
+        parserutils_error (*create)(const char *charset,
+                        parserutils_alloc alloc, void *pw,
+                        parserutils_charset_codec **codec);
+} parserutils_charset_handler;
+#endif

 /programs/network/netsurf/libparserutils/src/charset/codecs/codec_utf16.c
 ,0 → 1,552
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf16.h>
+#include "charset/codecs/codec_impl.h"
+#include "utils/endian.h"
+#include "utils/utils.h"
+/**
+ * UTF-16 charset codec
+ */
+typedef struct charset_utf16_codec {
+        parserutils_charset_codec base; /**< Base class */
+#define INVAL_BUFSIZE (32)
+        uint8_t inval_buf[INVAL_BUFSIZE];       /**< Buffer for fixing up
+                                                 * incomplete input
+                                                 * sequences */
+        size_t inval_len;               /*< Byte length of inval_buf **/
+#define READ_BUFSIZE (8)
+        uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
+                                                 * output sequences (decode)
+                                                 * (host-endian) */
+        size_t read_len;                /**< Character length of read_buf */
+#define WRITE_BUFSIZE (8)
+        uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
+                                                 * output sequences (encode)
+                                                 * (host-endian) */
+        size_t write_len;               /**< Character length of write_buf */
+} charset_utf16_codec;
+static bool charset_utf16_codec_handles_charset(const char *charset);
+static parserutils_error charset_utf16_codec_create(
+                const char *charset, parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec);
+static parserutils_error charset_utf16_codec_destroy(
+                parserutils_charset_codec *codec);
+static parserutils_error charset_utf16_codec_encode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_decode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_reset(
+                parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf16_codec_read_char(
+                charset_utf16_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf16_codec_output_decoded_char(
+                charset_utf16_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen);
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf16_codec_handles_charset(const char *charset)
+{
+        return parserutils_charset_mibenum_from_name(charset, strlen(charset))
+                ==
+                parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+}
+/**
+ * Create a UTF-16 codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \param codec    Pointer to location to receive codec
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhausion
+ */
+parserutils_error charset_utf16_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec)
+{
+        charset_utf16_codec *c;
+        UNUSED(charset);
+        c = alloc(NULL, sizeof(charset_utf16_codec), pw);
+        if (c == NULL)
+                return PARSERUTILS_NOMEM;
+        c->inval_buf[0] = '\0';
+        c->inval_len = 0;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        /* Finally, populate vtable */
+        c->base.handler.destroy = charset_utf16_codec_destroy;
+        c->base.handler.encode = charset_utf16_codec_encode;
+        c->base.handler.decode = charset_utf16_codec_decode;
+        c->base.handler.reset = charset_utf16_codec_reset;
+        *codec = (parserutils_charset_codec *) c;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy a UTF-16 codec
+ *
+ * \param codec  The codec to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf16_codec_destroy (parserutils_charset_codec *codec)
+{
+        UNUSED(codec);
+        return PARSERUTILS_OK;
+}
+/**
+ * Encode a chunk of UCS-4 (big endian) data into UTF-16
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_utf16_codec *c = (charset_utf16_codec *) codec;
+        uint32_t ucs4;
+        uint32_t *towrite;
+        size_t towritelen;
+        parserutils_error error;
+        /* Process any outstanding characters from the previous call */
+        if (c->write_len > 0) {
+                uint32_t *pwrite = c->write_buf;
+                uint8_t buf[4];
+                size_t len;
+                while (c->write_len > 0) {
+                        error = parserutils_charset_utf16_from_ucs4(
+                                        pwrite[0], buf, &len);
+                        if (error != PARSERUTILS_OK)
+                                abort();
+                        if (*destlen < len) {
+                                /* Insufficient output buffer space */
+                                for (len = 0; len < c->write_len; len++)
+                                        c->write_buf[len] = pwrite[len];
+                                return PARSERUTILS_NOMEM;
+                        }
+                        memcpy(*dest, buf, len);
+                        *dest += len;
+                        *destlen -= len;
+                        pwrite++;
+                        c->write_len--;
+                }
+        }
+        /* Now process the characters for this call */
+        while (*sourcelen > 0) {
+                ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
+                towrite = &ucs4;
+                towritelen = 1;
+                /* Output current characters */
+                while (towritelen > 0) {
+                        uint8_t buf[4];
+                        size_t len;
+                        error = parserutils_charset_utf16_from_ucs4(
+                                        towrite[0], buf, &len);
+                        if (error != PARSERUTILS_OK)
+                                abort();
+                        if (*destlen < len) {
+                                /* Insufficient output space */
+                                if (towritelen >= WRITE_BUFSIZE)
+                                        abort();
+                                c->write_len = towritelen;
+                                /* Copy pending chars to save area, for
+                                 * processing next call. */
+                                for (len = 0; len < towritelen; len++)
+                                        c->write_buf[len] = towrite[len];
+                                /* Claim character we've just buffered,
+                                 * so it's not reprocessed */
+                                *source += 4;
+                                *sourcelen -= 4;
+                                return PARSERUTILS_NOMEM;
+                        }
+                        memcpy(*dest, buf, len);
+                        *dest += len;
+                        *destlen -= len;
+                        towrite++;
+                        towritelen--;
+                }
+                *source += 4;
+                *sourcelen -= 4;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Decode a chunk of UTF-16 data into UCS-4 (big endian)
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_utf16_codec *c = (charset_utf16_codec *) codec;
+        parserutils_error error;
+        if (c->read_len > 0) {
+                /* Output left over from last decode */
+                uint32_t *pread = c->read_buf;
+                while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+                        *((uint32_t *) (void *) *dest) =
+                                        endian_host_to_big(pread[0]);
+                        *dest += 4;
+                        *destlen -= 4;
+                        pread++;
+                        c->read_len--;
+                }
+                if (*destlen < c->read_len * 4) {
+                        /* Ran out of output buffer */
+                        size_t i;
+                        /* Shuffle remaining output down */
+                        for (i = 0; i < c->read_len; i++)
+                                c->read_buf[i] = pread[i];
+                        return PARSERUTILS_NOMEM;
+                }
+        }
+        if (c->inval_len > 0) {
+                /* The last decode ended in an incomplete sequence.
+                 * Fill up inval_buf with data from the start of the
+                 * new chunk and process it. */
+                uint8_t *in = c->inval_buf;
+                size_t ol = c->inval_len;
+                size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+                size_t orig_l = l;
+                memcpy(c->inval_buf + ol, *source, l);
+                l += c->inval_len;
+                error = charset_utf16_codec_read_char(c,
+                                (const uint8_t **) &in, &l, dest, destlen);
+                if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+                        return error;
+                }
+                /* And now, fix up source pointers */
+                *source += max((signed) (orig_l - l), 0);
+                *sourcelen -= max((signed) (orig_l - l), 0);
+                /* Failed to resolve an incomplete character and
+                 * ran out of buffer space. No recovery strategy
+                 * possible, so explode everywhere. */
+                if ((orig_l + ol) - l == 0)
+                        abort();
+                /* Report memory exhaustion case from above */
+                if (error != PARSERUTILS_OK)
+                        return error;
+        }
+        /* Finally, the "normal" case; process all outstanding characters */
+        while (*sourcelen > 0) {
+                error = charset_utf16_codec_read_char(c,
+                                source, sourcelen, dest, destlen);
+                if (error != PARSERUTILS_OK) {
+                        return error;
+                }
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Clear a UTF-16 codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
+{
+        charset_utf16_codec *c = (charset_utf16_codec *) codec;
+        c->inval_buf[0] = '\0';
+        c->inval_len = 0;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        return PARSERUTILS_OK;
+}
+/**
+ * Read a character from the UTF-16 to UCS-4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        uint32_t ucs4;
+        size_t sucs4;
+        parserutils_error error;
+        /* Convert a single character */
+        error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
+                        &ucs4, &sucs4);
+        if (error == PARSERUTILS_OK) {
+                /* Read a character */
+                error = charset_utf16_codec_output_decoded_char(c,
+                                ucs4, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += sucs4;
+                        *sourcelen -= sucs4;
+                }
+                /* Clear inval buffer */
+                c->inval_buf[0] = '\0';
+                c->inval_len = 0;
+                return error;
+        } else if (error == PARSERUTILS_NEEDDATA) {
+                /* Incomplete input sequence */
+                if (*sourcelen > INVAL_BUFSIZE)
+                        abort();
+                memmove(c->inval_buf, *source, *sourcelen);
+                c->inval_buf[*sourcelen] = '\0';
+                c->inval_len = *sourcelen;
+                *source += *sourcelen;
+                *sourcelen = 0;
+                return PARSERUTILS_OK;
+        } else if (error == PARSERUTILS_INVALID) {
+                /* Illegal input sequence */
+                uint32_t nextchar;
+                /* Clear inval buffer */
+                c->inval_buf[0] = '\0';
+                c->inval_len = 0;
+                /* Strict errormode; simply flag invalid character */
+                if (c->base.errormode ==
+                                PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+                        return PARSERUTILS_INVALID;
+                }
+                /* Find next valid UTF-16 sequence.
+                 * We're processing client-provided data, so let's
+                 * be paranoid about its validity. */
+                error = parserutils_charset_utf16_next_paranoid(
+                                *source, *sourcelen, 0, &nextchar);
+                if (error != PARSERUTILS_OK) {
+                        if (error == PARSERUTILS_NEEDDATA) {
+                                /* Need more data to be sure */
+                                if (*sourcelen > INVAL_BUFSIZE)
+                                        abort();
+                                memmove(c->inval_buf, *source, *sourcelen);
+                                c->inval_buf[*sourcelen] = '\0';
+                                c->inval_len = *sourcelen;
+                                *source += *sourcelen;
+                                *sourcelen = 0;
+                                nextchar = 0;
+                        } else {
+                                return error;
+                        }
+                }
+                /* output U+FFFD and continue processing. */
+                error = charset_utf16_codec_output_decoded_char(c,
+xFFFD, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += nextchar;
+                        *sourcelen -= nextchar;
+                }
+                return error;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Output a UCS-4 character (big endian)
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS-4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+        if (*destlen < 4) {
+                /* Run out of output buffer */
+                c->read_len = 1;
+                c->read_buf[0] = ucs4;
+                return PARSERUTILS_NOMEM;
+        }
+        *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
+        *dest += 4;
+        *destlen -= 4;
+        return PARSERUTILS_OK;
+}
+const parserutils_charset_handler charset_utf16_codec_handler = {
+        charset_utf16_codec_handles_charset,
+        charset_utf16_codec_create
+};

 /programs/network/netsurf/libparserutils/src/charset/codecs/codec_utf8.c
 ,0 → 1,555
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/charset/mibenum.h>
+#include "charset/codecs/codec_impl.h"
+#include "charset/encodings/utf8impl.h"
+#include "utils/endian.h"
+#include "utils/utils.h"
+/**
+ * UTF-8 charset codec
+ */
+typedef struct charset_utf8_codec {
+        parserutils_charset_codec base; /**< Base class */
+#define INVAL_BUFSIZE (32)
+        uint8_t inval_buf[INVAL_BUFSIZE];       /**< Buffer for fixing up
+                                                 * incomplete input
+                                                 * sequences */
+        size_t inval_len;               /*< Byte length of inval_buf **/
+#define READ_BUFSIZE (8)
+        uint32_t read_buf[READ_BUFSIZE];        /**< Buffer for partial
+                                                 * output sequences (decode)
+                                                 * (host-endian) */
+        size_t read_len;                /**< Character length of read_buf */
+#define WRITE_BUFSIZE (8)
+        uint32_t write_buf[WRITE_BUFSIZE];      /**< Buffer for partial
+                                                 * output sequences (encode)
+                                                 * (host-endian) */
+        size_t write_len;               /**< Character length of write_buf */
+} charset_utf8_codec;
+static bool charset_utf8_codec_handles_charset(const char *charset);
+static parserutils_error charset_utf8_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec);
+static parserutils_error charset_utf8_codec_destroy(
+                parserutils_charset_codec *codec);
+static parserutils_error charset_utf8_codec_encode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_decode(
+                parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_reset(
+                parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf8_codec_read_char(
+                charset_utf8_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf8_codec_output_decoded_char(
+                charset_utf8_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen);
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset  Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf8_codec_handles_charset(const char *charset)
+{
+        return parserutils_charset_mibenum_from_name(charset,
+                                strlen(charset)) ==
+                        parserutils_charset_mibenum_from_name("UTF-8",
+                                SLEN("UTF-8"));
+}
+/**
+ * Create a UTF-8 codec
+ *
+ * \param charset  The charset to read from / write to
+ * \param alloc    Memory (de)allocation function
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \param codec    Pointer to location to receive codec
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhausion
+ */
+parserutils_error charset_utf8_codec_create(const char *charset,
+                parserutils_alloc alloc, void *pw,
+                parserutils_charset_codec **codec)
+{
+        charset_utf8_codec *c;
+        UNUSED(charset);
+        c = alloc(NULL, sizeof(charset_utf8_codec), pw);
+        if (c == NULL)
+                return PARSERUTILS_NOMEM;
+        c->inval_buf[0] = '\0';
+        c->inval_len = 0;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        /* Finally, populate vtable */
+        c->base.handler.destroy = charset_utf8_codec_destroy;
+        c->base.handler.encode = charset_utf8_codec_encode;
+        c->base.handler.decode = charset_utf8_codec_decode;
+        c->base.handler.reset = charset_utf8_codec_reset;
+        *codec = (parserutils_charset_codec *) c;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy a UTF-8 codec
+ *
+ * \param codec  The codec to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf8_codec_destroy (parserutils_charset_codec *codec)
+{
+        UNUSED(codec);
+        return PARSERUTILS_OK;
+}
+/**
+ * Encode a chunk of UCS-4 (big endian) data into UTF-8
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_utf8_codec *c = (charset_utf8_codec *) codec;
+        uint32_t ucs4;
+        uint32_t *towrite;
+        size_t towritelen;
+        parserutils_error error;
+        /* Process any outstanding characters from the previous call */
+        if (c->write_len > 0) {
+                uint32_t *pwrite = c->write_buf;
+                while (c->write_len > 0) {
+                        UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                if (error != PARSERUTILS_NOMEM)
+                                        abort();
+                                /* Insufficient output buffer space */
+                                for (len = 0; len < c->write_len; len++) {
+                                        c->write_buf[len] = pwrite[len];
+                                }
+                                return PARSERUTILS_NOMEM;
+                        }
+                        pwrite++;
+                        c->write_len--;
+                }
+        }
+        /* Now process the characters for this call */
+        while (*sourcelen > 0) {
+                ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
+                towrite = &ucs4;
+                towritelen = 1;
+                /* Output current characters */
+                while (towritelen > 0) {
+                        UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
+                        if (error != PARSERUTILS_OK) {
+                                uint32_t len;
+                                if (error != PARSERUTILS_NOMEM)
+                                        abort();
+                                /* Insufficient output space */
+                                if (towritelen >= WRITE_BUFSIZE)
+                                        abort();
+                                c->write_len = towritelen;
+                                /* Copy pending chars to save area, for
+                                 * processing next call. */
+                                for (len = 0; len < towritelen; len++)
+                                        c->write_buf[len] = towrite[len];
+                                /* Claim character we've just buffered,
+                                 * so it's not reprocessed */
+                                *source += 4;
+                                *sourcelen -= 4;
+                                return PARSERUTILS_NOMEM;
+                        }
+                        towrite++;
+                        towritelen--;
+                }
+                *source += 4;
+                *sourcelen -= 4;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Decode a chunk of UTF-8 data into UCS-4 (big endian)
+ *
+ * \param codec      The codec to use
+ * \param source     Pointer to pointer to source data
+ * \param sourcelen  Pointer to length (in bytes) of source data
+ * \param dest       Pointer to pointer to output buffer
+ * \param destlen    Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        charset_utf8_codec *c = (charset_utf8_codec *) codec;
+        parserutils_error error;
+        if (c->read_len > 0) {
+                /* Output left over from last decode */
+                uint32_t *pread = c->read_buf;
+                while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+                        *((uint32_t *) (void *) *dest) =
+                                        endian_host_to_big(pread[0]);
+                        *dest += 4;
+                        *destlen -= 4;
+                        pread++;
+                        c->read_len--;
+                }
+                if (*destlen < c->read_len * 4) {
+                        /* Ran out of output buffer */
+                        size_t i;
+                        /* Shuffle remaining output down */
+                        for (i = 0; i < c->read_len; i++)
+                                c->read_buf[i] = pread[i];
+                        return PARSERUTILS_NOMEM;
+                }
+        }
+        if (c->inval_len > 0) {
+                /* The last decode ended in an incomplete sequence.
+                 * Fill up inval_buf with data from the start of the
+                 * new chunk and process it. */
+                uint8_t *in = c->inval_buf;
+                size_t ol = c->inval_len;
+                size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+                size_t orig_l = l;
+                memcpy(c->inval_buf + ol, *source, l);
+                l += c->inval_len;
+                error = charset_utf8_codec_read_char(c,
+                                (const uint8_t **) &in, &l, dest, destlen);
+                if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+                        return error;
+                }
+                /* And now, fix up source pointers */
+                *source += max((signed) (orig_l - l), 0);
+                *sourcelen -= max((signed) (orig_l - l), 0);
+                /* Failed to resolve an incomplete character and
+                 * ran out of buffer space. No recovery strategy
+                 * possible, so explode everywhere. */
+                if ((orig_l + ol) - l == 0)
+                        abort();
+                /* Report memory exhaustion case from above */
+                if (error != PARSERUTILS_OK)
+                        return error;
+        }
+        /* Finally, the "normal" case; process all outstanding characters */
+        while (*sourcelen > 0) {
+                error = charset_utf8_codec_read_char(c,
+                                source, sourcelen, dest, destlen);
+                if (error != PARSERUTILS_OK) {
+                        return error;
+                }
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Clear a UTF-8 codec's encoding state
+ *
+ * \param codec  The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
+{
+        charset_utf8_codec *c = (charset_utf8_codec *) codec;
+        c->inval_buf[0] = '\0';
+        c->inval_len = 0;
+        c->read_buf[0] = 0;
+        c->read_len = 0;
+        c->write_buf[0] = 0;
+        c->write_len = 0;
+        return PARSERUTILS_OK;
+}
+/**
+ * Read a character from the UTF-8 to UCS-4 (big endian)
+ *
+ * \param c          The codec
+ * \param source     Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen  Pointer to length of source buffer (updated on exit)
+ * \param dest       Pointer to pointer to output buffer (updated on exit)
+ * \param destlen    Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ *         PARSERUTILS_INVALID     if a character cannot be represented and the
+ *                            codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
+                const uint8_t **source, size_t *sourcelen,
+                uint8_t **dest, size_t *destlen)
+{
+        uint32_t ucs4;
+        size_t sucs4;
+        parserutils_error error;
+        /* Convert a single character */
+        {
+                const uint8_t *src = *source;
+                size_t srclen = *sourcelen;
+                uint32_t *uptr = &ucs4;
+                size_t *usptr = &sucs4;
+                UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
+        }
+        if (error == PARSERUTILS_OK) {
+                /* Read a character */
+                error = charset_utf8_codec_output_decoded_char(c,
+                                ucs4, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += sucs4;
+                        *sourcelen -= sucs4;
+                }
+                /* Clear inval buffer */
+                c->inval_buf[0] = '\0';
+                c->inval_len = 0;
+                return error;
+        } else if (error == PARSERUTILS_NEEDDATA) {
+                /* Incomplete input sequence */
+                if (*sourcelen > INVAL_BUFSIZE)
+                        abort();
+                memmove(c->inval_buf, *source, *sourcelen);
+                c->inval_buf[*sourcelen] = '\0';
+                c->inval_len = *sourcelen;
+                *source += *sourcelen;
+                *sourcelen = 0;
+                return PARSERUTILS_OK;
+        } else if (error == PARSERUTILS_INVALID) {
+                /* Illegal input sequence */
+                uint32_t nextchar;
+                /* Strict errormode; simply flag invalid character */
+                if (c->base.errormode ==
+                                PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+                        /* Clear inval buffer */
+                        c->inval_buf[0] = '\0';
+                        c->inval_len = 0;
+                        return PARSERUTILS_INVALID;
+                }
+                /* Find next valid UTF-8 sequence.
+                 * We're processing client-provided data, so let's
+                 * be paranoid about its validity. */
+                {
+                        const uint8_t *src = *source;
+                        size_t srclen = *sourcelen;
+                        uint32_t off = 0;
+                        uint32_t *ncptr = &nextchar;
+                        UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
+                }
+                if (error != PARSERUTILS_OK) {
+                        if (error == PARSERUTILS_NEEDDATA) {
+                                /* Need more data to be sure */
+                                if (*sourcelen > INVAL_BUFSIZE)
+                                        abort();
+                                memmove(c->inval_buf, *source, *sourcelen);
+                                c->inval_buf[*sourcelen] = '\0';
+                                c->inval_len = *sourcelen;
+                                *source += *sourcelen;
+                                *sourcelen = 0;
+                                nextchar = 0;
+                        } else {
+                                return error;
+                        }
+                }
+                /* Clear inval buffer */
+                c->inval_buf[0] = '\0';
+                c->inval_len = 0;
+                /* output U+FFFD and continue processing. */
+                error = charset_utf8_codec_output_decoded_char(c,
+xFFFD, dest, destlen);
+                if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+                        /* output succeeded; update source pointers */
+                        *source += nextchar;
+                        *sourcelen -= nextchar;
+                }
+                return error;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Output a UCS-4 character (big endian)
+ *
+ * \param c        Codec to use
+ * \param ucs4     UCS-4 character (host endian)
+ * \param dest     Pointer to pointer to output buffer
+ * \param destlen  Pointer to output buffer length
+ * \return PARSERUTILS_OK          on success,
+ *         PARSERUTILS_NOMEM       if output buffer is too small,
+ */
+parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
+                uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+        if (*destlen < 4) {
+                /* Run out of output buffer */
+                c->read_len = 1;
+                c->read_buf[0] = ucs4;
+                return PARSERUTILS_NOMEM;
+        }
+        *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
+        *dest += 4;
+        *destlen -= 4;
+        return PARSERUTILS_OK;
+}
+const parserutils_charset_handler charset_utf8_codec_handler = {
+        charset_utf8_codec_handles_charset,
+        charset_utf8_codec_create
+};

 /programs/network/netsurf/libparserutils/src/charset/codecs/ext8_tables.h
 ,0 → 1,187
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_charset_codecs_ext8tables_h_
+#define parserutils_charset_codecs_ext8tables_h_
+/* Mapping tables for extended 8bit -> UCS4.
+ * Undefined characters are mapped to U+FFFF,
+ * which is a guaranteed non-character
+ */
+static uint32_t w1250[128] = {
+x20AC, 0xFFFF, 0x201A, 0xFFFF, 0x201E, 0x2026, 0x2020, 0x2021,
+xFFFF, 0x2030, 0x0160, 0x2039, 0x015A, 0x0164, 0x017D, 0x0179,
+xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+xFFFF, 0x2122, 0x0161, 0x203A, 0x015B, 0x0165, 0x017E, 0x017A,
+x00A0, 0x02C7, 0x02D8, 0x0141, 0x00A4, 0x0104, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x015E, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x017B,
+x00B0, 0x00B1, 0x02DB, 0x0142, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x0105, 0x015F, 0x00BB, 0x013D, 0x02DD, 0x013E, 0x017C,
+x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
+x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
+x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
+x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
+x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
+x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
+x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
+x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9,
+};
+static uint32_t w1251[128] = {
+x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
+x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
+x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+xFFFF, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
+x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
+x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
+x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
+x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457,
+x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
+x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
+x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
+x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
+x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
+x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
+x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
+x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
+};
+static uint32_t w1252[128] = {
+x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFF, 0x017D, 0xFFFF,
+xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFF, 0x017E, 0x0178,
+x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
+};
+static uint32_t w1253[128] = {
+x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+xFFFF, 0x2030, 0xFFFF, 0x2039, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+xFFFF, 0x2122, 0xFFFF, 0x203A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+x00A0, 0x0385, 0x0386, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0xFFFF, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x2015,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x00B5, 0x00B6, 0x00B7,
+x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
+x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
+x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
+x03A0, 0x03A1, 0xFFFF, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
+x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
+x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
+x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
+x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
+x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFF,
+};
+static uint32_t w1254[128] = {
+x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFF, 0xFFFF, 0x0178,
+x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
+x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
+x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF,
+};
+static uint32_t w1255[128] = {
+x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+x02C6, 0x2030, 0xFFFF, 0x2039, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+x02DC, 0x2122, 0xFFFF, 0x203A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AA, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7,
+x05B8, 0x05B9, 0xFFFF, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
+x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05F0, 0x05F1, 0x05F2, 0x05F3,
+x05F4, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
+x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
+x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
+x05E8, 0x05E9, 0x05EA, 0xFFFF, 0xFFFF, 0x200E, 0x200F, 0xFFFF,
+};
+static uint32_t w1256[128] = {
+x20AC, 0x067E, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+x02C6, 0x2030, 0x0679, 0x2039, 0x0152, 0x0686, 0x0698, 0x0688,
+x06AF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+x06A9, 0x2122, 0x0691, 0x203A, 0x0153, 0x200C, 0x200D, 0x06BA,
+x00A0, 0x060C, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x06BE, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x061B, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x061F,
+x06C1, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
+x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
+x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x00D7,
+x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642, 0x0643,
+x00E0, 0x0644, 0x00E2, 0x0645, 0x0646, 0x0647, 0x0648, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x0649, 0x064A, 0x00EE, 0x00EF,
+x064B, 0x064C, 0x064D, 0x064E, 0x00F4, 0x064F, 0x0650, 0x00F7,
+x0651, 0x00F9, 0x0652, 0x00FB, 0x00FC, 0x200E, 0x200F, 0x06D2,
+};
+static uint32_t w1257[128] = {
+x20AC, 0xFFFF, 0x201A, 0xFFFF, 0x201E, 0x2026, 0x2020, 0x2021,
+xFFFF, 0x2030, 0xFFFF, 0x2039, 0xFFFF, 0x00A8, 0x02C7, 0x00B8,
+xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+xFFFF, 0x2122, 0xFFFF, 0x203A, 0xFFFF, 0x00AF, 0x02DB, 0xFFFF,
+x00A0, 0xFFFF, 0x00A2, 0x00A3, 0x00A4, 0xFFFF, 0x00A6, 0x00A7,
+x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
+x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112,
+x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
+x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7,
+x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
+x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113,
+x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
+x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7,
+x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x02D9,
+};
+static uint32_t w1258[128] = {
+x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+x02C6, 0x2030, 0xFFFF, 0x2039, 0x0152, 0xFFFF, 0xFFFF, 0xFFFF,
+xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+x02DC, 0x2122, 0xFFFF, 0x203A, 0x0153, 0xFFFF, 0xFFFF, 0x0178,
+x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
+x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
+x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
+x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
+x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
+x00C8, 0x00C9, 0x00CA, 0x00CB, 0x0300, 0x00CD, 0x00CE, 0x00CF,
+x0110, 0x00D1, 0x0309, 0x00D3, 0x00D4, 0x01A0, 0x00D6, 0x00D7,
+x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x01AF, 0x0303, 0x00DF,
+x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
+x00E8, 0x00E9, 0x00EA, 0x00EB, 0x0301, 0x00ED, 0x00EE, 0x00EF,
+x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7,
+x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x01B0, 0x20AB, 0x00FF,
+};
+#endif

/programs/network/netsurf/libparserutils/src/charset/encodings/Makefile
0,0 → 1,5

OUTFILE = libo.o
OBJS = utf8.o utf16.o
CFLAGS += -I ../../../include/ -I ../../../../ -I ../../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib

 /programs/network/netsurf/libparserutils/src/charset/encodings/utf16.c
 ,0 → 1,245
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+/** \file
+ * UTF-16 manipulation functions (implementation).
+ */
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <parserutils/charset/utf16.h>
+/**
+ * Convert a UTF-16 sequence into a single UCS-4 character
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence in bytes
+ * \param ucs4  Pointer to location to receive UCS-4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-16 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
+                size_t len, uint32_t *ucs4, size_t *clen)
+{
+        const uint16_t *ss = (const uint16_t *) (const void *) s;
+        if (s == NULL || ucs4 == NULL || clen == NULL)
+                return PARSERUTILS_BADPARM;
+        if (len < 2)
+                return PARSERUTILS_NEEDDATA;
+        if (*ss < 0xD800 || *ss > 0xDFFF) {
+                *ucs4 = *ss;
+                *clen = 2;
+        } else if (0xD800 <= *ss && *ss <= 0xDBFF) {
+                /* High-surrogate code unit.  */
+                if (len < 4)
+                        return PARSERUTILS_NEEDDATA;
+                if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) {
+                        /* We have a valid surrogate pair.  */
+                        *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF))
+                                + (1<<16);
+                        *clen = 4;
+                } else {
+                        return PARSERUTILS_INVALID;
+                }
+        } else {
+                /* Low-surrogate code unit.  */
+                return PARSERUTILS_INVALID;
+        }
+        return PARSERUTILS_OK;
+}
+/**
+ * Convert a single UCS-4 character into a UTF-16 sequence
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to 4 byte long output buffer
+ * \param len   Pointer to location to receive length of multibyte sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
+                size_t *len)
+{
+        uint16_t *ss = (uint16_t *) (void *) s;
+        uint32_t l = 0;
+        if (s == NULL || len == NULL)
+                return PARSERUTILS_BADPARM;
+        else if (ucs4 < 0x10000) {
+                *ss = (uint16_t) ucs4;
+                l = 2;
+        } else if (ucs4 < 0x110000) {
+                ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
+                ss[1] = 0xDC00 | (ucs4 & 0x3ff);
+                l = 4;
+        } else {
+                return PARSERUTILS_INVALID;
+        }
+        *len = l;
+        return PARSERUTILS_OK;
+}
+/**
+ * Calculate the length (in characters) of a bounded UTF-16 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
+                size_t *len)
+{
+        const uint16_t *ss = (const uint16_t *) (const void *) s;
+        const uint16_t *end = (const uint16_t *) (const void *) (s + max);
+        int l = 0;
+        if (s == NULL || len == NULL)
+                return PARSERUTILS_BADPARM;
+        while (ss < end) {
+                if (*ss < 0xD800 || 0xDFFF < *ss)
+                        ss++;
+                else
+                        ss += 2;
+                l++;
+        }
+        *len = l;
+        return PARSERUTILS_OK;
+}
+/**
+ * Calculate the length (in bytes) of a UTF-16 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+                size_t *len)
+{
+        const uint16_t *ss = (const uint16_t *) (const void *) s;
+        if (s == NULL || len == NULL)
+                return PARSERUTILS_BADPARM;
+        if (*ss < 0xD800 || 0xDFFF < *ss)
+                *len = 2;
+        else
+                *len = 4;
+        return PARSERUTILS_OK;
+}
+/**
+ * Find previous legal UTF-16 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
+                uint32_t *prevoff)
+{
+        const uint16_t *ss = (const uint16_t *) (const void *) s;
+        if (s == NULL || prevoff == NULL)
+                return PARSERUTILS_BADPARM;
+        if (off < 2)
+                *prevoff = 0;
+        else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
+                *prevoff = off - 2;
+        else
+                *prevoff = (off < 4) ? 0 : off - 4;
+        return PARSERUTILS_OK;
+}
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
+                uint32_t off, uint32_t *nextoff)
+{
+        const uint16_t *ss = (const uint16_t *) (const void *) s;
+        if (s == NULL || off >= len || nextoff == NULL)
+                return PARSERUTILS_BADPARM;
+        if (len - off < 4)
+                *nextoff = len;
+        else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
+                *nextoff = off + 2;
+        else
+                *nextoff = (len - off < 6) ? len : off + 4;
+        return PARSERUTILS_OK;
+}
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+                uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+        const uint16_t *ss = (const uint16_t *) (const void *) s;
+        if (s == NULL || off >= len || nextoff == NULL)
+                return PARSERUTILS_BADPARM;
+        while (1) {
+                if (len - off < 4) {
+                        return PARSERUTILS_NEEDDATA;
+                } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
+                        *nextoff = off + 2;
+                        break;
+                } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
+                        if (len - off < 6)
+                                return PARSERUTILS_NEEDDATA;
+                        if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
+                                *nextoff = off + 4;
+                                break;
+                        } else {
+                                ss++;
+                                off += 2;
+                        }
+                }
+        }
+        return PARSERUTILS_OK;
+}

 /programs/network/netsurf/libparserutils/src/charset/encodings/utf8.c
 ,0 → 1,175
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/charset/utf8.h>
+#include "charset/encodings/utf8impl.h"
+/** Number of continuation bytes for a given start byte */
+const uint8_t numContinuations[256] = {
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS-4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS-4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-8 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+                uint32_t *ucs4, size_t *clen)
+{
+        parserutils_error error;
+        UTF8_TO_UCS4(s, len, ucs4, clen, error);
+        return error;
+}
+/**
+ * Convert a single UCS-4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to pointer to output buffer, updated on exit
+ * \param len   Pointer to length, in bytes, of output buffer, updated on exit
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4,
+                uint8_t **s, size_t *len)
+{
+        parserutils_error error;
+        UTF8_FROM_UCS4(ucs4, s, len, error);
+        return error;
+}
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+                size_t *len)
+{
+        parserutils_error error;
+        UTF8_LENGTH(s, max, len, error);
+        return error;
+}
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+                size_t *len)
+{
+        parserutils_error error;
+        UTF8_CHAR_BYTE_LENGTH(s, len, error);
+        return error;
+}
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+                uint32_t *prevoff)
+{
+        parserutils_error error;
+        UTF8_PREV(s, off, prevoff, error);
+        return error;
+}
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+                uint32_t off, uint32_t *nextoff)
+{
+        parserutils_error error;
+        UTF8_NEXT(s, len, off, nextoff, error);
+        return error;
+}
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
+                uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+        parserutils_error error;
+        UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
+        return error;
+}

 /programs/network/netsurf/libparserutils/src/charset/encodings/utf8impl.h
 ,0 → 1,342
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS-4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s      The sequence to process
+ * \param len    Length of sequence
+ * \param ucs4   Pointer to location to receive UCS-4 character (host endian)
+ * \param clen   Pointer to location to receive byte length of UTF-8 sequence
+ * \param error  Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error)                         \
+do {                                                                    \
+        uint32_t c, min;                                                \
+        uint8_t n;                                                      \
+        uint8_t i;                                                      \
+                                                                        \
+        error = PARSERUTILS_OK;                                         \
+                                                                        \
+        if (s == NULL || ucs4 == NULL || clen == NULL) {                \
+                error = PARSERUTILS_BADPARM;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        if (len == 0) {                                                 \
+                error = PARSERUTILS_NEEDDATA;                           \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        c = s[0];                                                       \
+                                                                        \
+        if (c < 0x80) {                                                 \
+                n = 1;                                                  \
+                min = 0;                                                \
+        } else if ((c & 0xE0) == 0xC0) {                                \
+                c &= 0x1F;                                              \
+                n = 2;                                                  \
+                min = 0x80;                                             \
+        } else if ((c & 0xF0) == 0xE0) {                                \
+                c &= 0x0F;                                              \
+                n = 3;                                                  \
+                min = 0x800;                                            \
+        } else if ((c & 0xF8) == 0xF0) {                                \
+                c &= 0x07;                                              \
+                n = 4;                                                  \
+                min = 0x10000;                                          \
+        } else if ((c & 0xFC) == 0xF8) {                                \
+                c &= 0x03;                                              \
+                n = 5;                                                  \
+                min = 0x200000;                                         \
+        } else if ((c & 0xFE) == 0xFC) {                                \
+                c &= 0x01;                                              \
+                n = 6;                                                  \
+                min = 0x4000000;                                        \
+        } else {                                                        \
+                error = PARSERUTILS_INVALID;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        if (len < n) {                                                  \
+                error = PARSERUTILS_NEEDDATA;                           \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        for (i = 1; i < n; i++) {                                       \
+                uint32_t t = s[i];                                      \
+                                                                        \
+                if ((t & 0xC0) != 0x80) {                               \
+                        error = PARSERUTILS_INVALID;                    \
+                        break;                                          \
+                }                                                       \
+                                                                        \
+                c <<= 6;                                                \
+                c |= t & 0x3F;                                          \
+        }                                                               \
+                                                                        \
+        if (error == PARSERUTILS_OK) {                                  \
+                /* Detect overlong sequences, surrogates and fffe/ffff */ \
+                if (c < min || (c >= 0xD800 && c <= 0xDFFF) ||          \
+                                c == 0xFFFE || c == 0xFFFF) {           \
+                        error = PARSERUTILS_INVALID;                    \
+                        break;                                          \
+                }                                                       \
+                                                                        \
+                *ucs4 = c;                                              \
+                *clen = n;                                              \
+        }                                                               \
+} while(0)
+/**
+ * Convert a single UCS-4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4   The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s      Pointer to pointer to output buffer, updated on exit
+ * \param len    Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error  Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error)                             \
+do {                                                                    \
+        uint8_t *buf;                                                   \
+        uint8_t l = 0;                                                  \
+                                                                        \
+        error = PARSERUTILS_OK;                                         \
+                                                                        \
+        if (s == NULL || *s == NULL || len == NULL) {                   \
+                error = PARSERUTILS_BADPARM;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        if (ucs4 < 0x80) {                                              \
+                l = 1;                                                  \
+        } else if (ucs4 < 0x800) {                                      \
+                l = 2;                                                  \
+        } else if (ucs4 < 0x10000) {                                    \
+                l = 3;                                                  \
+        } else if (ucs4 < 0x200000) {                                   \
+                l = 4;                                                  \
+        } else if (ucs4 < 0x4000000) {                                  \
+                l = 5;                                                  \
+        } else if (ucs4 <= 0x7FFFFFFF) {                                \
+                l = 6;                                                  \
+        } else {                                                        \
+                error = PARSERUTILS_INVALID;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        if (l > *len) {                                                 \
+                error = PARSERUTILS_NOMEM;                              \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        buf = *s;                                                       \
+                                                                        \
+        if (l == 1) {                                                   \
+                buf[0] = (uint8_t) ucs4;                                \
+        } else {                                                        \
+                uint8_t i;                                              \
+                for (i = l; i > 1; i--) {                               \
+                        buf[i - 1] = 0x80 | (ucs4 & 0x3F);              \
+                        ucs4 >>= 6;                                     \
+                }                                                       \
+                buf[0] = ~((1 << (8 - l)) - 1) | ucs4;                  \
+        }                                                               \
+                                                                        \
+        *s += l;                                                        \
+        *len -= l;                                                      \
+} while(0)
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s      The string
+ * \param max    Maximum length
+ * \param len    Pointer to location to receive length of string
+ * \param error  Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error)                                 \
+do {                                                                    \
+        const uint8_t *end = s + max;                                   \
+        int l = 0;                                                      \
+                                                                        \
+        error = PARSERUTILS_OK;                                         \
+                                                                        \
+        if (s == NULL || len == NULL) {                                 \
+                error = PARSERUTILS_BADPARM;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        while (s < end) {                                               \
+                uint32_t c = s[0];                                      \
+                                                                        \
+                if ((c & 0x80) == 0x00)                                 \
+                        s += 1;                                         \
+                else if ((c & 0xE0) == 0xC0)                            \
+                        s += 2;                                         \
+                else if ((c & 0xF0) == 0xE0)                            \
+                        s += 3;                                         \
+                else if ((c & 0xF8) == 0xF0)                            \
+                        s += 4;                                         \
+                else if ((c & 0xFC) == 0xF8)                            \
+                        s += 5;                                         \
+                else if ((c & 0xFE) == 0xFC)                            \
+                        s += 6;                                         \
+                else {                                                  \
+                        error = PARSERUTILS_INVALID;                    \
+                        break;                                          \
+                }                                                       \
+                                                                        \
+                l++;                                                    \
+        }                                                               \
+                                                                        \
+        if (error == PARSERUTILS_OK)                                    \
+                *len = l;                                               \
+} while(0)
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s      Pointer to start of character
+ * \param len    Pointer to location to receive length
+ * \param error  Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error)                            \
+do {                                                                    \
+        if (s == NULL || len == NULL) {                                 \
+                error = PARSERUTILS_BADPARM;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        *len = numContinuations[s[0]] + 1 /* Start byte */;             \
+                                                                        \
+        error = PARSERUTILS_OK;                                         \
+} while(0)
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error)                               \
+do {                                                                    \
+        if (s == NULL || prevoff == NULL) {                             \
+                error = PARSERUTILS_BADPARM;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        while (off != 0 && (s[--off] & 0xC0) == 0x80)                   \
+                /* do nothing */;                                       \
+                                                                        \
+        *prevoff = off;                                                 \
+                                                                        \
+        error = PARSERUTILS_OK;                                         \
+} while(0)
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error)                          \
+do {                                                                    \
+        if (s == NULL || off >= len || nextoff == NULL) {               \
+                error = PARSERUTILS_BADPARM;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        /* Skip current start byte (if present - may be mid-sequence) */\
+        if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)                   \
+                off++;                                                  \
+                                                                        \
+        while (off < len && (s[off] & 0xC0) == 0x80)                    \
+                off++;                                                  \
+                                                                        \
+        *nextoff = off;                                                 \
+                                                                        \
+        error = PARSERUTILS_OK;                                         \
+} while(0)
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)                 \
+do {                                                                    \
+        uint8_t c;                                                      \
+                                                                        \
+        error = PARSERUTILS_OK;                                         \
+                                                                        \
+        if (s == NULL || off >= len || nextoff == NULL) {               \
+                error = PARSERUTILS_BADPARM;                            \
+                break;                                                  \
+        }                                                               \
+                                                                        \
+        c = s[off];                                                     \
+                                                                        \
+        /* If we're mid-sequence, simply advance to next byte */        \
+        if (!(c < 0x80 || (c & 0xC0) == 0xC0)) {                        \
+                off++;                                                  \
+        } else {                                                        \
+                uint32_t nCont = numContinuations[c];                   \
+                uint32_t nToSkip;                                       \
+                                                                        \
+                if (off + nCont + 1 >= len) {                           \
+                        error = PARSERUTILS_NEEDDATA;                   \
+                        break;                                          \
+                }                                                       \
+                                                                        \
+                /* Verify continuation bytes */                         \
+                for (nToSkip = 1; nToSkip <= nCont; nToSkip++) {        \
+                        if ((s[off + nToSkip] & 0xC0) != 0x80)          \
+                                break;                                  \
+                }                                                       \
+                                                                        \
+                /* Skip over the valid bytes */                         \
+                off += nToSkip;                                         \
+        }                                                               \
+                                                                        \
+        *nextoff = off;                                                 \
+} while(0)
+#endif

/programs/network/netsurf/libparserutils/src/input/Makefile
0,0 → 1,6


OUTFILE = libo.o
OBJS = filter.o inputstream.o
CFLAGS += -I ../../include/ -I ../../../ -I ../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib

 /programs/network/netsurf/libparserutils/src/input/filter.c
 ,0 → 1,419
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#define WITHOUT_ICONV_FILTER
+#ifndef WITHOUT_ICONV_FILTER
+#include <iconv.h>
+#endif
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/codec.h>
+#include "input/filter.h"
+#include "utils/utils.h"
+/** Input filter */
+struct parserutils_filter {
+#ifndef WITHOUT_ICONV_FILTER
+        iconv_t cd;                     /**< Iconv conversion descriptor */
+        uint16_t int_enc;               /**< The internal encoding */
+#else
+        parserutils_charset_codec *read_codec;  /**< Read codec */
+        parserutils_charset_codec *write_codec; /**< Write codec */
+        uint32_t pivot_buf[64];         /**< Conversion pivot buffer */
+        bool leftover;                  /**< Data remains from last call */
+        uint8_t *pivot_left;            /**< Remaining pivot to write */
+        size_t pivot_len;               /**< Length of pivot remaining */
+#endif
+        struct {
+                uint16_t encoding;      /**< Input encoding */
+        } settings;                     /**< Filter settings */
+        parserutils_alloc alloc;        /**< Memory (de)allocation function */
+        void *pw;                       /**< Client private data */
+};
+static parserutils_error filter_set_defaults(parserutils_filter *input);
+static parserutils_error filter_set_encoding(parserutils_filter *input,
+                const char *enc);
+/**
+ * Create an input filter
+ *
+ * \param int_enc  Desired encoding of document
+ * \param alloc    Function used to (de)allocate data
+ * \param pw       Pointer to client-specific private data (may be NULL)
+ * \param filter   Pointer to location to receive filter instance
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhausion,
+ *         PARSERUTILS_BADENCODING if the encoding is unsupported
+ */
+parserutils_error parserutils__filter_create(const char *int_enc,
+                parserutils_alloc alloc, void *pw, parserutils_filter **filter)
+{
+        parserutils_filter *f;
+        parserutils_error error;
+        if (int_enc == NULL || alloc == NULL || filter == NULL)
+                return PARSERUTILS_BADPARM;
+        f = alloc(NULL, sizeof(parserutils_filter), pw);
+        if (f == NULL)
+                return PARSERUTILS_NOMEM;
+#ifndef WITHOUT_ICONV_FILTER
+        f->cd = (iconv_t) -1;
+        f->int_enc = parserutils_charset_mibenum_from_name(
+                        int_enc, strlen(int_enc));
+        if (f->int_enc == 0) {
+                alloc(f, 0, pw);
+                return PARSERUTILS_BADENCODING;
+        }
+#else
+        f->leftover = false;
+        f->pivot_left = NULL;
+        f->pivot_len = 0;
+#endif
+        f->alloc = alloc;
+        f->pw = pw;
+        error = filter_set_defaults(f);
+        if (error != PARSERUTILS_OK) {
+                f->alloc(f, 0, pw);
+                return error;
+        }
+#ifdef WITHOUT_ICONV_FILTER
+        error = parserutils_charset_codec_create(int_enc, alloc, pw,
+                        &f->write_codec);
+        if (error != PARSERUTILS_OK) {
+                if (f->read_codec != NULL) {
+                        parserutils_charset_codec_destroy(f->read_codec);
+                        f->read_codec = NULL;
+                }
+                f->alloc(f, 0, pw);
+                return error;
+        }
+#endif
+        *filter = f;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy an input filter
+ *
+ * \param input  Pointer to filter instance
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils__filter_destroy(parserutils_filter *input)
+{
+        if (input == NULL)
+                return PARSERUTILS_BADPARM;
+#ifndef WITHOUT_ICONV_FILTER
+        if (input->cd != (iconv_t) -1) {
+                iconv_close(input->cd);
+                input->cd = (iconv_t) -1;
+        }
+#else
+        if (input->read_codec != NULL) {
+                parserutils_charset_codec_destroy(input->read_codec);
+                input->read_codec = NULL;
+        }
+        if (input->write_codec != NULL) {
+                parserutils_charset_codec_destroy(input->write_codec);
+                input->write_codec = NULL;
+        }
+#endif
+        input->alloc(input, 0, input->pw);
+        return PARSERUTILS_OK;
+}
+/**
+ * Configure an input filter
+ *
+ * \param input   Pointer to filter instance
+ * \param type    Input option type to configure
+ * \param params  Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils__filter_setopt(parserutils_filter *input,
+                parserutils_filter_opttype type,
+                parserutils_filter_optparams *params)
+{
+        parserutils_error error = PARSERUTILS_OK;
+        if (input == NULL || params == NULL)
+                return PARSERUTILS_BADPARM;
+        switch (type) {
+        case PARSERUTILS_FILTER_SET_ENCODING:
+                error = filter_set_encoding(input, params->encoding.name);
+                break;
+        }
+        return error;
+}
+/**
+ * Process a chunk of data
+ *
+ * \param input   Pointer to filter instance
+ * \param data    Pointer to pointer to input buffer
+ * \param len     Pointer to length of input buffer
+ * \param output  Pointer to pointer to output buffer
+ * \param outlen  Pointer to length of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * Call this with an input buffer length of 0 to flush any buffers.
+ */
+parserutils_error parserutils__filter_process_chunk(parserutils_filter *input,
+                const uint8_t **data, size_t *len,
+                uint8_t **output, size_t *outlen)
+{
+        if (input == NULL || data == NULL || *data == NULL || len == NULL ||
+                        output == NULL || *output == NULL || outlen == NULL)
+                return PARSERUTILS_BADPARM;
+#ifndef WITHOUT_ICONV_FILTER
+        if (iconv(input->cd, (void *) data, len,
+                        (char **) output, outlen) == (size_t) -1) {
+                switch (errno) {
+                case E2BIG:
+                        return PARSERUTILS_NOMEM;
+                case EILSEQ:
+                        if (*outlen < 3)
+                                return PARSERUTILS_NOMEM;
+                        (*output)[0] = 0xef;
+                        (*output)[1] = 0xbf;
+                        (*output)[2] = 0xbd;
+                        *output += 3;
+                        *outlen -= 3;
+                        (*data)++;
+                        (*len)--;
+                        while (*len > 0) {
+                                size_t ret;
+                                ret = iconv(input->cd, (void *) data, len,
+                                                (char **) output, outlen);
+                                if (ret != (size_t) -1 || errno != EILSEQ)
+                                        break;
+                                if (*outlen < 3)
+                                        return PARSERUTILS_NOMEM;
+                                (*output)[0] = 0xef;
+                                (*output)[1] = 0xbf;
+                                (*output)[2] = 0xbd;
+                                *output += 3;
+                                *outlen -= 3;
+                                (*data)++;
+                                (*len)--;
+                        }
+                        return errno == E2BIG ? PARSERUTILS_NOMEM
+                                              : PARSERUTILS_OK;
+                }
+        }
+        return PARSERUTILS_OK;
+#else
+        if (input->leftover) {
+                parserutils_error write_error;
+                /* Some data left to be written from last call */
+                /* Attempt to flush the remaining data. */
+                write_error = parserutils_charset_codec_encode(
+                                input->write_codec,
+                                (const uint8_t **) &input->pivot_left,
+                                &input->pivot_len,
+                                output, outlen);
+                if (write_error != PARSERUTILS_OK)
+                        return write_error;
+                /* And clear leftover */
+                input->pivot_left = NULL;
+                input->pivot_len = 0;
+                input->leftover = false;
+        }
+        while (*len > 0) {
+                parserutils_error read_error, write_error;
+                size_t pivot_len = sizeof(input->pivot_buf);
+                uint8_t *pivot = (uint8_t *) input->pivot_buf;
+                read_error = parserutils_charset_codec_decode(input->read_codec,
+                                data, len,
+                                (uint8_t **) &pivot, &pivot_len);
+                pivot = (uint8_t *) input->pivot_buf;
+                pivot_len = sizeof(input->pivot_buf) - pivot_len;
+                if (pivot_len > 0) {
+                        write_error = parserutils_charset_codec_encode(
+                                        input->write_codec,
+                                        (const uint8_t **) &pivot,
+                                        &pivot_len,
+                                        output, outlen);
+                        if (write_error != PARSERUTILS_OK) {
+                                input->leftover = true;
+                                input->pivot_left = pivot;
+                                input->pivot_len = pivot_len;
+                                return write_error;
+                        }
+                }
+                if (read_error != PARSERUTILS_OK &&
+                                read_error != PARSERUTILS_NOMEM)
+                        return read_error;
+        }
+        return PARSERUTILS_OK;
+#endif
+}
+/**
+ * Reset an input filter's state
+ *
+ * \param input  The input filter to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils__filter_reset(parserutils_filter *input)
+{
+        parserutils_error error = PARSERUTILS_OK;
+        if (input == NULL)
+                return PARSERUTILS_BADPARM;
+#ifndef WITHOUT_ICONV_FILTER
+        iconv(input->cd, NULL, 0, NULL, 0);
+#else
+        /* Clear pivot buffer leftovers */
+        input->pivot_left = NULL;
+        input->pivot_len = 0;
+        input->leftover = false;
+        /* Reset read codec */
+        error = parserutils_charset_codec_reset(input->read_codec);
+        if (error != PARSERUTILS_OK)
+                return error;
+        /* Reset write codec */
+        error = parserutils_charset_codec_reset(input->write_codec);
+        if (error != PARSERUTILS_OK)
+                return error;
+#endif
+        return error;
+}
+/**
+ * Set an input filter's default settings
+ *
+ * \param input  Input filter to configure
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_defaults(parserutils_filter *input)
+{
+        parserutils_error error;
+        if (input == NULL)
+                return PARSERUTILS_BADPARM;
+#ifdef WITHOUT_ICONV_FILTER
+        input->read_codec = NULL;
+        input->write_codec = NULL;
+#endif
+        input->settings.encoding = 0;
+        error = filter_set_encoding(input, "UTF-8");
+        if (error != PARSERUTILS_OK)
+                return error;
+        return PARSERUTILS_OK;
+}
+/**
+ * Set an input filter's encoding
+ *
+ * \param input  Input filter to configure
+ * \param enc    Encoding name
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_encoding(parserutils_filter *input,
+                const char *enc)
+{
+        parserutils_error error = PARSERUTILS_OK;
+        uint16_t mibenum;
+        if (input == NULL || enc == NULL)
+                return PARSERUTILS_BADPARM;
+        mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc));
+        if (mibenum == 0)
+                return PARSERUTILS_BADENCODING;
+        /* Exit early if we're already using this encoding */
+        if (input->settings.encoding == mibenum)
+                return PARSERUTILS_OK;
+#ifndef WITHOUT_ICONV_FILTER
+        if (input->cd != (iconv_t) -1) {
+                iconv_close(input->cd);
+                input->cd = (iconv_t) -1;
+        }
+        input->cd = iconv_open(
+                parserutils_charset_mibenum_to_name(input->int_enc),
+                parserutils_charset_mibenum_to_name(mibenum));
+        if (input->cd == (iconv_t) -1) {
+                return (errno == EINVAL) ? PARSERUTILS_BADENCODING
+                                         : PARSERUTILS_NOMEM;
+        }
+#else
+        if (input->read_codec != NULL) {
+                parserutils_charset_codec_destroy(input->read_codec);
+                input->read_codec = NULL;
+        }
+        error = parserutils_charset_codec_create(enc, input->alloc,
+                        input->pw, &input->read_codec);
+        if (error != PARSERUTILS_OK)
+                return error;
+#endif
+        input->settings.encoding = mibenum;
+        return error;
+}

 /programs/network/netsurf/libparserutils/src/input/filter.h
 ,0 → 1,57
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_input_filter_h_
+#define parserutils_input_filter_h_
+#include <inttypes.h>
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+typedef struct parserutils_filter parserutils_filter;
+/**
+ * Input filter option types
+ */
+typedef enum parserutils_filter_opttype {
+        PARSERUTILS_FILTER_SET_ENCODING       = 0
+} parserutils_filter_opttype;
+/**
+ * Input filter option parameters
+ */
+typedef union parserutils_filter_optparams {
+        /** Parameters for encoding setting */
+        struct {
+                /** Encoding name */
+                const char *name;
+        } encoding;
+} parserutils_filter_optparams;
+/* Create an input filter */
+parserutils_error parserutils__filter_create(const char *int_enc,
+                parserutils_alloc alloc, void *pw, parserutils_filter **filter);
+/* Destroy an input filter */
+parserutils_error parserutils__filter_destroy(parserutils_filter *input);
+/* Configure an input filter */
+parserutils_error parserutils__filter_setopt(parserutils_filter *input,
+                parserutils_filter_opttype type,
+                parserutils_filter_optparams *params);
+/* Process a chunk of data */
+parserutils_error parserutils__filter_process_chunk(parserutils_filter *input,
+                const uint8_t **data, size_t *len,
+                uint8_t **output, size_t *outlen);
+/* Reset an input filter's state */
+parserutils_error parserutils__filter_reset(parserutils_filter *input);
+#endif

 /programs/network/netsurf/libparserutils/src/input/inputstream.c
 ,0 → 1,615
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+#include "input/filter.h"
+#include "utils/utils.h"
+/**
+ * Private input stream definition
+ */
+typedef struct parserutils_inputstream_private {
+        parserutils_inputstream public; /**< Public part. Must be first */
+        parserutils_buffer *raw;        /**< Buffer containing raw data */
+        bool done_first_chunk;          /**< Whether the first chunk has
+                                         * been processed */
+        uint16_t mibenum;               /**< MIB enum for charset, or 0 */
+        uint32_t encsrc;                /**< Charset source */
+        parserutils_filter *input;      /**< Charset conversion filter */
+        parserutils_charset_detect_func csdetect; /**< Charset detection func.*/
+        parserutils_alloc alloc;        /**< Memory (de)allocation function */
+        void *pw;                       /**< Client private data */
+} parserutils_inputstream_private;
+static inline parserutils_error parserutils_inputstream_refill_buffer(
+                parserutils_inputstream_private *stream);
+static inline parserutils_error parserutils_inputstream_strip_bom(
+                uint16_t *mibenum, parserutils_buffer *buffer);
+/**
+ * Create an input stream
+ *
+ * \param enc       Document charset, or NULL to autodetect
+ * \param encsrc    Value for encoding source, if specified, or 0
+ * \param csdetect  Charset detection function, or NULL
+ * \param alloc     Memory (de)allocation function
+ * \param pw        Pointer to client-specific private data (may be NULL)
+ * \param stream    Pointer to location to receive stream instance
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhaustion,
+ *         PARSERUTILS_BADENCODING on unsupported encoding
+ *
+ * The value 0 is defined as being the lowest priority encoding source
+ * (i.e. the default fallback encoding). Beyond this, no further
+ * interpretation is made upon the encoding source.
+ */
+parserutils_error parserutils_inputstream_create(const char *enc,
+                uint32_t encsrc, parserutils_charset_detect_func csdetect,
+                parserutils_alloc alloc, void *pw,
+                parserutils_inputstream **stream)
+{
+        parserutils_inputstream_private *s;
+        parserutils_error error;
+        if (alloc == NULL || stream == NULL)
+                return PARSERUTILS_BADPARM;
+        s = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
+        if (s == NULL)
+                return PARSERUTILS_NOMEM;
+        error = parserutils_buffer_create(alloc, pw, &s->raw);
+        if (error != PARSERUTILS_OK) {
+                alloc(s, 0, pw);
+                return error;
+        }
+        error = parserutils_buffer_create(alloc, pw, &s->public.utf8);
+        if (error != PARSERUTILS_OK) {
+                parserutils_buffer_destroy(s->raw);
+                alloc(s, 0, pw);
+                return error;
+        }
+        s->public.cursor = 0;
+        s->public.had_eof = false;
+        s->done_first_chunk = false;
+        error = parserutils__filter_create("UTF-8", alloc, pw, &s->input);
+        if (error != PARSERUTILS_OK) {
+                parserutils_buffer_destroy(s->public.utf8);
+                parserutils_buffer_destroy(s->raw);
+                alloc(s, 0, pw);
+                return error;
+        }
+        if (enc != NULL) {
+                parserutils_filter_optparams params;
+                s->mibenum =
+                        parserutils_charset_mibenum_from_name(enc, strlen(enc));
+                if (s->mibenum == 0)
+                        return PARSERUTILS_BADENCODING;
+                params.encoding.name = enc;
+                error = parserutils__filter_setopt(s->input,
+                                PARSERUTILS_FILTER_SET_ENCODING,
+                                &params);
+                if (error != PARSERUTILS_OK) {
+                        parserutils__filter_destroy(s->input);
+                        parserutils_buffer_destroy(s->public.utf8);
+                        parserutils_buffer_destroy(s->raw);
+                        alloc(s, 0, pw);
+                        return error;
+                }
+                s->encsrc = encsrc;
+        } else {
+                s->mibenum = 0;
+                s->encsrc = 0;
+        }
+        s->csdetect = csdetect;
+        s->alloc = alloc;
+        s->pw = pw;
+        *stream = (parserutils_inputstream *) s;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy an input stream
+ *
+ * \param stream  Input stream to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_destroy(
+                parserutils_inputstream *stream)
+{
+        parserutils_inputstream_private *s =
+                        (parserutils_inputstream_private *) stream;
+        if (stream == NULL)
+                return PARSERUTILS_BADPARM;
+        parserutils__filter_destroy(s->input);
+        parserutils_buffer_destroy(s->public.utf8);
+        parserutils_buffer_destroy(s->raw);
+        s->alloc(s, 0, s->pw);
+        return PARSERUTILS_OK;
+}
+/**
+ * Append data to an input stream
+ *
+ * \param stream  Input stream to append data to
+ * \param data    Data to append (in document charset), or NULL to flag EOF
+ * \param len     Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_append(
+                parserutils_inputstream *stream,
+                const uint8_t *data, size_t len)
+{
+        parserutils_inputstream_private *s =
+                        (parserutils_inputstream_private *) stream;
+        if (stream == NULL)
+                return PARSERUTILS_BADPARM;
+        if (data == NULL) {
+                s->public.had_eof = true;
+                return PARSERUTILS_OK;
+        }
+        return parserutils_buffer_append(s->raw, data, len);
+}
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream  Input stream to insert into
+ * \param data    Data to insert (UTF-8 encoded)
+ * \param len     Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_insert(
+                parserutils_inputstream *stream,
+                const uint8_t *data, size_t len)
+{
+        parserutils_inputstream_private *s =
+                        (parserutils_inputstream_private *) stream;
+        if (stream == NULL || data == NULL)
+                return PARSERUTILS_BADPARM;
+        return parserutils_buffer_insert(s->public.utf8, s->public.cursor,
+                        data, len);
+}
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+/**
+ * Look at the character in the stream that starts at
+ * offset bytes from the cursor (slow version)
+ *
+ * \param stream  Stream to look in
+ * \param offset  Byte offset of start of character
+ * \param ptr     Pointer to location to receive pointer to character data
+ * \param length  Pointer to location to receive character length (in bytes)
+ * \return PARSERUTILS_OK on success,
+ *                    _NEEDDATA on reaching the end of available input,
+ *                    _EOF on reaching the end of all input,
+ *                    _BADENCODING if the input cannot be decoded,
+ *                    _NOMEM on memory exhaustion,
+ *                    _BADPARM if bad parameters are passed.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to
+ * pass over the character), then no guarantee is made as to the validity of
+ * the data pointed to. Thus, any attempt to dereference the pointer after
+ * advancing past the data it points to is a bug.
+ */
+parserutils_error parserutils_inputstream_peek_slow(
+                parserutils_inputstream *stream,
+                size_t offset, const uint8_t **ptr, size_t *length)
+{
+        parserutils_inputstream_private *s =
+                        (parserutils_inputstream_private *) stream;
+        parserutils_error error = PARSERUTILS_OK;
+        size_t len;
+        if (stream == NULL || ptr == NULL || length == NULL)
+                return PARSERUTILS_BADPARM;
+        /* There's insufficient data in the buffer, so read some more */
+        if (s->raw->length == 0) {
+                /* No more data to be had */
+                return s->public.had_eof ? PARSERUTILS_EOF
+                                         : PARSERUTILS_NEEDDATA;
+        }
+        /* Refill utf8 buffer from raw buffer */
+        error = parserutils_inputstream_refill_buffer(s);
+        if (error != PARSERUTILS_OK)
+                return error;
+        /* Refill may have succeeded, but not actually produced any new data */
+        if (s->public.cursor + offset == s->public.utf8->length)
+                return PARSERUTILS_NEEDDATA;
+        /* Now try the read */
+        if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
+                len = 1;
+        } else {
+                error = parserutils_charset_utf8_char_byte_length(
+                        s->public.utf8->data + s->public.cursor + offset,
+                        &len);
+                if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
+                        return error;
+                if (error == PARSERUTILS_NEEDDATA) {
+                        return s->public.had_eof ? PARSERUTILS_EOF
+                                                 : PARSERUTILS_NEEDDATA;
+                }
+        }
+        (*length) = len;
+        (*ptr) = (s->public.utf8->data + s->public.cursor + offset);
+        return PARSERUTILS_OK;
+}
+#undef IS_ASCII
+/**
+ * Read the source charset of the input stream
+ *
+ * \param stream  Input stream to query
+ * \param source  Pointer to location to receive charset source identifier
+ * \return Pointer to charset name (constant; do not free)
+ */
+const char *parserutils_inputstream_read_charset(
+                parserutils_inputstream *stream, uint32_t *source)
+{
+        parserutils_inputstream_private *s =
+                        (parserutils_inputstream_private *) stream;
+        if (stream == NULL || source == NULL)
+                return NULL;
+        *source = s->encsrc;
+        if (s->encsrc == 0)
+                return "UTF-8";
+        return parserutils_charset_mibenum_to_name(s->mibenum);
+}
+/**
+ * Change the source charset of the input stream
+ *
+ * \param stream   Input stream to modify
+ * \param enc      Charset name
+ * \param source   Charset source identifier
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on invalid parameters,
+ *         PARSERUTILS_INVALID if called after data has been read from stream,
+ *         PARSERUTILS_BADENCODING if the encoding is unsupported,
+ *         PARSERUTILS_NOMEM on memory exhaustion.
+ */
+parserutils_error parserutils_inputstream_change_charset(
+                parserutils_inputstream *stream,
+                const char *enc, uint32_t source)
+{
+        parserutils_inputstream_private *s =
+                        (parserutils_inputstream_private *) stream;
+        parserutils_filter_optparams params;
+        uint16_t temp;
+        parserutils_error error;
+        if (stream == NULL || enc == NULL)
+                return PARSERUTILS_BADPARM;
+        if (s->done_first_chunk)
+                return PARSERUTILS_INVALID;
+        temp = parserutils_charset_mibenum_from_name(enc, strlen(enc));
+        if (temp == 0)
+                return PARSERUTILS_BADENCODING;
+        /* Ensure filter is using the correct encoding */
+        params.encoding.name = enc;
+        error = parserutils__filter_setopt(s->input,
+                        PARSERUTILS_FILTER_SET_ENCODING,
+                        &params);
+        if (error != PARSERUTILS_OK)
+                return error;
+        /* Finally, replace the current settings */
+        s->mibenum = temp;
+        s->encsrc = source;
+        return PARSERUTILS_OK;
+}
+/******************************************************************************
+ ******************************************************************************/
+/**
+ * Refill the UTF-8 buffer from the raw buffer
+ *
+ * \param stream  The inputstream to operate on
+ * \return PARSERUTILS_OK on success
+ */
+parserutils_error parserutils_inputstream_refill_buffer(
+                parserutils_inputstream_private *stream)
+{
+        const uint8_t *raw;
+        uint8_t *utf8;
+        size_t raw_length, utf8_space;
+        parserutils_error error;
+        /* If this is the first chunk of data, we must detect the charset and
+         * strip the BOM, if one exists */
+        if (stream->done_first_chunk == false) {
+                parserutils_filter_optparams params;
+                /* If there is a charset detection routine, give it an
+                 * opportunity to override any charset specified when the
+                 * inputstream was created */
+                if (stream->csdetect != NULL) {
+                        error = stream->csdetect(stream->raw->data,
+                                stream->raw->length,
+                                &stream->mibenum, &stream->encsrc);
+                        if (error != PARSERUTILS_OK) {
+                                if (error != PARSERUTILS_NEEDDATA ||
+                                                stream->public.had_eof == false)
+                                        return error;
+                                /* We don't have enough data to detect the
+                                 * input encoding, but we're not going to get
+                                 * any more as we've been notified of EOF.
+                                 * Therefore, leave the encoding alone
+                                 * so that any charset specified when the
+                                 * inputstream was created will be preserved.
+                                 * If there was no charset specified, then
+                                 * we'll default to UTF-8, below */
+                        }
+                }
+                /* Default to UTF-8 if there is still no encoding information
+                 * We'll do this if there was no encoding specified up-front
+                 * and:
+                 *    1) there was no charset detection routine
+                 * or 2) there was insufficient data for the charset
+                 *       detection routine to detect an encoding
+                 */
+                if (stream->mibenum == 0) {
+                        stream->mibenum =
+                                parserutils_charset_mibenum_from_name("UTF-8",
+                                        SLEN("UTF-8"));
+                        stream->encsrc = 0;
+                }
+                if (stream->mibenum == 0)
+                        abort();
+                /* Strip any BOM, and update encoding as appropriate */
+                error = parserutils_inputstream_strip_bom(&stream->mibenum,
+                                stream->raw);
+                if (error != PARSERUTILS_OK)
+                        return error;
+                /* Ensure filter is using the correct encoding */
+                params.encoding.name =
+                        parserutils_charset_mibenum_to_name(stream->mibenum);
+                error = parserutils__filter_setopt(stream->input,
+                                PARSERUTILS_FILTER_SET_ENCODING,
+                                &params);
+                if (error != PARSERUTILS_OK)
+                        return error;
+                stream->done_first_chunk = true;
+        }
+        /* Work out how to perform the buffer fill */
+        if (stream->public.cursor == stream->public.utf8->length) {
+                /* Cursor's at the end, so simply reuse the entire buffer */
+                utf8 = stream->public.utf8->data;
+                utf8_space = stream->public.utf8->allocated;
+        } else {
+                /* Cursor's not at the end, so shift data after cursor to the
+                 * bottom of the buffer. If the buffer's still over half full,
+                 * extend it. */
+                memmove(stream->public.utf8->data,
+                        stream->public.utf8->data + stream->public.cursor,
+                        stream->public.utf8->length - stream->public.cursor);
+                stream->public.utf8->length -= stream->public.cursor;
+                if (stream->public.utf8->length >
+                                stream->public.utf8->allocated / 2) {
+                        error = parserutils_buffer_grow(stream->public.utf8);
+                        if (error != PARSERUTILS_OK)
+                                return error;
+                }
+                utf8 = stream->public.utf8->data + stream->public.utf8->length;
+                utf8_space = stream->public.utf8->allocated -
+                                stream->public.utf8->length;
+        }
+        raw = stream->raw->data;
+        raw_length = stream->raw->length;
+        /* Try to fill utf8 buffer from the raw data */
+        error = parserutils__filter_process_chunk(stream->input,
+                        &raw, &raw_length, &utf8, &utf8_space);
+        /* _NOMEM implies that there's more input to read than available space
+         * in the utf8 buffer. That's fine, so we'll ignore that error. */
+        if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
+                return error;
+        /* Remove the raw data we've processed from the raw buffer */
+        error = parserutils_buffer_discard(stream->raw, 0,
+                        stream->raw->length - raw_length);
+        if (error != PARSERUTILS_OK)
+                return error;
+        /* Fix up the utf8 buffer information */
+        stream->public.utf8->length =
+                        stream->public.utf8->allocated - utf8_space;
+        /* Finally, fix up the cursor */
+        stream->public.cursor = 0;
+        return PARSERUTILS_OK;
+}
+/**
+ * Strip a BOM from a buffer in the given encoding
+ *
+ * \param mibenum  Pointer to the character set of the buffer, updated on exit
+ * \param buffer   The buffer to process
+ */
+parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum,
+                parserutils_buffer *buffer)
+{
+        static uint16_t utf8;
+        static uint16_t utf16;
+        static uint16_t utf16be;
+        static uint16_t utf16le;
+        static uint16_t utf32;
+        static uint16_t utf32be;
+        static uint16_t utf32le;
+        if (utf8 == 0) {
+                utf8 = parserutils_charset_mibenum_from_name("UTF-8",
+                                SLEN("UTF-8"));
+                utf16 = parserutils_charset_mibenum_from_name("UTF-16",
+                                SLEN("UTF-16"));
+                utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+                                SLEN("UTF-16BE"));
+                utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+                                SLEN("UTF-16LE"));
+                utf32 = parserutils_charset_mibenum_from_name("UTF-32",
+                                SLEN("UTF-32"));
+                utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+                                SLEN("UTF-32BE"));
+                utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+                                SLEN("UTF-32LE"));
+        }
+#define UTF32_BOM_LEN (4)
+#define UTF16_BOM_LEN (2)
+#define UTF8_BOM_LEN  (3)
+        if (*mibenum == utf8) {
+                if (buffer->length >= UTF8_BOM_LEN &&
+                                buffer->data[0] == 0xEF &&
+                                buffer->data[1] == 0xBB &&
+                                buffer->data[2] == 0xBF) {
+                        return parserutils_buffer_discard(
+                                        buffer, 0, UTF8_BOM_LEN);
+                }
+        } else if (*mibenum == utf16be) {
+                if (buffer->length >= UTF16_BOM_LEN &&
+                                buffer->data[0] == 0xFE &&
+                                buffer->data[1] == 0xFF) {
+                        return parserutils_buffer_discard(
+                                        buffer, 0, UTF16_BOM_LEN);
+                }
+        } else if (*mibenum == utf16le) {
+                if (buffer->length >= UTF16_BOM_LEN &&
+                                buffer->data[0] == 0xFF &&
+                                buffer->data[1] == 0xFE) {
+                        return parserutils_buffer_discard(
+                                        buffer, 0, UTF16_BOM_LEN);
+                }
+        } else if (*mibenum == utf16) {
+                *mibenum = utf16be;
+                if (buffer->length >= UTF16_BOM_LEN) {
+                        if (buffer->data[0] == 0xFE &&
+                                        buffer->data[1] == 0xFF) {
+                                return parserutils_buffer_discard(
+                                                buffer, 0, UTF16_BOM_LEN);
+                        } else if (buffer->data[0] == 0xFF &&
+                                        buffer->data[1] == 0xFE) {
+                                *mibenum = utf16le;
+                                return parserutils_buffer_discard(
+                                                buffer, 0, UTF16_BOM_LEN);
+                        }
+                }
+        } else if (*mibenum == utf32be) {
+                if (buffer->length >= UTF32_BOM_LEN &&
+                                buffer->data[0] == 0x00 &&
+                                buffer->data[1] == 0x00 &&
+                                buffer->data[2] == 0xFE &&
+                                buffer->data[3] == 0xFF) {
+                        return parserutils_buffer_discard(
+                                        buffer, 0, UTF32_BOM_LEN);
+                }
+        } else if (*mibenum == utf32le) {
+                if (buffer->length >= UTF32_BOM_LEN &&
+                                buffer->data[0] == 0xFF &&
+                                buffer->data[1] == 0xFE &&
+                                buffer->data[2] == 0x00 &&
+                                buffer->data[3] == 0x00) {
+                        return parserutils_buffer_discard(
+                                        buffer, 0, UTF32_BOM_LEN);
+                }
+        } else if (*mibenum == utf32) {
+                *mibenum = utf32be;
+                if (buffer->length >= UTF32_BOM_LEN) {
+                        if (buffer->data[0] == 0x00 &&
+                                        buffer->data[1] == 0x00 &&
+                                        buffer->data[2] == 0xFE &&
+                                        buffer->data[3] == 0xFF) {
+                                return parserutils_buffer_discard(
+                                                buffer, 0, UTF32_BOM_LEN);
+                        } else if (buffer->data[0] == 0xFF &&
+                                        buffer->data[1] == 0xFE &&
+                                        buffer->data[2] == 0x00 &&
+                                        buffer->data[3] == 0x00) {
+                                *mibenum = utf32le;
+                                return parserutils_buffer_discard(
+                                                buffer, 0, UTF32_BOM_LEN);
+                        }
+                }
+        }
+#undef UTF8_BOM_LEN
+#undef UTF16_BOM_LEN
+#undef UTF32_BOM_LEN
+        return PARSERUTILS_OK;
+}

/programs/network/netsurf/libparserutils/src/utils/Makefile
0,0 → 1,5

OUTFILE = libo.o
OBJS = buffer.o errors.o stack.o vector.o
CFLAGS += -I ../../include/ -I ../../../ -I ../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib

 /programs/network/netsurf/libparserutils/src/utils/buffer.c
 ,0 → 1,196
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <string.h>
+#include <parserutils/utils/buffer.h>
+#define DEFAULT_SIZE (4096)
+/**
+ * Create a memory buffer
+ *
+ * \param alloc   Memory (de)allocation function
+ * \param pw      Pointer to client-specific private data
+ * \param buffer  Pointer to location to receive memory buffer
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhausion
+ */
+parserutils_error parserutils_buffer_create(parserutils_alloc alloc, void *pw,
+                parserutils_buffer **buffer)
+{
+        parserutils_buffer *b;
+        if (alloc == NULL || buffer == NULL)
+                return PARSERUTILS_BADPARM;
+        b = alloc(NULL, sizeof(parserutils_buffer), pw);
+        if (b == NULL)
+                return PARSERUTILS_NOMEM;
+        b->data = alloc(NULL, DEFAULT_SIZE, pw);
+        if (b->data == NULL) {
+                alloc(b, 0, pw);
+                return PARSERUTILS_NOMEM;
+        }
+        b->length = 0;
+        b->allocated = DEFAULT_SIZE;
+        b->alloc = alloc;
+        b->pw = pw;
+        *buffer = b;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy a memory buffer
+ *
+ * \param buffer  The buffer to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_buffer_destroy(parserutils_buffer *buffer)
+{
+        if (buffer == NULL)
+                return PARSERUTILS_BADPARM;
+        buffer->alloc(buffer->data, 0, buffer->pw);
+        buffer->alloc(buffer, 0, buffer->pw);
+        return PARSERUTILS_OK;
+}
+/**
+ * Append data to a memory buffer
+ *
+ * \param buffer  The buffer to append to
+ * \param data    The data to append
+ * \param len     The length, in bytes, of the data to append
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_append(parserutils_buffer *buffer,
+                const uint8_t *data, size_t len)
+{
+        while (len >= buffer->allocated - buffer->length) {
+                parserutils_error error = parserutils_buffer_grow(buffer);
+                if (error != PARSERUTILS_OK)
+                        return error;
+        }
+        memcpy(buffer->data + buffer->length, data, len);
+        buffer->length += len;
+        return PARSERUTILS_OK;
+}
+/**
+ * Insert data into a memory buffer
+ *
+ * \param buffer  The buffer to insert into
+ * \param offset  The offset into the buffer to insert at
+ * \param data    The data to insert
+ * \param len     The length, in bytes, of the data to insert
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer,
+                size_t offset, const uint8_t *data, size_t len)
+{
+        if (offset > buffer->length)
+                return PARSERUTILS_BADPARM;
+        if (offset == buffer->length)
+                return parserutils_buffer_append(buffer, data, len);
+        while (len >= buffer->allocated - buffer->length) {
+                parserutils_error error = parserutils_buffer_grow(buffer);
+                if (error != PARSERUTILS_OK)
+                        return error;
+        }
+        memmove(buffer->data + offset + len,
+                        buffer->data + offset, buffer->length - offset);
+        memcpy(buffer->data + offset, data, len);
+        buffer->length += len;
+        return PARSERUTILS_OK;
+}
+/**
+ * Discard a section of a memory buffer
+ *
+ * \param buffer  The buffer to discard data from
+ * \param offset  The offset into the buffer of the start of the section
+ * \param len     The number of bytes to discard
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer,
+                size_t offset, size_t len)
+{
+        if (offset >= buffer->length || offset + len > buffer->length)
+                return PARSERUTILS_BADPARM;
+        memmove(buffer->data + offset, buffer->data + offset + len,
+                        buffer->length - len);
+        buffer->length -= len;
+        return PARSERUTILS_OK;
+}
+/**
+ * Extend the amount of space allocated for a memory buffer
+ *
+ * \param buffer  The buffer to extend
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer)
+{
+        uint8_t *temp = buffer->alloc(buffer->data,
+                        buffer->allocated * 2, buffer->pw);
+        if (temp == NULL)
+                return PARSERUTILS_NOMEM;
+        buffer->data = temp;
+        buffer->allocated *= 2;
+        return PARSERUTILS_OK;
+}
+parserutils_error parserutils_buffer_randomise(parserutils_buffer *buffer)
+{
+#ifndef NDEBUG
+        uint8_t *temp;
+#endif
+        if (buffer == NULL)
+                return PARSERUTILS_BADPARM;
+#ifndef NDEBUG
+        temp = buffer->alloc(NULL, buffer->allocated, buffer->pw);
+        if (temp == NULL)
+                return PARSERUTILS_NOMEM;
+        memcpy(temp, buffer->data, buffer->length);
+        memset(buffer->data, 0xff, buffer->length);
+        /* Leak the buffer's current data, so we don't reuse it */
+        /* buffer->alloc(buffer->data, 0, buffer->pw); */
+        buffer->data = temp;
+#endif
+        return PARSERUTILS_OK;
+}

 /programs/network/netsurf/libparserutils/src/utils/endian.h
 ,0 → 1,40
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2009 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_endian_h_
+#define parserutils_endian_h_
+static inline bool endian_host_is_le(void)
+{
+        static uint32_t magic = 0x10000002;
+        return (((uint8_t *) &magic)[0] == 0x02);
+}
+static inline uint32_t endian_swap(uint32_t val)
+{
+        return ((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) |
+                ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24);
+}
+static inline uint32_t endian_host_to_big(uint32_t host)
+{
+        if (endian_host_is_le())
+                return endian_swap(host);
+        return host;
+}
+static inline uint32_t endian_big_to_host(uint32_t big)
+{
+        if (endian_host_is_le())
+                return endian_swap(big);
+        return big;
+}
+#endif

 /programs/network/netsurf/libparserutils/src/utils/errors.c
 ,0 → 1,80
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <string.h>
+#include <parserutils/errors.h>
+/**
+ * Convert a parserutils error code to a string
+ *
+ * \param error  The error code to convert
+ * \return Pointer to string representation of error, or NULL if unknown.
+ */
+const char *parserutils_error_to_string(parserutils_error error)
+{
+        const char *result = NULL;
+        switch (error) {
+        case PARSERUTILS_OK:
+                result = "No error";
+                break;
+        case PARSERUTILS_NOMEM:
+                result = "Insufficient memory";
+                break;
+        case PARSERUTILS_BADPARM:
+                result = "Bad parameter";
+                break;
+        case PARSERUTILS_INVALID:
+                result = "Invalid input";
+                break;
+        case PARSERUTILS_FILENOTFOUND:
+                result = "File not found";
+                break;
+        case PARSERUTILS_NEEDDATA:
+                result = "Insufficient data";
+                break;
+        case PARSERUTILS_BADENCODING:
+                result = "Unsupported encoding";
+                break;
+        case PARSERUTILS_EOF:
+                result = "EOF";
+                break;
+        }
+        return result;
+}
+/**
+ * Convert a string representation of an error name to a parserutils error code
+ *
+ * \param str  String containing error name
+ * \param len  Length of string (bytes)
+ * \return Error code, or PARSERUTILS_OK if unknown
+ */
+parserutils_error parserutils_error_from_string(const char *str, size_t len)
+{
+        if (strncmp(str, "PARSERUTILS_OK", len) == 0) {
+                return PARSERUTILS_OK;
+        } else if (strncmp(str, "PARSERUTILS_NOMEM", len) == 0) {
+                return PARSERUTILS_NOMEM;
+        } else if (strncmp(str, "PARSERUTILS_BADPARM", len) == 0) {
+                return PARSERUTILS_BADPARM;
+        } else if (strncmp(str, "PARSERUTILS_INVALID", len) == 0) {
+                return PARSERUTILS_INVALID;
+        } else if (strncmp(str, "PARSERUTILS_FILENOTFOUND", len) == 0) {
+                return PARSERUTILS_FILENOTFOUND;
+        } else if (strncmp(str, "PARSERUTILS_NEEDDATA", len) == 0) {
+                return PARSERUTILS_NEEDDATA;
+        } else if (strncmp(str, "PARSERUTILS_BADENCODING", len) == 0) {
+                return PARSERUTILS_BADENCODING;
+        } else if (strncmp(str, "PARSERUTILS_EOF", len) == 0) {
+                return PARSERUTILS_EOF;
+        }
+        return PARSERUTILS_OK;
+}

 /programs/network/netsurf/libparserutils/src/utils/stack.c
 ,0 → 1,190
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <inttypes.h>
+#include <string.h>
+#include <parserutils/utils/stack.h>
+/**
+ * Stack object
+ */
+struct parserutils_stack
+{
+        size_t item_size;               /**< Size of an item in the stack */
+        size_t chunk_size;              /**< Size of a stack chunk */
+        size_t items_allocated;         /**< Number of slots allocated */
+        int32_t current_item;           /**< Index of current item */
+        void *items;                    /**< Items in stack */
+        parserutils_alloc alloc;        /**< Memory (de)allocation function */
+        void *pw;                       /**< Client-specific data */
+};
+/**
+ * Create a stack
+ *
+ * \param item_size   Length, in bytes, of an item in the stack
+ * \param chunk_size  Number of stack slots in a chunk
+ * \param alloc       Memory (de)allocation function
+ * \param pw          Pointer to client-specific private data
+ * \param stack       Pointer to location to receive stack instance
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters
+ *         PARSERUTILS_NOMEM on memory exhaustion
+ */
+parserutils_error parserutils_stack_create(size_t item_size, size_t chunk_size,
+                parserutils_alloc alloc, void *pw, parserutils_stack **stack)
+{
+        parserutils_stack *s;
+        if (item_size == 0 || chunk_size == 0 || alloc == NULL || stack == NULL)
+                return PARSERUTILS_BADPARM;
+        s = alloc(NULL, sizeof(parserutils_stack), pw);
+        if (s == NULL)
+                return PARSERUTILS_NOMEM;
+        s->items = alloc(NULL, item_size * chunk_size, pw);
+        if (s->items == NULL) {
+                alloc(s, 0, pw);
+                return PARSERUTILS_NOMEM;
+        }
+        s->item_size = item_size;
+        s->chunk_size = chunk_size;
+        s->items_allocated = chunk_size;
+        s->current_item = -1;
+        s->alloc = alloc;
+        s->pw = pw;
+        *stack = s;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy a stack instance
+ *
+ * \param stack  The stack to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_stack_destroy(parserutils_stack *stack)
+{
+        if (stack == NULL)
+                return PARSERUTILS_BADPARM;
+        stack->alloc(stack->items, 0, stack->pw);
+        stack->alloc(stack, 0, stack->pw);
+        return PARSERUTILS_OK;
+}
+/**
+ * Push an item onto the stack
+ *
+ * \param stack  The stack to push onto
+ * \param item   The item to push
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_stack_push(parserutils_stack *stack,
+                const void *item)
+{
+        int32_t slot;
+        if (stack == NULL || item == NULL)
+                return PARSERUTILS_BADPARM;
+        /* Ensure we'll get a valid slot */
+        if (stack->current_item < -1 || stack->current_item == INT32_MAX)
+                return PARSERUTILS_INVALID;
+        slot = stack->current_item + 1;
+        if ((size_t) slot >= stack->items_allocated) {
+                void *temp = stack->alloc(stack->items,
+                                (stack->items_allocated + stack->chunk_size) *
+                                stack->item_size, stack->pw);
+                if (temp == NULL)
+                        return PARSERUTILS_NOMEM;
+                stack->items = temp;
+                stack->items_allocated += stack->chunk_size;
+        }
+        memcpy((uint8_t *) stack->items + (slot * stack->item_size),
+                        item, stack->item_size);
+        stack->current_item = slot;
+        return PARSERUTILS_OK;
+}
+/**
+ * Pop an item off a stack
+ *
+ * \param stack  The stack to pop from
+ * \param item   Pointer to location to receive popped item, or NULL
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_stack_pop(parserutils_stack *stack, void *item)
+{
+        if (stack == NULL)
+                return PARSERUTILS_BADPARM;
+        if (stack->current_item < 0)
+                return PARSERUTILS_INVALID;
+        if (item != NULL) {
+                memcpy(item, (uint8_t *) stack->items +
+                                (stack->current_item * stack->item_size),
+                                stack->item_size);
+        }
+        stack->current_item -= 1;
+        return PARSERUTILS_OK;
+}
+/**
+ * Retrieve a pointer to the current item on the stack
+ *
+ * \param stack  The stack to inspect
+ * \return Pointer to item on stack, or NULL if none
+ */
+void *parserutils_stack_get_current(parserutils_stack *stack)
+{
+        if (stack == NULL || stack->current_item < 0)
+                return NULL;
+        return (uint8_t *) stack->items +
+                        (stack->current_item * stack->item_size);
+}
+#ifndef NDEBUG
+#include <stdio.h>
+extern void parserutils_stack_dump(parserutils_stack *stack, const char *prefix,
+                void (*printer)(void *item));
+void parserutils_stack_dump(parserutils_stack *stack, const char *prefix,
+                void (*printer)(void *item))
+{
+        int32_t i;
+        if (stack == NULL || printer == NULL)
+                return;
+        for (i = 0; i <= stack->current_item; i++) {
+                printf("%s %d: ", prefix != NULL ? prefix : "", i);
+                printer((uint8_t *) stack->items + (i * stack->item_size));
+                printf("\n");
+        }
+}
+#endif

 /programs/network/netsurf/libparserutils/src/utils/utils.h
 ,0 → 1,36
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#ifndef parserutils_utils_h_
+#define parserutils_utils_h_
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+#ifndef SLEN
+/* Calculate length of a string constant */
+#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */
+#endif
+#ifndef UNUSED
+#define UNUSED(x) ((x)=(x))
+#endif
+#ifndef N_ELEMENTS
+#define N_ELEMENTS(s) (sizeof((s)) / sizeof((s)[0]))
+#endif
+#ifndef ALIGN
+#define ALIGN(val) (((val) + 3) & ~(3))
+#endif
+#endif

 /programs/network/netsurf/libparserutils/src/utils/vector.c
 ,0 → 1,257
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+#include <inttypes.h>
+#include <string.h>
+#include <parserutils/utils/vector.h>
+/**
+ * Vector object
+ */
+struct parserutils_vector
+{
+        size_t item_size;               /**< Size of an item in the vector */
+        size_t chunk_size;              /**< Size of a vector chunk */
+        size_t items_allocated;         /**< Number of slots allocated */
+        int32_t current_item;           /**< Index of current item */
+        void *items;                    /**< Items in vector */
+        parserutils_alloc alloc;        /**< Memory (de)allocation function */
+        void *pw;                       /**< Client-specific data */
+};
+/**
+ * Create a vector
+ *
+ * \param item_size   Length, in bytes, of an item in the vector
+ * \param chunk_size  Number of vector slots in a chunk
+ * \param alloc       Memory (de)allocation function
+ * \param pw          Pointer to client-specific private data
+ * \param vector      Pointer to location to receive vector instance
+ * \return PARSERUTILS_OK on success,
+ *         PARSERUTILS_BADPARM on bad parameters,
+ *         PARSERUTILS_NOMEM on memory exhaustion
+ */
+parserutils_error parserutils_vector_create(size_t item_size,
+                size_t chunk_size, parserutils_alloc alloc, void *pw,
+                parserutils_vector **vector)
+{
+        parserutils_vector *v;
+        if (item_size == 0 || chunk_size == 0 || alloc == NULL ||
+                        vector == NULL)
+                return PARSERUTILS_BADPARM;
+        v = alloc(NULL, sizeof(parserutils_vector), pw);
+        if (v == NULL)
+                return PARSERUTILS_NOMEM;
+        v->items = alloc(NULL, item_size * chunk_size, pw);
+        if (v->items == NULL) {
+                alloc(v, 0, pw);
+                return PARSERUTILS_NOMEM;
+        }
+        v->item_size = item_size;
+        v->chunk_size = chunk_size;
+        v->items_allocated = chunk_size;
+        v->current_item = -1;
+        v->alloc = alloc;
+        v->pw = pw;
+        *vector = v;
+        return PARSERUTILS_OK;
+}
+/**
+ * Destroy a vector instance
+ *
+ * \param vector  The vector to destroy
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_vector_destroy(parserutils_vector *vector)
+{
+        if (vector == NULL)
+                return PARSERUTILS_BADPARM;
+        vector->alloc(vector->items, 0, vector->pw);
+        vector->alloc(vector, 0, vector->pw);
+        return PARSERUTILS_OK;
+}
+/**
+ * Append an item to the vector
+ *
+ * \param vector  The vector to append to
+ * \param item    The item to append
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_vector_append(parserutils_vector *vector,
+                void *item)
+{
+        int32_t slot;
+        if (vector == NULL || item == NULL)
+                return PARSERUTILS_BADPARM;
+        /* Ensure we'll get a valid slot */
+        if (vector->current_item < -1 || vector->current_item == INT32_MAX)
+                return PARSERUTILS_INVALID;
+        slot = vector->current_item + 1;
+        if ((size_t) slot >= vector->items_allocated) {
+                void *temp = vector->alloc(vector->items,
+                                (vector->items_allocated + vector->chunk_size) *
+                                vector->item_size, vector->pw);
+                if (temp == NULL)
+                        return PARSERUTILS_NOMEM;
+                vector->items = temp;
+                vector->items_allocated += vector->chunk_size;
+        }
+        memcpy((uint8_t *) vector->items + (slot * vector->item_size),
+                        item, vector->item_size);
+        vector->current_item = slot;
+        return PARSERUTILS_OK;
+}
+/**
+ * Clear a vector
+ *
+ * \param vector  The vector to clear
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_vector_clear(parserutils_vector *vector)
+{
+        if (vector == NULL)
+                return PARSERUTILS_BADPARM;
+        if (vector->current_item < 0)
+                return PARSERUTILS_INVALID;
+        vector->current_item = -1;
+        return PARSERUTILS_OK;
+}
+/**
+ * Remove the last item from a vector
+ *
+ * \param vector  The vector to remove from
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_vector_remove_last(parserutils_vector *vector)
+{
+        if (vector == NULL)
+                return PARSERUTILS_BADPARM;
+        if (vector->current_item < 0)
+                return PARSERUTILS_INVALID;
+        vector->current_item--;
+        return PARSERUTILS_OK;
+}
+/**
+ * Acquire the length (in items) of the vector.
+ *
+ * \param vector  The vector to interrogate.
+ * \param length  Pointer to location to receive length information.
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_vector_get_length(parserutils_vector *vector,
+                                                size_t *length)
+{
+        if (vector == NULL)
+                return PARSERUTILS_BADPARM;
+        if (length == NULL)
+                return PARSERUTILS_BADPARM;
+        *length = vector->current_item + 1;
+        return PARSERUTILS_OK;
+}
+/**
+ * Iterate over a vector
+ *
+ * \param vector  The vector to iterate over
+ * \param ctx     Pointer to an integer for the iterator to use as context.
+ * \return Pointer to current item, or NULL if no more
+ *
+ * \note The value pointed to by \a ctx must be zero to begin the iteration.
+ */
+const void *parserutils_vector_iterate(const parserutils_vector *vector,
+                int32_t *ctx)
+{
+        void *item;
+        if (vector == NULL || ctx == NULL || vector->current_item < 0)
+                return NULL;
+        if ((*ctx) > vector->current_item)
+                return NULL;
+        item = (uint8_t *) vector->items + ((*ctx) * vector->item_size);
+        (*ctx)++;
+        return item;
+}
+/**
+ * Peek at an item in a vector
+ *
+ * \param vector  The vector to iterate over
+ * \param ctx     Integer for the iterator to use as context.
+ * \return Pointer to item, or NULL if no more
+ */
+const void *parserutils_vector_peek(const parserutils_vector *vector,
+                int32_t ctx)
+{
+        if (vector == NULL || vector->current_item < 0)
+                return NULL;
+        if (ctx > vector->current_item)
+                return NULL;
+        return (uint8_t *) vector->items + (ctx * vector->item_size);
+}
+#ifndef NDEBUG
+#include <stdio.h>
+extern void parserutils_vector_dump(parserutils_vector *vector,
+                const char *prefix, void (*printer)(void *item));
+void parserutils_vector_dump(parserutils_vector *vector, const char *prefix,
+                void (*printer)(void *item))
+{
+        int32_t i;
+        if (vector == NULL || printer == NULL)
+                return;
+        for (i = 0; i <= vector->current_item; i++) {
+                printf("%s %d: ", prefix != NULL ? prefix : "", i);
+                printer((uint8_t *) vector->items + (i * vector->item_size));
+                printf("\n");
+        }
+}
+#endif

 /programs/network/netsurf/libparserutils/test/INDEX
 ,0 → 1,11
+# Index for testcases
+#
+# Test          Description                             DataDir
+aliases         Encoding alias handling
+cscodec-utf8    UTF-8 charset codec implementation      cscodec-utf8
+cscodec-utf16   UTF-16 charset codec implementation     cscodec-utf16
+cscodec-ext8    Extended 8bit charset codec             cscodec-ext8
+cscodec-8859    ISO-8859-n codec                        cscodec-8859
+filter          Input stream filtering
+inputstream     Inputstream handling                    input

/programs/network/netsurf/libparserutils/test/Makefile
0,0 → 1,7
# Tests
DIR_TEST_ITEMS := aliases:aliases.c cscodec-8859:cscodec-8859.c \
cscodec-ext8:cscodec-ext8.c cscodec-utf8:cscodec-utf8.c \
cscodec-utf16:cscodec-utf16.c filter:filter.c \
inputstream:inputstream.c

include $(NSBUILD)/Makefile.subdir

 /programs/network/netsurf/libparserutils/test/README
 ,0 → 1,84
+Libcharset testcases
+====================
+Testcases for Libcharset are self-contained binaries which test various parts
+of the charset library. These may make use of external data files to drive
+the testing.
+Testcase command lines
+----------------------
+Testcase command lines are in a unified format, thus:
+        <aliases_file> [ <data_file> ]
+The aliases file parameter will always be specified (as it is required for
+the library to work at all).
+The data file parameter is optional and may be provided on a test-by-test
+basis.
+Testcase output
+---------------
+Testcases may output anything at all to stdout. The final line of the
+output must begin with either PASS or FAIL (case sensitive), indicating
+the success status of the test.
+Test Index
+----------
+In the test sources directory, is a file, named INDEX, which provides an
+index of all available test binaries. Any new test applications should be
+added to this index as they are created.
+The test index file format is as follows:
+        file         = *line
+        line         = ( entry / comment / blank ) LF
+        entry        = testname 1*HTAB description [ 1*HTAB datadir ]
+        comment      = "#" *non-newline
+        blank        = 0<OCTET>
+        testname     = 1*non-reserved
+        description  = 1*non-reserved
+        datadir      = 1*non-reserved
+        non-newline  = VCHAR / WSP
+        non-reserved = VCHAR / SP
+Each entry contains a mandatory binary name and description followed by
+an optional data directory specifier. The data directory specifier is
+used to state the name of the directory containing data files for the
+test name. This directory will be searched for within the "data"
+directory in the source tree.
+If a data directory is specified, the test binary will be invoked for
+each data file listed within the data directory INDEX, passing the
+filename as the second parameter (<data_file>, above).
+Data Index
+----------
+Each test data directory contains a file, named INDEX, which provides an
+index of all available test data files.
+The data index file format is as follows:
+        file         = *line
+        line         = ( entry / comment / blank ) LF
+        entry        = dataname 1*HTAB description
+        comment      = "#" *non-newline
+        blank        = 0<OCTET>
+        dataname     = 1*non-reserved
+        description  = 1*non-reserved
+        non-newline  = VCHAR / WSP
+        non-reserved = VCHAR / SP
+Each entry contains a mandatory data file name and description.

 /programs/network/netsurf/libparserutils/test/aliases.c
 ,0 → 1,62
+#include <stdio.h>
+#include <string.h>
+#include "charset/aliases.h"
+#include "testutils.h"
+int main (int argc, char **argv)
+{
+        parserutils_charset_aliases_canon *c;
+        UNUSED(argc);
+        UNUSED(argv);
+        c = parserutils__charset_alias_canonicalise("moose", 5);
+        if (c) {
+                printf("FAIL - found invalid encoding 'moose'\n");
+                return 1;
+        }
+        c = parserutils__charset_alias_canonicalise("csinvariant", 11);
+        if (c) {
+                printf("%s %d\n", c->name, c->mib_enum);
+        } else {
+                printf("FAIL - failed finding encoding 'csinvariant'\n");
+                return 1;
+        }
+        c = parserutils__charset_alias_canonicalise("csinvariant\"", 12);
+        if (c) {
+                printf("%s %d\n", c->name, c->mib_enum);
+        } else {
+                printf("FAIL - failed finding encoding 'csinvariant'\n");
+                return 1;
+        }
+        c = parserutils__charset_alias_canonicalise("nats-sefi-add", 13);
+        if (c) {
+                printf("%s %d\n", c->name, c->mib_enum);
+        } else {
+                printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
+                return 1;
+        }
+        printf("%d\n", parserutils_charset_mibenum_from_name(c->name,
+                        strlen(c->name)));
+        printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum));
+        c = parserutils__charset_alias_canonicalise("u.t.f.8", 7);
+        if (c) {
+                printf("%s %d\n", c->name, c->mib_enum);
+        } else {
+                printf("FAIL - failed finding encoding 'u.t.f.8'\n");
+                return 1;
+        }
+        printf("PASS\n");
+        return 0;
+}

 /programs/network/netsurf/libparserutils/test/cscodec-8859.c
 ,0 → 1,263
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <parserutils/charset/codec.h>
+#include "utils/utils.h"
+#include "testutils.h"
+typedef struct line_ctx {
+        parserutils_charset_codec *codec;
+        size_t buflen;
+        size_t bufused;
+        uint8_t *buf;
+        size_t explen;
+        size_t expused;
+        uint8_t *exp;
+        bool hadenc;
+        bool indata;
+        bool inexp;
+        parserutils_error exp_ret;
+        enum { ENCODE, DECODE, BOTH } dir;
+} line_ctx;
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(line_ctx *ctx);
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_charset_codec *codec;
+        line_ctx ctx;
+        if (argc != 2) {
+                printf("Usage: %s <filename>\n", argv[0]);
+                return 1;
+        }
+        assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
+                        myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
+        ctx.buflen = parse_filesize(argv[1]);
+        if (ctx.buflen == 0)
+                return 1;
+        ctx.buf = malloc(2 * ctx.buflen);
+        if (ctx.buf == NULL) {
+                printf("Failed allocating %u bytes\n",
+                                (unsigned int) ctx.buflen);
+                return 1;
+        }
+        ctx.exp = ctx.buf + ctx.buflen;
+        ctx.explen = ctx.buflen;
+        ctx.buf[0] = '\0';
+        ctx.exp[0] = '\0';
+        ctx.bufused = 0;
+        ctx.expused = 0;
+        ctx.hadenc = false;
+        ctx.indata = false;
+        ctx.inexp = false;
+        ctx.exp_ret = PARSERUTILS_OK;
+        assert(parse_testfile(argv[1], handle_line, &ctx) == true);
+        /* and run final test */
+        if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+                ctx.bufused -= 1;
+        if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
+                ctx.expused -= 1;
+        run_test(&ctx);
+        free(ctx.buf);
+        parserutils_charset_codec_destroy(ctx.codec);
+        printf("PASS\n");
+        return 0;
+}
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+        line_ctx *ctx = (line_ctx *) pw;
+        if (data[0] == '#') {
+                if (ctx->inexp) {
+                        /* This marks end of testcase, so run it */
+                        if (ctx->buf[ctx->bufused - 1] == '\n')
+                                ctx->bufused -= 1;
+                        if (ctx->exp[ctx->expused - 1] == '\n')
+                                ctx->expused -= 1;
+                        run_test(ctx);
+                        ctx->buf[0] = '\0';
+                        ctx->exp[0] = '\0';
+                        ctx->bufused = 0;
+                        ctx->expused = 0;
+                        ctx->exp_ret = PARSERUTILS_OK;
+                }
+                if (strncasecmp(data+1, "data", 4) == 0) {
+                        parserutils_charset_codec_optparams params;
+                        const char *ptr = data + 6;
+                        ctx->indata = true;
+                        ctx->inexp = false;
+                        if (strncasecmp(ptr, "decode", 6) == 0)
+                                ctx->dir = DECODE;
+                        else if (strncasecmp(ptr, "encode", 6) == 0)
+                                ctx->dir = ENCODE;
+                        else
+                                ctx->dir = BOTH;
+                        ptr += 7;
+                        if (strncasecmp(ptr, "LOOSE", 5) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+                                ptr += 6;
+                        } else if (strncasecmp(ptr, "STRICT", 6) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
+                                ptr += 7;
+                        } else {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
+                                ptr += 9;
+                        }
+                        assert(parserutils_charset_codec_setopt(ctx->codec,
+                                PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
+                                (parserutils_charset_codec_optparams *) &params)
+                                == PARSERUTILS_OK);
+                } else if (strncasecmp(data+1, "expected", 8) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = true;
+                        ctx->exp_ret = parserutils_error_from_string(data + 10,
+                                        datalen - 10 - 1 /* \n */);
+                } else if (strncasecmp(data+1, "reset", 5) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = false;
+                        parserutils_charset_codec_reset(ctx->codec);
+                } else if (strncasecmp(data+1, "enc", 3) == 0) {
+                        const char *enc = data + 5;
+                        const char *end;
+                        char *enc_name;
+                        for (end = enc; !isspace(*end); end++)
+                                ;
+                        enc_name = alloca(end - enc + 1);
+                        memcpy(enc_name, enc, end - enc);
+                        enc_name[end - enc] = 0;
+                        assert(parserutils_charset_codec_create(enc_name,
+                                        myrealloc, NULL, &ctx->codec) ==
+                                        PARSERUTILS_OK);
+                        ctx->hadenc = true;
+                }
+        } else {
+                if (ctx->indata) {
+                        memcpy(ctx->buf + ctx->bufused, data, datalen);
+                        ctx->bufused += datalen;
+                }
+                if (ctx->inexp) {
+                        memcpy(ctx->exp + ctx->expused, data, datalen);
+                        ctx->expused += datalen;
+                }
+        }
+        return true;
+}
+void run_test(line_ctx *ctx)
+{
+        static int testnum;
+        size_t destlen = ctx->bufused * 4;
+        uint8_t *dest = alloca(destlen);
+        uint8_t *pdest = dest;
+        const uint8_t *psrc = ctx->buf;
+        size_t srclen = ctx->bufused;
+        size_t i;
+        if (ctx->dir == DECODE) {
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else if (ctx->dir == ENCODE) {
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else {
+                size_t templen = ctx->bufused * 4;
+                uint8_t *temp = alloca(templen);
+                uint8_t *ptemp = temp;
+                const uint8_t *ptemp2;
+                size_t templen2;
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &ptemp, &templen) == ctx->exp_ret);
+                /* \todo currently there is no way to specify the number of
+                   consumed & produced data in case of a deliberate bad input
+                   data set.  */
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp);
+                }
+                ptemp2 = temp;
+                templen2 = ctx->bufused * 4 - templen;
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &ptemp2, &templen2,
+                                &pdest, &destlen) == ctx->exp_ret);
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(templen2 == 0);
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
+                }
+        }
+        if (ctx->exp_ret == PARSERUTILS_OK) {
+                assert(srclen == 0);
+                assert(ctx->buf + ctx->bufused == psrc);
+                assert(dest + (ctx->bufused * 4 - destlen) == pdest);
+                assert(ctx->bufused * 4 - destlen == ctx->expused);
+        }
+        printf("%d: Read '", ++testnum);
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
+                                "0123456789abcdef"[dest[i] & 0xf]);
+        }
+        printf("' Expected '");
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
+                                "0123456789abcdef"[ctx->exp[i] & 0xf]);
+        }
+        printf("'\n");
+        assert(pdest == dest + ctx->expused);
+        assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
+}

 /programs/network/netsurf/libparserutils/test/cscodec-ext8.c
 ,0 → 1,263
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <parserutils/charset/codec.h>
+#include "utils/utils.h"
+#include "testutils.h"
+typedef struct line_ctx {
+        parserutils_charset_codec *codec;
+        size_t buflen;
+        size_t bufused;
+        uint8_t *buf;
+        size_t explen;
+        size_t expused;
+        uint8_t *exp;
+        bool hadenc;
+        bool indata;
+        bool inexp;
+        parserutils_error exp_ret;
+        enum { ENCODE, DECODE, BOTH } dir;
+} line_ctx;
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(line_ctx *ctx);
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_charset_codec *codec;
+        line_ctx ctx;
+        if (argc != 2) {
+                printf("Usage: %s <filename>\n", argv[0]);
+                return 1;
+        }
+        assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
+                        myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
+        ctx.buflen = parse_filesize(argv[1]);
+        if (ctx.buflen == 0)
+                return 1;
+        ctx.buf = malloc(2 * ctx.buflen);
+        if (ctx.buf == NULL) {
+                printf("Failed allocating %u bytes\n",
+                                (unsigned int) ctx.buflen);
+                return 1;
+        }
+        ctx.exp = ctx.buf + ctx.buflen;
+        ctx.explen = ctx.buflen;
+        ctx.buf[0] = '\0';
+        ctx.exp[0] = '\0';
+        ctx.bufused = 0;
+        ctx.expused = 0;
+        ctx.hadenc = false;
+        ctx.indata = false;
+        ctx.inexp = false;
+        ctx.exp_ret = PARSERUTILS_OK;
+        assert(parse_testfile(argv[1], handle_line, &ctx) == true);
+        /* and run final test */
+        if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+                ctx.bufused -= 1;
+        if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
+                ctx.expused -= 1;
+        run_test(&ctx);
+        free(ctx.buf);
+        parserutils_charset_codec_destroy(ctx.codec);
+        printf("PASS\n");
+        return 0;
+}
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+        line_ctx *ctx = (line_ctx *) pw;
+        if (data[0] == '#') {
+                if (ctx->inexp) {
+                        /* This marks end of testcase, so run it */
+                        if (ctx->buf[ctx->bufused - 1] == '\n')
+                                ctx->bufused -= 1;
+                        if (ctx->exp[ctx->expused - 1] == '\n')
+                                ctx->expused -= 1;
+                        run_test(ctx);
+                        ctx->buf[0] = '\0';
+                        ctx->exp[0] = '\0';
+                        ctx->bufused = 0;
+                        ctx->expused = 0;
+                        ctx->exp_ret = PARSERUTILS_OK;
+                }
+                if (strncasecmp(data+1, "data", 4) == 0) {
+                        parserutils_charset_codec_optparams params;
+                        const char *ptr = data + 6;
+                        ctx->indata = true;
+                        ctx->inexp = false;
+                        if (strncasecmp(ptr, "decode", 6) == 0)
+                                ctx->dir = DECODE;
+                        else if (strncasecmp(ptr, "encode", 6) == 0)
+                                ctx->dir = ENCODE;
+                        else
+                                ctx->dir = BOTH;
+                        ptr += 7;
+                        if (strncasecmp(ptr, "LOOSE", 5) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+                                ptr += 6;
+                        } else if (strncasecmp(ptr, "STRICT", 6) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
+                                ptr += 7;
+                        } else {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
+                                ptr += 9;
+                        }
+                        assert(parserutils_charset_codec_setopt(ctx->codec,
+                                PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
+                                (parserutils_charset_codec_optparams *) &params)
+                                == PARSERUTILS_OK);
+                } else if (strncasecmp(data+1, "expected", 8) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = true;
+                        ctx->exp_ret = parserutils_error_from_string(data + 10,
+                                        datalen - 10 - 1 /* \n */);
+                } else if (strncasecmp(data+1, "reset", 5) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = false;
+                        parserutils_charset_codec_reset(ctx->codec);
+                } else if (strncasecmp(data+1, "enc", 3) == 0) {
+                        const char *enc = data + 5;
+                        const char *end;
+                        char *enc_name;
+                        for (end = enc; !isspace(*end); end++)
+                                ;
+                        enc_name = alloca(end - enc + 1);
+                        memcpy(enc_name, enc, end - enc);
+                        enc_name[end - enc] = 0;
+                        assert(parserutils_charset_codec_create(enc_name,
+                                        myrealloc, NULL, &ctx->codec) ==
+                                        PARSERUTILS_OK);
+                        ctx->hadenc = true;
+                }
+        } else {
+                if (ctx->indata) {
+                        memcpy(ctx->buf + ctx->bufused, data, datalen);
+                        ctx->bufused += datalen;
+                }
+                if (ctx->inexp) {
+                        memcpy(ctx->exp + ctx->expused, data, datalen);
+                        ctx->expused += datalen;
+                }
+        }
+        return true;
+}
+void run_test(line_ctx *ctx)
+{
+        static int testnum;
+        size_t destlen = ctx->bufused * 4;
+        uint8_t *dest = alloca(destlen);
+        uint8_t *pdest = dest;
+        const uint8_t *psrc = ctx->buf;
+        size_t srclen = ctx->bufused;
+        size_t i;
+        if (ctx->dir == DECODE) {
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else if (ctx->dir == ENCODE) {
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else {
+                size_t templen = ctx->bufused * 4;
+                uint8_t *temp = alloca(templen);
+                uint8_t *ptemp = temp;
+                const uint8_t *ptemp2;
+                size_t templen2;
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &ptemp, &templen) == ctx->exp_ret);
+                /* \todo currently there is no way to specify the number of
+                   consumed & produced data in case of a deliberate bad input
+                   data set.  */
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp);
+                }
+                ptemp2 = temp;
+                templen2 = ctx->bufused * 4 - templen;
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &ptemp2, &templen2,
+                                &pdest, &destlen) == ctx->exp_ret);
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(templen2 == 0);
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
+                }
+        }
+        if (ctx->exp_ret == PARSERUTILS_OK) {
+                assert(srclen == 0);
+                assert(ctx->buf + ctx->bufused == psrc);
+                assert(dest + (ctx->bufused * 4 - destlen) == pdest);
+                assert(ctx->bufused * 4 - destlen == ctx->expused);
+        }
+        printf("%d: Read '", ++testnum);
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
+                                "0123456789abcdef"[dest[i] & 0xf]);
+        }
+        printf("' Expected '");
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
+                                "0123456789abcdef"[ctx->exp[i] & 0xf]);
+        }
+        printf("'\n");
+        assert(pdest == dest + ctx->expused);
+        assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
+}

 /programs/network/netsurf/libparserutils/test/cscodec-utf16.c
 ,0 → 1,321
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <parserutils/charset/codec.h>
+#include "utils/utils.h"
+#include "testutils.h"
+typedef struct line_ctx {
+        parserutils_charset_codec *codec;
+        size_t buflen;
+        size_t bufused;
+        uint8_t *buf;
+        size_t explen;
+        size_t expused;
+        uint8_t *exp;
+        bool indata;
+        bool inexp;
+        parserutils_error exp_ret;
+        enum { ENCODE, DECODE, BOTH } dir;
+} line_ctx;
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(line_ctx *ctx);
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_charset_codec *codec;
+        line_ctx ctx;
+        if (argc != 2) {
+                printf("Usage: %s <filename>\n", argv[0]);
+                return 1;
+        }
+        assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
+                        myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
+        assert(parserutils_charset_codec_create("UTF-16", myrealloc, NULL,
+                        &ctx.codec) == PARSERUTILS_OK);
+        ctx.buflen = parse_filesize(argv[1]);
+        if (ctx.buflen == 0)
+                return 1;
+        ctx.buf = malloc(ctx.buflen);
+        if (ctx.buf == NULL) {
+                printf("Failed allocating %u bytes\n", (int) ctx.buflen);
+                return 1;
+        }
+        ctx.exp = malloc(ctx.buflen);
+        if (ctx.exp == NULL) {
+                printf("Failed allocating %u bytes\n", (int) ctx.buflen);
+                free(ctx.buf);
+                return 1;
+        }
+        ctx.explen = ctx.buflen;
+        ctx.buf[0] = '\0';
+        ctx.exp[0] = '\0';
+        ctx.bufused = 0;
+        ctx.expused = 0;
+        ctx.indata = false;
+        ctx.inexp = false;
+        ctx.exp_ret = PARSERUTILS_OK;
+        assert(parse_testfile(argv[1], handle_line, &ctx) == true);
+        /* and run final test */
+        if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+                ctx.bufused -= 1;
+        if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
+                ctx.expused -= 1;
+        run_test(&ctx);
+        free(ctx.buf);
+        parserutils_charset_codec_destroy(ctx.codec);
+        printf("PASS\n");
+        return 0;
+}
+/**
+ * Converts hex character ('0' ... '9' or 'a' ... 'f' or 'A' ... 'F') to
+ * digit value.
+ * \param hex Valid hex character
+ * \return Corresponding digit value.
+ */
+static inline int hex2digit(char hex)
+{
+        return (hex <= '9') ? hex - '0' : (hex | 0x20) - 'a' + 10;
+}
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+        line_ctx *ctx = (line_ctx *) pw;
+        if (data[0] == '#') {
+                if (ctx->inexp) {
+                        /* This marks end of testcase, so run it */
+                        if (ctx->buf[ctx->bufused - 1] == '\n')
+                                ctx->bufused -= 1;
+                        if (ctx->exp[ctx->expused - 1] == '\n')
+                                ctx->expused -= 1;
+                        run_test(ctx);
+                        ctx->buf[0] = '\0';
+                        ctx->exp[0] = '\0';
+                        ctx->bufused = 0;
+                        ctx->expused = 0;
+                        ctx->exp_ret = PARSERUTILS_OK;
+                }
+                if (strncasecmp(data+1, "data", 4) == 0) {
+                        parserutils_charset_codec_optparams params;
+                        const char *ptr = data + 6;
+                        ctx->indata = true;
+                        ctx->inexp = false;
+                        if (strncasecmp(ptr, "decode", 6) == 0)
+                                ctx->dir = DECODE;
+                        else if (strncasecmp(ptr, "encode", 6) == 0)
+                                ctx->dir = ENCODE;
+                        else
+                                ctx->dir = BOTH;
+                        ptr += 7;
+                        if (strncasecmp(ptr, "LOOSE", 5) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+                                ptr += 6;
+                        } else if (strncasecmp(ptr, "STRICT", 6) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
+                                ptr += 7;
+                        } else {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
+                                ptr += 9;
+                        }
+                        assert(parserutils_charset_codec_setopt(ctx->codec,
+                                PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
+                                (parserutils_charset_codec_optparams *) &params)
+                                == PARSERUTILS_OK);
+                } else if (strncasecmp(data+1, "expected", 8) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = true;
+                        ctx->exp_ret = parserutils_error_from_string(data + 10,
+                                        datalen - 10 - 1 /* \n */);
+                } else if (strncasecmp(data+1, "reset", 5) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = false;
+                        parserutils_charset_codec_reset(ctx->codec);
+                }
+        } else {
+                if (ctx->indata) {
+                        /* Process "&#xNNNN" as 16-bit code units.  */
+                        while (datalen) {
+                                uint16_t nCodePoint;
+                                if (data[0] == '\n') {
+                                        ctx->buf[ctx->bufused++] = *data++;
+                                        --datalen;
+                                        continue;
+                                }
+                                assert(datalen >= sizeof ("&#xNNNN")-1 \
+                                        && data[0] == '&' && data[1] == '#' \
+                                        && data[2] == 'x' && isxdigit(data[3]) \
+                                        && isxdigit(data[4]) && isxdigit(data[5]) \
+                                        && isxdigit(data[6]));
+                                /* UTF-16 code is always host endian (different
+                                   than UCS-32 !).  */
+                                nCodePoint = (hex2digit(data[3]) << 12) |
+                                                (hex2digit(data[4]) <<  8) |
+                                                (hex2digit(data[5]) <<  4) |
+                                                hex2digit(data[6]);
+                                *((uint16_t *) (void *) (ctx->buf + ctx->bufused)) =
+                                                nCodePoint;
+                                ctx->bufused += 2;
+                                data += sizeof ("&#xNNNN")-1;
+                                datalen -= sizeof ("&#xNNNN")-1;
+                        }
+                }
+                if (ctx->inexp) {
+                        /* Process "&#xXXXXYYYY as 32-bit code units.  */
+                        while (datalen) {
+                                uint32_t nCodePoint;
+                                if (data[0] == '\n') {
+                                        ctx->exp[ctx->expused++] = *data++;
+                                        --datalen;
+                                        continue;
+                                }
+                                assert(datalen >= sizeof ("&#xXXXXYYYY")-1 \
+                                        && data[0] == '&' && data[1] == '#' \
+                                        && data[2] == 'x' && isxdigit(data[3]) \
+                                        && isxdigit(data[4]) && isxdigit(data[5]) \
+                                        && isxdigit(data[6]) && isxdigit(data[7]) \
+                                        && isxdigit(data[8]) && isxdigit(data[9]) \
+                                        && isxdigit(data[10]));
+                                /* UCS-4 code is always big endian, so convert
+                                   host endian to big endian.  */
+                                nCodePoint =
+                                        htonl((hex2digit(data[3]) << 28)
+                                        | (hex2digit(data[4]) << 24)
+                                        | (hex2digit(data[5]) << 20)
+                                        | (hex2digit(data[6]) << 16)
+                                        | (hex2digit(data[7]) << 12)
+                                        | (hex2digit(data[8]) << 8)
+                                        | (hex2digit(data[9]) << 4)
+                                        | hex2digit(data[10]));
+                                *((uint32_t *) (void *) (ctx->exp + ctx->expused)) =
+                                                nCodePoint;
+                                ctx->expused += 4;
+                                data += sizeof ("&#xXXXXYYYY")-1;
+                                datalen -= sizeof ("&#xXXXXYYYY")-1;
+                        }
+                }
+        }
+        return true;
+}
+void run_test(line_ctx *ctx)
+{
+        static int testnum;
+        size_t destlen = ctx->bufused * 4;
+        uint8_t *dest = alloca(destlen);
+        uint8_t *pdest = dest;
+        const uint8_t *psrc = ctx->buf;
+        size_t srclen = ctx->bufused;
+        size_t i;
+        if (ctx->dir == DECODE) {
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else if (ctx->dir == ENCODE) {
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else {
+                size_t templen = ctx->bufused * 4;
+                uint8_t *temp = alloca(templen);
+                uint8_t *ptemp = temp;
+                const uint8_t *ptemp2;
+                size_t templen2;
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &ptemp, &templen) == ctx->exp_ret);
+                /* \todo currently there is no way to specify the number of
+                   consumed & produced data in case of a deliberate bad input
+                   data set.  */
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp);
+                }
+                ptemp2 = temp;
+                templen2 = ctx->bufused * 4 - templen;
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &ptemp2, &templen2,
+                                &pdest, &destlen) == ctx->exp_ret);
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(templen2 == 0);
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
+                }
+        }
+        if (ctx->exp_ret == PARSERUTILS_OK) {
+                assert(srclen == 0);
+                assert(ctx->buf + ctx->bufused == psrc);
+                assert(dest + (ctx->bufused * 4 - destlen) == pdest);
+                assert(ctx->bufused * 4 - destlen == ctx->expused);
+        }
+        printf("%d: Read '", ++testnum);
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
+                                "0123456789abcdef"[dest[i] & 0xf]);
+        }
+        printf("' Expected '");
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
+                                "0123456789abcdef"[ctx->exp[i] & 0xf]);
+        }
+        printf("'\n");
+        assert(pdest == dest + ctx->expused);
+        assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
+}

 /programs/network/netsurf/libparserutils/test/cscodec-utf8.c
 ,0 → 1,246
+#include <stdio.h>
+#include <string.h>
+#include <parserutils/charset/codec.h>
+#include "utils/utils.h"
+#include "testutils.h"
+typedef struct line_ctx {
+        parserutils_charset_codec *codec;
+        size_t buflen;
+        size_t bufused;
+        uint8_t *buf;
+        size_t explen;
+        size_t expused;
+        uint8_t *exp;
+        bool indata;
+        bool inexp;
+        parserutils_error exp_ret;
+        enum { ENCODE, DECODE, BOTH } dir;
+} line_ctx;
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(line_ctx *ctx);
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_charset_codec *codec;
+        line_ctx ctx;
+        if (argc != 2) {
+                printf("Usage: %s <filename>\n", argv[0]);
+                return 1;
+        }
+        assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
+                        myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
+        assert(parserutils_charset_codec_create("UTF-8", myrealloc, NULL,
+                        &ctx.codec) == PARSERUTILS_OK);
+        ctx.buflen = parse_filesize(argv[1]);
+        if (ctx.buflen == 0)
+                return 1;
+        ctx.buf = malloc(2 * ctx.buflen);
+        if (ctx.buf == NULL) {
+                printf("Failed allocating %u bytes\n",
+                                (unsigned int) ctx.buflen);
+                return 1;
+        }
+        ctx.exp = ctx.buf + ctx.buflen;
+        ctx.explen = ctx.buflen;
+        ctx.buf[0] = '\0';
+        ctx.exp[0] = '\0';
+        ctx.bufused = 0;
+        ctx.expused = 0;
+        ctx.indata = false;
+        ctx.inexp = false;
+        ctx.exp_ret = PARSERUTILS_OK;
+        assert(parse_testfile(argv[1], handle_line, &ctx) == true);
+        /* and run final test */
+        if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+                ctx.bufused -= 1;
+        if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
+                ctx.expused -= 1;
+        run_test(&ctx);
+        free(ctx.buf);
+        parserutils_charset_codec_destroy(ctx.codec);
+        printf("PASS\n");
+        return 0;
+}
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+        line_ctx *ctx = (line_ctx *) pw;
+        if (data[0] == '#') {
+                if (ctx->inexp) {
+                        /* This marks end of testcase, so run it */
+                        if (ctx->buf[ctx->bufused - 1] == '\n')
+                                ctx->bufused -= 1;
+                        if (ctx->exp[ctx->expused - 1] == '\n')
+                                ctx->expused -= 1;
+                        run_test(ctx);
+                        ctx->buf[0] = '\0';
+                        ctx->exp[0] = '\0';
+                        ctx->bufused = 0;
+                        ctx->expused = 0;
+                        ctx->exp_ret = PARSERUTILS_OK;
+                }
+                if (strncasecmp(data+1, "data", 4) == 0) {
+                        parserutils_charset_codec_optparams params;
+                        const char *ptr = data + 6;
+                        ctx->indata = true;
+                        ctx->inexp = false;
+                        if (strncasecmp(ptr, "decode", 6) == 0)
+                                ctx->dir = DECODE;
+                        else if (strncasecmp(ptr, "encode", 6) == 0)
+                                ctx->dir = ENCODE;
+                        else
+                                ctx->dir = BOTH;
+                        ptr += 7;
+                        if (strncasecmp(ptr, "LOOSE", 5) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+                                ptr += 6;
+                        } else if (strncasecmp(ptr, "STRICT", 6) == 0) {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
+                                ptr += 7;
+                        } else {
+                                params.error_mode.mode =
+                                        PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
+                                ptr += 9;
+                        }
+                        assert(parserutils_charset_codec_setopt(ctx->codec,
+                                PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
+                                (parserutils_charset_codec_optparams *) &params)
+                                == PARSERUTILS_OK);
+                } else if (strncasecmp(data+1, "expected", 8) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = true;
+                        ctx->exp_ret = parserutils_error_from_string(data + 10,
+                                        datalen - 10 - 1 /* \n */);
+                } else if (strncasecmp(data+1, "reset", 5) == 0) {
+                        ctx->indata = false;
+                        ctx->inexp = false;
+                        parserutils_charset_codec_reset(ctx->codec);
+                }
+        } else {
+                if (ctx->indata) {
+                        memcpy(ctx->buf + ctx->bufused, data, datalen);
+                        ctx->bufused += datalen;
+                }
+                if (ctx->inexp) {
+                        memcpy(ctx->exp + ctx->expused, data, datalen);
+                        ctx->expused += datalen;
+                }
+        }
+        return true;
+}
+void run_test(line_ctx *ctx)
+{
+        static int testnum;
+        size_t destlen = ctx->bufused * 4;
+        uint8_t *dest = alloca(destlen);
+        uint8_t *pdest = dest;
+        const uint8_t *psrc = ctx->buf;
+        size_t srclen = ctx->bufused;
+        size_t i;
+        if (ctx->dir == DECODE) {
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else if (ctx->dir == ENCODE) {
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &psrc, &srclen,
+                                &pdest, &destlen) == ctx->exp_ret);
+        } else {
+                size_t templen = ctx->bufused * 4;
+                uint8_t *temp = alloca(templen);
+                uint8_t *ptemp = temp;
+                const uint8_t *ptemp2;
+                size_t templen2;
+                assert(parserutils_charset_codec_decode(ctx->codec,
+                                &psrc, &srclen,
+                                &ptemp, &templen) == ctx->exp_ret);
+                /* \todo currently there is no way to specify the number of
+                   consumed & produced data in case of a deliberate bad input
+                   data set.  */
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp);
+                }
+                ptemp2 = temp;
+                templen2 = ctx->bufused * 4 - templen;
+                assert(parserutils_charset_codec_encode(ctx->codec,
+                                &ptemp2, &templen2,
+                                &pdest, &destlen) == ctx->exp_ret);
+                if (ctx->exp_ret == PARSERUTILS_OK) {
+                        assert(templen2 == 0);
+                        assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
+                }
+        }
+        if (ctx->exp_ret == PARSERUTILS_OK) {
+                assert(srclen == 0);
+                assert(ctx->buf + ctx->bufused == psrc);
+                assert(dest + (ctx->bufused * 4 - destlen) == pdest);
+                assert(ctx->bufused * 4 - destlen == ctx->expused);
+        }
+        printf("%d: Read '", ++testnum);
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
+                                "0123456789abcdef"[dest[i] & 0xf]);
+        }
+        printf("' Expected '");
+        for (i = 0; i < ctx->expused; i++) {
+                printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
+                                "0123456789abcdef"[ctx->exp[i] & 0xf]);
+        }
+        printf("'\n");
+        assert(pdest == dest + ctx->expused);
+        assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
+}

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/1.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/10.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/11.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/13.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/14.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/15.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/16.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/2.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/3.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/4.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/5.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/6.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/7.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/8.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-8859/9.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

 /programs/network/netsurf/libparserutils/test/data/cscodec-8859/INDEX
 ,0 → 1,19
+# Index file for charset codec tests
+#
+# Test                  Description
+.dat                   ISO-8859-1
+.dat                   ISO-8859-2
+.dat                   ISO-8859-3
+.dat                   ISO-8859-4
+.dat                   ISO-8859-5
+.dat                   ISO-8859-6
+.dat                   ISO-8859-7
+.dat                   ISO-8859-8
+.dat                   ISO-8859-9
+.dat                  ISO-8859-10
+.dat                  ISO-8859-11
+.dat                  ISO-8859-13
+.dat                  ISO-8859-14
+.dat                  ISO-8859-15
+.dat                  ISO-8859-16

 /programs/network/netsurf/libparserutils/test/data/cscodec-ext8/INDEX
 ,0 → 1,13
+# Index file for charset codec tests
+#
+# Test                  Description
+cp1250.dat              Windows-1250
+cp1251.dat              Windows-1251
+cp1252.dat              Windows-1252
+cp1253.dat              Windows-1253
+cp1254.dat              Windows-1254
+cp1255.dat              Windows-1255
+cp1256.dat              Windows-1256
+cp1257.dat              Windows-1257
+cp1258.dat              Windows-1258

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1250.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1251.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1252.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1253.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1254.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1255.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1256.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1257.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1258.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/cscodec-utf16/INDEX
0,0 → 1,6
# Index file for UTF-16 charset codec tests
#
# Test Description

simple.dat Simple tests, designed to validate testdriver

 /programs/network/netsurf/libparserutils/test/data/cscodec-utf16/simple.dat
 ,0 → 1,33
+# *** Simple test:
+#data decode STRICT
+&#x0040&#x4142
+#expected PARSERUTILS_OK
+&#x00000040&#x00004142
+#reset
+# *** Surrogate test:
+#data decode STRICT
+&#xD800&#xDF02
+#expected PARSERUTILS_OK
+&#x00010302
+#reset
+# *** Lonely high surrogate:
+# This is a bit strange that end status is ok.
+#data decode STRICT
+&#xD805
+#expected PARSERUTILS_OK
+#reset
+# With an extra code point, the status is different.
+#data decode STRICT
+&#xD805&#x4142
+#expected PARSERUTILS_INVALID
+#reset
+# *** Wrong low surrogate start:
+#data decode STRICT
+&#xDC05
+#expected PARSERUTILS_INVALID
+#reset

/programs/network/netsurf/libparserutils/test/data/cscodec-utf8/INDEX
0,0 → 1,6
# Index file for charset codec tests
#
# Test Description

simple.dat Simple tests, designed to validate testdriver
UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file

 /programs/network/netsurf/libparserutils/test/data/cscodec-utf8/UTF-8-test.txt
 ,0 → 1,536
+#data recode LOOSE
+UTF-8 decoder capability and stress test
+----------------------------------------
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2003-02-19
+This test file can help you examine, how your UTF-8 decoder handles
+various types of correct, malformed, or otherwise interesting UTF-8
+sequences. This file is not meant to be a conformance test. It does
+not prescribes any particular outcome and therefore there is no way to
+"pass" or "fail" this test file, even though the texts suggests a
+preferable decoder behaviour at some places. The aim is instead to
+help you think about and test the behaviour of your UTF-8 on a
+systematic collection of unusual inputs. Experience so far suggests
+that most first-time authors of UTF-8 decoders find at least one
+serious problem in their decoder by using this file.
+The test lines below cover boundary conditions, malformed UTF-8
+sequences as well as correctly encoded UTF-8 sequences of Unicode code
+points that should never occur in a correct UTF-8 file.
+According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
+receiving UTF-8 shall interpret a "malformed sequence in the same way
+that it interprets a character that is outside the adopted subset" and
+"characters that are not within the adopted subset shall be indicated
+to the user" by a receiving device. A quite commonly used approach in
+UTF-8 decoders is to replace any malformed UTF-8 sequence by a
+replacement character (U+FFFD), which looks a bit like an inverted
+question mark, or a similar symbol. It might be a good idea to
+visually distinguish a malformed UTF-8 sequence from a correctly
+encoded Unicode character that is just not available in the current
+font but otherwise fully legal, even though ISO 10646-1 doesn't
+mandate this. In any case, just ignoring malformed sequences or
+unavailable characters does not conform to ISO 10646, will make
+debugging more difficult, and can lead to user confusion.
+Please check, whether a malformed UTF-8 sequence is (1) represented at
+all, (2) represented by exactly one single replacement character (or
+equivalent signal), and (3) the following quotation mark after an
+illegal UTF-8 sequence is correctly displayed, i.e. proper
+resynchronization takes place immageately after any malformed
+sequence. This file says "THE END" in the last line, so if you don't
+see that, your decoder crashed somehow before, which should always be
+cause for concern.
+All lines in this file are exactly 79 characters long (plus the line
+feed). In addition, all lines end with "|", except for the two test
+lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
+U+0000 and U+007F. If you display this file with a fixed-width font,
+these "|" characters should all line up in column 79 (right margin).
+This allows you to test quickly, whether your UTF-8 decoder finds the
+correct number of characters in every line, that is whether each
+malformed sequences is replaced by a single replacement character.
+Note that as an alternative to the notion of malformed sequence used
+here, it is also a perfectly acceptable (and in some situations even
+preferable) solution to represent each individual byte of a malformed
+sequence by a replacement character. If you follow this strategy in
+your decoder, then please ignore the "|" column.
+Here come the tests:                                                          |
+                                                                              |
+Some correct UTF-8 text                                                    |
+                                                                              |
+You should see the Greek word 'kosme':       "κόσμε"                          |
+                                                                              |
+Boundary condition test cases                                              |
+                                                                              |
+.1  First possible sequence of a certain length                              |
+                                                                              |
+.1.1  1 byte  (U-00000000):        ""
+.1.2  2 bytes (U-00000080):        ""                                       |
+.1.3  3 bytes (U-00000800):        "ࠀ"                                       |
+.1.4  4 bytes (U-00010000):        "𐀀"                                       |
+.1.5  5 bytes (U-00200000):        "ø"                                       |
+.1.6  6 bytes (U-04000000):        "ü"                                       |
+                                                                              |
+.2  Last possible sequence of a certain length                               |
+                                                                              |
+.2.1  1 byte  (U-0000007F):        ""
+.2.2  2 bytes (U-000007FF):        "߿"                                       |
+.2.3  3 bytes (U-0000FFFF):        ""                                       |
+.2.4  4 bytes (U-001FFFFF):        "÷¿¿¿"                                       |
+.2.5  5 bytes (U-03FFFFFF):        "û¿¿¿¿"                                       |
+.2.6  6 bytes (U-7FFFFFFF):        "ý¿¿¿¿¿"                                       |
+                                                                              |
+.3  Other boundary conditions                                                |
+                                                                              |
+.3.1  U-0000D7FF = ed 9f bf = "퟿"                                            |
+.3.2  U-0000E000 = ee 80 80 = ""                                            |
+.3.3  U-0000FFFD = ef bf bd = "�"                                            |
+.3.4  U-0010FFFF = f4 8f bf bf = "􏿿"                                         |
+.3.5  U-00110000 = f4 90 80 80 = "ô"                                         |
+                                                                              |
+Malformed sequences                                                        |
+                                                                              |
+.1  Unexpected continuation bytes                                            |
+                                                                              |
+Each unexpected continuation byte should be separately signalled as a         |
+malformed sequence of its own.                                                |
+                                                                              |
+.1.1  First continuation byte 0x80: ""                                      |
+.1.2  Last  continuation byte 0xbf: "¿"                                      |
+                                                                              |
+.1.3  2 continuation bytes: "¿"                                             |
+.1.4  3 continuation bytes: "¿"                                            |
+.1.5  4 continuation bytes: "¿¿"                                           |
+.1.6  5 continuation bytes: "¿¿"                                          |
+.1.7  6 continuation bytes: "¿¿¿"                                         |
+.1.8  7 continuation bytes: "¿¿¿"                                        |
+                                                                              |
+.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
+                                                                              |
+   "                                                          |
+                                                              |
+     ¡¢£¤¥¦§¨©ª«¬®¯                                                          |
+    °±²³´µ¶·¸¹º»¼½¾¿"                                                         |
+                                                                              |
+.2  Lonely start characters                                                  |
+                                                                              |
+.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï                                           |
+    Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß "                                         |
+                                                                              |
+.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "à á â ã ä å æ ç è é ê ë ì í î ï "                                         |
+                                                                              |
+.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ð ñ ò ó ô õ ö ÷ "                                                         |
+                                                                              |
+.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ø ù ú û "                                                                 |
+                                                                              |
+.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ü ý "                                                                     |
+                                                                              |
+.3  Sequences with last continuation byte missing                            |
+                                                                              |
+All bytes of an incomplete sequence should be signalled as a single           |
+malformed sequence, i.e., you should see only a single replacement            |
+character in each of the next 10 tests. (Characters as in section 2)          |
+                                                                              |
+.3.1  2-byte sequence with last byte missing (U+0000):     "À"               |
+.3.2  3-byte sequence with last byte missing (U+0000):     "à"               |
+.3.3  4-byte sequence with last byte missing (U+0000):     "ð"               |
+.3.4  5-byte sequence with last byte missing (U+0000):     "ø"               |
+.3.5  6-byte sequence with last byte missing (U+0000):     "ü"               |
+.3.6  2-byte sequence with last byte missing (U-000007FF): "ß"               |
+.3.7  3-byte sequence with last byte missing (U-0000FFFF): "ï¿"               |
+.3.8  4-byte sequence with last byte missing (U-001FFFFF): "÷¿¿"               |
+.3.9  5-byte sequence with last byte missing (U-03FFFFFF): "û¿¿¿"               |
+.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "ý¿¿¿¿"               |
+                                                                              |
+.4  Concatenation of incomplete sequences                                    |
+                                                                              |
+All the 10 sequences of 3.3 concatenated, you should see 10 malformed         |
+sequences being signalled:                                                    |
+                                                                              |
+   "Ààðøüßï¿÷¿¿û¿¿¿ý¿¿¿¿"                                                               |
+                                                                              |
+.5  Impossible bytes                                                         |
+                                                                              |
+The following two bytes cannot appear in a correct UTF-8 string               |
+                                                                              |
+.5.1  fe = "þ"                                                               |
+.5.2  ff = "ÿ"                                                               |
+.5.3  fe fe ff ff = "þþÿÿ"                                                   |
+                                                                              |
+Overlong sequences                                                         |
+                                                                              |
+The following sequences are not malformed according to the letter of          |
+the Unicode 2.0 standard. However, they are longer then necessary and         |
+a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8         |
+decoder" should reject them just like malformed sequences for two             |
+reasons: (1) It helps to debug applications if overlong sequences are         |
+not treated as valid representations of characters, because this helps        |
+to spot problems more quickly. (2) Overlong sequences provide                 |
+alternative representations of characters, that could maliciously be          |
+used to bypass filters that check only for ASCII characters. For              |
+instance, a 2-byte encoded line feed (LF) would not be caught by a            |
+line counter that counts only 0x0a bytes, but it would still be               |
+processed as a line feed by an unsafe UTF-8 decoder later in the              |
+pipeline. From a security point of view, ASCII compatibility of UTF-8         |
+sequences means also, that ASCII characters are *only* allowed to be          |
+represented by ASCII bytes in the range 0x00-0x7f. To ensure this             |
+aspect of ASCII compatibility, use only "safe UTF-8 decoders" that            |
+reject overlong UTF-8 sequences for which a shorter encoding exists.          |
+                                                                              |
+.1  Examples of an overlong ASCII character                                  |
+                                                                              |
+With a safe UTF-8 decoder, all of the following five overlong                 |
+representations of the ASCII character slash ("/") should be rejected         |
+like a malformed UTF-8 sequence, for instance by substituting it with         |
+a replacement character. If you see a slash below, you do not have a          |
+safe UTF-8 decoder!                                                           |
+                                                                              |
+.1.1 U+002F = c0 af             = "À¯"                                        |
+.1.2 U+002F = e0 80 af          = "à¯"                                        |
+.1.3 U+002F = f0 80 80 af       = "ð¯"                                        |
+.1.4 U+002F = f8 80 80 80 af    = "ø¯"                                        |
+.1.5 U+002F = fc 80 80 80 80 af = "ü¯"                                        |
+                                                                              |
+.2  Maximum overlong sequences                                               |
+                                                                              |
+Below you see the highest Unicode value that is still resulting in an         |
+overlong sequence if represented with the given number of bytes. This         |
+is a boundary test for safe UTF-8 decoders. All five characters should        |
+be rejected like malformed UTF-8 sequences.                                   |
+                                                                              |
+.2.1  U-0000007F = c1 bf             = "Á¿"                                   |
+.2.2  U-000007FF = e0 9f bf          = "à¿"                                   |
+.2.3  U-0000FFFF = f0 8f bf bf       = "ð¿¿"                                   |
+.2.4  U-001FFFFF = f8 87 bf bf bf    = "ø¿¿¿"                                   |
+.2.5  U-03FFFFFF = fc 83 bf bf bf bf = "ü¿¿¿¿"                                   |
+                                                                              |
+.3  Overlong representation of the NUL character                             |
+                                                                              |
+The following five sequences should also be rejected like malformed           |
+UTF-8 sequences and should not be treated like the ASCII NUL                  |
+character.                                                                    |
+                                                                              |
+.3.1  U+0000 = c0 80             = "À"                                       |
+.3.2  U+0000 = e0 80 80          = "à"                                       |
+.3.3  U+0000 = f0 80 80 80       = "ð"                                       |
+.3.4  U+0000 = f8 80 80 80 80    = "ø"                                       |
+.3.5  U+0000 = fc 80 80 80 80 80 = "ü"                                       |
+                                                                              |
+Illegal code positions                                                     |
+                                                                              |
+The following UTF-8 sequences should be rejected like malformed               |
+sequences, because they never represent valid ISO 10646 characters and        |
+a UTF-8 decoder that accepts them might introduce security problems           |
+comparable to overlong UTF-8 sequences.                                       |
+                                                                              |
+.1 Single UTF-16 surrogates                                                  |
+                                                                              |
+.1.1  U+D800 = ed a0 80 = "í "                                                |
+.1.2  U+DB7F = ed ad bf = "í¿"                                                |
+.1.3  U+DB80 = ed ae 80 = "í®"                                                |
+.1.4  U+DBFF = ed af bf = "í¯¿"                                                |
+.1.5  U+DC00 = ed b0 80 = "í°"                                                |
+.1.6  U+DF80 = ed be 80 = "í¾"                                                |
+.1.7  U+DFFF = ed bf bf = "í¿¿"                                                |
+                                                                              |
+.2 Paired UTF-16 surrogates                                                  |
+                                                                              |
+.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = "í í°"                               |
+.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = "í í¿¿"                               |
+.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = "í¿í°"                               |
+.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = "í¿í¿¿"                               |
+.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = "í®í°"                               |
+.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = "í®í¿¿"                               |
+.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = "í¯¿í°"                               |
+.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = "í¯¿í¿¿"                               |
+                                                                              |
+.3 Other illegal code positions                                              |
+                                                                              |
+.3.1  U+FFFE = ef bf be = ""                                                |
+.3.2  U+FFFF = ef bf bf = ""                                                |
+                                                                              |
+THE END                                                                       |
+#expected CHARSET_OK
+UTF-8 decoder capability and stress test
+----------------------------------------
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2003-02-19
+This test file can help you examine, how your UTF-8 decoder handles
+various types of correct, malformed, or otherwise interesting UTF-8
+sequences. This file is not meant to be a conformance test. It does
+not prescribes any particular outcome and therefore there is no way to
+"pass" or "fail" this test file, even though the texts suggests a
+preferable decoder behaviour at some places. The aim is instead to
+help you think about and test the behaviour of your UTF-8 on a
+systematic collection of unusual inputs. Experience so far suggests
+that most first-time authors of UTF-8 decoders find at least one
+serious problem in their decoder by using this file.
+The test lines below cover boundary conditions, malformed UTF-8
+sequences as well as correctly encoded UTF-8 sequences of Unicode code
+points that should never occur in a correct UTF-8 file.
+According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
+receiving UTF-8 shall interpret a "malformed sequence in the same way
+that it interprets a character that is outside the adopted subset" and
+"characters that are not within the adopted subset shall be indicated
+to the user" by a receiving device. A quite commonly used approach in
+UTF-8 decoders is to replace any malformed UTF-8 sequence by a
+replacement character (U+FFFD), which looks a bit like an inverted
+question mark, or a similar symbol. It might be a good idea to
+visually distinguish a malformed UTF-8 sequence from a correctly
+encoded Unicode character that is just not available in the current
+font but otherwise fully legal, even though ISO 10646-1 doesn't
+mandate this. In any case, just ignoring malformed sequences or
+unavailable characters does not conform to ISO 10646, will make
+debugging more difficult, and can lead to user confusion.
+Please check, whether a malformed UTF-8 sequence is (1) represented at
+all, (2) represented by exactly one single replacement character (or
+equivalent signal), and (3) the following quotation mark after an
+illegal UTF-8 sequence is correctly displayed, i.e. proper
+resynchronization takes place immageately after any malformed
+sequence. This file says "THE END" in the last line, so if you don't
+see that, your decoder crashed somehow before, which should always be
+cause for concern.
+All lines in this file are exactly 79 characters long (plus the line
+feed). In addition, all lines end with "|", except for the two test
+lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
+U+0000 and U+007F. If you display this file with a fixed-width font,
+these "|" characters should all line up in column 79 (right margin).
+This allows you to test quickly, whether your UTF-8 decoder finds the
+correct number of characters in every line, that is whether each
+malformed sequences is replaced by a single replacement character.
+Note that as an alternative to the notion of malformed sequence used
+here, it is also a perfectly acceptable (and in some situations even
+preferable) solution to represent each individual byte of a malformed
+sequence by a replacement character. If you follow this strategy in
+your decoder, then please ignore the "|" column.
+Here come the tests:                                                          |
+                                                                              |
+Some correct UTF-8 text                                                    |
+                                                                              |
+You should see the Greek word 'kosme':       "κόσμε"                          |
+                                                                              |
+Boundary condition test cases                                              |
+                                                                              |
+.1  First possible sequence of a certain length                              |
+                                                                              |
+.1.1  1 byte  (U-00000000):        ""
+.1.2  2 bytes (U-00000080):        ""                                       |
+.1.3  3 bytes (U-00000800):        "ࠀ"                                       |
+.1.4  4 bytes (U-00010000):        "𐀀"                                       |
+.1.5  5 bytes (U-00200000):        "ø"                                       |
+.1.6  6 bytes (U-04000000):        "ü"                                       |
+                                                                              |
+.2  Last possible sequence of a certain length                               |
+                                                                              |
+.2.1  1 byte  (U-0000007F):        ""
+.2.2  2 bytes (U-000007FF):        "߿"                                       |
+.2.3  3 bytes (U-0000FFFF):        "�"                                       |
+.2.4  4 bytes (U-001FFFFF):        "÷¿¿¿"                                       |
+.2.5  5 bytes (U-03FFFFFF):        "û¿¿¿¿"                                       |
+.2.6  6 bytes (U-7FFFFFFF):        "ý¿¿¿¿¿"                                       |
+                                                                              |
+.3  Other boundary conditions                                                |
+                                                                              |
+.3.1  U-0000D7FF = ed 9f bf = "퟿"                                            |
+.3.2  U-0000E000 = ee 80 80 = ""                                            |
+.3.3  U-0000FFFD = ef bf bd = "�"                                            |
+.3.4  U-0010FFFF = f4 8f bf bf = "􏿿"                                         |
+.3.5  U-00110000 = f4 90 80 80 = "ô"                                         |
+                                                                              |
+Malformed sequences                                                        |
+                                                                              |
+.1  Unexpected continuation bytes                                            |
+                                                                              |
+Each unexpected continuation byte should be separately signalled as a         |
+malformed sequence of its own.                                                |
+                                                                              |
+.1.1  First continuation byte 0x80: "�"                                      |
+.1.2  Last  continuation byte 0xbf: "�"                                      |
+                                                                              |
+.1.3  2 continuation bytes: "��"                                             |
+.1.4  3 continuation bytes: "���"                                            |
+.1.5  4 continuation bytes: "����"                                           |
+.1.6  5 continuation bytes: "�����"                                          |
+.1.7  6 continuation bytes: "������"                                         |
+.1.8  7 continuation bytes: "�������"                                        |
+                                                                              |
+.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
+                                                                              |
+   "����������������                                                          |
+    ����������������                                                          |
+    ����������������                                                          |
+    ����������������"                                                         |
+                                                                              |
+.2  Lonely start characters                                                  |
+                                                                              |
+.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "� � � � � � � � � � � � � � � �                                           |
+    � � � � � � � � � � � � � � � � "                                         |
+                                                                              |
+.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "� � � � � � � � � � � � � � � � "                                         |
+                                                                              |
+.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "� � � � � � � � "                                                         |
+                                                                              |
+.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "� � � � "                                                                 |
+                                                                              |
+.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "� � "                                                                     |
+                                                                              |
+.3  Sequences with last continuation byte missing                            |
+                                                                              |
+All bytes of an incomplete sequence should be signalled as a single           |
+malformed sequence, i.e., you should see only a single replacement            |
+character in each of the next 10 tests. (Characters as in section 2)          |
+                                                                              |
+.3.1  2-byte sequence with last byte missing (U+0000):     "�"               |
+.3.2  3-byte sequence with last byte missing (U+0000):     "�"               |
+.3.3  4-byte sequence with last byte missing (U+0000):     "�"               |
+.3.4  5-byte sequence with last byte missing (U+0000):     "�"               |
+.3.5  6-byte sequence with last byte missing (U+0000):     "�"               |
+.3.6  2-byte sequence with last byte missing (U-000007FF): "�"               |
+.3.7  3-byte sequence with last byte missing (U-0000FFFF): "�"               |
+.3.8  4-byte sequence with last byte missing (U-001FFFFF): "�"               |
+.3.9  5-byte sequence with last byte missing (U-03FFFFFF): "�"               |
+.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�"               |
+                                                                              |
+.4  Concatenation of incomplete sequences                                    |
+                                                                              |
+All the 10 sequences of 3.3 concatenated, you should see 10 malformed         |
+sequences being signalled:                                                    |
+                                                                              |
+   "����������"                                                               |
+                                                                              |
+.5  Impossible bytes                                                         |
+                                                                              |
+The following two bytes cannot appear in a correct UTF-8 string               |
+                                                                              |
+.5.1  fe = "�"                                                               |
+.5.2  ff = "�"                                                               |
+.5.3  fe fe ff ff = "����"                                                   |
+                                                                              |
+Overlong sequences                                                         |
+                                                                              |
+The following sequences are not malformed according to the letter of          |
+the Unicode 2.0 standard. However, they are longer then necessary and         |
+a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8         |
+decoder" should reject them just like malformed sequences for two             |
+reasons: (1) It helps to debug applications if overlong sequences are         |
+not treated as valid representations of characters, because this helps        |
+to spot problems more quickly. (2) Overlong sequences provide                 |
+alternative representations of characters, that could maliciously be          |
+used to bypass filters that check only for ASCII characters. For              |
+instance, a 2-byte encoded line feed (LF) would not be caught by a            |
+line counter that counts only 0x0a bytes, but it would still be               |
+processed as a line feed by an unsafe UTF-8 decoder later in the              |
+pipeline. From a security point of view, ASCII compatibility of UTF-8         |
+sequences means also, that ASCII characters are *only* allowed to be          |
+represented by ASCII bytes in the range 0x00-0x7f. To ensure this             |
+aspect of ASCII compatibility, use only "safe UTF-8 decoders" that            |
+reject overlong UTF-8 sequences for which a shorter encoding exists.          |
+                                                                              |
+.1  Examples of an overlong ASCII character                                  |
+                                                                              |
+With a safe UTF-8 decoder, all of the following five overlong                 |
+representations of the ASCII character slash ("/") should be rejected         |
+like a malformed UTF-8 sequence, for instance by substituting it with         |
+a replacement character. If you see a slash below, you do not have a          |
+safe UTF-8 decoder!                                                           |
+                                                                              |
+.1.1 U+002F = c0 af             = "�"                                        |
+.1.2 U+002F = e0 80 af          = "�"                                        |
+.1.3 U+002F = f0 80 80 af       = "�"                                        |
+.1.4 U+002F = f8 80 80 80 af    = "�"                                        |
+.1.5 U+002F = fc 80 80 80 80 af = "�"                                        |
+                                                                              |
+.2  Maximum overlong sequences                                               |
+                                                                              |
+Below you see the highest Unicode value that is still resulting in an         |
+overlong sequence if represented with the given number of bytes. This         |
+is a boundary test for safe UTF-8 decoders. All five characters should        |
+be rejected like malformed UTF-8 sequences.                                   |
+                                                                              |
+.2.1  U-0000007F = c1 bf             = "�"                                   |
+.2.2  U-000007FF = e0 9f bf          = "�"                                   |
+.2.3  U-0000FFFF = f0 8f bf bf       = "�"                                   |
+.2.4  U-001FFFFF = f8 87 bf bf bf    = "�"                                   |
+.2.5  U-03FFFFFF = fc 83 bf bf bf bf = "�"                                   |
+                                                                              |
+.3  Overlong representation of the NUL character                             |
+                                                                              |
+The following five sequences should also be rejected like malformed           |
+UTF-8 sequences and should not be treated like the ASCII NUL                  |
+character.                                                                    |
+                                                                              |
+.3.1  U+0000 = c0 80             = "�"                                       |
+.3.2  U+0000 = e0 80 80          = "�"                                       |
+.3.3  U+0000 = f0 80 80 80       = "�"                                       |
+.3.4  U+0000 = f8 80 80 80 80    = "�"                                       |
+.3.5  U+0000 = fc 80 80 80 80 80 = "�"                                       |
+                                                                              |
+Illegal code positions                                                     |
+                                                                              |
+The following UTF-8 sequences should be rejected like malformed               |
+sequences, because they never represent valid ISO 10646 characters and        |
+a UTF-8 decoder that accepts them might introduce security problems           |
+comparable to overlong UTF-8 sequences.                                       |
+                                                                              |
+.1 Single UTF-16 surrogates                                                  |
+                                                                              |
+.1.1  U+D800 = ed a0 80 = "�"                                                |
+.1.2  U+DB7F = ed ad bf = "�"                                                |
+.1.3  U+DB80 = ed ae 80 = "�"                                                |
+.1.4  U+DBFF = ed af bf = "�"                                                |
+.1.5  U+DC00 = ed b0 80 = "�"                                                |
+.1.6  U+DF80 = ed be 80 = "�"                                                |
+.1.7  U+DFFF = ed bf bf = "�"                                                |
+                                                                              |
+.2 Paired UTF-16 surrogates                                                  |
+                                                                              |
+.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = "��"                               |
+.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = "��"                               |
+.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = "��"                               |
+.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = "��"                               |
+.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = "��"                               |
+.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = "��"                               |
+.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = "��"                               |
+.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = "��"                               |
+                                                                              |
+.3 Other illegal code positions                                              |
+                                                                              |
+.3.1  U+FFFE = ef bf be = "�"                                                |
+.3.2  U+FFFF = ef bf bf = "�"                                                |
+                                                                              |
+THE END                                                                       |
+#reset

/programs/network/netsurf/libparserutils/test/data/cscodec-utf8/simple.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property

/programs/network/netsurf/libparserutils/test/data/input/INDEX
0,0 → 1,5
# Index file for inputstream tests
#
# Test Description

UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file

 /programs/network/netsurf/libparserutils/test/data/input/UTF-8-test.txt
 ,0 → 1,271
+UTF-8 decoder capability and stress test
+----------------------------------------
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2003-02-19
+This test file can help you examine, how your UTF-8 decoder handles
+various types of correct, malformed, or otherwise interesting UTF-8
+sequences. This file is not meant to be a conformance test. It does
+not prescribes any particular outcome and therefore there is no way to
+"pass" or "fail" this test file, even though the texts suggests a
+preferable decoder behaviour at some places. The aim is instead to
+help you think about and test the behaviour of your UTF-8 on a
+systematic collection of unusual inputs. Experience so far suggests
+that most first-time authors of UTF-8 decoders find at least one
+serious problem in their decoder by using this file.
+The test lines below cover boundary conditions, malformed UTF-8
+sequences as well as correctly encoded UTF-8 sequences of Unicode code
+points that should never occur in a correct UTF-8 file.
+According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
+receiving UTF-8 shall interpret a "malformed sequence in the same way
+that it interprets a character that is outside the adopted subset" and
+"characters that are not within the adopted subset shall be indicated
+to the user" by a receiving device. A quite commonly used approach in
+UTF-8 decoders is to replace any malformed UTF-8 sequence by a
+replacement character (U+FFFD), which looks a bit like an inverted
+question mark, or a similar symbol. It might be a good idea to
+visually distinguish a malformed UTF-8 sequence from a correctly
+encoded Unicode character that is just not available in the current
+font but otherwise fully legal, even though ISO 10646-1 doesn't
+mandate this. In any case, just ignoring malformed sequences or
+unavailable characters does not conform to ISO 10646, will make
+debugging more difficult, and can lead to user confusion.
+Please check, whether a malformed UTF-8 sequence is (1) represented at
+all, (2) represented by exactly one single replacement character (or
+equivalent signal), and (3) the following quotation mark after an
+illegal UTF-8 sequence is correctly displayed, i.e. proper
+resynchronization takes place immageately after any malformed
+sequence. This file says "THE END" in the last line, so if you don't
+see that, your decoder crashed somehow before, which should always be
+cause for concern.
+All lines in this file are exactly 79 characters long (plus the line
+feed). In addition, all lines end with "|", except for the two test
+lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
+U+0000 and U+007F. If you display this file with a fixed-width font,
+these "|" characters should all line up in column 79 (right margin).
+This allows you to test quickly, whether your UTF-8 decoder finds the
+correct number of characters in every line, that is whether each
+malformed sequences is replaced by a single replacement character.
+Note that as an alternative to the notion of malformed sequence used
+here, it is also a perfectly acceptable (and in some situations even
+preferable) solution to represent each individual byte of a malformed
+sequence by a replacement character. If you follow this strategy in
+your decoder, then please ignore the "|" column.
+Here come the tests:                                                          |
+                                                                              |
+Some correct UTF-8 text                                                    |
+                                                                              |
+You should see the Greek word 'kosme':       "κόσμε"                          |
+                                                                              |
+Boundary condition test cases                                              |
+                                                                              |
+.1  First possible sequence of a certain length                              |
+                                                                              |
+.1.1  1 byte  (U-00000000):        ""
+.1.2  2 bytes (U-00000080):        ""                                       |
+.1.3  3 bytes (U-00000800):        "ࠀ"                                       |
+.1.4  4 bytes (U-00010000):        "𐀀"                                       |
+.1.5  5 bytes (U-00200000):        "ø"                                       |
+.1.6  6 bytes (U-04000000):        "ü"                                       |
+                                                                              |
+.2  Last possible sequence of a certain length                               |
+                                                                              |
+.2.1  1 byte  (U-0000007F):        ""
+.2.2  2 bytes (U-000007FF):        "߿"                                       |
+.2.3  3 bytes (U-0000FFFF):        ""                                       |
+.2.4  4 bytes (U-001FFFFF):        "÷¿¿¿"                                       |
+.2.5  5 bytes (U-03FFFFFF):        "û¿¿¿¿"                                       |
+.2.6  6 bytes (U-7FFFFFFF):        "ý¿¿¿¿¿"                                       |
+                                                                              |
+.3  Other boundary conditions                                                |
+                                                                              |
+.3.1  U-0000D7FF = ed 9f bf = "퟿"                                            |
+.3.2  U-0000E000 = ee 80 80 = ""                                            |
+.3.3  U-0000FFFD = ef bf bd = "�"                                            |
+.3.4  U-0010FFFF = f4 8f bf bf = "􏿿"                                         |
+.3.5  U-00110000 = f4 90 80 80 = "ô"                                         |
+                                                                              |
+Malformed sequences                                                        |
+                                                                              |
+.1  Unexpected continuation bytes                                            |
+                                                                              |
+Each unexpected continuation byte should be separately signalled as a         |
+malformed sequence of its own.                                                |
+                                                                              |
+.1.1  First continuation byte 0x80: ""                                      |
+.1.2  Last  continuation byte 0xbf: "¿"                                      |
+                                                                              |
+.1.3  2 continuation bytes: "¿"                                             |
+.1.4  3 continuation bytes: "¿"                                            |
+.1.5  4 continuation bytes: "¿¿"                                           |
+.1.6  5 continuation bytes: "¿¿"                                          |
+.1.7  6 continuation bytes: "¿¿¿"                                         |
+.1.8  7 continuation bytes: "¿¿¿"                                        |
+                                                                              |
+.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
+                                                                              |
+   "                                                          |
+                                                              |
+     ¡¢£¤¥¦§¨©ª«¬®¯                                                          |
+    °±²³´µ¶·¸¹º»¼½¾¿"                                                         |
+                                                                              |
+.2  Lonely start characters                                                  |
+                                                                              |
+.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï                                           |
+    Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß "                                         |
+                                                                              |
+.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),                    |
+       each followed by a space character:                                    |
+                                                                              |
+   "à á â ã ä å æ ç è é ê ë ì í î ï "                                         |
+                                                                              |
+.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ð ñ ò ó ô õ ö ÷ "                                                         |
+                                                                              |
+.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ø ù ú û "                                                                 |
+                                                                              |
+.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),                     |
+       each followed by a space character:                                    |
+                                                                              |
+   "ü ý "                                                                     |
+                                                                              |
+.3  Sequences with last continuation byte missing                            |
+                                                                              |
+All bytes of an incomplete sequence should be signalled as a single           |
+malformed sequence, i.e., you should see only a single replacement            |
+character in each of the next 10 tests. (Characters as in section 2)          |
+                                                                              |
+.3.1  2-byte sequence with last byte missing (U+0000):     "À"               |
+.3.2  3-byte sequence with last byte missing (U+0000):     "à"               |
+.3.3  4-byte sequence with last byte missing (U+0000):     "ð"               |
+.3.4  5-byte sequence with last byte missing (U+0000):     "ø"               |
+.3.5  6-byte sequence with last byte missing (U+0000):     "ü"               |
+.3.6  2-byte sequence with last byte missing (U-000007FF): "ß"               |
+.3.7  3-byte sequence with last byte missing (U-0000FFFF): "ï¿"               |
+.3.8  4-byte sequence with last byte missing (U-001FFFFF): "÷¿¿"               |
+.3.9  5-byte sequence with last byte missing (U-03FFFFFF): "û¿¿¿"               |
+.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "ý¿¿¿¿"               |
+                                                                              |
+.4  Concatenation of incomplete sequences                                    |
+                                                                              |
+All the 10 sequences of 3.3 concatenated, you should see 10 malformed         |
+sequences being signalled:                                                    |
+                                                                              |
+   "Ààðøüßï¿÷¿¿û¿¿¿ý¿¿¿¿"                                                               |
+                                                                              |
+.5  Impossible bytes                                                         |
+                                                                              |
+The following two bytes cannot appear in a correct UTF-8 string               |
+                                                                              |
+.5.1  fe = "þ"                                                               |
+.5.2  ff = "ÿ"                                                               |
+.5.3  fe fe ff ff = "þþÿÿ"                                                   |
+                                                                              |
+Overlong sequences                                                         |
+                                                                              |
+The following sequences are not malformed according to the letter of          |
+the Unicode 2.0 standard. However, they are longer then necessary and         |
+a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8         |
+decoder" should reject them just like malformed sequences for two             |
+reasons: (1) It helps to debug applications if overlong sequences are         |
+not treated as valid representations of characters, because this helps        |
+to spot problems more quickly. (2) Overlong sequences provide                 |
+alternative representations of characters, that could maliciously be          |
+used to bypass filters that check only for ASCII characters. For              |
+instance, a 2-byte encoded line feed (LF) would not be caught by a            |
+line counter that counts only 0x0a bytes, but it would still be               |
+processed as a line feed by an unsafe UTF-8 decoder later in the              |
+pipeline. From a security point of view, ASCII compatibility of UTF-8         |
+sequences means also, that ASCII characters are *only* allowed to be          |
+represented by ASCII bytes in the range 0x00-0x7f. To ensure this             |
+aspect of ASCII compatibility, use only "safe UTF-8 decoders" that            |
+reject overlong UTF-8 sequences for which a shorter encoding exists.          |
+                                                                              |
+.1  Examples of an overlong ASCII character                                  |
+                                                                              |
+With a safe UTF-8 decoder, all of the following five overlong                 |
+representations of the ASCII character slash ("/") should be rejected         |
+like a malformed UTF-8 sequence, for instance by substituting it with         |
+a replacement character. If you see a slash below, you do not have a          |
+safe UTF-8 decoder!                                                           |
+                                                                              |
+.1.1 U+002F = c0 af             = "À¯"                                        |
+.1.2 U+002F = e0 80 af          = "à¯"                                        |
+.1.3 U+002F = f0 80 80 af       = "ð¯"                                        |
+.1.4 U+002F = f8 80 80 80 af    = "ø¯"                                        |
+.1.5 U+002F = fc 80 80 80 80 af = "ü¯"                                        |
+                                                                              |
+.2  Maximum overlong sequences                                               |
+                                                                              |
+Below you see the highest Unicode value that is still resulting in an         |
+overlong sequence if represented with the given number of bytes. This         |
+is a boundary test for safe UTF-8 decoders. All five characters should        |
+be rejected like malformed UTF-8 sequences.                                   |
+                                                                              |
+.2.1  U-0000007F = c1 bf             = "Á¿"                                   |
+.2.2  U-000007FF = e0 9f bf          = "à¿"                                   |
+.2.3  U-0000FFFF = f0 8f bf bf       = "ð¿¿"                                   |
+.2.4  U-001FFFFF = f8 87 bf bf bf    = "ø¿¿¿"                                   |
+.2.5  U-03FFFFFF = fc 83 bf bf bf bf = "ü¿¿¿¿"                                   |
+                                                                              |
+.3  Overlong representation of the NUL character                             |
+                                                                              |
+The following five sequences should also be rejected like malformed           |
+UTF-8 sequences and should not be treated like the ASCII NUL                  |
+character.                                                                    |
+                                                                              |
+.3.1  U+0000 = c0 80             = "À"                                       |
+.3.2  U+0000 = e0 80 80          = "à"                                       |
+.3.3  U+0000 = f0 80 80 80       = "ð"                                       |
+.3.4  U+0000 = f8 80 80 80 80    = "ø"                                       |
+.3.5  U+0000 = fc 80 80 80 80 80 = "ü"                                       |
+                                                                              |
+Illegal code positions                                                     |
+                                                                              |
+The following UTF-8 sequences should be rejected like malformed               |
+sequences, because they never represent valid ISO 10646 characters and        |
+a UTF-8 decoder that accepts them might introduce security problems           |
+comparable to overlong UTF-8 sequences.                                       |
+                                                                              |
+.1 Single UTF-16 surrogates                                                  |
+                                                                              |
+.1.1  U+D800 = ed a0 80 = "í "                                                |
+.1.2  U+DB7F = ed ad bf = "í¿"                                                |
+.1.3  U+DB80 = ed ae 80 = "í®"                                                |
+.1.4  U+DBFF = ed af bf = "í¯¿"                                                |
+.1.5  U+DC00 = ed b0 80 = "í°"                                                |
+.1.6  U+DF80 = ed be 80 = "í¾"                                                |
+.1.7  U+DFFF = ed bf bf = "í¿¿"                                                |
+                                                                              |
+.2 Paired UTF-16 surrogates                                                  |
+                                                                              |
+.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = "í í°"                               |
+.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = "í í¿¿"                               |
+.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = "í¿í°"                               |
+.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = "í¿í¿¿"                               |
+.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = "í®í°"                               |
+.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = "í®í¿¿"                               |
+.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = "í¯¿í°"                               |
+.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = "í¯¿í¿¿"                               |
+                                                                              |
+.3 Other illegal code positions                                              |
+                                                                              |
+.3.1  U+FFFE = ef bf be = ""                                                |
+.3.2  U+FFFF = ef bf bf = ""                                                |
+                                                                              |
+THE END                                                                       |

 /programs/network/netsurf/libparserutils/test/filter.c
 ,0 → 1,349
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <parserutils/parserutils.h>
+#include "utils/utils.h"
+#include "input/filter.h"
+#include "testutils.h"
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_filter_optparams params;
+        parserutils_filter *input;
+        uint8_t inbuf[64], outbuf[64];
+        size_t inlen, outlen;
+        const uint8_t *in = inbuf;
+        uint8_t *out = outbuf;
+        UNUSED(argc);
+        UNUSED(argv);
+        /* Create input filter */
+        assert(parserutils__filter_create("UTF-8", myrealloc, NULL, &input) ==
+                        PARSERUTILS_OK);
+        /* Convert filter to UTF-8 encoding */
+        params.encoding.name = "UTF-8";
+        assert(parserutils__filter_setopt(input, PARSERUTILS_FILTER_SET_ENCODING,
+                        (parserutils_filter_optparams *) &params) ==
+                        PARSERUTILS_OK);
+        /* Simple case - valid input & output buffer large enough */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\xc2\xa0o!");
+        inlen = strlen((const char *) inbuf);
+        outbuf[0] = '\0';
+        outlen = 64;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xc2\xa0o!",
+                        SLEN("hell\xc2\xa0o!")) == 0);
+        /* Too small an output buffer; no encoding edge cases */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hello!");
+        inlen = strlen((const char *) inbuf);
+        outbuf[0] = '\0';
+        outlen = 5;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_NOMEM);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        outlen = 64 - 5 + outlen;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hello!",
+                        SLEN("hello!")) == 0);
+        /* Illegal input sequence; output buffer large enough */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\x96o!");
+        inlen = strlen((const char *) inbuf);
+        outbuf[0] = '\0';
+        outlen = 64;
+        /* Input does loose decoding, converting to U+FFFD if illegal
+         * input is encountered */
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xef\xbf\xbdo!",
+                        SLEN("hell\xef\xbf\xbdo!")) == 0);
+        /* Input ends mid-sequence */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\xc2\xa0o!");
+        inlen = strlen((const char *) inbuf) - 3;
+        outbuf[0] = '\0';
+        outlen = 64;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 3;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xc2\xa0o!",
+                        SLEN("hell\xc2\xa0o!")) == 0);
+        /* Input ends mid-sequence, but second attempt has too small a
+         * buffer, but large enough to write out the incomplete character. */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\xc2\xa0o!");
+        inlen = strlen((const char *) inbuf) - 3;
+        outbuf[0] = '\0';
+        outlen = 64;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 3;
+        outlen = 3;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_NOMEM);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        outlen = 64 - 7;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xc2\xa0o!",
+                        SLEN("hell\xc2\xa0o!")) == 0);
+        /* Input ends mid-sequence, but second attempt has too small a
+         * buffer, not large enough to write out the incomplete character. */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\xc2\xa0o!");
+        inlen = strlen((const char *) inbuf) - 3;
+        outbuf[0] = '\0';
+        outlen = 64;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 3;
+        outlen = 1;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_NOMEM);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        outlen = 60;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xc2\xa0o!",
+                        SLEN("hell\xc2\xa0o!")) == 0);
+        /* Input ends mid-sequence, but second attempt contains
+         * invalid character */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\xc2\xc2o!");
+        inlen = strlen((const char *) inbuf) - 3;
+        outbuf[0] = '\0';
+        outlen = 64;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 3;
+        /* Input does loose decoding, converting to U+FFFD if illegal
+         * input is encountered */
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xef\xbf\xbd\xef\xbf\xbdo!",
+                        SLEN("hell\xef\xbf\xbd\xef\xbf\xbdo!")) == 0);
+        /* Input ends mid-sequence, but second attempt contains another
+         * incomplete character */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\xc2\xa0\xc2\xa1o!");
+        inlen = strlen((const char *) inbuf) - 5;
+        outbuf[0] = '\0';
+        outlen = 64;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 2;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 3;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xc2\xa0\xc2\xa1o!",
+                        SLEN("hell\xc2\xa0\xc2\xa1o!")) == 0);
+        /* Input ends mid-sequence, but second attempt contains insufficient
+         * data to complete the incomplete character */
+        in = inbuf;
+        out = outbuf;
+        strcpy((char *) inbuf, "hell\xe2\x80\xa2o!");
+        inlen = strlen((const char *) inbuf) - 4;
+        outbuf[0] = '\0';
+        outlen = 64;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 1;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        inlen += 3;
+        assert(parserutils__filter_process_chunk(input, &in, &inlen,
+                        &out, &outlen) == PARSERUTILS_OK);
+        printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+                        (int) (out - ((uint8_t *) outbuf)),
+                        outbuf, (int) outlen);
+        assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
+        assert(memcmp(outbuf, "hell\xe2\x80\xa2o!",
+                        SLEN("hell\xe2\x80\xa2o!")) == 0);
+        /* Clean up */
+        parserutils__filter_destroy(input);
+        printf("PASS\n");
+        return 0;
+}

 /programs/network/netsurf/libparserutils/test/inputstream.c
 ,0 → 1,102
+#include <inttypes.h>
+#include <stdio.h>
+#include <parserutils/parserutils.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+#include "utils/utils.h"
+#include "testutils.h"
+#ifdef __riscos
+const char * const __dynamic_da_name = "InputStream";
+int __dynamic_da_max_size = 128*1024*1024;
+#endif
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_inputstream *stream;
+        FILE *fp;
+        size_t len;
+#define CHUNK_SIZE (4096)
+        uint8_t buf[CHUNK_SIZE];
+        const uint8_t *c;
+        size_t clen;
+        if (argc != 2) {
+                printf("Usage: %s <filename>\n", argv[0]);
+                return 1;
+        }
+        assert(parserutils_inputstream_create("UTF-8", 1, NULL,
+                        myrealloc, NULL, &stream) == PARSERUTILS_OK);
+        fp = fopen(argv[1], "rb");
+        if (fp == NULL) {
+                printf("Failed opening %s\n", argv[1]);
+                return 1;
+        }
+        fseek(fp, 0, SEEK_END);
+        len = ftell(fp);
+        fseek(fp, 0, SEEK_SET);
+        while (len >= CHUNK_SIZE) {
+                size_t read = fread(buf, 1, CHUNK_SIZE, fp);
+                assert(read == CHUNK_SIZE);
+                assert(parserutils_inputstream_append(stream,
+                                buf, CHUNK_SIZE) == PARSERUTILS_OK);
+                len -= CHUNK_SIZE;
+                while (parserutils_inputstream_peek(stream, 0, &c, &clen) !=
+                                PARSERUTILS_NEEDDATA) {
+                        parserutils_inputstream_advance(stream, clen);
+                        if (*c == 'a') {
+                                assert(parserutils_inputstream_insert(stream,
+                                                (const uint8_t *) "hello!!!",
+                                                SLEN("hello!!!")) == PARSERUTILS_OK);
+                        }
+                }
+        }
+        if (len > 0) {
+                size_t read = fread(buf, 1, len, fp);
+                assert(read == len);
+                assert(parserutils_inputstream_append(stream,
+                                buf, len) == PARSERUTILS_OK);
+                len = 0;
+        }
+        fclose(fp);
+        assert(parserutils_inputstream_insert(stream,
+                        (const uint8_t *) "hello!!!",
+                        SLEN("hello!!!")) == PARSERUTILS_OK);
+        assert(parserutils_inputstream_append(stream, NULL, 0) ==
+                        PARSERUTILS_OK);
+        while (parserutils_inputstream_peek(stream, 0, &c, &clen) !=
+                        PARSERUTILS_EOF) {
+                parserutils_inputstream_advance(stream, clen);
+        }
+        parserutils_inputstream_destroy(stream);
+        printf("PASS\n");
+        return 0;
+}

/programs/network/netsurf/libparserutils/test/regression/INDEX
0,0 → 1,7
# Index for testcases
#
# Test Description DataDir

filter-segv Segfault in input filtering
stream-nomem Inputstream buffer expansion
filter-badenc-segv Segfault on resetting bad encoding in filter

/programs/network/netsurf/libparserutils/test/regression/Makefile
0,0 → 1,7
# Tests
DIR_TEST_ITEMS := filter-segv:filter-segv.c \
stream-nomem:stream-nomem.c filter-badenc-segv:filter-badenc-segv.c

CFLAGS := $(CFLAGS) -I$(CURDIR)/test

include $(NSBUILD)/Makefile.subdir

 /programs/network/netsurf/libparserutils/test/regression/filter-badenc-segv.c
 ,0 → 1,50
+#include <stdio.h>
+#include <stdlib.h>
+#include <parserutils/parserutils.h>
+#include "input/filter.h"
+#include "testutils.h"
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_filter *input;
+        parserutils_filter_optparams params;
+        parserutils_error expected;
+#ifndef WITHOUT_ICONV_FILTER
+        expected = PARSERUTILS_OK;
+#else
+        expected = PARSERUTILS_BADENCODING;
+#endif
+        UNUSED(argc);
+        UNUSED(argv);
+        assert(parserutils__filter_create("UTF-8", myrealloc, NULL, &input) ==
+                        PARSERUTILS_OK);
+        params.encoding.name = "GBK";
+        assert(parserutils__filter_setopt(input,
+                        PARSERUTILS_FILTER_SET_ENCODING, &params) ==
+                        expected);
+        params.encoding.name = "GBK";
+        assert(parserutils__filter_setopt(input,
+                        PARSERUTILS_FILTER_SET_ENCODING, &params) ==
+                        expected);
+        parserutils__filter_destroy(input);
+        printf("PASS\n");
+        return 0;
+}

 /programs/network/netsurf/libparserutils/test/regression/filter-segv.c
 ,0 → 1,32
+#include <stdio.h>
+#include <stdlib.h>
+#include <parserutils/parserutils.h>
+#include "input/filter.h"
+#include "testutils.h"
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_filter *input;
+        UNUSED(argc);
+        UNUSED(argv);
+        assert(parserutils__filter_create("UTF-8", myrealloc, NULL, &input) ==
+                        PARSERUTILS_OK);
+        parserutils__filter_destroy(input);
+        printf("PASS\n");
+        return 0;
+}

 /programs/network/netsurf/libparserutils/test/regression/stream-nomem.c
 ,0 → 1,86
+#include <stdio.h>
+#include <string.h>
+#include <parserutils/parserutils.h>
+#include <parserutils/input/inputstream.h>
+#include "utils/utils.h"
+#include "testutils.h"
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+        UNUSED(pw);
+        return realloc(ptr, len);
+}
+int main(int argc, char **argv)
+{
+        parserutils_inputstream *stream;
+        /* This is specially calculated so that the inputstream is forced to
+         * reallocate (it assumes that the inputstream's buffer chunk size
+         * is 4k) */
+#define BUFFER_SIZE (4096 + 4)
+        uint8_t input_buffer[BUFFER_SIZE];
+//      uint8_t *buffer;
+//      size_t buflen;
+        const uint8_t *c;
+        size_t clen;
+        UNUSED(argc);
+        UNUSED(argv);
+        /* Populate the buffer with something sane */
+        memset(input_buffer, 'a', BUFFER_SIZE);
+        /* Now, set up our test data */
+        input_buffer[BUFFER_SIZE - 1] = '5';
+        input_buffer[BUFFER_SIZE - 2] = '4';
+        input_buffer[BUFFER_SIZE - 3] = '\xbd';
+        input_buffer[BUFFER_SIZE - 4] = '\xbf';
+        /* This byte will occupy the 4095th byte in the buffer and
+         * thus cause the entirety of U+FFFD to be buffered until after
+         * the buffer has been enlarged */
+        input_buffer[BUFFER_SIZE - 5] = '\xef';
+        input_buffer[BUFFER_SIZE - 6] = '3';
+        input_buffer[BUFFER_SIZE - 7] = '2';
+        input_buffer[BUFFER_SIZE - 8] = '1';
+        assert(parserutils_inputstream_create("UTF-8", 0,
+                        NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK);
+        assert(parserutils_inputstream_append(stream,
+                        input_buffer, BUFFER_SIZE) == PARSERUTILS_OK);
+        assert(parserutils_inputstream_append(stream, NULL, 0) ==
+                        PARSERUTILS_OK);
+        while (parserutils_inputstream_peek(stream, 0, &c, &clen) !=
+                        PARSERUTILS_EOF)
+                parserutils_inputstream_advance(stream, clen);
+/*
+        assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) ==
+                        CSS_OK);
+        assert(buflen == BUFFER_SIZE);
+        printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8));
+        assert( buffer[BUFFER_SIZE - 6] == '3' &&
+                buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' &&
+                buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' &&
+                buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' &&
+                buffer[BUFFER_SIZE - 2] == '4');
+        free(buffer);
+*/
+        parserutils_inputstream_destroy(stream);
+        printf("PASS\n");
+        return 0;
+}

 /programs/network/netsurf/libparserutils/test/testutils.h
 ,0 → 1,123
+#ifndef test_testutils_h_
+#define test_testutils_h_
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifndef UNUSED
+#define UNUSED(x) ((x) = (x))
+#endif
+/* Redefine assert, so we can simply use the standard assert mechanism
+ * within testcases and exit with the right output for the testrunner
+ * to do the right thing. */
+void __assert2(const char *expr, const char *function,
+                const char *file, int line);
+void __assert2(const char *expr, const char *function,
+                const char *file, int line)
+{
+        UNUSED(function);
+        UNUSED(file);
+        printf("FAIL - %s at line %d\n", expr, line);
+        exit(EXIT_FAILURE);
+}
+#define assert(expr) \
+  ((void) ((expr) || (__assert2 (#expr, __func__, __FILE__, __LINE__), 0)))
+typedef bool (*line_func)(const char *data, size_t datalen, void *pw);
+static size_t parse_strlen(const char *str, size_t limit);
+bool parse_testfile(const char *filename, line_func callback, void *pw);
+size_t parse_filesize(const char *filename);
+/**
+ * Testcase datafile parser driver
+ *
+ * \param filename  Name of file to parse
+ * \param callback  Pointer to function to handle each line of input data
+ * \param pw        Pointer to client-specific private data
+ * \return true on success, false otherwise.
+ */
+bool parse_testfile(const char *filename, line_func callback, void *pw)
+{
+        FILE *fp;
+        char buf[300];
+        fp = fopen(filename, "rb");
+        if (fp == NULL) {
+                printf("Failed opening %s\n", filename);
+                return false;
+        }
+        while (fgets(buf, sizeof buf, fp)) {
+                if (buf[0] == '\n')
+                        continue;
+                if (!callback(buf, parse_strlen(buf, sizeof buf - 1), pw)) {
+                        fclose(fp);
+                        return false;
+                }
+        }
+        fclose(fp);
+        return true;
+}
+/**
+ * Utility string length measurer; assumes strings are '\n' terminated
+ *
+ * \param str    String to measure length of
+ * \param limit  Upper bound on string length
+ * \return String length
+ */
+size_t parse_strlen(const char *str, size_t limit)
+{
+        size_t len = 0;
+        if (str == NULL)
+                return 0;
+        while (len < limit - 1 && *str != '\n') {
+                len++;
+                str++;
+        }
+        len++;
+        return len;
+}
+/**
+ * Read the size of a file
+ *
+ * \param filename  Name of file to read size of
+ * \return File size (in bytes), or 0 on error
+ */
+size_t parse_filesize(const char *filename)
+{
+        FILE *fp;
+        size_t len = 0;
+        fp = fopen(filename, "rb");
+        if (fp == NULL) {
+                printf("Failed opening %s\n", filename);
+                return 0;
+        }
+        fseek(fp, 0, SEEK_END);
+        len = ftell(fp);
+        fclose(fp);
+        return len;
+}
+#endif

Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 3583 → Rev 3584