Subversion Repositories Kolibri OS

Compare Revisions

No changes between revisions

Regard whitespace Rev 3583 → Rev 3584

/programs/network/netsurf/libparserutils/.gitignore
0,0 → 1,3
build-*
Makefile.config.override
 
/programs/network/netsurf/libparserutils/COPYING
0,0 → 1,19
Copyright (C) 2007-8 J-M Bell
 
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
 
* The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
 
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
/programs/network/netsurf/libparserutils/Makefile
0,0 → 1,52
# Component settings
COMPONENT := parserutils
COMPONENT_VERSION := 0.1.1
# Default to a static library
COMPONENT_TYPE ?= lib-static
 
# Setup the tooling
PREFIX ?= /opt/netsurf
NSSHARED ?= $(PREFIX)/share/netsurf-buildsystem
include $(NSSHARED)/makefiles/Makefile.tools
 
TESTRUNNER := $(PERL) $(NSTESTTOOLS)/testrunner.pl
 
# Toolchain flags
WARNFLAGS := -Wall -W -Wundef -Wpointer-arith -Wcast-align \
-Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
-Wmissing-declarations -Wnested-externs -pedantic
# BeOS/Haiku standard library headers create warnings.
ifneq ($(TARGET),beos)
WARNFLAGS := $(WARNFLAGS) -Werror
endif
 
CFLAGS := -D_BSD_SOURCE -I$(CURDIR)/include/ \
-I$(CURDIR)/src $(WARNFLAGS) $(CFLAGS)
ifneq ($(GCCVER),2)
CFLAGS := $(CFLAGS) -std=c99
else
# __inline__ is a GCCism
CFLAGS := $(CFLAGS) -Dinline="__inline__"
endif
 
include $(NSBUILD)/Makefile.top
 
# Extra installation rules
Is := include/parserutils
I := /include/parserutils
INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/errors.h;$(Is)/functypes.h;$(Is)/parserutils.h;$(Is)/types.h
 
Is := include/parserutils/charset
I := /include/parserutils/charset
INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/codec.h;$(Is)/mibenum.h;$(Is)/utf16.h;$(Is)/utf8.h
 
Is := include/parserutils/input
I := /include/parserutils/input
INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/inputstream.h
 
Is := include/parserutils/utils
I := /include/parserutils/utils
INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):$(Is)/buffer.h;$(Is)/stack.h;$(Is)/vector.h
 
INSTALL_ITEMS := $(INSTALL_ITEMS) /lib/pkgconfig:lib$(COMPONENT).pc.in
INSTALL_ITEMS := $(INSTALL_ITEMS) /lib:$(OUTPUT)
/programs/network/netsurf/libparserutils/Makefile.config
0,0 → 1,7
# Configuration Makefile fragment
 
# Disable use of iconv in the input filter
# CFLAGS := $(CFLAGS) -DWITHOUT_ICONV_FILTER
 
# Cater for local configuration changes
-include Makefile.config.override
/programs/network/netsurf/libparserutils/README
0,0 → 1,123
LibParserUtils -- a utility library for parser building
=======================================================
 
Overview
--------
 
LibParserUtils provides various pieces of functionality that are useful
when writing parsers. These are:
 
+ A number of character set convertors
+ Mapping of character set names to/from MIB enum values
+ UTF-8 and UTF-16 (host endian) support functions
+ Various simple data structures (resizeable buffer, stack, vector)
+ A UTF-8 input stream
 
Requirements
------------
 
LibParserUtils requires the following tools:
 
+ A C99 capable C compiler
+ GNU make or compatible
+ Perl (for the testcases)
+ Pkg-config (for the testcases)
+ doxygen (for the API documentation)
 
For enhanced charset support, LibParserUtils requires an iconv()
implementation. If you don't have an implementation of iconv(),
this requirement may be disabled: see the "Disabling iconv()
support" section, below.
 
Compilation
-----------
 
The exact type of build may be configured by passing parameters to make.
Common usage is described below.
 
For a static library:
 
$ make
 
For a shared library:
 
$ make COMPONENT_TYPE=lib-shared
 
For a static library with debug enabled:
 
$ make BUILD=debug
 
To cross-compile a static library:
 
$ make TARGET=<target-platform>
 
Verification
------------
 
The library's functionality may be verified, thus:
$ make test
 
If you wish to see test coverage statistics, run:
 
$ make coverage
 
Then open build/coverage/index.html in a web browser.
 
In both cases, ensure that the same parameters to make are passed as when
building the library.
 
(Un)installation
----------------
 
To install the library:
 
$ make install
 
Ensure that the same parameters to make are passed as when building the
library.
 
To specify the installation prefix:
 
$ make install PREFIX=/path/to/prefix
 
To specify a staging directory for packaging:
 
$ make install DESTDIR=/path/to/directory
 
Items will be installed to $(DESTDIR)$(PREFIX)/
 
To uninstall:
 
$ make uninstall
 
API documentation
-----------------
 
Use doxygen to auto-generate API documentation, thus:
 
$ make docs
 
Then open build/docs/html/index.html in a web browser.
 
The test driver code in test/ may also provide some useful pointers.
 
Disabling iconv() support
-------------------------
 
Without iconv() support enabled, libparserutils only supports the
following character sets:
 
+ UTF-16 (platform-native endian)
+ UTF-8
+ ISO-8859-n
+ Windows-125n
+ US-ASCII
 
To disable iconv() support in libparserutils, do the following:
 
$ echo "CFLAGS += -DWITHOUT_ICONV_FILTER" \
>Makefile.config.override
 
Then build libparserutils as normal.
 
/programs/network/netsurf/libparserutils/build/Aliases
0,0 → 1,303
# > Unicode:Files.Aliases
# Mapping of character set encoding names to their canonical form
#
# Lines starting with a '#' are comments, blank lines are ignored.
#
# Based on http://www.iana.org/assignments/character-sets and
# http://www.iana.org/assignments/ianacharset-mib
#
# Canonical Form MIBenum Aliases...
#
US-ASCII 3 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII
ISO-10646-UTF-1 27 csISO10646UTF1
ISO_646.basic:1983 28 ref csISO646basic1983
INVARIANT 29 csINVARIANT
ISO_646.irv:1983 30 iso-ir-2 irv csISO2IntlRefVersion
BS_4730 20 iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom
NATS-SEFI 31 iso-ir-8-1 csNATSSEFI
NATS-SEFI-ADD 32 iso-ir-8-2 csNATSSEFIADD
NATS-DANO 33 iso-ir-9-1 csNATSDANO
NATS-DANO-ADD 34 iso-ir-9-2 csNATSDANOADD
SEN_850200_B 35 iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish
SEN_850200_C 21 iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames
KS_C_5601-1987 36 iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987
ISO-2022-KR 37 csISO2022KR
EUC-KR 38 csEUCKR EUCKR
ISO-2022-JP 39 csISO2022JP
ISO-2022-JP-2 40 csISO2022JP2
ISO-2022-CN 104
ISO-2022-CN-EXT 105
JIS_C6220-1969-jp 41 JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp
JIS_C6220-1969-ro 42 iso-ir-14 jp ISO646-JP csISO14JISC6220ro
IT 22 iso-ir-15 ISO646-IT csISO15Italian
PT 43 iso-ir-16 ISO646-PT csISO16Portuguese
ES 23 iso-ir-17 ISO646-ES csISO17Spanish
greek7-old 44 iso-ir-18 csISO18Greek7Old
latin-greek 45 iso-ir-19 csISO19LatinGreek
DIN_66003 24 iso-ir-21 de ISO646-DE csISO21German
NF_Z_62-010_(1973) 46 iso-ir-25 ISO646-FR1 csISO25French
Latin-greek-1 47 iso-ir-27 csISO27LatinGreek1
ISO_5427 48 iso-ir-37 csISO5427Cyrillic
JIS_C6226-1978 49 iso-ir-42 csISO42JISC62261978
BS_viewdata 50 iso-ir-47 csISO47BSViewdata
INIS 51 iso-ir-49 csISO49INIS
INIS-8 52 iso-ir-50 csISO50INIS8
INIS-cyrillic 53 iso-ir-51 csISO51INISCyrillic
ISO_5427:1981 54 iso-ir-54 ISO5427Cyrillic1981
ISO_5428:1980 55 iso-ir-55 csISO5428Greek
GB_1988-80 56 iso-ir-57 cn ISO646-CN csISO57GB1988
GB_2312-80 57 iso-ir-58 chinese csISO58GB231280
NS_4551-1 25 iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1
NS_4551-2 58 ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2
NF_Z_62-010 26 iso-ir-69 ISO646-FR fr csISO69French
videotex-suppl 59 iso-ir-70 csISO70VideotexSupp1
PT2 60 iso-ir-84 ISO646-PT2 csISO84Portuguese2
ES2 61 iso-ir-85 ISO646-ES2 csISO85Spanish2
MSZ_7795.3 62 iso-ir-86 ISO646-HU hu csISO86Hungarian
JIS_C6226-1983 63 iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208
greek7 64 iso-ir-88 csISO88Greek7
ASMO_449 65 ISO_9036 arabic7 iso-ir-89 csISO89ASMO449
iso-ir-90 66 csISO90
JIS_C6229-1984-a 67 iso-ir-91 jp-ocr-a csISO91JISC62291984a
JIS_C6229-1984-b 68 iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b
JIS_C6229-1984-b-add 69 iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd
JIS_C6229-1984-hand 70 iso-ir-94 jp-ocr-hand csISO94JIS62291984hand
JIS_C6229-1984-hand-add 71 iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd
JIS_C6229-1984-kana 72 iso-ir-96 csISO96JISC62291984kana
ISO_2033-1983 73 iso-ir-98 e13b csISO2033
ANSI_X3.110-1983 74 iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS
ISO-8859-1 4 iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1
ISO-8859-2 5 iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2
T.61-7bit 75 iso-ir-102 csISO102T617bit
T.61-8bit 76 T.61 iso-ir-103 csISO103T618bit
ISO-8859-3 6 iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3
ISO-8859-4 7 iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4
ECMA-cyrillic 77 iso-ir-111 KOI8-E csISO111ECMACyrillic
CSA_Z243.4-1985-1 78 iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1
CSA_Z243.4-1985-2 79 iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2
CSA_Z243.4-1985-gr 80 iso-ir-123 csISO123CSAZ24341985gr
ISO-8859-6 9 iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic
ISO-8859-6-E 81 csISO88596E ISO_8859-6-E
ISO-8859-6-I 82 csISO88596I ISO_8859-6-I
ISO-8859-7 10 iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7
T.101-G2 83 iso-ir-128 csISO128T101G2
ISO-8859-8 11 iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8
ISO-8859-8-E 84 csISO88598E ISO_8859-8-E
ISO-8859-8-I 85 csISO88598I ISO_8859-8-I
CSN_369103 86 iso-ir-139 csISO139CSN369103
JUS_I.B1.002 87 iso-ir-141 ISO646-YU js yu csISO141JUSIB1002
ISO_6937-2-add 14 iso-ir-142 csISOTextComm
IEC_P27-1 88 iso-ir-143 csISO143IECP271
ISO-8859-5 8 iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5
JUS_I.B1.003-serb 89 iso-ir-146 serbian csISO146Serbian
JUS_I.B1.003-mac 90 macedonian iso-ir-147 csISO147Macedonian
ISO-8859-9 12 iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9
greek-ccitt 91 iso-ir-150 csISO150 csISO150GreekCCITT
NC_NC00-10:81 92 cuba iso-ir-151 ISO646-CU csISO151Cuba
ISO_6937-2-25 93 iso-ir-152 csISO6937Add
GOST_19768-74 94 ST_SEV_358-88 iso-ir-153 csISO153GOST1976874
ISO_8859-supp 95 iso-ir-154 latin1-2-5 csISO8859Supp
ISO_10367-box 96 iso-ir-155 csISO10367Box
ISO-8859-10 13 iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10
latin-lap 97 lap iso-ir-158 csISO158Lap
JIS_X0212-1990 98 x0212 iso-ir-159 csISO159JISX02121990
DS_2089 99 DS2089 ISO646-DK dk csISO646Danish
us-dk 100 csUSDK
dk-us 101 csDKUS
JIS_X0201 15 X0201 csHalfWidthKatakana
KSC5636 102 ISO646-KR csKSC5636
ISO-10646-UCS-2 1000 csUnicode UCS-2 UCS2
ISO-10646-UCS-4 1001 csUCS4 UCS-4 UCS4
DEC-MCS 2008 dec csDECMCS
hp-roman8 2004 roman8 r8 csHPRoman8
macintosh 2027 mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN
IBM037 2028 cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037
IBM038 2029 EBCDIC-INT cp038 csIBM038
IBM273 2030 CP273 csIBM273
IBM274 2031 EBCDIC-BE CP274 csIBM274
IBM275 2032 EBCDIC-BR cp275 csIBM275
IBM277 2033 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277
IBM278 2034 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278
IBM280 2035 CP280 ebcdic-cp-it csIBM280
IBM281 2036 EBCDIC-JP-E cp281 csIBM281
IBM284 2037 CP284 ebcdic-cp-es csIBM284
IBM285 2038 CP285 ebcdic-cp-gb csIBM285
IBM290 2039 cp290 EBCDIC-JP-kana csIBM290
IBM297 2040 cp297 ebcdic-cp-fr csIBM297
IBM420 2041 cp420 ebcdic-cp-ar1 csIBM420
IBM423 2042 cp423 ebcdic-cp-gr csIBM423
IBM424 2043 cp424 ebcdic-cp-he csIBM424
IBM437 2011 cp437 437 csPC8CodePage437
IBM500 2044 CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500
IBM775 2087 cp775 csPC775Baltic
IBM850 2009 cp850 850 csPC850Multilingual
IBM851 2045 cp851 851 csIBM851
IBM852 2010 cp852 852 csPCp852
IBM855 2046 cp855 855 csIBM855
IBM857 2047 cp857 857 csIBM857
IBM860 2048 cp860 860 csIBM860
IBM861 2049 cp861 861 cp-is csIBM861
IBM862 2013 cp862 862 csPC862LatinHebrew
IBM863 2050 cp863 863 csIBM863
IBM864 2051 cp864 csIBM864
IBM865 2052 cp865 865 csIBM865
IBM866 2086 cp866 866 csIBM866
IBM868 2053 CP868 cp-ar csIBM868
IBM869 2054 cp869 869 cp-gr csIBM869
IBM870 2055 CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870
IBM871 2056 CP871 ebcdic-cp-is csIBM871
IBM880 2057 cp880 EBCDIC-Cyrillic csIBM880
IBM891 2058 cp891 csIBM891
IBM903 2059 cp903 csIBM903
IBM904 2060 cp904 904 csIBBM904
IBM905 2061 CP905 ebcdic-cp-tr csIBM905
IBM918 2062 CP918 ebcdic-cp-ar2 csIBM918
IBM1026 2063 CP1026 csIBM1026
EBCDIC-AT-DE 2064 csIBMEBCDICATDE
EBCDIC-AT-DE-A 2065 csEBCDICATDEA
EBCDIC-CA-FR 2066 csEBCDICCAFR
EBCDIC-DK-NO 2067 csEBCDICDKNO
EBCDIC-DK-NO-A 2068 csEBCDICDKNOA
EBCDIC-FI-SE 2069 csEBCDICFISE
EBCDIC-FI-SE-A 2070 csEBCDICFISEA
EBCDIC-FR 2071 csEBCDICFR
EBCDIC-IT 2072 csEBCDICIT
EBCDIC-PT 2073 csEBCDICPT
EBCDIC-ES 2074 csEBCDICES
EBCDIC-ES-A 2075 csEBCDICESA
EBCDIC-ES-S 2076 csEBCDICESS
EBCDIC-UK 2077 csEBCDICUK
EBCDIC-US 2078 csEBCDICUS
UNKNOWN-8BIT 2079 csUnknown8BiT
MNEMONIC 2080 csMnemonic
MNEM 2081 csMnem
VISCII 2082 csVISCII
VIQR 2083 csVIQR
KOI8-R 2084 csKOI8R
KOI8-U 2088
IBM00858 2089 CCSID00858 CP00858 PC-Multilingual-850+euro
IBM00924 2090 CCSID00924 CP00924 ebcdic-Latin9--euro
IBM01140 2091 CCSID01140 CP01140 ebcdic-us-37+euro
IBM01141 2092 CCSID01141 CP01141 ebcdic-de-273+euro
IBM01142 2093 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro
IBM01143 2094 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro
IBM01144 2095 CCSID01144 CP01144 ebcdic-it-280+euro
IBM01145 2096 CCSID01145 CP01145 ebcdic-es-284+euro
IBM01146 2097 CCSID01146 CP01146 ebcdic-gb-285+euro
IBM01147 2098 CCSID01147 CP01147 ebcdic-fr-297+euro
IBM01148 2099 CCSID01148 CP01148 ebcdic-international-500+euro
IBM01149 2100 CCSID01149 CP01149 ebcdic-is-871+euro
Big5-HKSCS 2101
IBM1047 2102 IBM-1047
PTCP154 2103 csPTCP154 PT154 CP154 Cyrillic-Asian
Amiga-1251 2104 Ami1251 Amiga1251 Ami-1251
KOI7-switched 2105
UNICODE-1-1 1010 csUnicode11
SCSU 1011
UTF-7 1012
UTF-16BE 1013
UTF-16LE 1014
UTF-16 1015
CESU-8 1016 csCESU-8
UTF-32 1017
UTF-32BE 1018
UTF-32LE 1019
BOCU-1 1020 csBOCU-1
UNICODE-1-1-UTF-7 103 csUnicode11UTF7
UTF-8 106 UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8
ISO-8859-13 109 8859_13 ISO8859-13
ISO-8859-14 110 iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14
ISO-8859-15 111 ISO_8859-15 Latin-9 8859_15 ISO8859-15
ISO-8859-16 112 iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10
GBK 113 CP936 MS936 windows-936
GB18030 114
OSD_EBCDIC_DF04_15 115
OSD_EBCDIC_DF03_IRV 116
OSD_EBCDIC_DF04_1 117
JIS_Encoding 16 csJISEncoding
Shift_JIS 17 MS_Kanji csShiftJIS X-SJIS Shift-JIS
EUC-JP 18 csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP
Extended_UNIX_Code_Fixed_Width_for_Japanese 19 csEUCFixWidJapanese
ISO-10646-UCS-Basic 1002 csUnicodeASCII
ISO-10646-Unicode-Latin1 1003 csUnicodeLatin1 ISO-10646
ISO-Unicode-IBM-1261 1005 csUnicodeIBM1261
ISO-Unicode-IBM-1268 1006 csUnicodeIBM1268
ISO-Unicode-IBM-1276 1007 csUnicodeIBM1276
ISO-Unicode-IBM-1264 1008 csUnicodeIBM1264
ISO-Unicode-IBM-1265 1009 csUnicodeIBM1265
ISO-8859-1-Windows-3.0-Latin-1 2000 csWindows30Latin1
ISO-8859-1-Windows-3.1-Latin-1 2001 csWindows31Latin1
ISO-8859-2-Windows-Latin-2 2002 csWindows31Latin2
ISO-8859-9-Windows-Latin-5 2003 csWindows31Latin5
Adobe-Standard-Encoding 2005 csAdobeStandardEncoding
Ventura-US 2006 csVenturaUS
Ventura-International 2007 csVenturaInternational
PC8-Danish-Norwegian 2012 csPC8DanishNorwegian
PC8-Turkish 2014 csPC8Turkish
IBM-Symbols 2015 csIBMSymbols
IBM-Thai 2016 csIBMThai
HP-Legal 2017 csHPLegal
HP-Pi-font 2018 csHPPiFont
HP-Math8 2019 csHPMath8
Adobe-Symbol-Encoding 2020 csHPPSMath
HP-DeskTop 2021 csHPDesktop
Ventura-Math 2022 csVenturaMath
Microsoft-Publishing 2023 csMicrosoftPublishing
Windows-31J 2024 csWindows31J
GB2312 2025 csGB2312 EUC-CN EUCCN CN-GB
Big5 2026 csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE x-x-big5
windows-1250 2250 CP1250 MS-EE
windows-1251 2251 CP1251 MS-CYRL
windows-1252 2252 CP1252 MS-ANSI
windows-1253 2253 CP1253 MS-GREEK
windows-1254 2254 CP1254 MS-TURK
windows-1255 2255
windows-1256 2256 CP1256 MS-ARAB
windows-1257 2257 CP1257 WINBALTRIM
windows-1258 2258
TIS-620 2259
HZ-GB-2312 2085
 
# Additional encodings not defined by IANA
 
# Arbitrary allocations
#CP737 3001
#CP853 3002
#CP856 3003
CP874 3004 WINDOWS-874
#CP922 3005
#CP1046 3006
#CP1124 3007
#CP1125 3008 WINDOWS-1125
#CP1129 3009
#CP1133 3010 IBM-CP1133
#CP1161 3011 IBM-1161 IBM1161 CSIBM1161
#CP1162 3012 IBM-1162 IBM1162 CSIBM1162
#CP1163 3013 IBM-1163 IBM1163 CSIBM1163
#GEORGIAN-ACADEMY 3014
#GEORGIAN-PS 3015
#KOI8-RU 3016
#KOI8-T 3017
#MACARABIC 3018 X-MAC-ARABIC MAC-ARABIC
#MACCROATIAN 3019 X-MAC-CROATIAN MAC-CROATIAN
#MACGREEK 3020 X-MAC-GREEK MAC-GREEK
#MACHEBREW 3021 X-MAC-HEBREW MAC-HEBREW
#MACICELAND 3022 X-MAC-ICELAND MAC-ICELAND
#MACROMANIA 3023 X-MAC-ROMANIA MAC-ROMANIA
#MACTHAI 3024 X-MAC-THAI MAC-THAI
#MACTURKISH 3025 X-MAC-TURKISH MAC-TURKISH
#MULELAO-1 3026
CP949 3027 WINDOWS-949
 
# From Unicode Lib
ISO-IR-182 4000
ISO-IR-197 4002
ISO-2022-JP-1 4008
MACCYRILLIC 4009 X-MAC-CYRILLIC MAC-CYRILLIC
MACUKRAINE 4010 X-MAC-UKRAINIAN MAC-UKRAINIAN
MACCENTRALEUROPE 4011 X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN
JOHAB 4012
ISO-8859-11 4014 iso-ir-166 ISO_8859-11 ISO8859-11 8859_11
X-CURRENT 4999 X-SYSTEM
X-ACORN-LATIN1 5001
X-ACORN-FUZZY 5002
/programs/network/netsurf/libparserutils/build/Doxyfile
0,0 → 1,1237
# Doxyfile 1.4.6
 
# This file describes the settings to be used by the documentation system
# doxygen (www.doxygen.org) for a project
#
# All text after a hash (#) is considered a comment and will be ignored
# The format is:
# TAG = value [value, ...]
# For lists items can also be appended using:
# TAG += value [value, ...]
# Values that contain spaces should be placed between quotes (" ")
 
#---------------------------------------------------------------------------
# Project related configuration options
#---------------------------------------------------------------------------
 
# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
# by quotes) that should identify the project.
 
PROJECT_NAME = Libparserutils
 
# The PROJECT_NUMBER tag can be used to enter a project or revision number.
# This could be handy for archiving the generated documentation or
# if some version control system is used.
 
PROJECT_NUMBER =
 
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.
# If a relative path is entered, it will be relative to the location
# where doxygen was started. If left blank the current directory will be used.
 
OUTPUT_DIRECTORY = build/docs
 
# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
# 4096 sub-directories (in 2 levels) under the output directory of each output
# format and will distribute the generated files over these directories.
# Enabling this option can be useful when feeding doxygen a huge amount of
# source files, where putting all generated files in the same directory would
# otherwise cause performance problems for the file system.
 
CREATE_SUBDIRS = NO
 
# The OUTPUT_LANGUAGE tag is used to specify the language in which all
# documentation generated by doxygen is written. Doxygen will use this
# information to generate all constant output in the proper language.
# The default language is English, other supported languages are:
# Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish,
# Dutch, Finnish, French, German, Greek, Hungarian, Italian, Japanese,
# Japanese-en (Japanese with English messages), Korean, Korean-en, Norwegian,
# Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish,
# Swedish, and Ukrainian.
 
OUTPUT_LANGUAGE = English
 
# This tag can be used to specify the encoding used in the generated output.
# The encoding is not always determined by the language that is chosen,
# but also whether or not the output is meant for Windows or non-Windows users.
# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES
# forces the Windows encoding (this is the default for the Windows binary),
# whereas setting the tag to NO uses a Unix-style encoding (the default for
# all platforms other than Windows).
 
USE_WINDOWS_ENCODING = NO
 
# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
# include brief member descriptions after the members that are listed in
# the file and class documentation (similar to JavaDoc).
# Set to NO to disable this.
 
BRIEF_MEMBER_DESC = YES
 
# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
# the brief description of a member or function before the detailed description.
# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
# brief descriptions will be completely suppressed.
 
REPEAT_BRIEF = YES
 
# This tag implements a quasi-intelligent brief description abbreviator
# that is used to form the text in various listings. Each string
# in this list, if found as the leading text of the brief description, will be
# stripped from the text and the result after processing the whole list, is
# used as the annotated text. Otherwise, the brief description is used as-is.
# If left blank, the following values are used ("$name" is automatically
# replaced with the name of the entity): "The $name class" "The $name widget"
# "The $name file" "is" "provides" "specifies" "contains"
# "represents" "a" "an" "the"
 
ABBREVIATE_BRIEF =
 
# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
# Doxygen will generate a detailed section even if there is only a brief
# description.
 
ALWAYS_DETAILED_SEC = NO
 
# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
# inherited members of a class in the documentation of that class as if those
# members were ordinary class members. Constructors, destructors and assignment
# operators of the base classes will not be shown.
 
INLINE_INHERITED_MEMB = NO
 
# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
# path before files name in the file list and in the header files. If set
# to NO the shortest path that makes the file name unique will be used.
 
FULL_PATH_NAMES = YES
 
# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
# can be used to strip a user-defined part of the path. Stripping is
# only done if one of the specified strings matches the left-hand part of
# the path. The tag can be used to show relative paths in the file list.
# If left blank the directory from which doxygen is run is used as the
# path to strip.
 
STRIP_FROM_PATH =
 
# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
# the path mentioned in the documentation of a class, which tells
# the reader which header file to include in order to use a class.
# If left blank only the name of the header file containing the class
# definition is used. Otherwise one should specify the include paths that
# are normally passed to the compiler using the -I flag.
 
STRIP_FROM_INC_PATH =
 
# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
# (but less readable) file names. This can be useful is your file systems
# doesn't support long names like on DOS, Mac, or CD-ROM.
 
SHORT_NAMES = NO
 
# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
# will interpret the first line (until the first dot) of a JavaDoc-style
# comment as the brief description. If set to NO, the JavaDoc
# comments will behave just like the Qt-style comments (thus requiring an
# explicit @brief command for a brief description.
 
JAVADOC_AUTOBRIEF = YES
 
# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
# treat a multi-line C++ special comment block (i.e. a block of //! or ///
# comments) as a brief description. This used to be the default behaviour.
# The new default is to treat a multi-line C++ comment block as a detailed
# description. Set this tag to YES if you prefer the old behaviour instead.
 
MULTILINE_CPP_IS_BRIEF = NO
 
# If the DETAILS_AT_TOP tag is set to YES then Doxygen
# will output the detailed description near the top, like JavaDoc.
# If set to NO, the detailed description appears after the member
# documentation.
 
DETAILS_AT_TOP = NO
 
# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
# member inherits the documentation from any documented member that it
# re-implements.
 
INHERIT_DOCS = YES
 
# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
# a new page for each member. If set to NO, the documentation of a member will
# be part of the file/class/namespace that contains it.
 
SEPARATE_MEMBER_PAGES = NO
 
# The TAB_SIZE tag can be used to set the number of spaces in a tab.
# Doxygen uses this value to replace tabs by spaces in code fragments.
 
TAB_SIZE = 8
 
# This tag can be used to specify a number of aliases that acts
# as commands in the documentation. An alias has the form "name=value".
# For example adding "sideeffect=\par Side Effects:\n" will allow you to
# put the command \sideeffect (or @sideeffect) in the documentation, which
# will result in a user-defined paragraph with heading "Side Effects:".
# You can put \n's in the value part of an alias to insert newlines.
 
ALIASES =
 
# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
# sources only. Doxygen will then generate output that is more tailored for C.
# For instance, some of the names that are used will be different. The list
# of all members will be omitted, etc.
 
OPTIMIZE_OUTPUT_FOR_C = YES
 
# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
# sources only. Doxygen will then generate output that is more tailored for Java.
# For instance, namespaces will be presented as packages, qualified scopes
# will look different, etc.
 
OPTIMIZE_OUTPUT_JAVA = NO
 
# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to
# include (a tag file for) the STL sources as input, then you should
# set this tag to YES in order to let doxygen match functions declarations and
# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
# func(std::string) {}). This also make the inheritance and collaboration
# diagrams that involve STL classes more complete and accurate.
 
BUILTIN_STL_SUPPORT = NO
 
# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
# tag is set to YES, then doxygen will reuse the documentation of the first
# member in the group (if any) for the other members of the group. By default
# all members of a group must be documented explicitly.
 
DISTRIBUTE_GROUP_DOC = NO
 
# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
# the same type (for instance a group of public functions) to be put as a
# subgroup of that type (e.g. under the Public Functions section). Set it to
# NO to prevent subgrouping. Alternatively, this can be done per class using
# the \nosubgrouping command.
 
SUBGROUPING = YES
 
#---------------------------------------------------------------------------
# Build related configuration options
#---------------------------------------------------------------------------
 
# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
# documentation are documented, even if no documentation was available.
# Private class members and static file members will be hidden unless
# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
 
EXTRACT_ALL = YES
 
# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
# will be included in the documentation.
 
EXTRACT_PRIVATE = YES
 
# If the EXTRACT_STATIC tag is set to YES all static members of a file
# will be included in the documentation.
 
EXTRACT_STATIC = YES
 
# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
# defined locally in source files will be included in the documentation.
# If set to NO only classes defined in header files are included.
 
EXTRACT_LOCAL_CLASSES = YES
 
# This flag is only useful for Objective-C code. When set to YES local
# methods, which are defined in the implementation section but not in
# the interface are included in the documentation.
# If set to NO (the default) only methods in the interface are included.
 
EXTRACT_LOCAL_METHODS = NO
 
# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
# undocumented members of documented classes, files or namespaces.
# If set to NO (the default) these members will be included in the
# various overviews, but no documentation section is generated.
# This option has no effect if EXTRACT_ALL is enabled.
 
HIDE_UNDOC_MEMBERS = NO
 
# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
# undocumented classes that are normally visible in the class hierarchy.
# If set to NO (the default) these classes will be included in the various
# overviews. This option has no effect if EXTRACT_ALL is enabled.
 
HIDE_UNDOC_CLASSES = NO
 
# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
# friend (class|struct|union) declarations.
# If set to NO (the default) these declarations will be included in the
# documentation.
 
HIDE_FRIEND_COMPOUNDS = NO
 
# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
# documentation blocks found inside the body of a function.
# If set to NO (the default) these blocks will be appended to the
# function's detailed documentation block.
 
HIDE_IN_BODY_DOCS = NO
 
# The INTERNAL_DOCS tag determines if documentation
# that is typed after a \internal command is included. If the tag is set
# to NO (the default) then the documentation will be excluded.
# Set it to YES to include the internal documentation.
 
INTERNAL_DOCS = NO
 
# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
# file names in lower-case letters. If set to YES upper-case letters are also
# allowed. This is useful if you have classes or files whose names only differ
# in case and if your file system supports case sensitive file names. Windows
# and Mac users are advised to set this option to NO.
 
CASE_SENSE_NAMES = YES
 
# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
# will show members with their full class and namespace scopes in the
# documentation. If set to YES the scope will be hidden.
 
HIDE_SCOPE_NAMES = NO
 
# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
# will put a list of the files that are included by a file in the documentation
# of that file.
 
SHOW_INCLUDE_FILES = YES
 
# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
# is inserted in the documentation for inline members.
 
INLINE_INFO = YES
 
# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
# will sort the (detailed) documentation of file and class members
# alphabetically by member name. If set to NO the members will appear in
# declaration order.
 
SORT_MEMBER_DOCS = YES
 
# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
# brief documentation of file, namespace and class members alphabetically
# by member name. If set to NO (the default) the members will appear in
# declaration order.
 
SORT_BRIEF_DOCS = NO
 
# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
# sorted by fully-qualified names, including namespaces. If set to
# NO (the default), the class list will be sorted only by class name,
# not including the namespace part.
# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
# Note: This option applies only to the class list, not to the
# alphabetical list.
 
SORT_BY_SCOPE_NAME = NO
 
# The GENERATE_TODOLIST tag can be used to enable (YES) or
# disable (NO) the todo list. This list is created by putting \todo
# commands in the documentation.
 
GENERATE_TODOLIST = YES
 
# The GENERATE_TESTLIST tag can be used to enable (YES) or
# disable (NO) the test list. This list is created by putting \test
# commands in the documentation.
 
GENERATE_TESTLIST = YES
 
# The GENERATE_BUGLIST tag can be used to enable (YES) or
# disable (NO) the bug list. This list is created by putting \bug
# commands in the documentation.
 
GENERATE_BUGLIST = YES
 
# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
# disable (NO) the deprecated list. This list is created by putting
# \deprecated commands in the documentation.
 
GENERATE_DEPRECATEDLIST= YES
 
# The ENABLED_SECTIONS tag can be used to enable conditional
# documentation sections, marked by \if sectionname ... \endif.
 
ENABLED_SECTIONS =
 
# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
# the initial value of a variable or define consists of for it to appear in
# the documentation. If the initializer consists of more lines than specified
# here it will be hidden. Use a value of 0 to hide initializers completely.
# The appearance of the initializer of individual variables and defines in the
# documentation can be controlled using \showinitializer or \hideinitializer
# command in the documentation regardless of this setting.
 
MAX_INITIALIZER_LINES = 30
 
# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
# at the bottom of the documentation of classes and structs. If set to YES the
# list will mention the files that were used to generate the documentation.
 
SHOW_USED_FILES = YES
 
# If the sources in your project are distributed over multiple directories
# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
# in the documentation. The default is NO.
 
SHOW_DIRECTORIES = YES
 
# The FILE_VERSION_FILTER tag can be used to specify a program or script that
# doxygen should invoke to get the current version for each file (typically from the
# version control system). Doxygen will invoke the program by executing (via
# popen()) the command <command> <input-file>, where <command> is the value of
# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
# provided by doxygen. Whatever the program writes to standard output
# is used as the file version. See the manual for examples.
 
FILE_VERSION_FILTER =
 
#---------------------------------------------------------------------------
# configuration options related to warning and progress messages
#---------------------------------------------------------------------------
 
# The QUIET tag can be used to turn on/off the messages that are generated
# by doxygen. Possible values are YES and NO. If left blank NO is used.
 
QUIET = NO
 
# The WARNINGS tag can be used to turn on/off the warning messages that are
# generated by doxygen. Possible values are YES and NO. If left blank
# NO is used.
 
WARNINGS = YES
 
# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
# automatically be disabled.
 
WARN_IF_UNDOCUMENTED = YES
 
# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
# potential errors in the documentation, such as not documenting some
# parameters in a documented function, or documenting parameters that
# don't exist or using markup commands wrongly.
 
WARN_IF_DOC_ERROR = YES
 
# This WARN_NO_PARAMDOC option can be abled to get warnings for
# functions that are documented, but have no documentation for their parameters
# or return value. If set to NO (the default) doxygen will only warn about
# wrong or incomplete parameter documentation, but not about the absence of
# documentation.
 
WARN_NO_PARAMDOC = NO
 
# The WARN_FORMAT tag determines the format of the warning messages that
# doxygen can produce. The string should contain the $file, $line, and $text
# tags, which will be replaced by the file and line number from which the
# warning originated and the warning text. Optionally the format may contain
# $version, which will be replaced by the version of the file (if it could
# be obtained via FILE_VERSION_FILTER)
 
WARN_FORMAT = "$file:$line: $text"
 
# The WARN_LOGFILE tag can be used to specify a file to which warning
# and error messages should be written. If left blank the output is written
# to stderr.
 
WARN_LOGFILE =
 
#---------------------------------------------------------------------------
# configuration options related to the input files
#---------------------------------------------------------------------------
 
# The INPUT tag can be used to specify the files and/or directories that contain
# documented source files. You may enter file names like "myfile.cpp" or
# directories like "/usr/src/myproject". Separate the files or directories
# with spaces.
 
INPUT = include src
 
# If the value of the INPUT tag contains directories, you can use the
# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
# and *.h) to filter out the source-files in the directories. If left
# blank the following patterns are tested:
# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py
 
FILE_PATTERNS = *.c *.h
 
# The RECURSIVE tag can be used to turn specify whether or not subdirectories
# should be searched for input files as well. Possible values are YES and NO.
# If left blank NO is used.
 
RECURSIVE = YES
 
# The EXCLUDE tag can be used to specify files and/or directories that should
# excluded from the INPUT source files. This way you can easily exclude a
# subdirectory from a directory tree whose root is specified with the INPUT tag.
 
EXCLUDE =
 
# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
# directories that are symbolic links (a Unix filesystem feature) are excluded
# from the input.
 
EXCLUDE_SYMLINKS = NO
 
# If the value of the INPUT tag contains directories, you can use the
# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
# certain files from those directories. Note that the wildcards are matched
# against the file with absolute path, so to exclude all test directories
# for example use the pattern */test/*
 
EXCLUDE_PATTERNS = */.svn/*
 
# The EXAMPLE_PATH tag can be used to specify one or more files or
# directories that contain example code fragments that are included (see
# the \include command).
 
EXAMPLE_PATH =
 
# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
# and *.h) to filter out the source-files in the directories. If left
# blank all files are included.
 
EXAMPLE_PATTERNS =
 
# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
# searched for input files to be used with the \include or \dontinclude
# commands irrespective of the value of the RECURSIVE tag.
# Possible values are YES and NO. If left blank NO is used.
 
EXAMPLE_RECURSIVE = NO
 
# The IMAGE_PATH tag can be used to specify one or more files or
# directories that contain image that are included in the documentation (see
# the \image command).
 
IMAGE_PATH =
 
# The INPUT_FILTER tag can be used to specify a program that doxygen should
# invoke to filter for each input file. Doxygen will invoke the filter program
# by executing (via popen()) the command <filter> <input-file>, where <filter>
# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
# input file. Doxygen will then use the output that the filter program writes
# to standard output. If FILTER_PATTERNS is specified, this tag will be
# ignored.
 
INPUT_FILTER =
 
# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
# basis. Doxygen will compare the file name with each pattern and apply the
# filter if there is a match. The filters are a list of the form:
# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
# is applied to all files.
 
FILTER_PATTERNS =
 
# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
# INPUT_FILTER) will be used to filter the input files when producing source
# files to browse (i.e. when SOURCE_BROWSER is set to YES).
 
FILTER_SOURCE_FILES = NO
 
#---------------------------------------------------------------------------
# configuration options related to source browsing
#---------------------------------------------------------------------------
 
# If the SOURCE_BROWSER tag is set to YES then a list of source files will
# be generated. Documented entities will be cross-referenced with these sources.
# Note: To get rid of all source code in the generated output, make sure also
# VERBATIM_HEADERS is set to NO.
 
SOURCE_BROWSER = YES
 
# Setting the INLINE_SOURCES tag to YES will include the body
# of functions and classes directly in the documentation.
 
INLINE_SOURCES = NO
 
# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
# doxygen to hide any special comment blocks from generated source code
# fragments. Normal C and C++ comments will always remain visible.
 
STRIP_CODE_COMMENTS = YES
 
# If the REFERENCED_BY_RELATION tag is set to YES (the default)
# then for each documented function all documented
# functions referencing it will be listed.
 
REFERENCED_BY_RELATION = YES
 
# If the REFERENCES_RELATION tag is set to YES (the default)
# then for each documented function all documented entities
# called/used by that function will be listed.
 
REFERENCES_RELATION = YES
 
# If the USE_HTAGS tag is set to YES then the references to source code
# will point to the HTML generated by the htags(1) tool instead of doxygen
# built-in source browser. The htags tool is part of GNU's global source
# tagging system (see http://www.gnu.org/software/global/global.html). You
# will need version 4.8.6 or higher.
 
USE_HTAGS = NO
 
# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
# will generate a verbatim copy of the header file for each class for
# which an include is specified. Set to NO to disable this.
 
VERBATIM_HEADERS = YES
 
#---------------------------------------------------------------------------
# configuration options related to the alphabetical class index
#---------------------------------------------------------------------------
 
# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
# of all compounds will be generated. Enable this if the project
# contains a lot of classes, structs, unions or interfaces.
 
ALPHABETICAL_INDEX = NO
 
# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
# in which this list will be split (can be a number in the range [1..20])
 
COLS_IN_ALPHA_INDEX = 5
 
# In case all classes in a project start with a common prefix, all
# classes will be put under the same header in the alphabetical index.
# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
# should be ignored while generating the index headers.
 
IGNORE_PREFIX =
 
#---------------------------------------------------------------------------
# configuration options related to the HTML output
#---------------------------------------------------------------------------
 
# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
# generate HTML output.
 
GENERATE_HTML = YES
 
# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `html' will be used as the default path.
 
HTML_OUTPUT = html
 
# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
# doxygen will generate files with .html extension.
 
HTML_FILE_EXTENSION = .html
 
# The HTML_HEADER tag can be used to specify a personal HTML header for
# each generated HTML page. If it is left blank doxygen will generate a
# standard header.
 
HTML_HEADER =
 
# The HTML_FOOTER tag can be used to specify a personal HTML footer for
# each generated HTML page. If it is left blank doxygen will generate a
# standard footer.
 
HTML_FOOTER =
 
# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
# style sheet that is used by each HTML page. It can be used to
# fine-tune the look of the HTML output. If the tag is left blank doxygen
# will generate a default style sheet. Note that doxygen will try to copy
# the style sheet file to the HTML output directory, so don't put your own
# stylesheet in the HTML output directory as well, or it will be erased!
 
HTML_STYLESHEET =
 
# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
# files or namespaces will be aligned in HTML using tables. If set to
# NO a bullet list will be used.
 
HTML_ALIGN_MEMBERS = YES
 
# If the GENERATE_HTMLHELP tag is set to YES, additional index files
# will be generated that can be used as input for tools like the
# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
# of the generated HTML documentation.
 
GENERATE_HTMLHELP = NO
 
# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
# be used to specify the file name of the resulting .chm file. You
# can add a path in front of the file if the result should not be
# written to the html output directory.
 
CHM_FILE =
 
# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
# be used to specify the location (absolute path including file name) of
# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
# the HTML help compiler on the generated index.hhp.
 
HHC_LOCATION =
 
# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
# controls if a separate .chi index file is generated (YES) or that
# it should be included in the master .chm file (NO).
 
GENERATE_CHI = NO
 
# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
# controls whether a binary table of contents is generated (YES) or a
# normal table of contents (NO) in the .chm file.
 
BINARY_TOC = NO
 
# The TOC_EXPAND flag can be set to YES to add extra items for group members
# to the contents of the HTML help documentation and to the tree view.
 
TOC_EXPAND = NO
 
# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
# top of each HTML page. The value NO (the default) enables the index and
# the value YES disables it.
 
DISABLE_INDEX = NO
 
# This tag can be used to set the number of enum values (range [1..20])
# that doxygen will group on one line in the generated HTML documentation.
 
ENUM_VALUES_PER_LINE = 4
 
# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
# generated containing a tree-like index structure (just like the one that
# is generated for HTML Help). For this to work a browser that supports
# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
# probably better off using the HTML help feature.
 
GENERATE_TREEVIEW = NO
 
# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
# used to set the initial width (in pixels) of the frame in which the tree
# is shown.
 
TREEVIEW_WIDTH = 250
 
#---------------------------------------------------------------------------
# configuration options related to the LaTeX output
#---------------------------------------------------------------------------
 
# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
# generate Latex output.
 
GENERATE_LATEX = NO
 
# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `latex' will be used as the default path.
 
LATEX_OUTPUT = latex
 
# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
# invoked. If left blank `latex' will be used as the default command name.
 
LATEX_CMD_NAME = latex
 
# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
# generate index for LaTeX. If left blank `makeindex' will be used as the
# default command name.
 
MAKEINDEX_CMD_NAME = makeindex
 
# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
# LaTeX documents. This may be useful for small projects and may help to
# save some trees in general.
 
COMPACT_LATEX = NO
 
# The PAPER_TYPE tag can be used to set the paper type that is used
# by the printer. Possible values are: a4, a4wide, letter, legal and
# executive. If left blank a4wide will be used.
 
PAPER_TYPE = a4wide
 
# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
# packages that should be included in the LaTeX output.
 
EXTRA_PACKAGES =
 
# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
# the generated latex document. The header should contain everything until
# the first chapter. If it is left blank doxygen will generate a
# standard header. Notice: only use this tag if you know what you are doing!
 
LATEX_HEADER =
 
# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
# is prepared for conversion to pdf (using ps2pdf). The pdf file will
# contain links (just like the HTML output) instead of page references
# This makes the output suitable for online browsing using a pdf viewer.
 
PDF_HYPERLINKS = NO
 
# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
# plain latex in the generated Makefile. Set this option to YES to get a
# higher quality PDF documentation.
 
USE_PDFLATEX = NO
 
# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
# command to the generated LaTeX files. This will instruct LaTeX to keep
# running if errors occur, instead of asking the user for help.
# This option is also used when generating formulas in HTML.
 
LATEX_BATCHMODE = NO
 
# If LATEX_HIDE_INDICES is set to YES then doxygen will not
# include the index chapters (such as File Index, Compound Index, etc.)
# in the output.
 
LATEX_HIDE_INDICES = NO
 
#---------------------------------------------------------------------------
# configuration options related to the RTF output
#---------------------------------------------------------------------------
 
# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
# The RTF output is optimized for Word 97 and may not look very pretty with
# other RTF readers or editors.
 
GENERATE_RTF = NO
 
# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `rtf' will be used as the default path.
 
RTF_OUTPUT = rtf
 
# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
# RTF documents. This may be useful for small projects and may help to
# save some trees in general.
 
COMPACT_RTF = NO
 
# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
# will contain hyperlink fields. The RTF file will
# contain links (just like the HTML output) instead of page references.
# This makes the output suitable for online browsing using WORD or other
# programs which support those fields.
# Note: wordpad (write) and others do not support links.
 
RTF_HYPERLINKS = NO
 
# Load stylesheet definitions from file. Syntax is similar to doxygen's
# config file, i.e. a series of assignments. You only have to provide
# replacements, missing definitions are set to their default value.
 
RTF_STYLESHEET_FILE =
 
# Set optional variables used in the generation of an rtf document.
# Syntax is similar to doxygen's config file.
 
RTF_EXTENSIONS_FILE =
 
#---------------------------------------------------------------------------
# configuration options related to the man page output
#---------------------------------------------------------------------------
 
# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
# generate man pages
 
GENERATE_MAN = NO
 
# The MAN_OUTPUT tag is used to specify where the man pages will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `man' will be used as the default path.
 
MAN_OUTPUT = man
 
# The MAN_EXTENSION tag determines the extension that is added to
# the generated man pages (default is the subroutine's section .3)
 
MAN_EXTENSION = .3
 
# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
# then it will generate one additional man file for each entity
# documented in the real man page(s). These additional files
# only source the real man page, but without them the man command
# would be unable to find the correct page. The default is NO.
 
MAN_LINKS = NO
 
#---------------------------------------------------------------------------
# configuration options related to the XML output
#---------------------------------------------------------------------------
 
# If the GENERATE_XML tag is set to YES Doxygen will
# generate an XML file that captures the structure of
# the code including all documentation.
 
GENERATE_XML = NO
 
# The XML_OUTPUT tag is used to specify where the XML pages will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `xml' will be used as the default path.
 
XML_OUTPUT = xml
 
# The XML_SCHEMA tag can be used to specify an XML schema,
# which can be used by a validating XML parser to check the
# syntax of the XML files.
 
XML_SCHEMA =
 
# The XML_DTD tag can be used to specify an XML DTD,
# which can be used by a validating XML parser to check the
# syntax of the XML files.
 
XML_DTD =
 
# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
# dump the program listings (including syntax highlighting
# and cross-referencing information) to the XML output. Note that
# enabling this will significantly increase the size of the XML output.
 
XML_PROGRAMLISTING = YES
 
#---------------------------------------------------------------------------
# configuration options for the AutoGen Definitions output
#---------------------------------------------------------------------------
 
# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
# generate an AutoGen Definitions (see autogen.sf.net) file
# that captures the structure of the code including all
# documentation. Note that this feature is still experimental
# and incomplete at the moment.
 
GENERATE_AUTOGEN_DEF = NO
 
#---------------------------------------------------------------------------
# configuration options related to the Perl module output
#---------------------------------------------------------------------------
 
# If the GENERATE_PERLMOD tag is set to YES Doxygen will
# generate a Perl module file that captures the structure of
# the code including all documentation. Note that this
# feature is still experimental and incomplete at the
# moment.
 
GENERATE_PERLMOD = NO
 
# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
# the necessary Makefile rules, Perl scripts and LaTeX code to be able
# to generate PDF and DVI output from the Perl module output.
 
PERLMOD_LATEX = NO
 
# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
# nicely formatted so it can be parsed by a human reader. This is useful
# if you want to understand what is going on. On the other hand, if this
# tag is set to NO the size of the Perl module output will be much smaller
# and Perl will parse it just the same.
 
PERLMOD_PRETTY = YES
 
# The names of the make variables in the generated doxyrules.make file
# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
# This is useful so different doxyrules.make files included by the same
# Makefile don't overwrite each other's variables.
 
PERLMOD_MAKEVAR_PREFIX =
 
#---------------------------------------------------------------------------
# Configuration options related to the preprocessor
#---------------------------------------------------------------------------
 
# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
# evaluate all C-preprocessor directives found in the sources and include
# files.
 
ENABLE_PREPROCESSING = YES
 
# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
# names in the source code. If set to NO (the default) only conditional
# compilation will be performed. Macro expansion can be done in a controlled
# way by setting EXPAND_ONLY_PREDEF to YES.
 
MACRO_EXPANSION = NO
 
# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
# then the macro expansion is limited to the macros specified with the
# PREDEFINED and EXPAND_AS_DEFINED tags.
 
EXPAND_ONLY_PREDEF = NO
 
# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
# in the INCLUDE_PATH (see below) will be search if a #include is found.
 
SEARCH_INCLUDES = YES
 
# The INCLUDE_PATH tag can be used to specify one or more directories that
# contain include files that are not input files but should be processed by
# the preprocessor.
 
INCLUDE_PATH =
 
# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
# patterns (like *.h and *.hpp) to filter out the header-files in the
# directories. If left blank, the patterns specified with FILE_PATTERNS will
# be used.
 
INCLUDE_FILE_PATTERNS =
 
# The PREDEFINED tag can be used to specify one or more macro names that
# are defined before the preprocessor is started (similar to the -D option of
# gcc). The argument of the tag is a list of macros of the form: name
# or name=definition (no spaces). If the definition and the = are
# omitted =1 is assumed. To prevent a macro definition from being
# undefined via #undef or recursively expanded use the := operator
# instead of the = operator.
 
PREDEFINED =
 
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
# this tag can be used to specify a list of macro names that should be expanded.
# The macro definition that is found in the sources will be used.
# Use the PREDEFINED tag if you want to use a different macro definition.
 
EXPAND_AS_DEFINED =
 
# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
# doxygen's preprocessor will remove all function-like macros that are alone
# on a line, have an all uppercase name, and do not end with a semicolon. Such
# function macros are typically used for boiler-plate code, and will confuse
# the parser if not removed.
 
SKIP_FUNCTION_MACROS = YES
 
#---------------------------------------------------------------------------
# Configuration::additions related to external references
#---------------------------------------------------------------------------
 
# The TAGFILES option can be used to specify one or more tagfiles.
# Optionally an initial location of the external documentation
# can be added for each tagfile. The format of a tag file without
# this location is as follows:
# TAGFILES = file1 file2 ...
# Adding location for the tag files is done as follows:
# TAGFILES = file1=loc1 "file2 = loc2" ...
# where "loc1" and "loc2" can be relative or absolute paths or
# URLs. If a location is present for each tag, the installdox tool
# does not have to be run to correct the links.
# Note that each tag file must have a unique name
# (where the name does NOT include the path)
# If a tag file is not located in the directory in which doxygen
# is run, you must also specify the path to the tagfile here.
 
TAGFILES =
 
# When a file name is specified after GENERATE_TAGFILE, doxygen will create
# a tag file that is based on the input files it reads.
 
GENERATE_TAGFILE =
 
# If the ALLEXTERNALS tag is set to YES all external classes will be listed
# in the class index. If set to NO only the inherited external classes
# will be listed.
 
ALLEXTERNALS = NO
 
# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
# in the modules index. If set to NO, only the current project's groups will
# be listed.
 
EXTERNAL_GROUPS = YES
 
# The PERL_PATH should be the absolute path and name of the perl script
# interpreter (i.e. the result of `which perl').
 
PERL_PATH = /usr/bin/perl
 
#---------------------------------------------------------------------------
# Configuration options related to the dot tool
#---------------------------------------------------------------------------
 
# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
# or super classes. Setting the tag to NO turns the diagrams off. Note that
# this option is superseded by the HAVE_DOT option below. This is only a
# fallback. It is recommended to install and use dot, since it yields more
# powerful graphs.
 
CLASS_DIAGRAMS = YES
 
# If set to YES, the inheritance and collaboration graphs will hide
# inheritance and usage relations if the target is undocumented
# or is not a class.
 
HIDE_UNDOC_RELATIONS = YES
 
# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
# available from the path. This tool is part of Graphviz, a graph visualization
# toolkit from AT&T and Lucent Bell Labs. The other options in this section
# have no effect if this option is set to NO (the default)
 
HAVE_DOT = NO
 
# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
# will generate a graph for each documented class showing the direct and
# indirect inheritance relations. Setting this tag to YES will force the
# the CLASS_DIAGRAMS tag to NO.
 
CLASS_GRAPH = YES
 
# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
# will generate a graph for each documented class showing the direct and
# indirect implementation dependencies (inheritance, containment, and
# class references variables) of the class with other documented classes.
 
COLLABORATION_GRAPH = YES
 
# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
# will generate a graph for groups, showing the direct groups dependencies
 
GROUP_GRAPHS = YES
 
# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
# collaboration diagrams in a style similar to the OMG's Unified Modeling
# Language.
 
UML_LOOK = NO
 
# If set to YES, the inheritance and collaboration graphs will show the
# relations between templates and their instances.
 
TEMPLATE_RELATIONS = NO
 
# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
# tags are set to YES then doxygen will generate a graph for each documented
# file showing the direct and indirect include dependencies of the file with
# other documented files.
 
INCLUDE_GRAPH = YES
 
# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
# documented header file showing the documented files that directly or
# indirectly include this file.
 
INCLUDED_BY_GRAPH = YES
 
# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will
# generate a call dependency graph for every global function or class method.
# Note that enabling this option will significantly increase the time of a run.
# So in most cases it will be better to enable call graphs for selected
# functions only using the \callgraph command.
 
CALL_GRAPH = NO
 
# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
# will graphical hierarchy of all classes instead of a textual one.
 
GRAPHICAL_HIERARCHY = YES
 
# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
# then doxygen will show the dependencies a directory has on other directories
# in a graphical way. The dependency relations are determined by the #include
# relations between the files in the directories.
 
DIRECTORY_GRAPH = YES
 
# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
# generated by dot. Possible values are png, jpg, or gif
# If left blank png will be used.
 
DOT_IMAGE_FORMAT = png
 
# The tag DOT_PATH can be used to specify the path where the dot tool can be
# found. If left blank, it is assumed the dot tool can be found in the path.
 
DOT_PATH =
 
# The DOTFILE_DIRS tag can be used to specify one or more directories that
# contain dot files that are included in the documentation (see the
# \dotfile command).
 
DOTFILE_DIRS =
 
# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width
# (in pixels) of the graphs generated by dot. If a graph becomes larger than
# this value, doxygen will try to truncate the graph, so that it fits within
# the specified constraint. Beware that most browsers cannot cope with very
# large images.
 
MAX_DOT_GRAPH_WIDTH = 1024
 
# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height
# (in pixels) of the graphs generated by dot. If a graph becomes larger than
# this value, doxygen will try to truncate the graph, so that it fits within
# the specified constraint. Beware that most browsers cannot cope with very
# large images.
 
MAX_DOT_GRAPH_HEIGHT = 1024
 
# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
# graphs generated by dot. A depth value of 3 means that only nodes reachable
# from the root by following a path via at most 3 edges will be shown. Nodes
# that lay further from the root node will be omitted. Note that setting this
# option to 1 or 2 may greatly reduce the computation time needed for large
# code bases. Also note that a graph may be further truncated if the graph's
# image dimensions are not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH
# and MAX_DOT_GRAPH_HEIGHT). If 0 is used for the depth value (the default),
# the graph is not depth-constrained.
 
MAX_DOT_GRAPH_DEPTH = 0
 
# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
# background. This is disabled by default, which results in a white background.
# Warning: Depending on the platform used, enabling this option may lead to
# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
# read).
 
DOT_TRANSPARENT = NO
 
# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
# files in one run (i.e. multiple -o and -T options on the command line). This
# makes dot run faster, but since only newer versions of dot (>1.8.10)
# support this, this feature is disabled by default.
 
DOT_MULTI_TARGETS = NO
 
# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
# generate a legend page explaining the meaning of the various boxes and
# arrows in the dot generated graphs.
 
GENERATE_LEGEND = YES
 
# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
# remove the intermediate dot files that are used to generate
# the various graphs.
 
DOT_CLEANUP = YES
 
#---------------------------------------------------------------------------
# Configuration::additions related to the search engine
#---------------------------------------------------------------------------
 
# The SEARCHENGINE tag specifies whether or not a search engine should be
# used. If set to NO the values of all tags below this one will be ignored.
 
SEARCHENGINE = NO
/programs/network/netsurf/libparserutils/build/conv.pl
0,0 → 1,49
#!/usr/bin/perl
 
use warnings;
use strict;
 
# Convert Unicode mapping tables to C structures
# Input files may be found at http://unicode.org/Public/MAPPINGS
#
# Usage: conv.pl <input_file>
 
die "Usage: conv.pl <input_file>\n" if (scalar(@ARGV) != 1);
 
my @table;
 
open MAP, "<$ARGV[0]" or die "Failed opening $ARGV[0]: $!\n";
 
while (<MAP>) {
next if (/^#/);
 
my @parts = split(/\s+/);
 
# Ignore ASCII part
next if (hex($parts[0]) < 0x80);
 
# Convert undefined entries to U+FFFF
if ($parts[1] =~ /^#/) {
push(@table, "0xFFFF");
} else {
push(@table, $parts[1]);
}
}
 
close MAP;
 
# You'll have to go through and fix up the structure name
print "static uint32_t ${ARGV[0]}[128] = {\n\t";
 
my $count = 0;
foreach my $item (@table) {
print "$item, ";
$count++;
 
if ($count % 8 == 0 && $count != 128) {
print "\n\t";
}
}
 
print "\n};\n\n";
 
Property changes:
Added: svn:executable
+*
\ No newline at end of property
/programs/network/netsurf/libparserutils/build/make-aliases.pl
0,0 → 1,124
#!/usr/bin/perl -w
# This file is part of LibParserUtils.
# Licensed under the MIT License,
# http://www.opensource.org/licenses/mit-license.php
# Copyright 2010 Daniel Silverstone <dsilvers@netsurf-browser.org>
# John-Mark Bell <jmb@netsurf-browser.org>
 
use strict;
 
use constant ALIAS_FILE => 'build/Aliases';
use constant ALIAS_INC => 'src/charset/aliases.inc';
 
use constant UNICODE_CHARSETS =>
[
qr'^ISO-10646-UCS-[24]$',
qr'^UTF-16',
qr'^UTF-8$',
qr'^UTF-32'
];
 
open(INFILE, "<", ALIAS_FILE) || die "Unable to open " . ALIAS_FILE;
 
my %charsets;
 
while (my $line = <INFILE>) {
last unless (defined $line);
next if ($line =~ /^#/);
chomp $line;
next if ($line eq '');
my @elements = split /\s+/, $line;
my $canon = shift @elements;
my $mibenum = shift @elements;
$charsets{$canon} = [$mibenum, \@elements];
}
 
close(INFILE);
 
my $unicodeexp = "";
 
my $output = <<'EOH';
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2010 The NetSurf Project.
*
* Note: This file is automatically generated by make-aliases.pl
*
* Do not edit file file, changes will be overwritten during build.
*/
 
static parserutils_charset_aliases_canon canonical_charset_names[] = {
EOH
 
my %aliases;
my $canonnr = 0;
foreach my $canon (sort keys %charsets) {
my ($mibenum, $elements) = @{$charsets{$canon}};
# Ordering must match struct in src/charset/aliases.h
$output .= "\t{ " . $mibenum . ", " . length($canon) . ', "' . $canon . '" },' . "\n";
my $isunicode = 0;
foreach my $unirexp (@{UNICODE_CHARSETS()}) {
$isunicode = 1 if ($canon =~ $unirexp);
}
if ($isunicode == 1) {
$unicodeexp .= "((x) == $mibenum) || ";
}
$canon =~ y/A-Z/a-z/;
$canon =~ s/[^a-z0-9]//g;
$aliases{$canon} = $canonnr;
foreach my $alias (@$elements) {
$alias =~ y/A-Z/a-z/;
$alias =~ s/[^a-z0-9]//g;
$aliases{$alias} = $canonnr;
}
$canonnr += 1;
}
 
$output .= "};\n\nstatic const uint16_t charset_aliases_canon_count = ${canonnr};\n\n";
 
$output .= <<'EOT';
typedef struct {
uint16_t name_len;
const char *name;
parserutils_charset_aliases_canon *canon;
} parserutils_charset_aliases_alias;
 
static parserutils_charset_aliases_alias charset_aliases[] = {
EOT
 
my $aliascount = 0;
 
foreach my $alias (sort keys %aliases) {
my $canonnr = $aliases{$alias};
$output .= "\t{ " . length($alias) . ', "' . $alias . '", &canonical_charset_names[' . $canonnr . "] },\n";
$aliascount += 1;
}
 
$output .= "};\n\n";
 
# Drop the final " || "
chop $unicodeexp;
chop $unicodeexp;
chop $unicodeexp;
chop $unicodeexp;
 
$output .= <<"EOS";
static const uint16_t charset_aliases_count = ${aliascount};
 
#define MIBENUM_IS_UNICODE(x) ($unicodeexp)
EOS
 
if (open(EXISTING, "<", ALIAS_INC)) {
local $/ = undef();
my $now = <EXISTING>;
undef($output) if ($output eq $now);
close(EXISTING);
}
 
if (defined($output)) {
open(OUTF, ">", ALIAS_INC);
print OUTF $output;
close(OUTF);
}
/programs/network/netsurf/libparserutils/docs/Todo
0,0 → 1,5
Todo list
---------
 
+ Charset conversion should use Unicode Normalisation Form C.
 
/programs/network/netsurf/libparserutils/include/parserutils/charset/codec.h
0,0 → 1,125
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_charset_codec_h_
#define parserutils_charset_codec_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <inttypes.h>
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
 
typedef struct parserutils_charset_codec parserutils_charset_codec;
 
#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU)
 
/**
* Charset codec error mode
*
* A codec's error mode determines its behaviour in the face of:
*
* + characters which are unrepresentable in the destination charset (if
* encoding data) or which cannot be converted to UCS-4 (if decoding data).
* + invalid byte sequences (both encoding and decoding)
*
* The options provide a choice between the following approaches:
*
* + draconian, "stop processing" ("strict")
* + "replace the unrepresentable character with something else" ("loose")
* + "attempt to transliterate, or replace if unable" ("translit")
*
* The default error mode is "loose".
*
*
* In the "loose" case, the replacement character will depend upon:
*
* + Whether the operation was encoding or decoding
* + If encoding, what the destination charset is.
*
* If decoding, the replacement character will be:
*
* U+FFFD (REPLACEMENT CHARACTER)
*
* If encoding, the replacement character will be:
*
* U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
* U+FFFD (REPLACEMENT CHARACTER) otherwise.
*
*
* In the "translit" case, the codec will attempt to transliterate into
* the destination charset, if encoding. If decoding, or if transliteration
* fails, this option is identical to "loose".
*/
typedef enum parserutils_charset_codec_errormode {
/** Abort processing if unrepresentable character encountered */
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT = 0,
/** Replace unrepresentable characters with single alternate */
PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE = 1,
/** Transliterate unrepresentable characters, if possible */
PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2
} parserutils_charset_codec_errormode;
 
/**
* Charset codec option types
*/
typedef enum parserutils_charset_codec_opttype {
/** Set codec error mode */
PARSERUTILS_CHARSET_CODEC_ERROR_MODE = 1
} parserutils_charset_codec_opttype;
 
/**
* Charset codec option parameters
*/
typedef union parserutils_charset_codec_optparams {
/** Parameters for error mode setting */
struct {
/** The desired error handling mode */
parserutils_charset_codec_errormode mode;
} error_mode;
} parserutils_charset_codec_optparams;
 
 
/* Create a charset codec */
parserutils_error parserutils_charset_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec);
/* Destroy a charset codec */
parserutils_error parserutils_charset_codec_destroy(
parserutils_charset_codec *codec);
 
/* Configure a charset codec */
parserutils_error parserutils_charset_codec_setopt(
parserutils_charset_codec *codec,
parserutils_charset_codec_opttype type,
parserutils_charset_codec_optparams *params);
 
/* Encode a chunk of UCS-4 data into a codec's charset */
parserutils_error parserutils_charset_codec_encode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
 
/* Decode a chunk of data in a codec's charset into UCS-4 */
parserutils_error parserutils_charset_codec_decode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
 
/* Reset a charset codec */
parserutils_error parserutils_charset_codec_reset(
parserutils_charset_codec *codec);
 
#ifdef __cplusplus
}
#endif
 
#endif
/programs/network/netsurf/libparserutils/include/parserutils/charset/mibenum.h
0,0 → 1,33
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_charset_mibenum_h_
#define parserutils_charset_mibenum_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <inttypes.h>
#include <stdbool.h>
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
 
/* Convert an encoding alias to a MIB enum value */
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len);
/* Convert a MIB enum value into an encoding alias */
const char *parserutils_charset_mibenum_to_name(uint16_t mibenum);
/* Determine if a MIB enum value represents a Unicode variant */
bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum);
 
#ifdef __cplusplus
}
#endif
 
#endif
/programs/network/netsurf/libparserutils/include/parserutils/charset/utf16.h
0,0 → 1,47
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
/** \file
* UTF-16 manipulation functions (interface).
*/
 
#ifndef parserutils_charset_utf16_h_
#define parserutils_charset_utf16_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <inttypes.h>
 
#include <parserutils/errors.h>
 
parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
size_t len, uint32_t *ucs4, size_t *clen);
parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4,
uint8_t *s, size_t *len);
 
parserutils_error parserutils_charset_utf16_length(const uint8_t *s,
size_t max, size_t *len);
parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
size_t *len);
 
parserutils_error parserutils_charset_utf16_prev(const uint8_t *s,
uint32_t off, uint32_t *prevoff);
parserutils_error parserutils_charset_utf16_next(const uint8_t *s,
uint32_t len, uint32_t off, uint32_t *nextoff);
 
parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
uint32_t len, uint32_t off, uint32_t *nextoff);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/charset/utf8.h
0,0 → 1,47
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
/** \file
* UTF-8 manipulation functions (interface).
*/
 
#ifndef parserutils_charset_utf8_h_
#define parserutils_charset_utf8_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <inttypes.h>
 
#include <parserutils/errors.h>
 
parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
uint32_t *ucs4, size_t *clen);
parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s,
size_t *len);
 
parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
size_t *len);
parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
size_t *len);
 
parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
uint32_t *prevoff);
parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
uint32_t off, uint32_t *nextoff);
 
parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
uint32_t len, uint32_t off, uint32_t *nextoff);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/errors.h
0,0 → 1,40
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_errors_h_
#define parserutils_errors_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <stddef.h>
 
typedef enum parserutils_error {
PARSERUTILS_OK = 0,
 
PARSERUTILS_NOMEM = 1,
PARSERUTILS_BADPARM = 2,
PARSERUTILS_INVALID = 3,
PARSERUTILS_FILENOTFOUND = 4,
PARSERUTILS_NEEDDATA = 5,
PARSERUTILS_BADENCODING = 6,
PARSERUTILS_EOF = 7
} parserutils_error;
 
/* Convert a parserutils error value to a string */
const char *parserutils_error_to_string(parserutils_error error);
/* Convert a string to a parserutils error value */
parserutils_error parserutils_error_from_string(const char *str, size_t len);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/functypes.h
0,0 → 1,30
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_functypes_h_
#define parserutils_functypes_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
 
#include <parserutils/types.h>
 
/* Type of allocation function for parserutils */
typedef void *(*parserutils_alloc)(void *ptr, size_t size, void *pw);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/input/inputstream.h
0,0 → 1,188
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_input_inputstream_h_
#define parserutils_input_inputstream_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <stdbool.h>
#ifndef NDEBUG
#include <stdio.h>
#endif
#include <stdlib.h>
#include <inttypes.h>
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
#include <parserutils/types.h>
#include <parserutils/charset/utf8.h>
#include <parserutils/utils/buffer.h>
 
/**
* Type of charset detection function
*/
typedef parserutils_error (*parserutils_charset_detect_func)(
const uint8_t *data, size_t len,
uint16_t *mibenum, uint32_t *source);
 
/**
* Input stream object
*/
typedef struct parserutils_inputstream
{
parserutils_buffer *utf8; /**< Buffer containing UTF-8 data */
 
uint32_t cursor; /**< Byte offset of current position */
 
bool had_eof; /**< Whether EOF has been reached */
} parserutils_inputstream;
 
/* Create an input stream */
parserutils_error parserutils_inputstream_create(const char *enc,
uint32_t encsrc, parserutils_charset_detect_func csdetect,
parserutils_alloc alloc, void *pw,
parserutils_inputstream **stream);
/* Destroy an input stream */
parserutils_error parserutils_inputstream_destroy(
parserutils_inputstream *stream);
 
/* Append data to an input stream */
parserutils_error parserutils_inputstream_append(
parserutils_inputstream *stream,
const uint8_t *data, size_t len);
/* Insert data into stream at current location */
parserutils_error parserutils_inputstream_insert(
parserutils_inputstream *stream,
const uint8_t *data, size_t len);
 
/* Slow form of css_inputstream_peek. */
parserutils_error parserutils_inputstream_peek_slow(
parserutils_inputstream *stream,
size_t offset, const uint8_t **ptr, size_t *length);
 
/**
* Look at the character in the stream that starts at
* offset bytes from the cursor
*
* \param stream Stream to look in
* \param offset Byte offset of start of character
* \param ptr Pointer to location to receive pointer to character data
* \param length Pointer to location to receive character length (in bytes)
* \return PARSERUTILS_OK on success,
* _NEEDDATA on reaching the end of available input,
* _EOF on reaching the end of all input,
* _BADENCODING if the input cannot be decoded,
* _NOMEM on memory exhaustion,
* _BADPARM if bad parameters are passed.
*
* Once the character pointed to by the result of this call has been advanced
* past (i.e. parserutils_inputstream_advance has caused the stream cursor to
* pass over the character), then no guarantee is made as to the validity of
* the data pointed to. Thus, any attempt to dereference the pointer after
* advancing past the data it points to is a bug.
*/
static inline parserutils_error parserutils_inputstream_peek(
parserutils_inputstream *stream, size_t offset,
const uint8_t **ptr, size_t *length)
{
parserutils_error error = PARSERUTILS_OK;
const parserutils_buffer *utf8;
const uint8_t *utf8_data;
size_t len, off, utf8_len;
 
if (stream == NULL || ptr == NULL || length == NULL)
return PARSERUTILS_BADPARM;
 
#ifndef NDEBUG
#ifdef VERBOSE_INPUTSTREAM
fprintf(stdout, "Peek: len: %zu cur: %u off: %zu\n",
stream->utf8->length, stream->cursor, offset);
#endif
#ifdef RANDOMISE_INPUTSTREAM
parserutils_buffer_randomise(stream->utf8);
#endif
#endif
 
utf8 = stream->utf8;
utf8_data = utf8->data;
utf8_len = utf8->length;
off = stream->cursor + offset;
 
#define IS_ASCII(x) (((x) & 0x80) == 0)
 
if (off < utf8_len) {
if (IS_ASCII(utf8_data[off])) {
/* Early exit for ASCII case */
(*length) = 1;
(*ptr) = (utf8_data + off);
return PARSERUTILS_OK;
} else {
error = parserutils_charset_utf8_char_byte_length(
utf8_data + off, &len);
 
if (error == PARSERUTILS_OK) {
(*length) = len;
(*ptr) = (utf8_data + off);
return PARSERUTILS_OK;
} else if (error != PARSERUTILS_NEEDDATA) {
return error;
}
}
}
 
#undef IS_ASCII
 
if (off != utf8_len && error != PARSERUTILS_NEEDDATA)
abort();
 
return parserutils_inputstream_peek_slow(stream, offset, ptr, length);
}
 
/**
* Advance the stream's current position
*
* \param stream The stream whose position to advance
* \param bytes The number of bytes to advance
*/
static inline void parserutils_inputstream_advance(
parserutils_inputstream *stream, size_t bytes)
{
if (stream == NULL)
return;
 
#if !defined(NDEBUG) && defined(VERBOSE_INPUTSTREAM)
fprintf(stdout, "Advance: len: %zu cur: %u bytes: %zu\n",
stream->utf8->length, stream->cursor, bytes);
#endif
 
if (bytes > stream->utf8->length - stream->cursor)
abort();
 
if (stream->cursor == stream->utf8->length)
return;
 
stream->cursor += bytes;
}
 
/* Read the document charset */
const char *parserutils_inputstream_read_charset(
parserutils_inputstream *stream, uint32_t *source);
/* Change the document charset */
parserutils_error parserutils_inputstream_change_charset(
parserutils_inputstream *stream,
const char *enc, uint32_t source);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/parserutils.h
0,0 → 1,25
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_parserutils_h_
#define parserutils_parserutils_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
#include <parserutils/types.h>
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/types.h
0,0 → 1,24
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_types_h_
#define parserutils_types_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <stdbool.h>
#include <inttypes.h>
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/utils/buffer.h
0,0 → 1,50
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_utils_buffer_h_
#define parserutils_utils_buffer_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
 
struct parserutils_buffer
{
uint8_t *data;
size_t length;
size_t allocated;
 
parserutils_alloc alloc;
void *pw;
};
typedef struct parserutils_buffer parserutils_buffer;
 
parserutils_error parserutils_buffer_create(parserutils_alloc alloc,
void *pw, parserutils_buffer **buffer);
parserutils_error parserutils_buffer_destroy(parserutils_buffer *buffer);
 
parserutils_error parserutils_buffer_append(parserutils_buffer *buffer,
const uint8_t *data, size_t len);
parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer,
size_t offset, const uint8_t *data, size_t len);
parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer,
size_t offset, size_t len);
 
parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer);
 
parserutils_error parserutils_buffer_randomise(parserutils_buffer *buffer);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/utils/stack.h
0,0 → 1,39
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_utils_stack_h_
#define parserutils_utils_stack_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <stddef.h>
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
 
struct parserutils_stack;
typedef struct parserutils_stack parserutils_stack;
 
parserutils_error parserutils_stack_create(size_t item_size, size_t chunk_size,
parserutils_alloc alloc, void *pw, parserutils_stack **stack);
parserutils_error parserutils_stack_destroy(parserutils_stack *stack);
 
parserutils_error parserutils_stack_push(parserutils_stack *stack,
const void *item);
parserutils_error parserutils_stack_pop(parserutils_stack *stack, void *item);
 
void *parserutils_stack_get_current(parserutils_stack *stack);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/include/parserutils/utils/vector.h
0,0 → 1,45
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_utils_vector_h_
#define parserutils_utils_vector_h_
 
#ifdef __cplusplus
extern "C"
{
#endif
 
#include <stddef.h>
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
 
struct parserutils_vector;
typedef struct parserutils_vector parserutils_vector;
 
parserutils_error parserutils_vector_create(size_t item_size,
size_t chunk_size, parserutils_alloc alloc, void *pw,
parserutils_vector **vector);
parserutils_error parserutils_vector_destroy(parserutils_vector *vector);
 
parserutils_error parserutils_vector_append(parserutils_vector *vector,
void *item);
parserutils_error parserutils_vector_clear(parserutils_vector *vector);
parserutils_error parserutils_vector_remove_last(parserutils_vector *vector);
parserutils_error parserutils_vector_get_length(parserutils_vector *vector, size_t *length);
 
const void *parserutils_vector_iterate(const parserutils_vector *vector,
int32_t *ctx);
const void *parserutils_vector_peek(const parserutils_vector *vector,
int32_t ctx);
 
#ifdef __cplusplus
}
#endif
 
#endif
 
/programs/network/netsurf/libparserutils/libparserutils.pc.in
0,0 → 1,10
prefix=PREFIX
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include
 
Name: libparserutils
Description: Utility library for facilitating parser development
Version: VERSION
Libs: -L${libdir} -lparserutils
Cflags: -I${includedir}
/programs/network/netsurf/libparserutils/src/Makefile
0,0 → 1,2
 
include $(NSBUILD)/Makefile.subdir
/programs/network/netsurf/libparserutils/src/charset/Makefile
0,0 → 1,5
 
OUTFILE = libo.o
OBJS = aliases.o codec.o
CFLAGS += -I ../../include/ -I ../../../ -I ../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib
/programs/network/netsurf/libparserutils/src/charset/aliases.c
0,0 → 1,150
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <ctype.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
#include <assert.h>
 
#include "charset/aliases.h"
#include "utils/utils.h"
 
/* Bring in the aliases tables */
#include "aliases.inc"
 
typedef struct {
size_t slen;
const char *s;
} lengthed_string;
 
 
#define IS_PUNCT_OR_SPACE(x) \
(!(((x) >= 'A' && (x) <= 'Z') || \
((x) >= 'a' && (x) <= 'z') || \
((x) >= '0' && (x) <= '9')))
 
 
static int parserutils_charset_alias_match(const void *a, const void *b)
{
lengthed_string *s = (lengthed_string *)a;
parserutils_charset_aliases_alias *alias = (parserutils_charset_aliases_alias*)b;
size_t key_left = s->slen;
size_t alias_left = alias->name_len;
const char *s_alias = alias->name;
const char *s_key = s->s;
int cmpret;
while ((key_left > 0) && (alias_left > 0)) {
while ((key_left > 0) && IS_PUNCT_OR_SPACE(*s_key)) {
key_left--; s_key++;
}
if (key_left == 0)
break;
cmpret = tolower(*s_key) - *s_alias;
if (cmpret != 0) {
return cmpret;
}
key_left--;
s_key++;
alias_left--;
s_alias++;
}
while ((key_left > 0) && IS_PUNCT_OR_SPACE(*s_key)) {
key_left--; s_key++;
}
return key_left - alias_left;
}
 
/**
* Retrieve the canonical form of an alias name
*
* \param alias The alias name
* \param len The length of the alias name
* \return Pointer to canonical form or NULL if not found
*/
parserutils_charset_aliases_canon *parserutils__charset_alias_canonicalise(
const char *alias, size_t len)
{
parserutils_charset_aliases_alias *c;
lengthed_string s;
s.slen = len;
s.s = alias;
 
c = (parserutils_charset_aliases_alias*)bsearch(&s,
&charset_aliases[0],
charset_aliases_count,
sizeof(parserutils_charset_aliases_alias),
parserutils_charset_alias_match);
if (c == NULL)
return NULL;
return c->canon;
}
 
/**
* Retrieve the MIB enum value assigned to an encoding name
*
* \param alias The alias to lookup
* \param len The length of the alias string
* \return The MIB enum value, or 0 if not found
*/
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
{
parserutils_charset_aliases_canon *c;
 
if (alias == NULL)
return 0;
 
c = parserutils__charset_alias_canonicalise(alias, len);
if (c == NULL)
return 0;
 
return c->mib_enum;
}
 
/**
* Retrieve the canonical name of an encoding from the MIB enum
*
* \param mibenum The MIB enum value
* \return Pointer to canonical name, or NULL if not found
*/
const char *parserutils_charset_mibenum_to_name(uint16_t mibenum)
{
int i;
parserutils_charset_aliases_canon *c;
for (i = 0; i < charset_aliases_canon_count; ++i) {
c = &canonical_charset_names[i];
if (c->mib_enum == mibenum)
return c->name;
}
return NULL;
}
 
/**
* Detect if a parserutils_charset is Unicode
*
* \param mibenum The MIB enum to consider
* \return true if a Unicode variant, false otherwise
*/
bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
{
return MIBENUM_IS_UNICODE(mibenum);
}
/programs/network/netsurf/libparserutils/src/charset/aliases.h
0,0 → 1,26
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_charset_aliases_h_
#define parserutils_charset_aliases_h_
 
#include <inttypes.h>
 
#include <parserutils/charset/mibenum.h>
 
typedef struct parserutils_charset_aliases_canon {
/* Do not change the ordering here without changing make-aliases.pl */
uint16_t mib_enum;
uint16_t name_len;
const char *name;
} parserutils_charset_aliases_canon;
 
/* Canonicalise an alias name */
parserutils_charset_aliases_canon *parserutils__charset_alias_canonicalise(
const char *alias, size_t len);
 
#endif
/programs/network/netsurf/libparserutils/src/charset/aliases.inc
0,0 → 1,1142
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2010 The NetSurf Project.
*
* Note: This file is automatically generated by make-aliases.pl
*
* Do not edit file file, changes will be overwritten during build.
*/
 
static parserutils_charset_aliases_canon canonical_charset_names[] = {
{ 74, 16, "ANSI_X3.110-1983" },
{ 65, 8, "ASMO_449" },
{ 2005, 23, "Adobe-Standard-Encoding" },
{ 2020, 21, "Adobe-Symbol-Encoding" },
{ 2104, 10, "Amiga-1251" },
{ 1020, 6, "BOCU-1" },
{ 20, 7, "BS_4730" },
{ 50, 11, "BS_viewdata" },
{ 2026, 4, "Big5" },
{ 2101, 10, "Big5-HKSCS" },
{ 1016, 6, "CESU-8" },
{ 3004, 5, "CP874" },
{ 3027, 5, "CP949" },
{ 78, 17, "CSA_Z243.4-1985-1" },
{ 79, 17, "CSA_Z243.4-1985-2" },
{ 80, 18, "CSA_Z243.4-1985-gr" },
{ 86, 10, "CSN_369103" },
{ 2008, 7, "DEC-MCS" },
{ 24, 9, "DIN_66003" },
{ 99, 7, "DS_2089" },
{ 2064, 12, "EBCDIC-AT-DE" },
{ 2065, 14, "EBCDIC-AT-DE-A" },
{ 2066, 12, "EBCDIC-CA-FR" },
{ 2067, 12, "EBCDIC-DK-NO" },
{ 2068, 14, "EBCDIC-DK-NO-A" },
{ 2074, 9, "EBCDIC-ES" },
{ 2075, 11, "EBCDIC-ES-A" },
{ 2076, 11, "EBCDIC-ES-S" },
{ 2069, 12, "EBCDIC-FI-SE" },
{ 2070, 14, "EBCDIC-FI-SE-A" },
{ 2071, 9, "EBCDIC-FR" },
{ 2072, 9, "EBCDIC-IT" },
{ 2073, 9, "EBCDIC-PT" },
{ 2077, 9, "EBCDIC-UK" },
{ 2078, 9, "EBCDIC-US" },
{ 77, 13, "ECMA-cyrillic" },
{ 23, 2, "ES" },
{ 61, 3, "ES2" },
{ 18, 6, "EUC-JP" },
{ 38, 6, "EUC-KR" },
{ 19, 43, "Extended_UNIX_Code_Fixed_Width_for_Japanese" },
{ 114, 7, "GB18030" },
{ 2025, 6, "GB2312" },
{ 113, 3, "GBK" },
{ 56, 10, "GB_1988-80" },
{ 57, 10, "GB_2312-80" },
{ 94, 13, "GOST_19768-74" },
{ 2021, 10, "HP-DeskTop" },
{ 2017, 8, "HP-Legal" },
{ 2019, 8, "HP-Math8" },
{ 2018, 10, "HP-Pi-font" },
{ 2085, 10, "HZ-GB-2312" },
{ 2015, 11, "IBM-Symbols" },
{ 2016, 8, "IBM-Thai" },
{ 2089, 8, "IBM00858" },
{ 2090, 8, "IBM00924" },
{ 2091, 8, "IBM01140" },
{ 2092, 8, "IBM01141" },
{ 2093, 8, "IBM01142" },
{ 2094, 8, "IBM01143" },
{ 2095, 8, "IBM01144" },
{ 2096, 8, "IBM01145" },
{ 2097, 8, "IBM01146" },
{ 2098, 8, "IBM01147" },
{ 2099, 8, "IBM01148" },
{ 2100, 8, "IBM01149" },
{ 2028, 6, "IBM037" },
{ 2029, 6, "IBM038" },
{ 2063, 7, "IBM1026" },
{ 2102, 7, "IBM1047" },
{ 2030, 6, "IBM273" },
{ 2031, 6, "IBM274" },
{ 2032, 6, "IBM275" },
{ 2033, 6, "IBM277" },
{ 2034, 6, "IBM278" },
{ 2035, 6, "IBM280" },
{ 2036, 6, "IBM281" },
{ 2037, 6, "IBM284" },
{ 2038, 6, "IBM285" },
{ 2039, 6, "IBM290" },
{ 2040, 6, "IBM297" },
{ 2041, 6, "IBM420" },
{ 2042, 6, "IBM423" },
{ 2043, 6, "IBM424" },
{ 2011, 6, "IBM437" },
{ 2044, 6, "IBM500" },
{ 2087, 6, "IBM775" },
{ 2009, 6, "IBM850" },
{ 2045, 6, "IBM851" },
{ 2010, 6, "IBM852" },
{ 2046, 6, "IBM855" },
{ 2047, 6, "IBM857" },
{ 2048, 6, "IBM860" },
{ 2049, 6, "IBM861" },
{ 2013, 6, "IBM862" },
{ 2050, 6, "IBM863" },
{ 2051, 6, "IBM864" },
{ 2052, 6, "IBM865" },
{ 2086, 6, "IBM866" },
{ 2053, 6, "IBM868" },
{ 2054, 6, "IBM869" },
{ 2055, 6, "IBM870" },
{ 2056, 6, "IBM871" },
{ 2057, 6, "IBM880" },
{ 2058, 6, "IBM891" },
{ 2059, 6, "IBM903" },
{ 2060, 6, "IBM904" },
{ 2061, 6, "IBM905" },
{ 2062, 6, "IBM918" },
{ 88, 9, "IEC_P27-1" },
{ 51, 4, "INIS" },
{ 52, 6, "INIS-8" },
{ 53, 13, "INIS-cyrillic" },
{ 29, 9, "INVARIANT" },
{ 1000, 15, "ISO-10646-UCS-2" },
{ 1001, 15, "ISO-10646-UCS-4" },
{ 1002, 19, "ISO-10646-UCS-Basic" },
{ 27, 15, "ISO-10646-UTF-1" },
{ 1003, 24, "ISO-10646-Unicode-Latin1" },
{ 104, 11, "ISO-2022-CN" },
{ 105, 15, "ISO-2022-CN-EXT" },
{ 39, 11, "ISO-2022-JP" },
{ 4008, 13, "ISO-2022-JP-1" },
{ 40, 13, "ISO-2022-JP-2" },
{ 37, 11, "ISO-2022-KR" },
{ 4, 10, "ISO-8859-1" },
{ 2000, 30, "ISO-8859-1-Windows-3.0-Latin-1" },
{ 2001, 30, "ISO-8859-1-Windows-3.1-Latin-1" },
{ 13, 11, "ISO-8859-10" },
{ 4014, 11, "ISO-8859-11" },
{ 109, 11, "ISO-8859-13" },
{ 110, 11, "ISO-8859-14" },
{ 111, 11, "ISO-8859-15" },
{ 112, 11, "ISO-8859-16" },
{ 5, 10, "ISO-8859-2" },
{ 2002, 26, "ISO-8859-2-Windows-Latin-2" },
{ 6, 10, "ISO-8859-3" },
{ 7, 10, "ISO-8859-4" },
{ 8, 10, "ISO-8859-5" },
{ 9, 10, "ISO-8859-6" },
{ 81, 12, "ISO-8859-6-E" },
{ 82, 12, "ISO-8859-6-I" },
{ 10, 10, "ISO-8859-7" },
{ 11, 10, "ISO-8859-8" },
{ 84, 12, "ISO-8859-8-E" },
{ 85, 12, "ISO-8859-8-I" },
{ 12, 10, "ISO-8859-9" },
{ 2003, 26, "ISO-8859-9-Windows-Latin-5" },
{ 4000, 10, "ISO-IR-182" },
{ 4002, 10, "ISO-IR-197" },
{ 1005, 20, "ISO-Unicode-IBM-1261" },
{ 1008, 20, "ISO-Unicode-IBM-1264" },
{ 1009, 20, "ISO-Unicode-IBM-1265" },
{ 1006, 20, "ISO-Unicode-IBM-1268" },
{ 1007, 20, "ISO-Unicode-IBM-1276" },
{ 96, 13, "ISO_10367-box" },
{ 73, 13, "ISO_2033-1983" },
{ 48, 8, "ISO_5427" },
{ 54, 13, "ISO_5427:1981" },
{ 55, 13, "ISO_5428:1980" },
{ 28, 18, "ISO_646.basic:1983" },
{ 30, 16, "ISO_646.irv:1983" },
{ 93, 13, "ISO_6937-2-25" },
{ 14, 14, "ISO_6937-2-add" },
{ 95, 13, "ISO_8859-supp" },
{ 22, 2, "IT" },
{ 41, 17, "JIS_C6220-1969-jp" },
{ 42, 17, "JIS_C6220-1969-ro" },
{ 49, 14, "JIS_C6226-1978" },
{ 63, 14, "JIS_C6226-1983" },
{ 67, 16, "JIS_C6229-1984-a" },
{ 68, 16, "JIS_C6229-1984-b" },
{ 69, 20, "JIS_C6229-1984-b-add" },
{ 70, 19, "JIS_C6229-1984-hand" },
{ 71, 23, "JIS_C6229-1984-hand-add" },
{ 72, 19, "JIS_C6229-1984-kana" },
{ 16, 12, "JIS_Encoding" },
{ 15, 9, "JIS_X0201" },
{ 98, 14, "JIS_X0212-1990" },
{ 4012, 5, "JOHAB" },
{ 87, 12, "JUS_I.B1.002" },
{ 90, 16, "JUS_I.B1.003-mac" },
{ 89, 17, "JUS_I.B1.003-serb" },
{ 2105, 13, "KOI7-switched" },
{ 2084, 6, "KOI8-R" },
{ 2088, 6, "KOI8-U" },
{ 102, 7, "KSC5636" },
{ 36, 14, "KS_C_5601-1987" },
{ 47, 13, "Latin-greek-1" },
{ 4011, 16, "MACCENTRALEUROPE" },
{ 4009, 11, "MACCYRILLIC" },
{ 4010, 10, "MACUKRAINE" },
{ 2081, 4, "MNEM" },
{ 2080, 8, "MNEMONIC" },
{ 62, 10, "MSZ_7795.3" },
{ 2023, 20, "Microsoft-Publishing" },
{ 33, 9, "NATS-DANO" },
{ 34, 13, "NATS-DANO-ADD" },
{ 31, 9, "NATS-SEFI" },
{ 32, 13, "NATS-SEFI-ADD" },
{ 92, 13, "NC_NC00-10:81" },
{ 26, 11, "NF_Z_62-010" },
{ 46, 18, "NF_Z_62-010_(1973)" },
{ 25, 9, "NS_4551-1" },
{ 58, 9, "NS_4551-2" },
{ 116, 19, "OSD_EBCDIC_DF03_IRV" },
{ 117, 17, "OSD_EBCDIC_DF04_1" },
{ 115, 18, "OSD_EBCDIC_DF04_15" },
{ 2012, 20, "PC8-Danish-Norwegian" },
{ 2014, 11, "PC8-Turkish" },
{ 43, 2, "PT" },
{ 60, 3, "PT2" },
{ 2103, 7, "PTCP154" },
{ 1011, 4, "SCSU" },
{ 35, 12, "SEN_850200_B" },
{ 21, 12, "SEN_850200_C" },
{ 17, 9, "Shift_JIS" },
{ 83, 8, "T.101-G2" },
{ 75, 9, "T.61-7bit" },
{ 76, 9, "T.61-8bit" },
{ 2259, 7, "TIS-620" },
{ 1010, 11, "UNICODE-1-1" },
{ 103, 17, "UNICODE-1-1-UTF-7" },
{ 2079, 12, "UNKNOWN-8BIT" },
{ 3, 8, "US-ASCII" },
{ 1015, 6, "UTF-16" },
{ 1013, 8, "UTF-16BE" },
{ 1014, 8, "UTF-16LE" },
{ 1017, 6, "UTF-32" },
{ 1018, 8, "UTF-32BE" },
{ 1019, 8, "UTF-32LE" },
{ 1012, 5, "UTF-7" },
{ 106, 5, "UTF-8" },
{ 2083, 4, "VIQR" },
{ 2082, 6, "VISCII" },
{ 2007, 21, "Ventura-International" },
{ 2022, 12, "Ventura-Math" },
{ 2006, 10, "Ventura-US" },
{ 2024, 11, "Windows-31J" },
{ 5002, 13, "X-ACORN-FUZZY" },
{ 5001, 14, "X-ACORN-LATIN1" },
{ 4999, 9, "X-CURRENT" },
{ 101, 5, "dk-us" },
{ 91, 11, "greek-ccitt" },
{ 64, 6, "greek7" },
{ 44, 10, "greek7-old" },
{ 2004, 9, "hp-roman8" },
{ 66, 9, "iso-ir-90" },
{ 45, 11, "latin-greek" },
{ 97, 9, "latin-lap" },
{ 2027, 9, "macintosh" },
{ 100, 5, "us-dk" },
{ 59, 14, "videotex-suppl" },
{ 2250, 12, "windows-1250" },
{ 2251, 12, "windows-1251" },
{ 2252, 12, "windows-1252" },
{ 2253, 12, "windows-1253" },
{ 2254, 12, "windows-1254" },
{ 2255, 12, "windows-1255" },
{ 2256, 12, "windows-1256" },
{ 2257, 12, "windows-1257" },
{ 2258, 12, "windows-1258" },
};
 
static const uint16_t charset_aliases_canon_count = 262;
 
typedef struct {
uint16_t name_len;
const char *name;
parserutils_charset_aliases_canon *canon;
} parserutils_charset_aliases_alias;
 
static parserutils_charset_aliases_alias charset_aliases[] = {
{ 3, "437", &canonical_charset_names[84] },
{ 3, "850", &canonical_charset_names[87] },
{ 3, "851", &canonical_charset_names[88] },
{ 3, "852", &canonical_charset_names[89] },
{ 3, "855", &canonical_charset_names[90] },
{ 3, "857", &canonical_charset_names[91] },
{ 3, "860", &canonical_charset_names[92] },
{ 3, "861", &canonical_charset_names[93] },
{ 3, "862", &canonical_charset_names[94] },
{ 3, "863", &canonical_charset_names[95] },
{ 3, "865", &canonical_charset_names[97] },
{ 3, "866", &canonical_charset_names[98] },
{ 3, "869", &canonical_charset_names[100] },
{ 5, "88591", &canonical_charset_names[125] },
{ 6, "885910", &canonical_charset_names[128] },
{ 6, "885911", &canonical_charset_names[129] },
{ 6, "885913", &canonical_charset_names[130] },
{ 6, "885914", &canonical_charset_names[131] },
{ 6, "885915", &canonical_charset_names[132] },
{ 5, "88592", &canonical_charset_names[134] },
{ 5, "88593", &canonical_charset_names[136] },
{ 5, "88594", &canonical_charset_names[137] },
{ 5, "88595", &canonical_charset_names[138] },
{ 5, "88597", &canonical_charset_names[142] },
{ 5, "88598", &canonical_charset_names[143] },
{ 5, "88599", &canonical_charset_names[146] },
{ 3, "904", &canonical_charset_names[106] },
{ 21, "adobestandardencoding", &canonical_charset_names[2] },
{ 19, "adobesymbolencoding", &canonical_charset_names[3] },
{ 7, "ami1251", &canonical_charset_names[4] },
{ 9, "amiga1251", &canonical_charset_names[4] },
{ 13, "ansix31101983", &canonical_charset_names[0] },
{ 11, "ansix341968", &canonical_charset_names[224] },
{ 11, "ansix341986", &canonical_charset_names[224] },
{ 6, "arabic", &canonical_charset_names[139] },
{ 7, "arabic7", &canonical_charset_names[1] },
{ 5, "ascii", &canonical_charset_names[224] },
{ 7, "asmo449", &canonical_charset_names[1] },
{ 7, "asmo708", &canonical_charset_names[139] },
{ 4, "big5", &canonical_charset_names[8] },
{ 9, "big5hkscs", &canonical_charset_names[9] },
{ 7, "bigfive", &canonical_charset_names[8] },
{ 5, "bocu1", &canonical_charset_names[5] },
{ 6, "bs4730", &canonical_charset_names[6] },
{ 10, "bsviewdata", &canonical_charset_names[7] },
{ 2, "ca", &canonical_charset_names[13] },
{ 10, "ccsid00858", &canonical_charset_names[54] },
{ 10, "ccsid00924", &canonical_charset_names[55] },
{ 10, "ccsid01140", &canonical_charset_names[56] },
{ 10, "ccsid01141", &canonical_charset_names[57] },
{ 10, "ccsid01142", &canonical_charset_names[58] },
{ 10, "ccsid01143", &canonical_charset_names[59] },
{ 10, "ccsid01144", &canonical_charset_names[60] },
{ 10, "ccsid01145", &canonical_charset_names[61] },
{ 10, "ccsid01146", &canonical_charset_names[62] },
{ 10, "ccsid01147", &canonical_charset_names[63] },
{ 10, "ccsid01148", &canonical_charset_names[64] },
{ 10, "ccsid01149", &canonical_charset_names[65] },
{ 5, "cesu8", &canonical_charset_names[10] },
{ 7, "chinese", &canonical_charset_names[45] },
{ 2, "cn", &canonical_charset_names[44] },
{ 6, "cnbig5", &canonical_charset_names[8] },
{ 4, "cngb", &canonical_charset_names[42] },
{ 7, "cp00858", &canonical_charset_names[54] },
{ 7, "cp00924", &canonical_charset_names[55] },
{ 7, "cp01140", &canonical_charset_names[56] },
{ 7, "cp01141", &canonical_charset_names[57] },
{ 7, "cp01142", &canonical_charset_names[58] },
{ 7, "cp01143", &canonical_charset_names[59] },
{ 7, "cp01144", &canonical_charset_names[60] },
{ 7, "cp01145", &canonical_charset_names[61] },
{ 7, "cp01146", &canonical_charset_names[62] },
{ 7, "cp01147", &canonical_charset_names[63] },
{ 7, "cp01148", &canonical_charset_names[64] },
{ 7, "cp01149", &canonical_charset_names[65] },
{ 5, "cp037", &canonical_charset_names[66] },
{ 5, "cp038", &canonical_charset_names[67] },
{ 6, "cp1026", &canonical_charset_names[68] },
{ 6, "cp1250", &canonical_charset_names[253] },
{ 6, "cp1251", &canonical_charset_names[254] },
{ 6, "cp1252", &canonical_charset_names[255] },
{ 6, "cp1253", &canonical_charset_names[256] },
{ 6, "cp1254", &canonical_charset_names[257] },
{ 6, "cp1256", &canonical_charset_names[259] },
{ 6, "cp1257", &canonical_charset_names[260] },
{ 5, "cp154", &canonical_charset_names[212] },
{ 5, "cp273", &canonical_charset_names[70] },
{ 5, "cp274", &canonical_charset_names[71] },
{ 5, "cp275", &canonical_charset_names[72] },
{ 5, "cp278", &canonical_charset_names[74] },
{ 5, "cp280", &canonical_charset_names[75] },
{ 5, "cp281", &canonical_charset_names[76] },
{ 5, "cp284", &canonical_charset_names[77] },
{ 5, "cp285", &canonical_charset_names[78] },
{ 5, "cp290", &canonical_charset_names[79] },
{ 5, "cp297", &canonical_charset_names[80] },
{ 5, "cp367", &canonical_charset_names[224] },
{ 5, "cp420", &canonical_charset_names[81] },
{ 5, "cp423", &canonical_charset_names[82] },
{ 5, "cp424", &canonical_charset_names[83] },
{ 5, "cp437", &canonical_charset_names[84] },
{ 5, "cp500", &canonical_charset_names[85] },
{ 5, "cp775", &canonical_charset_names[86] },
{ 5, "cp819", &canonical_charset_names[125] },
{ 5, "cp850", &canonical_charset_names[87] },
{ 5, "cp851", &canonical_charset_names[88] },
{ 5, "cp852", &canonical_charset_names[89] },
{ 5, "cp855", &canonical_charset_names[90] },
{ 5, "cp857", &canonical_charset_names[91] },
{ 5, "cp860", &canonical_charset_names[92] },
{ 5, "cp861", &canonical_charset_names[93] },
{ 5, "cp862", &canonical_charset_names[94] },
{ 5, "cp863", &canonical_charset_names[95] },
{ 5, "cp864", &canonical_charset_names[96] },
{ 5, "cp865", &canonical_charset_names[97] },
{ 5, "cp866", &canonical_charset_names[98] },
{ 5, "cp868", &canonical_charset_names[99] },
{ 5, "cp869", &canonical_charset_names[100] },
{ 5, "cp870", &canonical_charset_names[101] },
{ 5, "cp871", &canonical_charset_names[102] },
{ 5, "cp874", &canonical_charset_names[11] },
{ 5, "cp880", &canonical_charset_names[103] },
{ 5, "cp891", &canonical_charset_names[104] },
{ 5, "cp903", &canonical_charset_names[105] },
{ 5, "cp904", &canonical_charset_names[106] },
{ 5, "cp905", &canonical_charset_names[107] },
{ 5, "cp918", &canonical_charset_names[108] },
{ 5, "cp936", &canonical_charset_names[43] },
{ 5, "cp949", &canonical_charset_names[12] },
{ 4, "cpar", &canonical_charset_names[99] },
{ 4, "cpgr", &canonical_charset_names[100] },
{ 4, "cpis", &canonical_charset_names[93] },
{ 5, "csa71", &canonical_charset_names[13] },
{ 5, "csa72", &canonical_charset_names[14] },
{ 23, "csadobestandardencoding", &canonical_charset_names[2] },
{ 7, "csascii", &canonical_charset_names[224] },
{ 11, "csat5001983", &canonical_charset_names[0] },
{ 13, "csaz243419851", &canonical_charset_names[13] },
{ 13, "csaz243419852", &canonical_charset_names[14] },
{ 14, "csaz24341985gr", &canonical_charset_names[15] },
{ 6, "csbig5", &canonical_charset_names[8] },
{ 7, "csbocu1", &canonical_charset_names[5] },
{ 7, "cscesu8", &canonical_charset_names[10] },
{ 8, "csdecmcs", &canonical_charset_names[17] },
{ 6, "csdkus", &canonical_charset_names[242] },
{ 13, "csebcdicatdea", &canonical_charset_names[21] },
{ 12, "csebcdiccafr", &canonical_charset_names[22] },
{ 12, "csebcdicdkno", &canonical_charset_names[23] },
{ 13, "csebcdicdknoa", &canonical_charset_names[24] },
{ 10, "csebcdices", &canonical_charset_names[25] },
{ 11, "csebcdicesa", &canonical_charset_names[26] },
{ 11, "csebcdicess", &canonical_charset_names[27] },
{ 12, "csebcdicfise", &canonical_charset_names[28] },
{ 13, "csebcdicfisea", &canonical_charset_names[29] },
{ 10, "csebcdicfr", &canonical_charset_names[30] },
{ 10, "csebcdicit", &canonical_charset_names[31] },
{ 10, "csebcdicpt", &canonical_charset_names[32] },
{ 10, "csebcdicuk", &canonical_charset_names[33] },
{ 10, "csebcdicus", &canonical_charset_names[34] },
{ 19, "cseucfixwidjapanese", &canonical_charset_names[40] },
{ 7, "cseuckr", &canonical_charset_names[39] },
{ 19, "cseucpkdfmtjapanese", &canonical_charset_names[38] },
{ 8, "csgb2312", &canonical_charset_names[42] },
{ 19, "cshalfwidthkatakana", &canonical_charset_names[177] },
{ 11, "cshpdesktop", &canonical_charset_names[47] },
{ 9, "cshplegal", &canonical_charset_names[48] },
{ 9, "cshpmath8", &canonical_charset_names[49] },
{ 10, "cshppifont", &canonical_charset_names[50] },
{ 10, "cshppsmath", &canonical_charset_names[3] },
{ 10, "cshproman8", &canonical_charset_names[246] },
{ 9, "csibbm904", &canonical_charset_names[106] },
{ 8, "csibm037", &canonical_charset_names[66] },
{ 8, "csibm038", &canonical_charset_names[67] },
{ 9, "csibm1026", &canonical_charset_names[68] },
{ 8, "csibm273", &canonical_charset_names[70] },
{ 8, "csibm274", &canonical_charset_names[71] },
{ 8, "csibm275", &canonical_charset_names[72] },
{ 8, "csibm277", &canonical_charset_names[73] },
{ 8, "csibm278", &canonical_charset_names[74] },
{ 8, "csibm280", &canonical_charset_names[75] },
{ 8, "csibm281", &canonical_charset_names[76] },
{ 8, "csibm284", &canonical_charset_names[77] },
{ 8, "csibm285", &canonical_charset_names[78] },
{ 8, "csibm290", &canonical_charset_names[79] },
{ 8, "csibm297", &canonical_charset_names[80] },
{ 8, "csibm420", &canonical_charset_names[81] },
{ 8, "csibm423", &canonical_charset_names[82] },
{ 8, "csibm424", &canonical_charset_names[83] },
{ 8, "csibm500", &canonical_charset_names[85] },
{ 8, "csibm851", &canonical_charset_names[88] },
{ 8, "csibm855", &canonical_charset_names[90] },
{ 8, "csibm857", &canonical_charset_names[91] },
{ 8, "csibm860", &canonical_charset_names[92] },
{ 8, "csibm861", &canonical_charset_names[93] },
{ 8, "csibm863", &canonical_charset_names[95] },
{ 8, "csibm864", &canonical_charset_names[96] },
{ 8, "csibm865", &canonical_charset_names[97] },
{ 8, "csibm866", &canonical_charset_names[98] },
{ 8, "csibm868", &canonical_charset_names[99] },
{ 8, "csibm869", &canonical_charset_names[100] },
{ 8, "csibm870", &canonical_charset_names[101] },
{ 8, "csibm871", &canonical_charset_names[102] },
{ 8, "csibm880", &canonical_charset_names[103] },
{ 8, "csibm891", &canonical_charset_names[104] },
{ 8, "csibm903", &canonical_charset_names[105] },
{ 8, "csibm905", &canonical_charset_names[107] },
{ 8, "csibm918", &canonical_charset_names[108] },
{ 15, "csibmebcdicatde", &canonical_charset_names[20] },
{ 12, "csibmsymbols", &canonical_charset_names[52] },
{ 9, "csibmthai", &canonical_charset_names[53] },
{ 11, "csinvariant", &canonical_charset_names[113] },
{ 15, "csiso102t617bit", &canonical_charset_names[218] },
{ 13, "csiso10367box", &canonical_charset_names[155] },
{ 15, "csiso103t618bit", &canonical_charset_names[219] },
{ 14, "csiso10646utf1", &canonical_charset_names[117] },
{ 14, "csiso10swedish", &canonical_charset_names[214] },
{ 20, "csiso111ecmacyrillic", &canonical_charset_names[35] },
{ 22, "csiso11swedishfornames", &canonical_charset_names[215] },
{ 17, "csiso121canadian1", &canonical_charset_names[13] },
{ 17, "csiso122canadian2", &canonical_charset_names[14] },
{ 22, "csiso123csaz24341985gr", &canonical_charset_names[15] },
{ 14, "csiso128t101g2", &canonical_charset_names[217] },
{ 17, "csiso139csn369103", &canonical_charset_names[16] },
{ 17, "csiso13jisc6220jp", &canonical_charset_names[166] },
{ 17, "csiso141jusib1002", &canonical_charset_names[180] },
{ 15, "csiso143iecp271", &canonical_charset_names[109] },
{ 15, "csiso146serbian", &canonical_charset_names[182] },
{ 18, "csiso147macedonian", &canonical_charset_names[181] },
{ 17, "csiso14jisc6220ro", &canonical_charset_names[167] },
{ 8, "csiso150", &canonical_charset_names[243] },
{ 18, "csiso150greekccitt", &canonical_charset_names[243] },
{ 12, "csiso151cuba", &canonical_charset_names[200] },
{ 19, "csiso153gost1976874", &canonical_charset_names[46] },
{ 11, "csiso158lap", &canonical_charset_names[249] },
{ 20, "csiso159jisx02121990", &canonical_charset_names[178] },
{ 14, "csiso15italian", &canonical_charset_names[165] },
{ 17, "csiso16portuguese", &canonical_charset_names[210] },
{ 14, "csiso17spanish", &canonical_charset_names[36] },
{ 16, "csiso18greek7old", &canonical_charset_names[245] },
{ 17, "csiso19latingreek", &canonical_charset_names[248] },
{ 11, "csiso2022jp", &canonical_charset_names[121] },
{ 12, "csiso2022jp2", &canonical_charset_names[123] },
{ 11, "csiso2022kr", &canonical_charset_names[124] },
{ 9, "csiso2033", &canonical_charset_names[156] },
{ 13, "csiso21german", &canonical_charset_names[18] },
{ 13, "csiso25french", &canonical_charset_names[202] },
{ 18, "csiso27latingreek1", &canonical_charset_names[188] },
{ 20, "csiso2intlrefversion", &canonical_charset_names[161] },
{ 19, "csiso42jisc62261978", &canonical_charset_names[168] },
{ 17, "csiso47bsviewdata", &canonical_charset_names[7] },
{ 11, "csiso49inis", &canonical_charset_names[110] },
{ 19, "csiso4unitedkingdom", &canonical_charset_names[6] },
{ 12, "csiso50inis8", &canonical_charset_names[111] },
{ 19, "csiso51iniscyrillic", &canonical_charset_names[112] },
{ 17, "csiso5427cyrillic", &canonical_charset_names[157] },
{ 14, "csiso5428greek", &canonical_charset_names[159] },
{ 13, "csiso57gb1988", &canonical_charset_names[44] },
{ 15, "csiso58gb231280", &canonical_charset_names[45] },
{ 22, "csiso60danishnorwegian", &canonical_charset_names[203] },
{ 17, "csiso60norwegian1", &canonical_charset_names[203] },
{ 17, "csiso61norwegian2", &canonical_charset_names[204] },
{ 17, "csiso646basic1983", &canonical_charset_names[160] },
{ 14, "csiso646danish", &canonical_charset_names[19] },
{ 12, "csiso6937add", &canonical_charset_names[162] },
{ 13, "csiso69french", &canonical_charset_names[201] },
{ 20, "csiso70videotexsupp1", &canonical_charset_names[252] },
{ 18, "csiso84portuguese2", &canonical_charset_names[211] },
{ 15, "csiso85spanish2", &canonical_charset_names[37] },
{ 16, "csiso86hungarian", &canonical_charset_names[194] },
{ 15, "csiso87jisx0208", &canonical_charset_names[169] },
{ 11, "csiso88596e", &canonical_charset_names[140] },
{ 11, "csiso88596i", &canonical_charset_names[141] },
{ 11, "csiso88598e", &canonical_charset_names[144] },
{ 11, "csiso88598i", &canonical_charset_names[145] },
{ 13, "csiso8859supp", &canonical_charset_names[164] },
{ 13, "csiso88greek7", &canonical_charset_names[244] },
{ 14, "csiso89asmo449", &canonical_charset_names[1] },
{ 7, "csiso90", &canonical_charset_names[247] },
{ 20, "csiso91jisc62291984a", &canonical_charset_names[170] },
{ 20, "csiso92jisc62991984b", &canonical_charset_names[171] },
{ 22, "csiso93jis62291984badd", &canonical_charset_names[172] },
{ 22, "csiso94jis62291984hand", &canonical_charset_names[173] },
{ 25, "csiso95jis62291984handadd", &canonical_charset_names[174] },
{ 23, "csiso96jisc62291984kana", &canonical_charset_names[175] },
{ 13, "csiso99naplps", &canonical_charset_names[0] },
{ 11, "csisolatin1", &canonical_charset_names[125] },
{ 11, "csisolatin2", &canonical_charset_names[134] },
{ 11, "csisolatin3", &canonical_charset_names[136] },
{ 11, "csisolatin4", &canonical_charset_names[137] },
{ 11, "csisolatin5", &canonical_charset_names[146] },
{ 11, "csisolatin6", &canonical_charset_names[128] },
{ 16, "csisolatinarabic", &canonical_charset_names[139] },
{ 18, "csisolatincyrillic", &canonical_charset_names[138] },
{ 15, "csisolatingreek", &canonical_charset_names[142] },
{ 16, "csisolatinhebrew", &canonical_charset_names[143] },
{ 13, "csisotextcomm", &canonical_charset_names[163] },
{ 13, "csjisencoding", &canonical_charset_names[176] },
{ 7, "cskoi8r", &canonical_charset_names[184] },
{ 13, "csksc56011987", &canonical_charset_names[187] },
{ 9, "csksc5636", &canonical_charset_names[186] },
{ 11, "csmacintosh", &canonical_charset_names[250] },
{ 21, "csmicrosoftpublishing", &canonical_charset_names[195] },
{ 6, "csmnem", &canonical_charset_names[192] },
{ 10, "csmnemonic", &canonical_charset_names[193] },
{ 9, "csn369103", &canonical_charset_names[16] },
{ 10, "csnatsdano", &canonical_charset_names[196] },
{ 13, "csnatsdanoadd", &canonical_charset_names[197] },
{ 10, "csnatssefi", &canonical_charset_names[198] },
{ 13, "csnatssefiadd", &canonical_charset_names[199] },
{ 13, "cspc775baltic", &canonical_charset_names[86] },
{ 19, "cspc850multilingual", &canonical_charset_names[87] },
{ 18, "cspc862latinhebrew", &canonical_charset_names[94] },
{ 16, "cspc8codepage437", &canonical_charset_names[84] },
{ 20, "cspc8danishnorwegian", &canonical_charset_names[208] },
{ 12, "cspc8turkish", &canonical_charset_names[209] },
{ 8, "cspcp852", &canonical_charset_names[89] },
{ 9, "csptcp154", &canonical_charset_names[212] },
{ 10, "csshiftjis", &canonical_charset_names[216] },
{ 6, "csucs4", &canonical_charset_names[115] },
{ 9, "csunicode", &canonical_charset_names[114] },
{ 11, "csunicode11", &canonical_charset_names[221] },
{ 15, "csunicode11utf7", &canonical_charset_names[222] },
{ 14, "csunicodeascii", &canonical_charset_names[116] },
{ 16, "csunicodeibm1261", &canonical_charset_names[150] },
{ 16, "csunicodeibm1264", &canonical_charset_names[151] },
{ 16, "csunicodeibm1265", &canonical_charset_names[152] },
{ 16, "csunicodeibm1268", &canonical_charset_names[153] },
{ 16, "csunicodeibm1276", &canonical_charset_names[154] },
{ 15, "csunicodelatin1", &canonical_charset_names[118] },
{ 13, "csunknown8bit", &canonical_charset_names[223] },
{ 6, "csusdk", &canonical_charset_names[251] },
{ 22, "csventurainternational", &canonical_charset_names[235] },
{ 13, "csventuramath", &canonical_charset_names[236] },
{ 11, "csventuraus", &canonical_charset_names[237] },
{ 6, "csviqr", &canonical_charset_names[233] },
{ 8, "csviscii", &canonical_charset_names[234] },
{ 17, "cswindows30latin1", &canonical_charset_names[126] },
{ 12, "cswindows31j", &canonical_charset_names[238] },
{ 17, "cswindows31latin1", &canonical_charset_names[127] },
{ 17, "cswindows31latin2", &canonical_charset_names[135] },
{ 17, "cswindows31latin5", &canonical_charset_names[147] },
{ 4, "cuba", &canonical_charset_names[200] },
{ 8, "cyrillic", &canonical_charset_names[138] },
{ 13, "cyrillicasian", &canonical_charset_names[212] },
{ 2, "de", &canonical_charset_names[18] },
{ 3, "dec", &canonical_charset_names[17] },
{ 6, "decmcs", &canonical_charset_names[17] },
{ 8, "din66003", &canonical_charset_names[18] },
{ 2, "dk", &canonical_charset_names[19] },
{ 4, "dkus", &canonical_charset_names[242] },
{ 6, "ds2089", &canonical_charset_names[19] },
{ 4, "e13b", &canonical_charset_names[156] },
{ 10, "ebcdicatde", &canonical_charset_names[20] },
{ 11, "ebcdicatdea", &canonical_charset_names[21] },
{ 8, "ebcdicbe", &canonical_charset_names[71] },
{ 8, "ebcdicbr", &canonical_charset_names[72] },
{ 10, "ebcdiccafr", &canonical_charset_names[22] },
{ 11, "ebcdiccpar1", &canonical_charset_names[81] },
{ 11, "ebcdiccpar2", &canonical_charset_names[108] },
{ 10, "ebcdiccpbe", &canonical_charset_names[85] },
{ 10, "ebcdiccpca", &canonical_charset_names[66] },
{ 10, "ebcdiccpch", &canonical_charset_names[85] },
{ 10, "ebcdiccpdk", &canonical_charset_names[73] },
{ 10, "ebcdiccpes", &canonical_charset_names[77] },
{ 10, "ebcdiccpfi", &canonical_charset_names[74] },
{ 10, "ebcdiccpfr", &canonical_charset_names[80] },
{ 10, "ebcdiccpgb", &canonical_charset_names[78] },
{ 10, "ebcdiccpgr", &canonical_charset_names[82] },
{ 10, "ebcdiccphe", &canonical_charset_names[83] },
{ 10, "ebcdiccpis", &canonical_charset_names[102] },
{ 10, "ebcdiccpit", &canonical_charset_names[75] },
{ 10, "ebcdiccpnl", &canonical_charset_names[66] },
{ 10, "ebcdiccpno", &canonical_charset_names[73] },
{ 13, "ebcdiccproece", &canonical_charset_names[101] },
{ 10, "ebcdiccpse", &canonical_charset_names[74] },
{ 10, "ebcdiccptr", &canonical_charset_names[107] },
{ 10, "ebcdiccpus", &canonical_charset_names[66] },
{ 10, "ebcdiccpwt", &canonical_charset_names[66] },
{ 10, "ebcdiccpyu", &canonical_charset_names[101] },
{ 14, "ebcdiccyrillic", &canonical_charset_names[103] },
{ 15, "ebcdicde273euro", &canonical_charset_names[57] },
{ 15, "ebcdicdk277euro", &canonical_charset_names[58] },
{ 10, "ebcdicdkno", &canonical_charset_names[23] },
{ 11, "ebcdicdknoa", &canonical_charset_names[24] },
{ 8, "ebcdices", &canonical_charset_names[25] },
{ 15, "ebcdices284euro", &canonical_charset_names[61] },
{ 9, "ebcdicesa", &canonical_charset_names[26] },
{ 9, "ebcdicess", &canonical_charset_names[27] },
{ 15, "ebcdicfi278euro", &canonical_charset_names[59] },
{ 10, "ebcdicfise", &canonical_charset_names[28] },
{ 11, "ebcdicfisea", &canonical_charset_names[29] },
{ 8, "ebcdicfr", &canonical_charset_names[30] },
{ 15, "ebcdicfr297euro", &canonical_charset_names[63] },
{ 15, "ebcdicgb285euro", &canonical_charset_names[62] },
{ 9, "ebcdicint", &canonical_charset_names[67] },
{ 26, "ebcdicinternational500euro", &canonical_charset_names[64] },
{ 15, "ebcdicis871euro", &canonical_charset_names[65] },
{ 8, "ebcdicit", &canonical_charset_names[31] },
{ 15, "ebcdicit280euro", &canonical_charset_names[60] },
{ 9, "ebcdicjpe", &canonical_charset_names[76] },
{ 12, "ebcdicjpkana", &canonical_charset_names[79] },
{ 16, "ebcdiclatin9euro", &canonical_charset_names[55] },
{ 15, "ebcdicno277euro", &canonical_charset_names[58] },
{ 8, "ebcdicpt", &canonical_charset_names[32] },
{ 15, "ebcdicse278euro", &canonical_charset_names[59] },
{ 8, "ebcdicuk", &canonical_charset_names[33] },
{ 8, "ebcdicus", &canonical_charset_names[34] },
{ 14, "ebcdicus37euro", &canonical_charset_names[56] },
{ 7, "ecma114", &canonical_charset_names[139] },
{ 7, "ecma118", &canonical_charset_names[142] },
{ 12, "ecmacyrillic", &canonical_charset_names[35] },
{ 7, "elot928", &canonical_charset_names[142] },
{ 2, "es", &canonical_charset_names[36] },
{ 3, "es2", &canonical_charset_names[37] },
{ 5, "euccn", &canonical_charset_names[42] },
{ 5, "eucjp", &canonical_charset_names[38] },
{ 5, "euckr", &canonical_charset_names[39] },
{ 37, "extendedunixcodefixedwidthforjapanese", &canonical_charset_names[40] },
{ 39, "extendedunixcodepackedformatforjapanese", &canonical_charset_names[38] },
{ 2, "fi", &canonical_charset_names[214] },
{ 2, "fr", &canonical_charset_names[201] },
{ 2, "gb", &canonical_charset_names[6] },
{ 7, "gb18030", &canonical_charset_names[41] },
{ 8, "gb198880", &canonical_charset_names[44] },
{ 6, "gb2312", &canonical_charset_names[42] },
{ 8, "gb231280", &canonical_charset_names[45] },
{ 3, "gbk", &canonical_charset_names[43] },
{ 11, "gost1976874", &canonical_charset_names[46] },
{ 5, "greek", &canonical_charset_names[142] },
{ 6, "greek7", &canonical_charset_names[244] },
{ 9, "greek7old", &canonical_charset_names[245] },
{ 6, "greek8", &canonical_charset_names[142] },
{ 10, "greekccitt", &canonical_charset_names[243] },
{ 6, "hebrew", &canonical_charset_names[143] },
{ 9, "hpdesktop", &canonical_charset_names[47] },
{ 7, "hplegal", &canonical_charset_names[48] },
{ 7, "hpmath8", &canonical_charset_names[49] },
{ 8, "hppifont", &canonical_charset_names[50] },
{ 8, "hproman8", &canonical_charset_names[246] },
{ 2, "hu", &canonical_charset_names[194] },
{ 8, "hzgb2312", &canonical_charset_names[51] },
{ 8, "ibm00858", &canonical_charset_names[54] },
{ 8, "ibm00924", &canonical_charset_names[55] },
{ 8, "ibm01140", &canonical_charset_names[56] },
{ 8, "ibm01141", &canonical_charset_names[57] },
{ 8, "ibm01142", &canonical_charset_names[58] },
{ 8, "ibm01143", &canonical_charset_names[59] },
{ 8, "ibm01144", &canonical_charset_names[60] },
{ 8, "ibm01145", &canonical_charset_names[61] },
{ 8, "ibm01146", &canonical_charset_names[62] },
{ 8, "ibm01147", &canonical_charset_names[63] },
{ 8, "ibm01148", &canonical_charset_names[64] },
{ 8, "ibm01149", &canonical_charset_names[65] },
{ 6, "ibm037", &canonical_charset_names[66] },
{ 6, "ibm038", &canonical_charset_names[67] },
{ 7, "ibm1026", &canonical_charset_names[68] },
{ 7, "ibm1047", &canonical_charset_names[69] },
{ 6, "ibm273", &canonical_charset_names[70] },
{ 6, "ibm274", &canonical_charset_names[71] },
{ 6, "ibm275", &canonical_charset_names[72] },
{ 6, "ibm277", &canonical_charset_names[73] },
{ 6, "ibm278", &canonical_charset_names[74] },
{ 6, "ibm280", &canonical_charset_names[75] },
{ 6, "ibm281", &canonical_charset_names[76] },
{ 6, "ibm284", &canonical_charset_names[77] },
{ 6, "ibm285", &canonical_charset_names[78] },
{ 6, "ibm290", &canonical_charset_names[79] },
{ 6, "ibm297", &canonical_charset_names[80] },
{ 6, "ibm367", &canonical_charset_names[224] },
{ 6, "ibm420", &canonical_charset_names[81] },
{ 6, "ibm423", &canonical_charset_names[82] },
{ 6, "ibm424", &canonical_charset_names[83] },
{ 6, "ibm437", &canonical_charset_names[84] },
{ 6, "ibm500", &canonical_charset_names[85] },
{ 6, "ibm775", &canonical_charset_names[86] },
{ 6, "ibm819", &canonical_charset_names[125] },
{ 6, "ibm850", &canonical_charset_names[87] },
{ 6, "ibm851", &canonical_charset_names[88] },
{ 6, "ibm852", &canonical_charset_names[89] },
{ 6, "ibm855", &canonical_charset_names[90] },
{ 6, "ibm857", &canonical_charset_names[91] },
{ 6, "ibm860", &canonical_charset_names[92] },
{ 6, "ibm861", &canonical_charset_names[93] },
{ 6, "ibm862", &canonical_charset_names[94] },
{ 6, "ibm863", &canonical_charset_names[95] },
{ 6, "ibm864", &canonical_charset_names[96] },
{ 6, "ibm865", &canonical_charset_names[97] },
{ 6, "ibm866", &canonical_charset_names[98] },
{ 6, "ibm868", &canonical_charset_names[99] },
{ 6, "ibm869", &canonical_charset_names[100] },
{ 6, "ibm870", &canonical_charset_names[101] },
{ 6, "ibm871", &canonical_charset_names[102] },
{ 6, "ibm880", &canonical_charset_names[103] },
{ 6, "ibm891", &canonical_charset_names[104] },
{ 6, "ibm903", &canonical_charset_names[105] },
{ 6, "ibm904", &canonical_charset_names[106] },
{ 6, "ibm905", &canonical_charset_names[107] },
{ 6, "ibm918", &canonical_charset_names[108] },
{ 10, "ibmsymbols", &canonical_charset_names[52] },
{ 7, "ibmthai", &canonical_charset_names[53] },
{ 7, "iecp271", &canonical_charset_names[109] },
{ 4, "inis", &canonical_charset_names[110] },
{ 5, "inis8", &canonical_charset_names[111] },
{ 12, "iniscyrillic", &canonical_charset_names[112] },
{ 9, "invariant", &canonical_charset_names[113] },
{ 3, "irv", &canonical_charset_names[161] },
{ 11, "iso10367box", &canonical_charset_names[155] },
{ 8, "iso10646", &canonical_charset_names[118] },
{ 12, "iso10646ucs2", &canonical_charset_names[114] },
{ 12, "iso10646ucs4", &canonical_charset_names[115] },
{ 16, "iso10646ucsbasic", &canonical_charset_names[116] },
{ 21, "iso10646unicodelatin1", &canonical_charset_names[118] },
{ 12, "iso10646utf1", &canonical_charset_names[117] },
{ 9, "iso2022cn", &canonical_charset_names[119] },
{ 12, "iso2022cnext", &canonical_charset_names[120] },
{ 9, "iso2022jp", &canonical_charset_names[121] },
{ 10, "iso2022jp1", &canonical_charset_names[122] },
{ 10, "iso2022jp2", &canonical_charset_names[123] },
{ 9, "iso2022kr", &canonical_charset_names[124] },
{ 11, "iso20331983", &canonical_charset_names[156] },
{ 7, "iso5427", &canonical_charset_names[157] },
{ 11, "iso54271981", &canonical_charset_names[158] },
{ 19, "iso5427cyrillic1981", &canonical_charset_names[158] },
{ 11, "iso54281980", &canonical_charset_names[159] },
{ 15, "iso646basic1983", &canonical_charset_names[160] },
{ 8, "iso646ca", &canonical_charset_names[13] },
{ 9, "iso646ca2", &canonical_charset_names[14] },
{ 8, "iso646cn", &canonical_charset_names[44] },
{ 8, "iso646cu", &canonical_charset_names[200] },
{ 8, "iso646de", &canonical_charset_names[18] },
{ 8, "iso646dk", &canonical_charset_names[19] },
{ 8, "iso646es", &canonical_charset_names[36] },
{ 9, "iso646es2", &canonical_charset_names[37] },
{ 8, "iso646fi", &canonical_charset_names[214] },
{ 8, "iso646fr", &canonical_charset_names[201] },
{ 9, "iso646fr1", &canonical_charset_names[202] },
{ 8, "iso646gb", &canonical_charset_names[6] },
{ 8, "iso646hu", &canonical_charset_names[194] },
{ 13, "iso646irv1983", &canonical_charset_names[161] },
{ 13, "iso646irv1991", &canonical_charset_names[224] },
{ 8, "iso646it", &canonical_charset_names[165] },
{ 8, "iso646jp", &canonical_charset_names[167] },
{ 12, "iso646jpocrb", &canonical_charset_names[171] },
{ 8, "iso646kr", &canonical_charset_names[186] },
{ 8, "iso646no", &canonical_charset_names[203] },
{ 9, "iso646no2", &canonical_charset_names[204] },
{ 8, "iso646pt", &canonical_charset_names[210] },
{ 9, "iso646pt2", &canonical_charset_names[211] },
{ 8, "iso646se", &canonical_charset_names[214] },
{ 9, "iso646se2", &canonical_charset_names[215] },
{ 8, "iso646us", &canonical_charset_names[224] },
{ 8, "iso646yu", &canonical_charset_names[180] },
{ 10, "iso6937225", &canonical_charset_names[162] },
{ 11, "iso69372add", &canonical_charset_names[163] },
{ 8, "iso88591", &canonical_charset_names[125] },
{ 9, "iso885910", &canonical_charset_names[128] },
{ 13, "iso8859101992", &canonical_charset_names[128] },
{ 9, "iso885911", &canonical_charset_names[129] },
{ 12, "iso885911987", &canonical_charset_names[125] },
{ 9, "iso885913", &canonical_charset_names[130] },
{ 9, "iso885914", &canonical_charset_names[131] },
{ 13, "iso8859141998", &canonical_charset_names[131] },
{ 9, "iso885915", &canonical_charset_names[132] },
{ 9, "iso885916", &canonical_charset_names[133] },
{ 13, "iso8859162001", &canonical_charset_names[133] },
{ 23, "iso88591windows30latin1", &canonical_charset_names[126] },
{ 23, "iso88591windows31latin1", &canonical_charset_names[127] },
{ 8, "iso88592", &canonical_charset_names[134] },
{ 12, "iso885921987", &canonical_charset_names[134] },
{ 21, "iso88592windowslatin2", &canonical_charset_names[135] },
{ 8, "iso88593", &canonical_charset_names[136] },
{ 12, "iso885931988", &canonical_charset_names[136] },
{ 8, "iso88594", &canonical_charset_names[137] },
{ 12, "iso885941988", &canonical_charset_names[137] },
{ 8, "iso88595", &canonical_charset_names[138] },
{ 12, "iso885951988", &canonical_charset_names[138] },
{ 8, "iso88596", &canonical_charset_names[139] },
{ 12, "iso885961987", &canonical_charset_names[139] },
{ 9, "iso88596e", &canonical_charset_names[140] },
{ 9, "iso88596i", &canonical_charset_names[141] },
{ 8, "iso88597", &canonical_charset_names[142] },
{ 12, "iso885971987", &canonical_charset_names[142] },
{ 8, "iso88598", &canonical_charset_names[143] },
{ 12, "iso885981988", &canonical_charset_names[143] },
{ 9, "iso88598e", &canonical_charset_names[144] },
{ 9, "iso88598i", &canonical_charset_names[145] },
{ 8, "iso88599", &canonical_charset_names[146] },
{ 12, "iso885991989", &canonical_charset_names[146] },
{ 21, "iso88599windowslatin5", &canonical_charset_names[147] },
{ 11, "iso8859supp", &canonical_charset_names[164] },
{ 7, "iso9036", &canonical_charset_names[1] },
{ 9, "isoceltic", &canonical_charset_names[131] },
{ 7, "isoir10", &canonical_charset_names[214] },
{ 8, "isoir100", &canonical_charset_names[125] },
{ 8, "isoir101", &canonical_charset_names[134] },
{ 8, "isoir102", &canonical_charset_names[218] },
{ 8, "isoir103", &canonical_charset_names[219] },
{ 8, "isoir109", &canonical_charset_names[136] },
{ 7, "isoir11", &canonical_charset_names[215] },
{ 8, "isoir110", &canonical_charset_names[137] },
{ 8, "isoir111", &canonical_charset_names[35] },
{ 8, "isoir121", &canonical_charset_names[13] },
{ 8, "isoir122", &canonical_charset_names[14] },
{ 8, "isoir123", &canonical_charset_names[15] },
{ 8, "isoir126", &canonical_charset_names[142] },
{ 8, "isoir127", &canonical_charset_names[139] },
{ 8, "isoir128", &canonical_charset_names[217] },
{ 7, "isoir13", &canonical_charset_names[166] },
{ 8, "isoir138", &canonical_charset_names[143] },
{ 8, "isoir139", &canonical_charset_names[16] },
{ 7, "isoir14", &canonical_charset_names[167] },
{ 8, "isoir141", &canonical_charset_names[180] },
{ 8, "isoir142", &canonical_charset_names[163] },
{ 8, "isoir143", &canonical_charset_names[109] },
{ 8, "isoir144", &canonical_charset_names[138] },
{ 8, "isoir146", &canonical_charset_names[182] },
{ 8, "isoir147", &canonical_charset_names[181] },
{ 8, "isoir148", &canonical_charset_names[146] },
{ 8, "isoir149", &canonical_charset_names[187] },
{ 7, "isoir15", &canonical_charset_names[165] },
{ 8, "isoir150", &canonical_charset_names[243] },
{ 8, "isoir151", &canonical_charset_names[200] },
{ 8, "isoir152", &canonical_charset_names[162] },
{ 8, "isoir153", &canonical_charset_names[46] },
{ 8, "isoir154", &canonical_charset_names[164] },
{ 8, "isoir155", &canonical_charset_names[155] },
{ 8, "isoir157", &canonical_charset_names[128] },
{ 8, "isoir158", &canonical_charset_names[249] },
{ 8, "isoir159", &canonical_charset_names[178] },
{ 7, "isoir16", &canonical_charset_names[210] },
{ 8, "isoir166", &canonical_charset_names[129] },
{ 7, "isoir17", &canonical_charset_names[36] },
{ 7, "isoir18", &canonical_charset_names[245] },
{ 8, "isoir182", &canonical_charset_names[148] },
{ 7, "isoir19", &canonical_charset_names[248] },
{ 8, "isoir197", &canonical_charset_names[149] },
{ 8, "isoir199", &canonical_charset_names[131] },
{ 6, "isoir2", &canonical_charset_names[161] },
{ 7, "isoir21", &canonical_charset_names[18] },
{ 8, "isoir226", &canonical_charset_names[133] },
{ 7, "isoir25", &canonical_charset_names[202] },
{ 7, "isoir27", &canonical_charset_names[188] },
{ 7, "isoir37", &canonical_charset_names[157] },
{ 6, "isoir4", &canonical_charset_names[6] },
{ 7, "isoir42", &canonical_charset_names[168] },
{ 7, "isoir47", &canonical_charset_names[7] },
{ 7, "isoir49", &canonical_charset_names[110] },
{ 7, "isoir50", &canonical_charset_names[111] },
{ 7, "isoir51", &canonical_charset_names[112] },
{ 7, "isoir54", &canonical_charset_names[158] },
{ 7, "isoir55", &canonical_charset_names[159] },
{ 7, "isoir57", &canonical_charset_names[44] },
{ 7, "isoir58", &canonical_charset_names[45] },
{ 6, "isoir6", &canonical_charset_names[224] },
{ 7, "isoir60", &canonical_charset_names[203] },
{ 7, "isoir61", &canonical_charset_names[204] },
{ 7, "isoir69", &canonical_charset_names[201] },
{ 7, "isoir70", &canonical_charset_names[252] },
{ 7, "isoir81", &canonical_charset_names[198] },
{ 7, "isoir82", &canonical_charset_names[199] },
{ 7, "isoir84", &canonical_charset_names[211] },
{ 7, "isoir85", &canonical_charset_names[37] },
{ 7, "isoir86", &canonical_charset_names[194] },
{ 7, "isoir87", &canonical_charset_names[169] },
{ 7, "isoir88", &canonical_charset_names[244] },
{ 7, "isoir89", &canonical_charset_names[1] },
{ 7, "isoir90", &canonical_charset_names[247] },
{ 7, "isoir91", &canonical_charset_names[196] },
{ 7, "isoir92", &canonical_charset_names[197] },
{ 7, "isoir93", &canonical_charset_names[172] },
{ 7, "isoir94", &canonical_charset_names[173] },
{ 7, "isoir95", &canonical_charset_names[174] },
{ 7, "isoir96", &canonical_charset_names[175] },
{ 7, "isoir98", &canonical_charset_names[156] },
{ 7, "isoir99", &canonical_charset_names[0] },
{ 17, "isounicodeibm1261", &canonical_charset_names[150] },
{ 17, "isounicodeibm1264", &canonical_charset_names[151] },
{ 17, "isounicodeibm1265", &canonical_charset_names[152] },
{ 17, "isounicodeibm1268", &canonical_charset_names[153] },
{ 17, "isounicodeibm1276", &canonical_charset_names[154] },
{ 2, "it", &canonical_charset_names[165] },
{ 12, "jisc62201969", &canonical_charset_names[166] },
{ 14, "jisc62201969jp", &canonical_charset_names[166] },
{ 14, "jisc62201969ro", &canonical_charset_names[167] },
{ 12, "jisc62261978", &canonical_charset_names[168] },
{ 12, "jisc62261983", &canonical_charset_names[169] },
{ 13, "jisc62291984a", &canonical_charset_names[170] },
{ 13, "jisc62291984b", &canonical_charset_names[171] },
{ 16, "jisc62291984badd", &canonical_charset_names[172] },
{ 16, "jisc62291984hand", &canonical_charset_names[173] },
{ 19, "jisc62291984handadd", &canonical_charset_names[174] },
{ 16, "jisc62291984kana", &canonical_charset_names[175] },
{ 11, "jisencoding", &canonical_charset_names[176] },
{ 8, "jisx0201", &canonical_charset_names[177] },
{ 12, "jisx02081983", &canonical_charset_names[169] },
{ 12, "jisx02121990", &canonical_charset_names[178] },
{ 5, "johab", &canonical_charset_names[179] },
{ 2, "jp", &canonical_charset_names[167] },
{ 6, "jpocra", &canonical_charset_names[170] },
{ 6, "jpocrb", &canonical_charset_names[171] },
{ 9, "jpocrbadd", &canonical_charset_names[172] },
{ 9, "jpocrhand", &canonical_charset_names[173] },
{ 12, "jpocrhandadd", &canonical_charset_names[174] },
{ 2, "js", &canonical_charset_names[180] },
{ 9, "jusib1002", &canonical_charset_names[180] },
{ 12, "jusib1003mac", &canonical_charset_names[181] },
{ 13, "jusib1003serb", &canonical_charset_names[182] },
{ 8, "katakana", &canonical_charset_names[166] },
{ 12, "koi7switched", &canonical_charset_names[183] },
{ 5, "koi8e", &canonical_charset_names[35] },
{ 5, "koi8r", &canonical_charset_names[184] },
{ 5, "koi8u", &canonical_charset_names[185] },
{ 6, "korean", &canonical_charset_names[187] },
{ 7, "ksc5601", &canonical_charset_names[187] },
{ 11, "ksc56011987", &canonical_charset_names[187] },
{ 11, "ksc56011989", &canonical_charset_names[187] },
{ 7, "ksc5636", &canonical_charset_names[186] },
{ 2, "l1", &canonical_charset_names[125] },
{ 3, "l10", &canonical_charset_names[133] },
{ 2, "l2", &canonical_charset_names[134] },
{ 2, "l3", &canonical_charset_names[136] },
{ 2, "l4", &canonical_charset_names[137] },
{ 2, "l5", &canonical_charset_names[146] },
{ 2, "l6", &canonical_charset_names[128] },
{ 2, "l8", &canonical_charset_names[131] },
{ 3, "lap", &canonical_charset_names[249] },
{ 6, "latin1", &canonical_charset_names[125] },
{ 7, "latin10", &canonical_charset_names[133] },
{ 8, "latin125", &canonical_charset_names[164] },
{ 6, "latin2", &canonical_charset_names[134] },
{ 6, "latin3", &canonical_charset_names[136] },
{ 6, "latin4", &canonical_charset_names[137] },
{ 6, "latin5", &canonical_charset_names[146] },
{ 6, "latin6", &canonical_charset_names[128] },
{ 6, "latin8", &canonical_charset_names[131] },
{ 6, "latin9", &canonical_charset_names[132] },
{ 10, "latingreek", &canonical_charset_names[248] },
{ 11, "latingreek1", &canonical_charset_names[188] },
{ 8, "latinlap", &canonical_charset_names[249] },
{ 3, "mac", &canonical_charset_names[250] },
{ 16, "maccentraleurope", &canonical_charset_names[189] },
{ 18, "maccentraleurroman", &canonical_charset_names[189] },
{ 11, "maccyrillic", &canonical_charset_names[190] },
{ 10, "macedonian", &canonical_charset_names[181] },
{ 9, "macintosh", &canonical_charset_names[250] },
{ 8, "macroman", &canonical_charset_names[250] },
{ 10, "macukraine", &canonical_charset_names[191] },
{ 12, "macukrainian", &canonical_charset_names[191] },
{ 19, "microsoftpublishing", &canonical_charset_names[195] },
{ 4, "mnem", &canonical_charset_names[192] },
{ 8, "mnemonic", &canonical_charset_names[193] },
{ 5, "ms936", &canonical_charset_names[43] },
{ 6, "msansi", &canonical_charset_names[255] },
{ 6, "msarab", &canonical_charset_names[259] },
{ 6, "mscyrl", &canonical_charset_names[254] },
{ 4, "msee", &canonical_charset_names[253] },
{ 7, "msgreek", &canonical_charset_names[256] },
{ 7, "mskanji", &canonical_charset_names[216] },
{ 6, "msturk", &canonical_charset_names[257] },
{ 8, "msz77953", &canonical_charset_names[194] },
{ 6, "naplps", &canonical_charset_names[0] },
{ 8, "natsdano", &canonical_charset_names[196] },
{ 11, "natsdanoadd", &canonical_charset_names[197] },
{ 8, "natssefi", &canonical_charset_names[198] },
{ 11, "natssefiadd", &canonical_charset_names[199] },
{ 10, "ncnc001081", &canonical_charset_names[200] },
{ 8, "nfz62010", &canonical_charset_names[201] },
{ 12, "nfz620101973", &canonical_charset_names[202] },
{ 2, "no", &canonical_charset_names[203] },
{ 3, "no2", &canonical_charset_names[204] },
{ 7, "ns45511", &canonical_charset_names[203] },
{ 7, "ns45512", &canonical_charset_names[204] },
{ 16, "osdebcdicdf03irv", &canonical_charset_names[205] },
{ 14, "osdebcdicdf041", &canonical_charset_names[206] },
{ 15, "osdebcdicdf0415", &canonical_charset_names[207] },
{ 18, "pc8danishnorwegian", &canonical_charset_names[208] },
{ 10, "pc8turkish", &canonical_charset_names[209] },
{ 21, "pcmultilingual850euro", &canonical_charset_names[54] },
{ 2, "pt", &canonical_charset_names[210] },
{ 5, "pt154", &canonical_charset_names[212] },
{ 3, "pt2", &canonical_charset_names[211] },
{ 7, "ptcp154", &canonical_charset_names[212] },
{ 2, "r8", &canonical_charset_names[246] },
{ 3, "ref", &canonical_charset_names[160] },
{ 6, "roman8", &canonical_charset_names[246] },
{ 4, "scsu", &canonical_charset_names[213] },
{ 2, "se", &canonical_charset_names[214] },
{ 3, "se2", &canonical_charset_names[215] },
{ 10, "sen850200b", &canonical_charset_names[214] },
{ 10, "sen850200c", &canonical_charset_names[215] },
{ 7, "serbian", &canonical_charset_names[182] },
{ 8, "shiftjis", &canonical_charset_names[216] },
{ 10, "stsev35888", &canonical_charset_names[46] },
{ 6, "t101g2", &canonical_charset_names[217] },
{ 3, "t61", &canonical_charset_names[219] },
{ 7, "t617bit", &canonical_charset_names[218] },
{ 7, "t618bit", &canonical_charset_names[219] },
{ 6, "tis620", &canonical_charset_names[220] },
{ 4, "ucs2", &canonical_charset_names[114] },
{ 4, "ucs4", &canonical_charset_names[115] },
{ 2, "uk", &canonical_charset_names[6] },
{ 9, "unicode11", &canonical_charset_names[221] },
{ 13, "unicode11utf7", &canonical_charset_names[222] },
{ 13, "unicode11utf8", &canonical_charset_names[232] },
{ 13, "unicode20utf8", &canonical_charset_names[232] },
{ 11, "unknown8bit", &canonical_charset_names[223] },
{ 2, "us", &canonical_charset_names[224] },
{ 7, "usascii", &canonical_charset_names[224] },
{ 4, "usdk", &canonical_charset_names[251] },
{ 5, "utf16", &canonical_charset_names[225] },
{ 7, "utf16be", &canonical_charset_names[226] },
{ 7, "utf16le", &canonical_charset_names[227] },
{ 5, "utf32", &canonical_charset_names[228] },
{ 7, "utf32be", &canonical_charset_names[229] },
{ 7, "utf32le", &canonical_charset_names[230] },
{ 4, "utf7", &canonical_charset_names[231] },
{ 4, "utf8", &canonical_charset_names[232] },
{ 20, "venturainternational", &canonical_charset_names[235] },
{ 11, "venturamath", &canonical_charset_names[236] },
{ 9, "venturaus", &canonical_charset_names[237] },
{ 13, "videotexsuppl", &canonical_charset_names[252] },
{ 4, "viqr", &canonical_charset_names[233] },
{ 6, "viscii", &canonical_charset_names[234] },
{ 10, "winbaltrim", &canonical_charset_names[260] },
{ 11, "windows1250", &canonical_charset_names[253] },
{ 11, "windows1251", &canonical_charset_names[254] },
{ 11, "windows1252", &canonical_charset_names[255] },
{ 11, "windows1253", &canonical_charset_names[256] },
{ 11, "windows1254", &canonical_charset_names[257] },
{ 11, "windows1255", &canonical_charset_names[258] },
{ 11, "windows1256", &canonical_charset_names[259] },
{ 11, "windows1257", &canonical_charset_names[260] },
{ 11, "windows1258", &canonical_charset_names[261] },
{ 10, "windows31j", &canonical_charset_names[238] },
{ 10, "windows874", &canonical_charset_names[11] },
{ 10, "windows936", &canonical_charset_names[43] },
{ 10, "windows949", &canonical_charset_names[12] },
{ 5, "x0201", &canonical_charset_names[177] },
{ 6, "x02017", &canonical_charset_names[166] },
{ 5, "x0208", &canonical_charset_names[169] },
{ 5, "x0212", &canonical_charset_names[178] },
{ 11, "xacornfuzzy", &canonical_charset_names[239] },
{ 12, "xacornlatin1", &canonical_charset_names[240] },
{ 8, "xcurrent", &canonical_charset_names[241] },
{ 19, "xmaccentraleurroman", &canonical_charset_names[189] },
{ 12, "xmaccyrillic", &canonical_charset_names[190] },
{ 9, "xmacroman", &canonical_charset_names[250] },
{ 13, "xmacukrainian", &canonical_charset_names[191] },
{ 5, "xsjis", &canonical_charset_names[216] },
{ 7, "xsystem", &canonical_charset_names[241] },
{ 6, "xxbig5", &canonical_charset_names[8] },
{ 2, "yu", &canonical_charset_names[180] },
};
 
static const uint16_t charset_aliases_count = 852;
 
#define MIBENUM_IS_UNICODE(x) (((x) == 1000) || ((x) == 1001) || ((x) == 1015) || ((x) == 1013) || ((x) == 1014) || ((x) == 1017) || ((x) == 1018) || ((x) == 1019) || ((x) == 106))
/programs/network/netsurf/libparserutils/src/charset/codec.c
0,0 → 1,196
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <string.h>
 
#include "charset/aliases.h"
#include "charset/codecs/codec_impl.h"
 
extern parserutils_charset_handler charset_ascii_codec_handler;
extern parserutils_charset_handler charset_8859_codec_handler;
extern parserutils_charset_handler charset_ext8_codec_handler;
extern parserutils_charset_handler charset_utf8_codec_handler;
extern parserutils_charset_handler charset_utf16_codec_handler;
 
static parserutils_charset_handler *handler_table[] = {
&charset_utf8_codec_handler,
&charset_utf16_codec_handler,
&charset_8859_codec_handler,
&charset_ext8_codec_handler,
&charset_ascii_codec_handler,
NULL,
};
 
/**
* Create a charset codec
*
* \param charset Target charset
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param codec Pointer to location to receive codec instance
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhaustion,
* PARSERUTILS_BADENCODING on unsupported charset
*/
parserutils_error parserutils_charset_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec)
{
parserutils_charset_codec *c;
parserutils_charset_handler **handler;
const parserutils_charset_aliases_canon * canon;
parserutils_error error;
 
if (charset == NULL || alloc == NULL || codec == NULL)
return PARSERUTILS_BADPARM;
 
/* Canonicalise parserutils_charset name. */
canon = parserutils__charset_alias_canonicalise(charset,
strlen(charset));
if (canon == NULL)
return PARSERUTILS_BADENCODING;
 
/* Search for handler class */
for (handler = handler_table; *handler != NULL; handler++) {
if ((*handler)->handles_charset(canon->name))
break;
}
 
/* None found */
if ((*handler) == NULL)
return PARSERUTILS_BADENCODING;
 
/* Instantiate class */
error = (*handler)->create(canon->name, alloc, pw, &c);
if (error != PARSERUTILS_OK)
return error;
 
/* and initialise it */
c->mibenum = canon->mib_enum;
 
c->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
 
c->alloc = alloc;
c->alloc_pw = pw;
 
*codec = c;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy a charset codec
*
* \param codec The codec to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_codec_destroy(
parserutils_charset_codec *codec)
{
if (codec == NULL)
return PARSERUTILS_BADPARM;
 
codec->handler.destroy(codec);
 
codec->alloc(codec, 0, codec->alloc_pw);
 
return PARSERUTILS_OK;
}
 
/**
* Configure a charset codec
*
* \param codec The codec to configure
* \param type The codec option type to configure
* \param params Option-specific parameters
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_codec_setopt(
parserutils_charset_codec *codec,
parserutils_charset_codec_opttype type,
parserutils_charset_codec_optparams *params)
{
if (codec == NULL || params == NULL)
return PARSERUTILS_BADPARM;
 
switch (type) {
case PARSERUTILS_CHARSET_CODEC_ERROR_MODE:
codec->errormode = params->error_mode.mode;
break;
}
 
return PARSERUTILS_OK;
}
 
/**
* Encode a chunk of UCS-4 data into a codec's charset
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*
* source, sourcelen, dest and destlen will be updated appropriately on exit
*/
parserutils_error parserutils_charset_codec_encode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
if (codec == NULL || source == NULL || *source == NULL ||
sourcelen == NULL || dest == NULL || *dest == NULL ||
destlen == NULL)
return PARSERUTILS_BADPARM;
 
return codec->handler.encode(codec, source, sourcelen, dest, destlen);
}
 
/**
* Decode a chunk of data in a codec's charset into UCS-4
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*
* source, sourcelen, dest and destlen will be updated appropriately on exit
*
* Call this with a source length of 0 to flush any buffers.
*/
parserutils_error parserutils_charset_codec_decode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
if (codec == NULL || source == NULL || *source == NULL ||
sourcelen == NULL || dest == NULL || *dest == NULL ||
destlen == NULL)
return PARSERUTILS_BADPARM;
 
return codec->handler.decode(codec, source, sourcelen, dest, destlen);
}
 
/**
* Clear a charset codec's encoding state
*
* \param codec The codec to reset
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_codec_reset(
parserutils_charset_codec *codec)
{
if (codec == NULL)
return PARSERUTILS_BADPARM;
 
return codec->handler.reset(codec);
}
 
/programs/network/netsurf/libparserutils/src/charset/codecs/8859_tables.h
0,0 → 1,241
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_charset_codecs_8859tables_h_
#define parserutils_charset_codecs_8859tables_h_
 
/* Mapping tables for ISO-8859-n -> UCS4.
* Undefined characters are mapped to U+FFFF,
* which is a guaranteed non-character
*/
 
static uint32_t t1[96] = {
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
};
 
static uint32_t t2[96] = {
0x00A0, 0x0104, 0x02D8, 0x0141, 0x00A4, 0x013D, 0x015A, 0x00A7,
0x00A8, 0x0160, 0x015E, 0x0164, 0x0179, 0x00AD, 0x017D, 0x017B,
0x00B0, 0x0105, 0x02DB, 0x0142, 0x00B4, 0x013E, 0x015B, 0x02C7,
0x00B8, 0x0161, 0x015F, 0x0165, 0x017A, 0x02DD, 0x017E, 0x017C,
0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9,
};
 
static uint32_t t3[96] = {
0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0xFFFF, 0x0124, 0x00A7,
0x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, 0xFFFF, 0x017B,
0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7,
0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, 0xFFFF, 0x017C,
0x00C0, 0x00C1, 0x00C2, 0xFFFF, 0x00C4, 0x010A, 0x0108, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0xFFFF, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7,
0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0xFFFF, 0x00E4, 0x010B, 0x0109, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0xFFFF, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7,
0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9,
};
 
static uint32_t t4[96] = {
0x00A0, 0x0104, 0x0138, 0x0156, 0x00A4, 0x0128, 0x013B, 0x00A7,
0x00A8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00AD, 0x017D, 0x00AF,
0x00B0, 0x0105, 0x02DB, 0x0157, 0x00B4, 0x0129, 0x013C, 0x02C7,
0x00B8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014A, 0x017E, 0x014B,
0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x012A,
0x0110, 0x0145, 0x014C, 0x0136, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x0168, 0x016A, 0x00DF,
0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x012B,
0x0111, 0x0146, 0x014D, 0x0137, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x0169, 0x016B, 0x02D9,
};
 
static uint32_t t5[96] = {
0x00A0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x00AD, 0x040E, 0x040F,
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,
0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x00A7, 0x045E, 0x045F,
};
 
static uint32_t t6[96] = {
0x00A0, 0xFFFF, 0xFFFF, 0xFFFF, 0x00A4, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x060C, 0x00AD, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0x061B, 0xFFFF, 0xFFFF, 0xFFFF, 0x061F,
0xFFFF, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,
0x0638, 0x0639, 0x063A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,
0x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F,
0x0650, 0x0651, 0x0652, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
};
 
static uint32_t t7[96] = {
0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0xFFFF, 0x2015,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7,
0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
0x03A0, 0x03A1, 0xFFFF, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFF,
};
 
static uint32_t t8[96] = {
0x00A0, 0xFFFF, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x2017,
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
0x05E8, 0x05E9, 0x05EA, 0xFFFF, 0xFFFF, 0x200E, 0x200F, 0xFFFF,
};
 
static uint32_t t9[96] = {
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF,
};
 
static uint32_t t10[96] = {
0x00A0, 0x0104, 0x0112, 0x0122, 0x012A, 0x0128, 0x0136, 0x00A7,
0x013B, 0x0110, 0x0160, 0x0166, 0x017D, 0x00AD, 0x016A, 0x014A,
0x00B0, 0x0105, 0x0113, 0x0123, 0x012B, 0x0129, 0x0137, 0x00B7,
0x013C, 0x0111, 0x0161, 0x0167, 0x017E, 0x2015, 0x016B, 0x014B,
0x0100, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x012E,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x0116, 0x00CD, 0x00CE, 0x00CF,
0x00D0, 0x0145, 0x014C, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x0168,
0x00D8, 0x0172, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
0x0101, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x012F,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x0117, 0x00ED, 0x00EE, 0x00EF,
0x00F0, 0x0146, 0x014D, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x0169,
0x00F8, 0x0173, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x0138,
};
 
static uint32_t t11[96] = {
0x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07,
0x0E08, 0x0E09, 0x0E0A, 0x0E0B, 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F,
0x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17,
0x0E18, 0x0E19, 0x0E1A, 0x0E1B, 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F,
0x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27,
0x0E28, 0x0E29, 0x0E2A, 0x0E2B, 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F,
0x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37,
0x0E38, 0x0E39, 0x0E3A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0E3F,
0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47,
0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F,
0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57,
0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
};
 
static uint32_t t13[96] = {
0x00A0, 0x201D, 0x00A2, 0x00A3, 0x00A4, 0x201E, 0x00A6, 0x00A7,
0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x201C, 0x00B5, 0x00B6, 0x00B7,
0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112,
0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7,
0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113,
0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7,
0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x2019,
};
 
static uint32_t t14[96] = {
0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7,
0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56,
0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF,
};
 
static uint32_t t15[96] = {
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AC, 0x00A5, 0x0160, 0x00A7,
0x0161, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x017D, 0x00B5, 0x00B6, 0x00B7,
0x017E, 0x00B9, 0x00BA, 0x00BB, 0x0152, 0x0153, 0x0178, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
};
 
static uint32_t t16[96] = {
0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF,
};
 
#endif
/programs/network/netsurf/libparserutils/src/charset/codecs/Makefile
0,0 → 1,5
OUTFILE = libo.o
OBJS = codec_ascii.o codec_8859.o codec_ext8.o \
codec_utf8.o codec_utf16.o
CFLAGS += -I ../../../include/ -I ../../../../ -I ../../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib
/programs/network/netsurf/libparserutils/src/charset/codecs/codec_8859.c
0,0 → 1,596
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <assert.h>
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/charset/mibenum.h>
 
#include "charset/codecs/codec_impl.h"
#include "utils/endian.h"
#include "utils/utils.h"
 
#include "charset/codecs/8859_tables.h"
 
static struct {
uint16_t mib;
const char *name;
size_t len;
uint32_t *table;
} known_charsets[] = {
{ 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
{ 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
{ 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
{ 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
{ 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
{ 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
{ 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
{ 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
{ 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
{ 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
{ 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
{ 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
{ 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
{ 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
{ 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
};
 
/**
* ISO-8859-n charset codec
*/
typedef struct charset_8859_codec {
parserutils_charset_codec base; /**< Base class */
 
uint32_t *table; /**< Mapping table for 0xA0-0xFF */
 
#define READ_BUFSIZE (8)
uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
* output sequences (decode)
* (host-endian) */
size_t read_len; /**< Character length of read_buf */
 
#define WRITE_BUFSIZE (8)
uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
* output sequences (encode)
* (host-endian) */
size_t write_len; /**< Character length of write_buf */
 
} charset_8859_codec;
 
static bool charset_8859_codec_handles_charset(const char *charset);
static parserutils_error charset_8859_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec);
static parserutils_error charset_8859_codec_destroy(
parserutils_charset_codec *codec);
static parserutils_error charset_8859_codec_encode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_8859_codec_decode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_8859_codec_reset(
parserutils_charset_codec *codec);
static inline parserutils_error charset_8859_codec_read_char(
charset_8859_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_8859_codec_output_decoded_char(
charset_8859_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
uint32_t ucs4, uint8_t **s, size_t *len);
static inline parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
const uint8_t *s, size_t len, uint32_t *ucs4);
 
/**
* Determine whether this codec handles a specific charset
*
* \param charset Charset to test
* \return true if handleable, false otherwise
*/
bool charset_8859_codec_handles_charset(const char *charset)
{
uint32_t i;
uint16_t match = parserutils_charset_mibenum_from_name(charset,
strlen(charset));
 
if (known_charsets[0].mib == 0) {
for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
known_charsets[i].mib =
parserutils_charset_mibenum_from_name(
known_charsets[i].name,
known_charsets[i].len);
}
}
 
for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
if (known_charsets[i].mib == match)
return true;
}
 
return false;
}
 
/**
* Create an ISO-8859-n codec
*
* \param charset The charset to read from / write to
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param codec Pointer to location to receive codec
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhausion
*/
parserutils_error charset_8859_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec)
{
uint32_t i;
charset_8859_codec *c;
uint16_t match = parserutils_charset_mibenum_from_name(
charset, strlen(charset));
uint32_t *table = NULL;
 
for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
if (known_charsets[i].mib == match) {
table = known_charsets[i].table;
break;
}
}
 
assert(table != NULL);
 
c = alloc(NULL, sizeof(charset_8859_codec), pw);
if (c == NULL)
return PARSERUTILS_NOMEM;
 
c->table = table;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
/* Finally, populate vtable */
c->base.handler.destroy = charset_8859_codec_destroy;
c->base.handler.encode = charset_8859_codec_encode;
c->base.handler.decode = charset_8859_codec_decode;
c->base.handler.reset = charset_8859_codec_reset;
 
*codec = (parserutils_charset_codec *) c;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy an ISO-8859-n codec
*
* \param codec The codec to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_8859_codec_destroy (parserutils_charset_codec *codec)
{
UNUSED(codec);
 
return PARSERUTILS_OK;
}
 
/**
* Encode a chunk of UCS-4 (big endian) data into ISO-8859-n
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read. Any remaining output for the character will be buffered by the
* codec for writing on the next call.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_8859_codec *c = (charset_8859_codec *) codec;
uint32_t ucs4;
uint32_t *towrite;
size_t towritelen;
parserutils_error error;
 
/* Process any outstanding characters from the previous call */
if (c->write_len > 0) {
uint32_t *pwrite = c->write_buf;
 
while (c->write_len > 0) {
error = charset_8859_from_ucs4(c, pwrite[0],
dest, destlen);
if (error != PARSERUTILS_OK) {
uint32_t len;
assert(error == PARSERUTILS_NOMEM);
 
for (len = 0; len < c->write_len; len++) {
c->write_buf[len] = pwrite[len];
}
 
return error;
}
 
pwrite++;
c->write_len--;
}
}
 
/* Now process the characters for this call */
while (*sourcelen > 0) {
ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
towrite = &ucs4;
towritelen = 1;
 
/* Output current characters */
while (towritelen > 0) {
error = charset_8859_from_ucs4(c, towrite[0], dest,
destlen);
if (error != PARSERUTILS_OK) {
uint32_t len;
if (error != PARSERUTILS_NOMEM) {
return error;
}
 
/* Insufficient output space */
if (towritelen >= WRITE_BUFSIZE)
abort();
 
c->write_len = towritelen;
 
/* Copy pending chars to save area, for
* processing next call. */
for (len = 0; len < towritelen; len++)
c->write_buf[len] = towrite[len];
 
/* Claim character we've just buffered,
* so it's not reprocessed */
*source += 4;
*sourcelen -= 4;
 
return PARSERUTILS_NOMEM;
}
 
towrite++;
towritelen--;
}
 
*source += 4;
*sourcelen -= 4;
}
 
return PARSERUTILS_OK;
}
 
/**
* Decode a chunk of ISO-8859-n data into UCS-4 (big endian)
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* If STRICT error handling is configured and an illegal sequence is split
* over two calls, then _INVALID will be returned from the second call,
* but ::source will point mid-way through the invalid sequence (i.e. it
* will be unmodified over the second call). In addition, the internal
* incomplete-sequence buffer will be emptied, such that subsequent calls
* will progress, rather than re-evaluating the same invalid sequence.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*
* Call this with a source length of 0 to flush the output buffer.
*/
parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_8859_codec *c = (charset_8859_codec *) codec;
parserutils_error error;
 
if (c->read_len > 0) {
/* Output left over from last decode */
uint32_t *pread = c->read_buf;
 
while (c->read_len > 0 && *destlen >= c->read_len * 4) {
*((uint32_t *) (void *) *dest) =
endian_host_to_big(pread[0]);
 
*dest += 4;
*destlen -= 4;
 
pread++;
c->read_len--;
}
 
if (*destlen < c->read_len * 4) {
/* Ran out of output buffer */
size_t i;
 
/* Shuffle remaining output down */
for (i = 0; i < c->read_len; i++)
c->read_buf[i] = pread[i];
 
return PARSERUTILS_NOMEM;
}
}
 
/* Finally, the "normal" case; process all outstanding characters */
while (*sourcelen > 0) {
error = charset_8859_codec_read_char(c,
source, sourcelen, dest, destlen);
if (error != PARSERUTILS_OK) {
return error;
}
}
 
return PARSERUTILS_OK;
}
 
/**
* Clear an ISO-8859-n codec's encoding state
*
* \param codec The codec to reset
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec)
{
charset_8859_codec *c = (charset_8859_codec *) codec;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
return PARSERUTILS_OK;
}
 
 
/**
* Read a character from the ISO-8859-n to UCS-4 (big endian)
*
* \param c The codec
* \param source Pointer to pointer to source buffer (updated on exit)
* \param sourcelen Pointer to length of source buffer (updated on exit)
* \param dest Pointer to pointer to output buffer (updated on exit)
* \param destlen Pointer to length of output buffer (updated on exit)
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_8859_codec_read_char(charset_8859_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
uint32_t ucs4;
parserutils_error error;
 
/* Convert a single character */
error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4);
if (error == PARSERUTILS_OK) {
/* Read a character */
error = charset_8859_codec_output_decoded_char(c,
ucs4, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += 1;
*sourcelen -= 1;
}
 
return error;
} else if (error == PARSERUTILS_NEEDDATA) {
/* Can only happen if sourcelen == 0 */
return error;
} else if (error == PARSERUTILS_INVALID) {
/* Illegal input sequence */
 
/* Strict errormode; simply flag invalid character */
if (c->base.errormode ==
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
return PARSERUTILS_INVALID;
}
 
/* output U+FFFD and continue processing. */
error = charset_8859_codec_output_decoded_char(c,
0xFFFD, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += 1;
*sourcelen -= 1;
}
 
return error;
}
 
return PARSERUTILS_OK;
}
 
/**
* Output a UCS-4 character (big endian)
*
* \param c Codec to use
* \param ucs4 UCS-4 character (host endian)
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to output buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
*/
parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen)
{
if (*destlen < 4) {
/* Run out of output buffer */
c->read_len = 1;
c->read_buf[0] = ucs4;
 
return PARSERUTILS_NOMEM;
}
 
*((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
*dest += 4;
*destlen -= 4;
 
return PARSERUTILS_OK;
}
 
/**
* Convert a UCS4 (host endian) character to ISO-8859-n
*
* \param c The codec instance
* \param ucs4 The UCS4 character to convert
* \param s Pointer to pointer to destination buffer
* \param len Pointer to destination buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
* PARSERUTILS_INVALID if the character cannot be represented
*
* _INVALID will only be returned if the codec's conversion mode is STRICT.
* Otherwise, '?' will be output.
*
* On successful conversion, *s and *len will be updated.
*/
parserutils_error charset_8859_from_ucs4(charset_8859_codec *c,
uint32_t ucs4, uint8_t **s, size_t *len)
{
uint8_t out = 0;
 
if (*len < 1)
return PARSERUTILS_NOMEM;
 
if (ucs4 < 0x80) {
/* ASCII */
out = ucs4;
} else {
uint32_t i;
 
for (i = 0; i < 96; i++) {
if (ucs4 == c->table[i])
break;
}
 
if (i == 96) {
if (c->base.errormode ==
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
return PARSERUTILS_INVALID;
else
out = '?';
} else {
out = 0xA0 + i;
}
}
 
*(*s) = out;
(*s)++;
(*len)--;
 
return PARSERUTILS_OK;
}
 
/**
* Convert an ISO-8859-n character to UCS4 (host endian)
*
* \param c The codec instance
* \param s Pointer to source buffer
* \param len Source buffer length
* \param ucs4 Pointer to destination buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NEEDDATA if there's insufficient input data
* PARSERUTILS_INVALID if the character cannot be represented
*/
parserutils_error charset_8859_to_ucs4(charset_8859_codec *c,
const uint8_t *s, size_t len, uint32_t *ucs4)
{
uint32_t out;
 
if (len < 1)
return PARSERUTILS_NEEDDATA;
 
if (*s < 0x80) {
out = *s;
} else if (*s >= 0xA0) {
if (c->table[*s - 0xA0] == 0xFFFF)
return PARSERUTILS_INVALID;
 
out = c->table[*s - 0xA0];
} else {
return PARSERUTILS_INVALID;
}
 
*ucs4 = out;
 
return PARSERUTILS_OK;
}
 
const parserutils_charset_handler charset_8859_codec_handler = {
charset_8859_codec_handles_charset,
charset_8859_codec_create
};
 
/programs/network/netsurf/libparserutils/src/charset/codecs/codec_ascii.c
0,0 → 1,536
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <assert.h>
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/charset/mibenum.h>
 
#include "charset/codecs/codec_impl.h"
#include "utils/endian.h"
#include "utils/utils.h"
 
/**
* US-ASCII charset codec
*/
typedef struct charset_ascii_codec {
parserutils_charset_codec base; /**< Base class */
 
#define READ_BUFSIZE (8)
uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
* output sequences (decode)
* (host-endian) */
size_t read_len; /**< Character length of read_buf */
 
#define WRITE_BUFSIZE (8)
uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
* output sequences (encode)
* (host-endian) */
size_t write_len; /**< Character length of write_buf */
 
} charset_ascii_codec;
 
static bool charset_ascii_codec_handles_charset(const char *charset);
static parserutils_error charset_ascii_codec_create(
const char *charset, parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec);
static parserutils_error charset_ascii_codec_destroy(
parserutils_charset_codec *codec);
static parserutils_error charset_ascii_codec_encode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_ascii_codec_decode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_ascii_codec_reset(
parserutils_charset_codec *codec);
static inline parserutils_error charset_ascii_codec_read_char(
charset_ascii_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_ascii_codec_output_decoded_char(
charset_ascii_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
uint32_t ucs4, uint8_t **s, size_t *len);
static inline parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
const uint8_t *s, size_t len, uint32_t *ucs4);
 
/**
* Determine whether this codec handles a specific charset
*
* \param charset Charset to test
* \return true if handleable, false otherwise
*/
bool charset_ascii_codec_handles_charset(const char *charset)
{
static uint16_t ascii;
uint16_t match = parserutils_charset_mibenum_from_name(charset,
strlen(charset));
 
if (ascii == 0) {
ascii = parserutils_charset_mibenum_from_name(
"US-ASCII", SLEN("US-ASCII"));
}
 
if (ascii != 0 && ascii == match)
return true;
 
return false;
}
 
/**
* Create a US-ASCII codec
*
* \param charset The charset to read from / write to
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param codec Pointer to location to receive codec
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhausion
*/
parserutils_error charset_ascii_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec)
{
charset_ascii_codec *c;
 
UNUSED(charset);
 
c = alloc(NULL, sizeof(charset_ascii_codec), pw);
if (c == NULL)
return PARSERUTILS_NOMEM;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
/* Finally, populate vtable */
c->base.handler.destroy = charset_ascii_codec_destroy;
c->base.handler.encode = charset_ascii_codec_encode;
c->base.handler.decode = charset_ascii_codec_decode;
c->base.handler.reset = charset_ascii_codec_reset;
 
*codec = (parserutils_charset_codec *) c;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy a US-ASCII codec
*
* \param codec The codec to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_ascii_codec_destroy (parserutils_charset_codec *codec)
{
UNUSED(codec);
 
return PARSERUTILS_OK;
}
 
/**
* Encode a chunk of UCS-4 (big endian) data into US-ASCII
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read. Any remaining output for the character will be buffered by the
* codec for writing on the next call.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_ascii_codec_encode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_ascii_codec *c = (charset_ascii_codec *) codec;
uint32_t ucs4;
uint32_t *towrite;
size_t towritelen;
parserutils_error error;
 
/* Process any outstanding characters from the previous call */
if (c->write_len > 0) {
uint32_t *pwrite = c->write_buf;
 
while (c->write_len > 0) {
error = charset_ascii_from_ucs4(c, pwrite[0],
dest, destlen);
if (error != PARSERUTILS_OK) {
uint32_t len;
assert(error == PARSERUTILS_NOMEM);
 
for (len = 0; len < c->write_len; len++) {
c->write_buf[len] = pwrite[len];
}
 
return error;
}
 
pwrite++;
c->write_len--;
}
}
 
/* Now process the characters for this call */
while (*sourcelen > 0) {
ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
towrite = &ucs4;
towritelen = 1;
 
/* Output current characters */
while (towritelen > 0) {
error = charset_ascii_from_ucs4(c, towrite[0], dest,
destlen);
if (error != PARSERUTILS_OK) {
uint32_t len;
if (error != PARSERUTILS_NOMEM) {
return error;
}
 
/* Insufficient output space */
if (towritelen >= WRITE_BUFSIZE)
abort();
 
c->write_len = towritelen;
 
/* Copy pending chars to save area, for
* processing next call. */
for (len = 0; len < towritelen; len++)
c->write_buf[len] = towrite[len];
 
/* Claim character we've just buffered,
* so it's not reprocessed */
*source += 4;
*sourcelen -= 4;
 
return PARSERUTILS_NOMEM;
}
 
towrite++;
towritelen--;
}
 
*source += 4;
*sourcelen -= 4;
}
 
return PARSERUTILS_OK;
}
 
/**
* Decode a chunk of US-ASCII data into UCS-4 (big endian)
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* If STRICT error handling is configured and an illegal sequence is split
* over two calls, then _INVALID will be returned from the second call,
* but ::source will point mid-way through the invalid sequence (i.e. it
* will be unmodified over the second call). In addition, the internal
* incomplete-sequence buffer will be emptied, such that subsequent calls
* will progress, rather than re-evaluating the same invalid sequence.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*
* Call this with a source length of 0 to flush the output buffer.
*/
parserutils_error charset_ascii_codec_decode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_ascii_codec *c = (charset_ascii_codec *) codec;
parserutils_error error;
 
if (c->read_len > 0) {
/* Output left over from last decode */
uint32_t *pread = c->read_buf;
 
while (c->read_len > 0 && *destlen >= c->read_len * 4) {
*((uint32_t *) (void *) *dest) =
endian_host_to_big(pread[0]);
 
*dest += 4;
*destlen -= 4;
 
pread++;
c->read_len--;
}
 
if (*destlen < c->read_len * 4) {
/* Ran out of output buffer */
size_t i;
 
/* Shuffle remaining output down */
for (i = 0; i < c->read_len; i++)
c->read_buf[i] = pread[i];
 
return PARSERUTILS_NOMEM;
}
}
 
/* Finally, the "normal" case; process all outstanding characters */
while (*sourcelen > 0) {
error = charset_ascii_codec_read_char(c,
source, sourcelen, dest, destlen);
if (error != PARSERUTILS_OK) {
return error;
}
}
 
return PARSERUTILS_OK;
}
 
/**
* Clear a US-ASCII codec's encoding state
*
* \param codec The codec to reset
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_ascii_codec_reset(parserutils_charset_codec *codec)
{
charset_ascii_codec *c = (charset_ascii_codec *) codec;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
return PARSERUTILS_OK;
}
 
 
/**
* Read a character from US-ASCII to UCS-4 (big endian)
*
* \param c The codec
* \param source Pointer to pointer to source buffer (updated on exit)
* \param sourcelen Pointer to length of source buffer (updated on exit)
* \param dest Pointer to pointer to output buffer (updated on exit)
* \param destlen Pointer to length of output buffer (updated on exit)
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_ascii_codec_read_char(charset_ascii_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
uint32_t ucs4;
parserutils_error error;
 
/* Convert a single character */
error = charset_ascii_to_ucs4(c, *source, *sourcelen, &ucs4);
if (error == PARSERUTILS_OK) {
/* Read a character */
error = charset_ascii_codec_output_decoded_char(c,
ucs4, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += 1;
*sourcelen -= 1;
}
 
return error;
} else if (error == PARSERUTILS_NEEDDATA) {
/* Can only happen if sourcelen == 0 */
return error;
} else if (error == PARSERUTILS_INVALID) {
/* Illegal input sequence */
 
/* Strict errormode; simply flag invalid character */
if (c->base.errormode ==
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
return PARSERUTILS_INVALID;
}
 
/* output U+FFFD and continue processing. */
error = charset_ascii_codec_output_decoded_char(c,
0xFFFD, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += 1;
*sourcelen -= 1;
}
 
return error;
}
 
return PARSERUTILS_OK;
}
 
/**
* Output a UCS-4 character (big endian)
*
* \param c Codec to use
* \param ucs4 UCS-4 character (host endian)
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to output buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
*/
parserutils_error charset_ascii_codec_output_decoded_char(
charset_ascii_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen)
{
if (*destlen < 4) {
/* Run out of output buffer */
c->read_len = 1;
c->read_buf[0] = ucs4;
 
return PARSERUTILS_NOMEM;
}
 
*((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
*dest += 4;
*destlen -= 4;
 
return PARSERUTILS_OK;
}
 
/**
* Convert a UCS4 (host endian) character to US-ASCII
*
* \param c The codec instance
* \param ucs4 The UCS4 character to convert
* \param s Pointer to pointer to destination buffer
* \param len Pointer to destination buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
* PARSERUTILS_INVALID if the character cannot be represented
*
* _INVALID will only be returned if the codec's conversion mode is STRICT.
* Otherwise, '?' will be output.
*
* On successful conversion, *s and *len will be updated.
*/
parserutils_error charset_ascii_from_ucs4(charset_ascii_codec *c,
uint32_t ucs4, uint8_t **s, size_t *len)
{
uint8_t out = 0;
 
if (*len < 1)
return PARSERUTILS_NOMEM;
 
if (ucs4 < 0x80) {
/* ASCII */
out = ucs4;
} else {
if (c->base.errormode == PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
return PARSERUTILS_INVALID;
else
out = '?';
}
 
*(*s) = out;
(*s)++;
(*len)--;
 
return PARSERUTILS_OK;
}
 
/**
* Convert a US-ASCII character to UCS4 (host endian)
*
* \param c The codec instance
* \param s Pointer to source buffer
* \param len Source buffer length
* \param ucs4 Pointer to destination buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NEEDDATA if there's insufficient input data
* PARSERUTILS_INVALID if the character cannot be represented
*/
parserutils_error charset_ascii_to_ucs4(charset_ascii_codec *c,
const uint8_t *s, size_t len, uint32_t *ucs4)
{
uint32_t out;
 
UNUSED(c);
 
if (len < 1)
return PARSERUTILS_NEEDDATA;
 
if (*s < 0x80) {
out = *s;
} else {
return PARSERUTILS_INVALID;
}
 
*ucs4 = out;
 
return PARSERUTILS_OK;
}
 
const parserutils_charset_handler charset_ascii_codec_handler = {
charset_ascii_codec_handles_charset,
charset_ascii_codec_create
};
 
/programs/network/netsurf/libparserutils/src/charset/codecs/codec_ext8.c
0,0 → 1,588
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <assert.h>
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/charset/mibenum.h>
 
#include "charset/codecs/codec_impl.h"
#include "utils/endian.h"
#include "utils/utils.h"
 
#include "charset/codecs/ext8_tables.h"
 
static struct {
uint16_t mib;
const char *name;
size_t len;
uint32_t *table;
} known_charsets[] = {
{ 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
{ 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
{ 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
{ 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
{ 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
{ 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
{ 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
{ 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
{ 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
};
 
/**
* Windows charset codec
*/
typedef struct charset_ext8_codec {
parserutils_charset_codec base; /**< Base class */
 
uint32_t *table; /**< Mapping table for 0x80-0xFF */
 
#define READ_BUFSIZE (8)
uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
* output sequences (decode)
* (host-endian) */
size_t read_len; /**< Character length of read_buf */
 
#define WRITE_BUFSIZE (8)
uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
* output sequences (encode)
* (host-endian) */
size_t write_len; /**< Character length of write_buf */
 
} charset_ext8_codec;
 
static bool charset_ext8_codec_handles_charset(const char *charset);
static parserutils_error charset_ext8_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec);
static parserutils_error charset_ext8_codec_destroy(
parserutils_charset_codec *codec);
static parserutils_error charset_ext8_codec_encode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_ext8_codec_decode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_ext8_codec_reset(
parserutils_charset_codec *codec);
static inline parserutils_error charset_ext8_codec_read_char(
charset_ext8_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_ext8_codec_output_decoded_char(
charset_ext8_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
uint32_t ucs4, uint8_t **s, size_t *len);
static inline parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
const uint8_t *s, size_t len, uint32_t *ucs4);
 
/**
* Determine whether this codec handles a specific charset
*
* \param charset Charset to test
* \return true if handleable, false otherwise
*/
bool charset_ext8_codec_handles_charset(const char *charset)
{
uint32_t i;
uint16_t match = parserutils_charset_mibenum_from_name(charset,
strlen(charset));
 
if (known_charsets[0].mib == 0) {
for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
known_charsets[i].mib =
parserutils_charset_mibenum_from_name(
known_charsets[i].name,
known_charsets[i].len);
}
}
 
for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
if (known_charsets[i].mib == match)
return true;
}
 
return false;
}
 
/**
* Create an extended 8bit codec
*
* \param charset The charset to read from / write to
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param codec Pointer to location to receive codec
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhausion
*/
parserutils_error charset_ext8_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec)
{
uint32_t i;
charset_ext8_codec *c;
uint16_t match = parserutils_charset_mibenum_from_name(
charset, strlen(charset));
uint32_t *table = NULL;
 
for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
if (known_charsets[i].mib == match) {
table = known_charsets[i].table;
break;
}
}
 
assert(table != NULL);
 
c = alloc(NULL, sizeof(charset_ext8_codec), pw);
if (c == NULL)
return PARSERUTILS_NOMEM;
 
c->table = table;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
/* Finally, populate vtable */
c->base.handler.destroy = charset_ext8_codec_destroy;
c->base.handler.encode = charset_ext8_codec_encode;
c->base.handler.decode = charset_ext8_codec_decode;
c->base.handler.reset = charset_ext8_codec_reset;
 
*codec = (parserutils_charset_codec *) c;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy an extended 8bit codec
*
* \param codec The codec to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_ext8_codec_destroy (parserutils_charset_codec *codec)
{
UNUSED(codec);
 
return PARSERUTILS_OK;
}
 
/**
* Encode a chunk of UCS-4 (big endian) data into extended 8bit
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read. Any remaining output for the character will be buffered by the
* codec for writing on the next call.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_ext8_codec *c = (charset_ext8_codec *) codec;
uint32_t ucs4;
uint32_t *towrite;
size_t towritelen;
parserutils_error error;
 
/* Process any outstanding characters from the previous call */
if (c->write_len > 0) {
uint32_t *pwrite = c->write_buf;
 
while (c->write_len > 0) {
error = charset_ext8_from_ucs4(c, pwrite[0],
dest, destlen);
if (error != PARSERUTILS_OK) {
uint32_t len;
assert(error == PARSERUTILS_NOMEM);
 
for (len = 0; len < c->write_len; len++) {
c->write_buf[len] = pwrite[len];
}
 
return error;
}
 
pwrite++;
c->write_len--;
}
}
 
/* Now process the characters for this call */
while (*sourcelen > 0) {
ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
towrite = &ucs4;
towritelen = 1;
 
/* Output current characters */
while (towritelen > 0) {
error = charset_ext8_from_ucs4(c, towrite[0], dest,
destlen);
if (error != PARSERUTILS_OK) {
uint32_t len;
if (error != PARSERUTILS_NOMEM) {
return error;
}
 
/* Insufficient output space */
if (towritelen >= WRITE_BUFSIZE)
abort();
 
c->write_len = towritelen;
 
/* Copy pending chars to save area, for
* processing next call. */
for (len = 0; len < towritelen; len++)
c->write_buf[len] = towrite[len];
 
/* Claim character we've just buffered,
* so it's not reprocessed */
*source += 4;
*sourcelen -= 4;
 
return PARSERUTILS_NOMEM;
}
 
towrite++;
towritelen--;
}
 
*source += 4;
*sourcelen -= 4;
}
 
return PARSERUTILS_OK;
}
 
/**
* Decode a chunk of extended 8bit data into UCS-4 (big endian)
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* If STRICT error handling is configured and an illegal sequence is split
* over two calls, then _INVALID will be returned from the second call,
* but ::source will point mid-way through the invalid sequence (i.e. it
* will be unmodified over the second call). In addition, the internal
* incomplete-sequence buffer will be emptied, such that subsequent calls
* will progress, rather than re-evaluating the same invalid sequence.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*
* Call this with a source length of 0 to flush the output buffer.
*/
parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_ext8_codec *c = (charset_ext8_codec *) codec;
parserutils_error error;
 
if (c->read_len > 0) {
/* Output left over from last decode */
uint32_t *pread = c->read_buf;
 
while (c->read_len > 0 && *destlen >= c->read_len * 4) {
*((uint32_t *) (void *) *dest) =
endian_host_to_big(pread[0]);
 
*dest += 4;
*destlen -= 4;
 
pread++;
c->read_len--;
}
 
if (*destlen < c->read_len * 4) {
/* Ran out of output buffer */
size_t i;
 
/* Shuffle remaining output down */
for (i = 0; i < c->read_len; i++)
c->read_buf[i] = pread[i];
 
return PARSERUTILS_NOMEM;
}
}
 
/* Finally, the "normal" case; process all outstanding characters */
while (*sourcelen > 0) {
error = charset_ext8_codec_read_char(c,
source, sourcelen, dest, destlen);
if (error != PARSERUTILS_OK) {
return error;
}
}
 
return PARSERUTILS_OK;
}
 
/**
* Clear an extended 8bit codec's encoding state
*
* \param codec The codec to reset
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
{
charset_ext8_codec *c = (charset_ext8_codec *) codec;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
return PARSERUTILS_OK;
}
 
 
/**
* Read a character from the extended 8bit to UCS-4 (big endian)
*
* \param c The codec
* \param source Pointer to pointer to source buffer (updated on exit)
* \param sourcelen Pointer to length of source buffer (updated on exit)
* \param dest Pointer to pointer to output buffer (updated on exit)
* \param destlen Pointer to length of output buffer (updated on exit)
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
uint32_t ucs4;
parserutils_error error;
 
/* Convert a single character */
error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
if (error == PARSERUTILS_OK) {
/* Read a character */
error = charset_ext8_codec_output_decoded_char(c,
ucs4, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += 1;
*sourcelen -= 1;
}
 
return error;
} else if (error == PARSERUTILS_NEEDDATA) {
/* Can only happen if sourcelen == 0 */
return error;
} else if (error == PARSERUTILS_INVALID) {
/* Illegal input sequence */
 
/* Strict errormode; simply flag invalid character */
if (c->base.errormode ==
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
return PARSERUTILS_INVALID;
}
 
/* output U+FFFD and continue processing. */
error = charset_ext8_codec_output_decoded_char(c,
0xFFFD, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += 1;
*sourcelen -= 1;
}
 
return error;
}
 
return PARSERUTILS_OK;
}
 
/**
* Output a UCS-4 character (big endian)
*
* \param c Codec to use
* \param ucs4 UCS-4 character (host endian)
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to output buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
*/
parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen)
{
if (*destlen < 4) {
/* Run out of output buffer */
c->read_len = 1;
c->read_buf[0] = ucs4;
 
return PARSERUTILS_NOMEM;
}
 
*((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
*dest += 4;
*destlen -= 4;
 
return PARSERUTILS_OK;
}
 
/**
* Convert a UCS4 (host endian) character to extended 8bit
*
* \param c The codec instance
* \param ucs4 The UCS4 character to convert
* \param s Pointer to pointer to destination buffer
* \param len Pointer to destination buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if there's insufficient space in the output buffer,
* PARSERUTILS_INVALID if the character cannot be represented
*
* _INVALID will only be returned if the codec's conversion mode is STRICT.
* Otherwise, '?' will be output.
*
* On successful conversion, *s and *len will be updated.
*/
parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c,
uint32_t ucs4, uint8_t **s, size_t *len)
{
uint8_t out = 0;
 
if (*len < 1)
return PARSERUTILS_NOMEM;
 
if (ucs4 < 0x80) {
/* ASCII */
out = ucs4;
} else {
uint32_t i;
 
for (i = 0; i < 128; i++) {
if (ucs4 == c->table[i])
break;
}
 
if (i == 128) {
if (c->base.errormode ==
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT)
return PARSERUTILS_INVALID;
else
out = '?';
} else {
out = 0x80 + i;
}
}
 
*(*s) = out;
(*s)++;
(*len)--;
 
return PARSERUTILS_OK;
}
 
/**
* Convert an extended 8bit character to UCS4 (host endian)
*
* \param c The codec instance
* \param s Pointer to source buffer
* \param len Source buffer length
* \param ucs4 Pointer to destination buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NEEDDATA if there's insufficient input data
* PARSERUTILS_INVALID if the character cannot be represented
*/
parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c,
const uint8_t *s, size_t len, uint32_t *ucs4)
{
uint32_t out;
 
if (len < 1)
return PARSERUTILS_NEEDDATA;
 
if (*s < 0x80) {
out = *s;
} else {
if (c->table[*s - 0x80] == 0xFFFF)
return PARSERUTILS_INVALID;
 
out = c->table[*s - 0x80];
}
 
*ucs4 = out;
 
return PARSERUTILS_OK;
}
 
const parserutils_charset_handler charset_ext8_codec_handler = {
charset_ext8_codec_handles_charset,
charset_ext8_codec_create
};
 
/programs/network/netsurf/libparserutils/src/charset/codecs/codec_impl.h
0,0 → 1,49
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_charset_codecs_codecimpl_h_
#define parserutils_charset_codecs_codecimpl_h_
 
#include <stdbool.h>
#include <inttypes.h>
 
#include <parserutils/charset/codec.h>
 
/**
* Core charset codec definition; implementations extend this
*/
struct parserutils_charset_codec {
uint16_t mibenum; /**< MIB enum for charset */
 
parserutils_charset_codec_errormode errormode; /**< error mode */
 
parserutils_alloc alloc; /**< allocation function */
void *alloc_pw; /**< private word */
 
struct {
parserutils_error (*destroy)(parserutils_charset_codec *codec);
parserutils_error (*encode)(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
parserutils_error (*decode)(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
parserutils_error (*reset)(parserutils_charset_codec *codec);
} handler; /**< Vtable for handler code */
};
 
/**
* Codec factory component definition
*/
typedef struct parserutils_charset_handler {
bool (*handles_charset)(const char *charset);
parserutils_error (*create)(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec);
} parserutils_charset_handler;
 
#endif
/programs/network/netsurf/libparserutils/src/charset/codecs/codec_utf16.c
0,0 → 1,552
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/charset/mibenum.h>
#include <parserutils/charset/utf16.h>
 
#include "charset/codecs/codec_impl.h"
#include "utils/endian.h"
#include "utils/utils.h"
 
/**
* UTF-16 charset codec
*/
typedef struct charset_utf16_codec {
parserutils_charset_codec base; /**< Base class */
 
#define INVAL_BUFSIZE (32)
uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
* incomplete input
* sequences */
size_t inval_len; /*< Byte length of inval_buf **/
 
#define READ_BUFSIZE (8)
uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
* output sequences (decode)
* (host-endian) */
size_t read_len; /**< Character length of read_buf */
 
#define WRITE_BUFSIZE (8)
uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
* output sequences (encode)
* (host-endian) */
size_t write_len; /**< Character length of write_buf */
 
} charset_utf16_codec;
 
static bool charset_utf16_codec_handles_charset(const char *charset);
static parserutils_error charset_utf16_codec_create(
const char *charset, parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec);
static parserutils_error charset_utf16_codec_destroy(
parserutils_charset_codec *codec);
static parserutils_error charset_utf16_codec_encode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_utf16_codec_decode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_utf16_codec_reset(
parserutils_charset_codec *codec);
static inline parserutils_error charset_utf16_codec_read_char(
charset_utf16_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_utf16_codec_output_decoded_char(
charset_utf16_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen);
 
/**
* Determine whether this codec handles a specific charset
*
* \param charset Charset to test
* \return true if handleable, false otherwise
*/
bool charset_utf16_codec_handles_charset(const char *charset)
{
return parserutils_charset_mibenum_from_name(charset, strlen(charset))
==
parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
}
 
/**
* Create a UTF-16 codec
*
* \param charset The charset to read from / write to
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param codec Pointer to location to receive codec
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhausion
*/
parserutils_error charset_utf16_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec)
{
charset_utf16_codec *c;
 
UNUSED(charset);
 
c = alloc(NULL, sizeof(charset_utf16_codec), pw);
if (c == NULL)
return PARSERUTILS_NOMEM;
 
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
/* Finally, populate vtable */
c->base.handler.destroy = charset_utf16_codec_destroy;
c->base.handler.encode = charset_utf16_codec_encode;
c->base.handler.decode = charset_utf16_codec_decode;
c->base.handler.reset = charset_utf16_codec_reset;
 
*codec = (parserutils_charset_codec *) c;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy a UTF-16 codec
*
* \param codec The codec to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_utf16_codec_destroy (parserutils_charset_codec *codec)
{
UNUSED(codec);
 
return PARSERUTILS_OK;
}
 
/**
* Encode a chunk of UCS-4 (big endian) data into UTF-16
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read. Any remaining output for the character will be buffered by the
* codec for writing on the next call.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_utf16_codec *c = (charset_utf16_codec *) codec;
uint32_t ucs4;
uint32_t *towrite;
size_t towritelen;
parserutils_error error;
 
/* Process any outstanding characters from the previous call */
if (c->write_len > 0) {
uint32_t *pwrite = c->write_buf;
uint8_t buf[4];
size_t len;
 
while (c->write_len > 0) {
error = parserutils_charset_utf16_from_ucs4(
pwrite[0], buf, &len);
if (error != PARSERUTILS_OK)
abort();
 
if (*destlen < len) {
/* Insufficient output buffer space */
for (len = 0; len < c->write_len; len++)
c->write_buf[len] = pwrite[len];
 
return PARSERUTILS_NOMEM;
}
 
memcpy(*dest, buf, len);
 
*dest += len;
*destlen -= len;
 
pwrite++;
c->write_len--;
}
}
 
/* Now process the characters for this call */
while (*sourcelen > 0) {
ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
towrite = &ucs4;
towritelen = 1;
 
/* Output current characters */
while (towritelen > 0) {
uint8_t buf[4];
size_t len;
 
error = parserutils_charset_utf16_from_ucs4(
towrite[0], buf, &len);
if (error != PARSERUTILS_OK)
abort();
 
if (*destlen < len) {
/* Insufficient output space */
if (towritelen >= WRITE_BUFSIZE)
abort();
 
c->write_len = towritelen;
 
/* Copy pending chars to save area, for
* processing next call. */
for (len = 0; len < towritelen; len++)
c->write_buf[len] = towrite[len];
 
/* Claim character we've just buffered,
* so it's not reprocessed */
*source += 4;
*sourcelen -= 4;
 
return PARSERUTILS_NOMEM;
}
 
memcpy(*dest, buf, len);
 
*dest += len;
*destlen -= len;
 
towrite++;
towritelen--;
}
 
*source += 4;
*sourcelen -= 4;
}
 
return PARSERUTILS_OK;
}
 
/**
* Decode a chunk of UTF-16 data into UCS-4 (big endian)
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* If STRICT error handling is configured and an illegal sequence is split
* over two calls, then _INVALID will be returned from the second call,
* but ::source will point mid-way through the invalid sequence (i.e. it
* will be unmodified over the second call). In addition, the internal
* incomplete-sequence buffer will be emptied, such that subsequent calls
* will progress, rather than re-evaluating the same invalid sequence.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*
* Call this with a source length of 0 to flush the output buffer.
*/
parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_utf16_codec *c = (charset_utf16_codec *) codec;
parserutils_error error;
 
if (c->read_len > 0) {
/* Output left over from last decode */
uint32_t *pread = c->read_buf;
 
while (c->read_len > 0 && *destlen >= c->read_len * 4) {
*((uint32_t *) (void *) *dest) =
endian_host_to_big(pread[0]);
 
*dest += 4;
*destlen -= 4;
 
pread++;
c->read_len--;
}
 
if (*destlen < c->read_len * 4) {
/* Ran out of output buffer */
size_t i;
 
/* Shuffle remaining output down */
for (i = 0; i < c->read_len; i++)
c->read_buf[i] = pread[i];
 
return PARSERUTILS_NOMEM;
}
}
 
if (c->inval_len > 0) {
/* The last decode ended in an incomplete sequence.
* Fill up inval_buf with data from the start of the
* new chunk and process it. */
uint8_t *in = c->inval_buf;
size_t ol = c->inval_len;
size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
size_t orig_l = l;
 
memcpy(c->inval_buf + ol, *source, l);
 
l += c->inval_len;
 
error = charset_utf16_codec_read_char(c,
(const uint8_t **) &in, &l, dest, destlen);
if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
return error;
}
 
/* And now, fix up source pointers */
*source += max((signed) (orig_l - l), 0);
*sourcelen -= max((signed) (orig_l - l), 0);
 
/* Failed to resolve an incomplete character and
* ran out of buffer space. No recovery strategy
* possible, so explode everywhere. */
if ((orig_l + ol) - l == 0)
abort();
 
/* Report memory exhaustion case from above */
if (error != PARSERUTILS_OK)
return error;
}
 
/* Finally, the "normal" case; process all outstanding characters */
while (*sourcelen > 0) {
error = charset_utf16_codec_read_char(c,
source, sourcelen, dest, destlen);
if (error != PARSERUTILS_OK) {
return error;
}
}
 
return PARSERUTILS_OK;
}
 
/**
* Clear a UTF-16 codec's encoding state
*
* \param codec The codec to reset
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
{
charset_utf16_codec *c = (charset_utf16_codec *) codec;
 
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
return PARSERUTILS_OK;
}
 
 
/**
* Read a character from the UTF-16 to UCS-4 (big endian)
*
* \param c The codec
* \param source Pointer to pointer to source buffer (updated on exit)
* \param sourcelen Pointer to length of source buffer (updated on exit)
* \param dest Pointer to pointer to output buffer (updated on exit)
* \param destlen Pointer to length of output buffer (updated on exit)
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
uint32_t ucs4;
size_t sucs4;
parserutils_error error;
 
/* Convert a single character */
error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
&ucs4, &sucs4);
if (error == PARSERUTILS_OK) {
/* Read a character */
error = charset_utf16_codec_output_decoded_char(c,
ucs4, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += sucs4;
*sourcelen -= sucs4;
}
 
/* Clear inval buffer */
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
return error;
} else if (error == PARSERUTILS_NEEDDATA) {
/* Incomplete input sequence */
if (*sourcelen > INVAL_BUFSIZE)
abort();
 
memmove(c->inval_buf, *source, *sourcelen);
c->inval_buf[*sourcelen] = '\0';
c->inval_len = *sourcelen;
 
*source += *sourcelen;
*sourcelen = 0;
 
return PARSERUTILS_OK;
} else if (error == PARSERUTILS_INVALID) {
/* Illegal input sequence */
uint32_t nextchar;
 
/* Clear inval buffer */
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
/* Strict errormode; simply flag invalid character */
if (c->base.errormode ==
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
return PARSERUTILS_INVALID;
}
 
/* Find next valid UTF-16 sequence.
* We're processing client-provided data, so let's
* be paranoid about its validity. */
error = parserutils_charset_utf16_next_paranoid(
*source, *sourcelen, 0, &nextchar);
if (error != PARSERUTILS_OK) {
if (error == PARSERUTILS_NEEDDATA) {
/* Need more data to be sure */
if (*sourcelen > INVAL_BUFSIZE)
abort();
 
memmove(c->inval_buf, *source, *sourcelen);
c->inval_buf[*sourcelen] = '\0';
c->inval_len = *sourcelen;
 
*source += *sourcelen;
*sourcelen = 0;
 
nextchar = 0;
} else {
return error;
}
}
 
/* output U+FFFD and continue processing. */
error = charset_utf16_codec_output_decoded_char(c,
0xFFFD, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += nextchar;
*sourcelen -= nextchar;
}
 
return error;
}
 
return PARSERUTILS_OK;
}
 
/**
* Output a UCS-4 character (big endian)
*
* \param c Codec to use
* \param ucs4 UCS-4 character (host endian)
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to output buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
*/
parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen)
{
if (*destlen < 4) {
/* Run out of output buffer */
c->read_len = 1;
c->read_buf[0] = ucs4;
 
return PARSERUTILS_NOMEM;
}
 
*((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
*dest += 4;
*destlen -= 4;
 
return PARSERUTILS_OK;
}
 
 
const parserutils_charset_handler charset_utf16_codec_handler = {
charset_utf16_codec_handles_charset,
charset_utf16_codec_create
};
/programs/network/netsurf/libparserutils/src/charset/codecs/codec_utf8.c
0,0 → 1,555
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/charset/mibenum.h>
 
#include "charset/codecs/codec_impl.h"
#include "charset/encodings/utf8impl.h"
#include "utils/endian.h"
#include "utils/utils.h"
 
/**
* UTF-8 charset codec
*/
typedef struct charset_utf8_codec {
parserutils_charset_codec base; /**< Base class */
 
#define INVAL_BUFSIZE (32)
uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
* incomplete input
* sequences */
size_t inval_len; /*< Byte length of inval_buf **/
 
#define READ_BUFSIZE (8)
uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
* output sequences (decode)
* (host-endian) */
size_t read_len; /**< Character length of read_buf */
 
#define WRITE_BUFSIZE (8)
uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
* output sequences (encode)
* (host-endian) */
size_t write_len; /**< Character length of write_buf */
 
} charset_utf8_codec;
 
static bool charset_utf8_codec_handles_charset(const char *charset);
static parserutils_error charset_utf8_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec);
static parserutils_error charset_utf8_codec_destroy(
parserutils_charset_codec *codec);
static parserutils_error charset_utf8_codec_encode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_utf8_codec_decode(
parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static parserutils_error charset_utf8_codec_reset(
parserutils_charset_codec *codec);
static inline parserutils_error charset_utf8_codec_read_char(
charset_utf8_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen);
static inline parserutils_error charset_utf8_codec_output_decoded_char(
charset_utf8_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen);
 
/**
* Determine whether this codec handles a specific charset
*
* \param charset Charset to test
* \return true if handleable, false otherwise
*/
bool charset_utf8_codec_handles_charset(const char *charset)
{
return parserutils_charset_mibenum_from_name(charset,
strlen(charset)) ==
parserutils_charset_mibenum_from_name("UTF-8",
SLEN("UTF-8"));
}
 
/**
* Create a UTF-8 codec
*
* \param charset The charset to read from / write to
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param codec Pointer to location to receive codec
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhausion
*/
parserutils_error charset_utf8_codec_create(const char *charset,
parserutils_alloc alloc, void *pw,
parserutils_charset_codec **codec)
{
charset_utf8_codec *c;
 
UNUSED(charset);
 
c = alloc(NULL, sizeof(charset_utf8_codec), pw);
if (c == NULL)
return PARSERUTILS_NOMEM;
 
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
/* Finally, populate vtable */
c->base.handler.destroy = charset_utf8_codec_destroy;
c->base.handler.encode = charset_utf8_codec_encode;
c->base.handler.decode = charset_utf8_codec_decode;
c->base.handler.reset = charset_utf8_codec_reset;
 
*codec = (parserutils_charset_codec *) c;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy a UTF-8 codec
*
* \param codec The codec to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_utf8_codec_destroy (parserutils_charset_codec *codec)
{
UNUSED(codec);
 
return PARSERUTILS_OK;
}
 
/**
* Encode a chunk of UCS-4 (big endian) data into UTF-8
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read. Any remaining output for the character will be buffered by the
* codec for writing on the next call.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_utf8_codec *c = (charset_utf8_codec *) codec;
uint32_t ucs4;
uint32_t *towrite;
size_t towritelen;
parserutils_error error;
 
/* Process any outstanding characters from the previous call */
if (c->write_len > 0) {
uint32_t *pwrite = c->write_buf;
 
while (c->write_len > 0) {
UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
if (error != PARSERUTILS_OK) {
uint32_t len;
if (error != PARSERUTILS_NOMEM)
abort();
 
/* Insufficient output buffer space */
for (len = 0; len < c->write_len; len++) {
c->write_buf[len] = pwrite[len];
}
 
return PARSERUTILS_NOMEM;
}
 
pwrite++;
c->write_len--;
}
}
 
/* Now process the characters for this call */
while (*sourcelen > 0) {
ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
towrite = &ucs4;
towritelen = 1;
 
/* Output current characters */
while (towritelen > 0) {
UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
if (error != PARSERUTILS_OK) {
uint32_t len;
if (error != PARSERUTILS_NOMEM)
abort();
 
/* Insufficient output space */
if (towritelen >= WRITE_BUFSIZE)
abort();
 
c->write_len = towritelen;
 
/* Copy pending chars to save area, for
* processing next call. */
for (len = 0; len < towritelen; len++)
c->write_buf[len] = towrite[len];
 
/* Claim character we've just buffered,
* so it's not reprocessed */
*source += 4;
*sourcelen -= 4;
 
return PARSERUTILS_NOMEM;
}
 
towrite++;
towritelen--;
}
 
*source += 4;
*sourcelen -= 4;
}
 
return PARSERUTILS_OK;
}
 
/**
* Decode a chunk of UTF-8 data into UCS-4 (big endian)
*
* \param codec The codec to use
* \param source Pointer to pointer to source data
* \param sourcelen Pointer to length (in bytes) of source data
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to length (in bytes) of output buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* Note that, if failure occurs whilst attempting to write any output
* buffered by the last call, then ::source and ::sourcelen will remain
* unchanged (as nothing more has been read).
*
* If STRICT error handling is configured and an illegal sequence is split
* over two calls, then _INVALID will be returned from the second call,
* but ::source will point mid-way through the invalid sequence (i.e. it
* will be unmodified over the second call). In addition, the internal
* incomplete-sequence buffer will be emptied, such that subsequent calls
* will progress, rather than re-evaluating the same invalid sequence.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*
* Call this with a source length of 0 to flush the output buffer.
*/
parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
charset_utf8_codec *c = (charset_utf8_codec *) codec;
parserutils_error error;
 
if (c->read_len > 0) {
/* Output left over from last decode */
uint32_t *pread = c->read_buf;
 
while (c->read_len > 0 && *destlen >= c->read_len * 4) {
*((uint32_t *) (void *) *dest) =
endian_host_to_big(pread[0]);
 
*dest += 4;
*destlen -= 4;
 
pread++;
c->read_len--;
}
 
if (*destlen < c->read_len * 4) {
/* Ran out of output buffer */
size_t i;
 
/* Shuffle remaining output down */
for (i = 0; i < c->read_len; i++)
c->read_buf[i] = pread[i];
 
return PARSERUTILS_NOMEM;
}
}
 
if (c->inval_len > 0) {
/* The last decode ended in an incomplete sequence.
* Fill up inval_buf with data from the start of the
* new chunk and process it. */
uint8_t *in = c->inval_buf;
size_t ol = c->inval_len;
size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
size_t orig_l = l;
 
memcpy(c->inval_buf + ol, *source, l);
 
l += c->inval_len;
 
error = charset_utf8_codec_read_char(c,
(const uint8_t **) &in, &l, dest, destlen);
if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
return error;
}
 
/* And now, fix up source pointers */
*source += max((signed) (orig_l - l), 0);
*sourcelen -= max((signed) (orig_l - l), 0);
 
/* Failed to resolve an incomplete character and
* ran out of buffer space. No recovery strategy
* possible, so explode everywhere. */
if ((orig_l + ol) - l == 0)
abort();
 
/* Report memory exhaustion case from above */
if (error != PARSERUTILS_OK)
return error;
}
 
/* Finally, the "normal" case; process all outstanding characters */
while (*sourcelen > 0) {
error = charset_utf8_codec_read_char(c,
source, sourcelen, dest, destlen);
if (error != PARSERUTILS_OK) {
return error;
}
}
 
return PARSERUTILS_OK;
}
 
/**
* Clear a UTF-8 codec's encoding state
*
* \param codec The codec to reset
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
{
charset_utf8_codec *c = (charset_utf8_codec *) codec;
 
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
c->read_buf[0] = 0;
c->read_len = 0;
 
c->write_buf[0] = 0;
c->write_len = 0;
 
return PARSERUTILS_OK;
}
 
 
/**
* Read a character from the UTF-8 to UCS-4 (big endian)
*
* \param c The codec
* \param source Pointer to pointer to source buffer (updated on exit)
* \param sourcelen Pointer to length of source buffer (updated on exit)
* \param dest Pointer to pointer to output buffer (updated on exit)
* \param destlen Pointer to length of output buffer (updated on exit)
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
* PARSERUTILS_INVALID if a character cannot be represented and the
* codec's error handling mode is set to STRICT,
*
* On exit, ::source will point immediately _after_ the last input character
* read, if the result is _OK or _NOMEM. Any remaining output for the
* character will be buffered by the codec for writing on the next call.
*
* In the case of the result being _INVALID, ::source will point _at_ the
* last input character read; nothing will be written or buffered for the
* failed character. It is up to the client to fix the cause of the failure
* and retry the decoding process.
*
* ::sourcelen will be reduced appropriately on exit.
*
* ::dest will point immediately _after_ the last character written.
*
* ::destlen will be reduced appropriately on exit.
*/
parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
const uint8_t **source, size_t *sourcelen,
uint8_t **dest, size_t *destlen)
{
uint32_t ucs4;
size_t sucs4;
parserutils_error error;
 
/* Convert a single character */
{
const uint8_t *src = *source;
size_t srclen = *sourcelen;
uint32_t *uptr = &ucs4;
size_t *usptr = &sucs4;
UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
}
if (error == PARSERUTILS_OK) {
/* Read a character */
error = charset_utf8_codec_output_decoded_char(c,
ucs4, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += sucs4;
*sourcelen -= sucs4;
}
 
/* Clear inval buffer */
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
return error;
} else if (error == PARSERUTILS_NEEDDATA) {
/* Incomplete input sequence */
if (*sourcelen > INVAL_BUFSIZE)
abort();
 
memmove(c->inval_buf, *source, *sourcelen);
c->inval_buf[*sourcelen] = '\0';
c->inval_len = *sourcelen;
 
*source += *sourcelen;
*sourcelen = 0;
 
return PARSERUTILS_OK;
} else if (error == PARSERUTILS_INVALID) {
/* Illegal input sequence */
uint32_t nextchar;
 
/* Strict errormode; simply flag invalid character */
if (c->base.errormode ==
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
/* Clear inval buffer */
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
return PARSERUTILS_INVALID;
}
 
/* Find next valid UTF-8 sequence.
* We're processing client-provided data, so let's
* be paranoid about its validity. */
{
const uint8_t *src = *source;
size_t srclen = *sourcelen;
uint32_t off = 0;
uint32_t *ncptr = &nextchar;
 
UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
}
if (error != PARSERUTILS_OK) {
if (error == PARSERUTILS_NEEDDATA) {
/* Need more data to be sure */
if (*sourcelen > INVAL_BUFSIZE)
abort();
 
memmove(c->inval_buf, *source, *sourcelen);
c->inval_buf[*sourcelen] = '\0';
c->inval_len = *sourcelen;
 
*source += *sourcelen;
*sourcelen = 0;
 
nextchar = 0;
} else {
return error;
}
}
 
/* Clear inval buffer */
c->inval_buf[0] = '\0';
c->inval_len = 0;
 
/* output U+FFFD and continue processing. */
error = charset_utf8_codec_output_decoded_char(c,
0xFFFD, dest, destlen);
if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
/* output succeeded; update source pointers */
*source += nextchar;
*sourcelen -= nextchar;
}
 
return error;
}
 
return PARSERUTILS_OK;
}
 
/**
* Output a UCS-4 character (big endian)
*
* \param c Codec to use
* \param ucs4 UCS-4 character (host endian)
* \param dest Pointer to pointer to output buffer
* \param destlen Pointer to output buffer length
* \return PARSERUTILS_OK on success,
* PARSERUTILS_NOMEM if output buffer is too small,
*/
parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
uint32_t ucs4, uint8_t **dest, size_t *destlen)
{
if (*destlen < 4) {
/* Run out of output buffer */
c->read_len = 1;
c->read_buf[0] = ucs4;
 
return PARSERUTILS_NOMEM;
}
 
*((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
*dest += 4;
*destlen -= 4;
 
return PARSERUTILS_OK;
}
 
 
const parserutils_charset_handler charset_utf8_codec_handler = {
charset_utf8_codec_handles_charset,
charset_utf8_codec_create
};
 
/programs/network/netsurf/libparserutils/src/charset/codecs/ext8_tables.h
0,0 → 1,187
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_charset_codecs_ext8tables_h_
#define parserutils_charset_codecs_ext8tables_h_
 
/* Mapping tables for extended 8bit -> UCS4.
* Undefined characters are mapped to U+FFFF,
* which is a guaranteed non-character
*/
 
static uint32_t w1250[128] = {
0x20AC, 0xFFFF, 0x201A, 0xFFFF, 0x201E, 0x2026, 0x2020, 0x2021,
0xFFFF, 0x2030, 0x0160, 0x2039, 0x015A, 0x0164, 0x017D, 0x0179,
0xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0xFFFF, 0x2122, 0x0161, 0x203A, 0x015B, 0x0165, 0x017E, 0x017A,
0x00A0, 0x02C7, 0x02D8, 0x0141, 0x00A4, 0x0104, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x015E, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x017B,
0x00B0, 0x00B1, 0x02DB, 0x0142, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x0105, 0x015F, 0x00BB, 0x013D, 0x02DD, 0x013E, 0x017C,
0x0154, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0139, 0x0106, 0x00C7,
0x010C, 0x00C9, 0x0118, 0x00CB, 0x011A, 0x00CD, 0x00CE, 0x010E,
0x0110, 0x0143, 0x0147, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x00D7,
0x0158, 0x016E, 0x00DA, 0x0170, 0x00DC, 0x00DD, 0x0162, 0x00DF,
0x0155, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x013A, 0x0107, 0x00E7,
0x010D, 0x00E9, 0x0119, 0x00EB, 0x011B, 0x00ED, 0x00EE, 0x010F,
0x0111, 0x0144, 0x0148, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x00F7,
0x0159, 0x016F, 0x00FA, 0x0171, 0x00FC, 0x00FD, 0x0163, 0x02D9,
};
 
static uint32_t w1251[128] = {
0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0xFFFF, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457,
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
};
 
static uint32_t w1252[128] = {
0x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFF, 0x017D, 0xFFFF,
0xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFF, 0x017E, 0x0178,
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF,
};
 
static uint32_t w1253[128] = {
0x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0xFFFF, 0x2030, 0xFFFF, 0x2039, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0xFFFF, 0x2122, 0xFFFF, 0x203A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x00A0, 0x0385, 0x0386, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0xFFFF, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x2015,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x00B5, 0x00B6, 0x00B7,
0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
0x03A0, 0x03A1, 0xFFFF, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7,
0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7,
0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7,
0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFF,
};
 
static uint32_t w1254[128] = {
0x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFF, 0xFFFF, 0x0178,
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x011E, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0130, 0x015E, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x011F, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0131, 0x015F, 0x00FF,
};
 
static uint32_t w1255[128] = {
0x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0xFFFF, 0x2039, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0xFFFF, 0x203A, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AA, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7,
0x05B8, 0x05B9, 0xFFFF, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05F0, 0x05F1, 0x05F2, 0x05F3,
0x05F4, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7,
0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7,
0x05E8, 0x05E9, 0x05EA, 0xFFFF, 0xFFFF, 0x200E, 0x200F, 0xFFFF,
};
 
static uint32_t w1256[128] = {
0x20AC, 0x067E, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0679, 0x2039, 0x0152, 0x0686, 0x0698, 0x0688,
0x06AF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x06A9, 0x2122, 0x0691, 0x203A, 0x0153, 0x200C, 0x200D, 0x06BA,
0x00A0, 0x060C, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x06BE, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x061B, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x061F,
0x06C1, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,
0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x00D7,
0x0637, 0x0638, 0x0639, 0x063A, 0x0640, 0x0641, 0x0642, 0x0643,
0x00E0, 0x0644, 0x00E2, 0x0645, 0x0646, 0x0647, 0x0648, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x0649, 0x064A, 0x00EE, 0x00EF,
0x064B, 0x064C, 0x064D, 0x064E, 0x00F4, 0x064F, 0x0650, 0x00F7,
0x0651, 0x00F9, 0x0652, 0x00FB, 0x00FC, 0x200E, 0x200F, 0x06D2,
};
 
static uint32_t w1257[128] = {
0x20AC, 0xFFFF, 0x201A, 0xFFFF, 0x201E, 0x2026, 0x2020, 0x2021,
0xFFFF, 0x2030, 0xFFFF, 0x2039, 0xFFFF, 0x00A8, 0x02C7, 0x00B8,
0xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0xFFFF, 0x2122, 0xFFFF, 0x203A, 0xFFFF, 0x00AF, 0x02DB, 0xFFFF,
0x00A0, 0xFFFF, 0x00A2, 0x00A3, 0x00A4, 0xFFFF, 0x00A6, 0x00A7,
0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112,
0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7,
0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113,
0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7,
0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x02D9,
};
 
static uint32_t w1258[128] = {
0x20AC, 0xFFFF, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0xFFFF, 0x2039, 0x0152, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0xFFFF, 0x203A, 0x0153, 0xFFFF, 0xFFFF, 0x0178,
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x0300, 0x00CD, 0x00CE, 0x00CF,
0x0110, 0x00D1, 0x0309, 0x00D3, 0x00D4, 0x01A0, 0x00D6, 0x00D7,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x01AF, 0x0303, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x0301, 0x00ED, 0x00EE, 0x00EF,
0x0111, 0x00F1, 0x0323, 0x00F3, 0x00F4, 0x01A1, 0x00F6, 0x00F7,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x01B0, 0x20AB, 0x00FF,
};
 
#endif
/programs/network/netsurf/libparserutils/src/charset/encodings/Makefile
0,0 → 1,5
 
OUTFILE = libo.o
OBJS = utf8.o utf16.o
CFLAGS += -I ../../../include/ -I ../../../../ -I ../../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib
/programs/network/netsurf/libparserutils/src/charset/encodings/utf16.c
0,0 → 1,245
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
/** \file
* UTF-16 manipulation functions (implementation).
*/
 
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
 
#include <parserutils/charset/utf16.h>
 
/**
* Convert a UTF-16 sequence into a single UCS-4 character
*
* \param s The sequence to process
* \param len Length of sequence in bytes
* \param ucs4 Pointer to location to receive UCS-4 character (host endian)
* \param clen Pointer to location to receive byte length of UTF-16 sequence
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
size_t len, uint32_t *ucs4, size_t *clen)
{
const uint16_t *ss = (const uint16_t *) (const void *) s;
 
if (s == NULL || ucs4 == NULL || clen == NULL)
return PARSERUTILS_BADPARM;
 
if (len < 2)
return PARSERUTILS_NEEDDATA;
 
if (*ss < 0xD800 || *ss > 0xDFFF) {
*ucs4 = *ss;
*clen = 2;
} else if (0xD800 <= *ss && *ss <= 0xDBFF) {
/* High-surrogate code unit. */
if (len < 4)
return PARSERUTILS_NEEDDATA;
 
if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) {
/* We have a valid surrogate pair. */
*ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF))
+ (1<<16);
*clen = 4;
} else {
return PARSERUTILS_INVALID;
}
} else {
/* Low-surrogate code unit. */
return PARSERUTILS_INVALID;
}
 
return PARSERUTILS_OK;
}
 
/**
* Convert a single UCS-4 character into a UTF-16 sequence
*
* \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
* \param s Pointer to 4 byte long output buffer
* \param len Pointer to location to receive length of multibyte sequence
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
size_t *len)
{
uint16_t *ss = (uint16_t *) (void *) s;
uint32_t l = 0;
 
if (s == NULL || len == NULL)
return PARSERUTILS_BADPARM;
else if (ucs4 < 0x10000) {
*ss = (uint16_t) ucs4;
l = 2;
} else if (ucs4 < 0x110000) {
ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
ss[1] = 0xDC00 | (ucs4 & 0x3ff);
l = 4;
} else {
return PARSERUTILS_INVALID;
}
 
*len = l;
 
return PARSERUTILS_OK;
}
 
/**
* Calculate the length (in characters) of a bounded UTF-16 string
*
* \param s The string
* \param max Maximum length
* \param len Pointer to location to receive length of string
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
size_t *len)
{
const uint16_t *ss = (const uint16_t *) (const void *) s;
const uint16_t *end = (const uint16_t *) (const void *) (s + max);
int l = 0;
 
if (s == NULL || len == NULL)
return PARSERUTILS_BADPARM;
 
while (ss < end) {
if (*ss < 0xD800 || 0xDFFF < *ss)
ss++;
else
ss += 2;
 
l++;
}
 
*len = l;
 
return PARSERUTILS_OK;
}
 
/**
* Calculate the length (in bytes) of a UTF-16 character
*
* \param s Pointer to start of character
* \param len Pointer to location to receive length
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
size_t *len)
{
const uint16_t *ss = (const uint16_t *) (const void *) s;
 
if (s == NULL || len == NULL)
return PARSERUTILS_BADPARM;
 
if (*ss < 0xD800 || 0xDFFF < *ss)
*len = 2;
else
*len = 4;
 
return PARSERUTILS_OK;
}
 
/**
* Find previous legal UTF-16 char in string
*
* \param s The string
* \param off Offset in the string to start at
* \param prevoff Pointer to location to receive offset of first byte of
* previous legal character
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
uint32_t *prevoff)
{
const uint16_t *ss = (const uint16_t *) (const void *) s;
 
if (s == NULL || prevoff == NULL)
return PARSERUTILS_BADPARM;
 
if (off < 2)
*prevoff = 0;
else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
*prevoff = off - 2;
else
*prevoff = (off < 4) ? 0 : off - 4;
 
return PARSERUTILS_OK;
}
 
/**
* Find next legal UTF-16 char in string
*
* \param s The string (assumed valid)
* \param len Maximum offset in string
* \param off Offset in the string to start at
* \param nextoff Pointer to location to receive offset of first byte of
* next legal character
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
uint32_t off, uint32_t *nextoff)
{
const uint16_t *ss = (const uint16_t *) (const void *) s;
 
if (s == NULL || off >= len || nextoff == NULL)
return PARSERUTILS_BADPARM;
 
if (len - off < 4)
*nextoff = len;
else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
*nextoff = off + 2;
else
*nextoff = (len - off < 6) ? len : off + 4;
 
return PARSERUTILS_OK;
}
 
/**
* Find next legal UTF-16 char in string
*
* \param s The string (assumed to be of dubious validity)
* \param len Maximum offset in string
* \param off Offset in the string to start at
* \param nextoff Pointer to location to receive offset of first byte of
* next legal character
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
uint32_t len, uint32_t off, uint32_t *nextoff)
{
const uint16_t *ss = (const uint16_t *) (const void *) s;
 
if (s == NULL || off >= len || nextoff == NULL)
return PARSERUTILS_BADPARM;
 
while (1) {
if (len - off < 4) {
return PARSERUTILS_NEEDDATA;
} else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
*nextoff = off + 2;
break;
} else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
if (len - off < 6)
return PARSERUTILS_NEEDDATA;
 
if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
*nextoff = off + 4;
break;
} else {
ss++;
off += 2;
}
}
}
 
return PARSERUTILS_OK;
}
 
/programs/network/netsurf/libparserutils/src/charset/encodings/utf8.c
0,0 → 1,175
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
/** \file
* UTF-8 manipulation functions (implementation).
*/
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/charset/utf8.h>
#include "charset/encodings/utf8impl.h"
 
/** Number of continuation bytes for a given start byte */
const uint8_t numContinuations[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
};
 
/**
* Convert a UTF-8 multibyte sequence into a single UCS-4 character
*
* Encoding of UCS values outside the UTF-16 plane has been removed from
* RFC3629. This function conforms to RFC2279, however.
*
* \param s The sequence to process
* \param len Length of sequence
* \param ucs4 Pointer to location to receive UCS-4 character (host endian)
* \param clen Pointer to location to receive byte length of UTF-8 sequence
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
uint32_t *ucs4, size_t *clen)
{
parserutils_error error;
 
UTF8_TO_UCS4(s, len, ucs4, clen, error);
 
return error;
}
 
/**
* Convert a single UCS-4 character into a UTF-8 multibyte sequence
*
* Encoding of UCS values outside the UTF-16 plane has been removed from
* RFC3629. This function conforms to RFC2279, however.
*
* \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
* \param s Pointer to pointer to output buffer, updated on exit
* \param len Pointer to length, in bytes, of output buffer, updated on exit
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4,
uint8_t **s, size_t *len)
{
parserutils_error error;
 
UTF8_FROM_UCS4(ucs4, s, len, error);
 
return error;
}
 
/**
* Calculate the length (in characters) of a bounded UTF-8 string
*
* \param s The string
* \param max Maximum length
* \param len Pointer to location to receive length of string
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
size_t *len)
{
parserutils_error error;
 
UTF8_LENGTH(s, max, len, error);
 
return error;
}
 
/**
* Calculate the length (in bytes) of a UTF-8 character
*
* \param s Pointer to start of character
* \param len Pointer to location to receive length
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
size_t *len)
{
parserutils_error error;
 
UTF8_CHAR_BYTE_LENGTH(s, len, error);
 
return error;
}
 
/**
* Find previous legal UTF-8 char in string
*
* \param s The string
* \param off Offset in the string to start at
* \param prevoff Pointer to location to receive offset of first byte of
* previous legal character
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
uint32_t *prevoff)
{
parserutils_error error;
 
UTF8_PREV(s, off, prevoff, error);
 
return error;
}
 
/**
* Find next legal UTF-8 char in string
*
* \param s The string (assumed valid)
* \param len Maximum offset in string
* \param off Offset in the string to start at
* \param nextoff Pointer to location to receive offset of first byte of
* next legal character
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
uint32_t off, uint32_t *nextoff)
{
parserutils_error error;
 
UTF8_NEXT(s, len, off, nextoff, error);
 
return error;
}
 
/**
* Find next legal UTF-8 char in string
*
* \param s The string (assumed to be of dubious validity)
* \param len Maximum offset in string
* \param off Offset in the string to start at
* \param nextoff Pointer to location to receive offset of first byte of
* next legal character
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
uint32_t len, uint32_t off, uint32_t *nextoff)
{
parserutils_error error;
 
UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
 
return error;
}
 
/programs/network/netsurf/libparserutils/src/charset/encodings/utf8impl.h
0,0 → 1,342
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_charset_encodings_utf8impl_h_
#define parserutils_charset_encodings_utf8impl_h_
 
/** \file
* UTF-8 manipulation macros (implementation).
*/
#include <stdint.h>
 
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
 
/** Number of continuation bytes for a given start byte */
extern const uint8_t numContinuations[256];
 
/**
* Convert a UTF-8 multibyte sequence into a single UCS-4 character
*
* Encoding of UCS values outside the UTF-16 plane has been removed from
* RFC3629. This macro conforms to RFC2279, however.
*
* \param s The sequence to process
* \param len Length of sequence
* \param ucs4 Pointer to location to receive UCS-4 character (host endian)
* \param clen Pointer to location to receive byte length of UTF-8 sequence
* \param error Location to receive error code
*/
#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \
do { \
uint32_t c, min; \
uint8_t n; \
uint8_t i; \
\
error = PARSERUTILS_OK; \
\
if (s == NULL || ucs4 == NULL || clen == NULL) { \
error = PARSERUTILS_BADPARM; \
break; \
} \
\
if (len == 0) { \
error = PARSERUTILS_NEEDDATA; \
break; \
} \
\
c = s[0]; \
\
if (c < 0x80) { \
n = 1; \
min = 0; \
} else if ((c & 0xE0) == 0xC0) { \
c &= 0x1F; \
n = 2; \
min = 0x80; \
} else if ((c & 0xF0) == 0xE0) { \
c &= 0x0F; \
n = 3; \
min = 0x800; \
} else if ((c & 0xF8) == 0xF0) { \
c &= 0x07; \
n = 4; \
min = 0x10000; \
} else if ((c & 0xFC) == 0xF8) { \
c &= 0x03; \
n = 5; \
min = 0x200000; \
} else if ((c & 0xFE) == 0xFC) { \
c &= 0x01; \
n = 6; \
min = 0x4000000; \
} else { \
error = PARSERUTILS_INVALID; \
break; \
} \
\
if (len < n) { \
error = PARSERUTILS_NEEDDATA; \
break; \
} \
\
for (i = 1; i < n; i++) { \
uint32_t t = s[i]; \
\
if ((t & 0xC0) != 0x80) { \
error = PARSERUTILS_INVALID; \
break; \
} \
\
c <<= 6; \
c |= t & 0x3F; \
} \
\
if (error == PARSERUTILS_OK) { \
/* Detect overlong sequences, surrogates and fffe/ffff */ \
if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \
c == 0xFFFE || c == 0xFFFF) { \
error = PARSERUTILS_INVALID; \
break; \
} \
\
*ucs4 = c; \
*clen = n; \
} \
} while(0)
 
/**
* Convert a single UCS-4 character into a UTF-8 multibyte sequence
*
* Encoding of UCS values outside the UTF-16 plane has been removed from
* RFC3629. This macro conforms to RFC2279, however.
*
* \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
* \param s Pointer to pointer to output buffer, updated on exit
* \param len Pointer to length, in bytes, of output buffer, updated on exit
* \param error Location to receive error code
*/
#define UTF8_FROM_UCS4(ucs4, s, len, error) \
do { \
uint8_t *buf; \
uint8_t l = 0; \
\
error = PARSERUTILS_OK; \
\
if (s == NULL || *s == NULL || len == NULL) { \
error = PARSERUTILS_BADPARM; \
break; \
} \
\
if (ucs4 < 0x80) { \
l = 1; \
} else if (ucs4 < 0x800) { \
l = 2; \
} else if (ucs4 < 0x10000) { \
l = 3; \
} else if (ucs4 < 0x200000) { \
l = 4; \
} else if (ucs4 < 0x4000000) { \
l = 5; \
} else if (ucs4 <= 0x7FFFFFFF) { \
l = 6; \
} else { \
error = PARSERUTILS_INVALID; \
break; \
} \
\
if (l > *len) { \
error = PARSERUTILS_NOMEM; \
break; \
} \
\
buf = *s; \
\
if (l == 1) { \
buf[0] = (uint8_t) ucs4; \
} else { \
uint8_t i; \
for (i = l; i > 1; i--) { \
buf[i - 1] = 0x80 | (ucs4 & 0x3F); \
ucs4 >>= 6; \
} \
buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \
} \
\
*s += l; \
*len -= l; \
} while(0)
 
/**
* Calculate the length (in characters) of a bounded UTF-8 string
*
* \param s The string
* \param max Maximum length
* \param len Pointer to location to receive length of string
* \param error Location to receive error code
*/
#define UTF8_LENGTH(s, max, len, error) \
do { \
const uint8_t *end = s + max; \
int l = 0; \
\
error = PARSERUTILS_OK; \
\
if (s == NULL || len == NULL) { \
error = PARSERUTILS_BADPARM; \
break; \
} \
\
while (s < end) { \
uint32_t c = s[0]; \
\
if ((c & 0x80) == 0x00) \
s += 1; \
else if ((c & 0xE0) == 0xC0) \
s += 2; \
else if ((c & 0xF0) == 0xE0) \
s += 3; \
else if ((c & 0xF8) == 0xF0) \
s += 4; \
else if ((c & 0xFC) == 0xF8) \
s += 5; \
else if ((c & 0xFE) == 0xFC) \
s += 6; \
else { \
error = PARSERUTILS_INVALID; \
break; \
} \
\
l++; \
} \
\
if (error == PARSERUTILS_OK) \
*len = l; \
} while(0)
 
/**
* Calculate the length (in bytes) of a UTF-8 character
*
* \param s Pointer to start of character
* \param len Pointer to location to receive length
* \param error Location to receive error code
*/
#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \
do { \
if (s == NULL || len == NULL) { \
error = PARSERUTILS_BADPARM; \
break; \
} \
\
*len = numContinuations[s[0]] + 1 /* Start byte */; \
\
error = PARSERUTILS_OK; \
} while(0)
 
/**
* Find previous legal UTF-8 char in string
*
* \param s The string
* \param off Offset in the string to start at
* \param prevoff Pointer to location to receive offset of first byte of
* previous legal character
* \param error Location to receive error code
*/
#define UTF8_PREV(s, off, prevoff, error) \
do { \
if (s == NULL || prevoff == NULL) { \
error = PARSERUTILS_BADPARM; \
break; \
} \
\
while (off != 0 && (s[--off] & 0xC0) == 0x80) \
/* do nothing */; \
\
*prevoff = off; \
\
error = PARSERUTILS_OK; \
} while(0)
 
/**
* Find next legal UTF-8 char in string
*
* \param s The string (assumed valid)
* \param len Maximum offset in string
* \param off Offset in the string to start at
* \param nextoff Pointer to location to receive offset of first byte of
* next legal character
* \param error Location to receive error code
*/
#define UTF8_NEXT(s, len, off, nextoff, error) \
do { \
if (s == NULL || off >= len || nextoff == NULL) { \
error = PARSERUTILS_BADPARM; \
break; \
} \
\
/* Skip current start byte (if present - may be mid-sequence) */\
if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
off++; \
\
while (off < len && (s[off] & 0xC0) == 0x80) \
off++; \
\
*nextoff = off; \
\
error = PARSERUTILS_OK; \
} while(0)
 
/**
* Skip to start of next sequence in UTF-8 input
*
* \param s The string (assumed to be of dubious validity)
* \param len Maximum offset in string
* \param off Offset in the string to start at
* \param nextoff Pointer to location to receive offset of first byte of
* next legal character
* \param error Location to receive error code
*/
#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \
do { \
uint8_t c; \
\
error = PARSERUTILS_OK; \
\
if (s == NULL || off >= len || nextoff == NULL) { \
error = PARSERUTILS_BADPARM; \
break; \
} \
\
c = s[off]; \
\
/* If we're mid-sequence, simply advance to next byte */ \
if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \
off++; \
} else { \
uint32_t nCont = numContinuations[c]; \
uint32_t nToSkip; \
\
if (off + nCont + 1 >= len) { \
error = PARSERUTILS_NEEDDATA; \
break; \
} \
\
/* Verify continuation bytes */ \
for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \
if ((s[off + nToSkip] & 0xC0) != 0x80) \
break; \
} \
\
/* Skip over the valid bytes */ \
off += nToSkip; \
} \
\
*nextoff = off; \
} while(0)
 
#endif
/programs/network/netsurf/libparserutils/src/input/Makefile
0,0 → 1,6
 
 
OUTFILE = libo.o
OBJS = filter.o inputstream.o
CFLAGS += -I ../../include/ -I ../../../ -I ../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib
/programs/network/netsurf/libparserutils/src/input/filter.c
0,0 → 1,419
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
 
#define WITHOUT_ICONV_FILTER
#ifndef WITHOUT_ICONV_FILTER
#include <iconv.h>
#endif
 
#include <parserutils/charset/mibenum.h>
#include <parserutils/charset/codec.h>
 
#include "input/filter.h"
#include "utils/utils.h"
 
/** Input filter */
struct parserutils_filter {
#ifndef WITHOUT_ICONV_FILTER
iconv_t cd; /**< Iconv conversion descriptor */
uint16_t int_enc; /**< The internal encoding */
#else
parserutils_charset_codec *read_codec; /**< Read codec */
parserutils_charset_codec *write_codec; /**< Write codec */
 
uint32_t pivot_buf[64]; /**< Conversion pivot buffer */
 
bool leftover; /**< Data remains from last call */
uint8_t *pivot_left; /**< Remaining pivot to write */
size_t pivot_len; /**< Length of pivot remaining */
#endif
 
struct {
uint16_t encoding; /**< Input encoding */
} settings; /**< Filter settings */
 
parserutils_alloc alloc; /**< Memory (de)allocation function */
void *pw; /**< Client private data */
};
 
static parserutils_error filter_set_defaults(parserutils_filter *input);
static parserutils_error filter_set_encoding(parserutils_filter *input,
const char *enc);
 
/**
* Create an input filter
*
* \param int_enc Desired encoding of document
* \param alloc Function used to (de)allocate data
* \param pw Pointer to client-specific private data (may be NULL)
* \param filter Pointer to location to receive filter instance
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhausion,
* PARSERUTILS_BADENCODING if the encoding is unsupported
*/
parserutils_error parserutils__filter_create(const char *int_enc,
parserutils_alloc alloc, void *pw, parserutils_filter **filter)
{
parserutils_filter *f;
parserutils_error error;
 
if (int_enc == NULL || alloc == NULL || filter == NULL)
return PARSERUTILS_BADPARM;
 
f = alloc(NULL, sizeof(parserutils_filter), pw);
if (f == NULL)
return PARSERUTILS_NOMEM;
 
#ifndef WITHOUT_ICONV_FILTER
f->cd = (iconv_t) -1;
f->int_enc = parserutils_charset_mibenum_from_name(
int_enc, strlen(int_enc));
if (f->int_enc == 0) {
alloc(f, 0, pw);
return PARSERUTILS_BADENCODING;
}
#else
f->leftover = false;
f->pivot_left = NULL;
f->pivot_len = 0;
#endif
 
f->alloc = alloc;
f->pw = pw;
 
error = filter_set_defaults(f);
if (error != PARSERUTILS_OK) {
f->alloc(f, 0, pw);
return error;
}
 
#ifdef WITHOUT_ICONV_FILTER
error = parserutils_charset_codec_create(int_enc, alloc, pw,
&f->write_codec);
if (error != PARSERUTILS_OK) {
if (f->read_codec != NULL) {
parserutils_charset_codec_destroy(f->read_codec);
f->read_codec = NULL;
}
f->alloc(f, 0, pw);
return error;
}
#endif
 
*filter = f;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy an input filter
*
* \param input Pointer to filter instance
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils__filter_destroy(parserutils_filter *input)
{
if (input == NULL)
return PARSERUTILS_BADPARM;
 
#ifndef WITHOUT_ICONV_FILTER
if (input->cd != (iconv_t) -1) {
iconv_close(input->cd);
input->cd = (iconv_t) -1;
}
#else
if (input->read_codec != NULL) {
parserutils_charset_codec_destroy(input->read_codec);
input->read_codec = NULL;
}
 
if (input->write_codec != NULL) {
parserutils_charset_codec_destroy(input->write_codec);
input->write_codec = NULL;
}
#endif
 
input->alloc(input, 0, input->pw);
 
return PARSERUTILS_OK;
}
 
/**
* Configure an input filter
*
* \param input Pointer to filter instance
* \param type Input option type to configure
* \param params Option-specific parameters
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils__filter_setopt(parserutils_filter *input,
parserutils_filter_opttype type,
parserutils_filter_optparams *params)
{
parserutils_error error = PARSERUTILS_OK;
 
if (input == NULL || params == NULL)
return PARSERUTILS_BADPARM;
 
switch (type) {
case PARSERUTILS_FILTER_SET_ENCODING:
error = filter_set_encoding(input, params->encoding.name);
break;
}
 
return error;
}
 
/**
* Process a chunk of data
*
* \param input Pointer to filter instance
* \param data Pointer to pointer to input buffer
* \param len Pointer to length of input buffer
* \param output Pointer to pointer to output buffer
* \param outlen Pointer to length of output buffer
* \return PARSERUTILS_OK on success, appropriate error otherwise
*
* Call this with an input buffer length of 0 to flush any buffers.
*/
parserutils_error parserutils__filter_process_chunk(parserutils_filter *input,
const uint8_t **data, size_t *len,
uint8_t **output, size_t *outlen)
{
if (input == NULL || data == NULL || *data == NULL || len == NULL ||
output == NULL || *output == NULL || outlen == NULL)
return PARSERUTILS_BADPARM;
 
#ifndef WITHOUT_ICONV_FILTER
if (iconv(input->cd, (void *) data, len,
(char **) output, outlen) == (size_t) -1) {
switch (errno) {
case E2BIG:
return PARSERUTILS_NOMEM;
case EILSEQ:
if (*outlen < 3)
return PARSERUTILS_NOMEM;
 
(*output)[0] = 0xef;
(*output)[1] = 0xbf;
(*output)[2] = 0xbd;
 
*output += 3;
*outlen -= 3;
 
(*data)++;
(*len)--;
 
while (*len > 0) {
size_t ret;
ret = iconv(input->cd, (void *) data, len,
(char **) output, outlen);
if (ret != (size_t) -1 || errno != EILSEQ)
break;
 
if (*outlen < 3)
return PARSERUTILS_NOMEM;
 
(*output)[0] = 0xef;
(*output)[1] = 0xbf;
(*output)[2] = 0xbd;
 
*output += 3;
*outlen -= 3;
 
(*data)++;
(*len)--;
}
 
return errno == E2BIG ? PARSERUTILS_NOMEM
: PARSERUTILS_OK;
}
}
 
return PARSERUTILS_OK;
#else
if (input->leftover) {
parserutils_error write_error;
 
/* Some data left to be written from last call */
 
/* Attempt to flush the remaining data. */
write_error = parserutils_charset_codec_encode(
input->write_codec,
(const uint8_t **) &input->pivot_left,
&input->pivot_len,
output, outlen);
 
if (write_error != PARSERUTILS_OK)
return write_error;
 
 
/* And clear leftover */
input->pivot_left = NULL;
input->pivot_len = 0;
input->leftover = false;
}
 
while (*len > 0) {
parserutils_error read_error, write_error;
size_t pivot_len = sizeof(input->pivot_buf);
uint8_t *pivot = (uint8_t *) input->pivot_buf;
 
read_error = parserutils_charset_codec_decode(input->read_codec,
data, len,
(uint8_t **) &pivot, &pivot_len);
 
pivot = (uint8_t *) input->pivot_buf;
pivot_len = sizeof(input->pivot_buf) - pivot_len;
 
if (pivot_len > 0) {
write_error = parserutils_charset_codec_encode(
input->write_codec,
(const uint8_t **) &pivot,
&pivot_len,
output, outlen);
 
if (write_error != PARSERUTILS_OK) {
input->leftover = true;
input->pivot_left = pivot;
input->pivot_len = pivot_len;
 
return write_error;
}
}
 
if (read_error != PARSERUTILS_OK &&
read_error != PARSERUTILS_NOMEM)
return read_error;
}
 
return PARSERUTILS_OK;
#endif
}
 
/**
* Reset an input filter's state
*
* \param input The input filter to reset
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils__filter_reset(parserutils_filter *input)
{
parserutils_error error = PARSERUTILS_OK;
 
if (input == NULL)
return PARSERUTILS_BADPARM;
 
#ifndef WITHOUT_ICONV_FILTER
iconv(input->cd, NULL, 0, NULL, 0);
#else
/* Clear pivot buffer leftovers */
input->pivot_left = NULL;
input->pivot_len = 0;
input->leftover = false;
 
/* Reset read codec */
error = parserutils_charset_codec_reset(input->read_codec);
if (error != PARSERUTILS_OK)
return error;
 
/* Reset write codec */
error = parserutils_charset_codec_reset(input->write_codec);
if (error != PARSERUTILS_OK)
return error;
#endif
 
return error;
}
 
/**
* Set an input filter's default settings
*
* \param input Input filter to configure
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error filter_set_defaults(parserutils_filter *input)
{
parserutils_error error;
 
if (input == NULL)
return PARSERUTILS_BADPARM;
 
#ifdef WITHOUT_ICONV_FILTER
input->read_codec = NULL;
input->write_codec = NULL;
#endif
 
input->settings.encoding = 0;
error = filter_set_encoding(input, "UTF-8");
if (error != PARSERUTILS_OK)
return error;
 
return PARSERUTILS_OK;
}
 
/**
* Set an input filter's encoding
*
* \param input Input filter to configure
* \param enc Encoding name
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error filter_set_encoding(parserutils_filter *input,
const char *enc)
{
parserutils_error error = PARSERUTILS_OK;
uint16_t mibenum;
 
if (input == NULL || enc == NULL)
return PARSERUTILS_BADPARM;
 
mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc));
if (mibenum == 0)
return PARSERUTILS_BADENCODING;
 
/* Exit early if we're already using this encoding */
if (input->settings.encoding == mibenum)
return PARSERUTILS_OK;
 
#ifndef WITHOUT_ICONV_FILTER
if (input->cd != (iconv_t) -1) {
iconv_close(input->cd);
input->cd = (iconv_t) -1;
}
 
input->cd = iconv_open(
parserutils_charset_mibenum_to_name(input->int_enc),
parserutils_charset_mibenum_to_name(mibenum));
if (input->cd == (iconv_t) -1) {
return (errno == EINVAL) ? PARSERUTILS_BADENCODING
: PARSERUTILS_NOMEM;
}
#else
if (input->read_codec != NULL) {
parserutils_charset_codec_destroy(input->read_codec);
input->read_codec = NULL;
}
 
error = parserutils_charset_codec_create(enc, input->alloc,
input->pw, &input->read_codec);
if (error != PARSERUTILS_OK)
return error;
#endif
 
input->settings.encoding = mibenum;
 
return error;
 
}
/programs/network/netsurf/libparserutils/src/input/filter.h
0,0 → 1,57
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_input_filter_h_
#define parserutils_input_filter_h_
 
#include <inttypes.h>
 
#include <parserutils/errors.h>
#include <parserutils/functypes.h>
 
typedef struct parserutils_filter parserutils_filter;
 
/**
* Input filter option types
*/
typedef enum parserutils_filter_opttype {
PARSERUTILS_FILTER_SET_ENCODING = 0
} parserutils_filter_opttype;
 
/**
* Input filter option parameters
*/
typedef union parserutils_filter_optparams {
/** Parameters for encoding setting */
struct {
/** Encoding name */
const char *name;
} encoding;
} parserutils_filter_optparams;
 
 
/* Create an input filter */
parserutils_error parserutils__filter_create(const char *int_enc,
parserutils_alloc alloc, void *pw, parserutils_filter **filter);
/* Destroy an input filter */
parserutils_error parserutils__filter_destroy(parserutils_filter *input);
 
/* Configure an input filter */
parserutils_error parserutils__filter_setopt(parserutils_filter *input,
parserutils_filter_opttype type,
parserutils_filter_optparams *params);
 
/* Process a chunk of data */
parserutils_error parserutils__filter_process_chunk(parserutils_filter *input,
const uint8_t **data, size_t *len,
uint8_t **output, size_t *outlen);
 
/* Reset an input filter's state */
parserutils_error parserutils__filter_reset(parserutils_filter *input);
 
#endif
 
/programs/network/netsurf/libparserutils/src/input/inputstream.c
0,0 → 1,615
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <assert.h>
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/charset/mibenum.h>
#include <parserutils/charset/utf8.h>
#include <parserutils/input/inputstream.h>
 
#include "input/filter.h"
#include "utils/utils.h"
 
/**
* Private input stream definition
*/
typedef struct parserutils_inputstream_private {
parserutils_inputstream public; /**< Public part. Must be first */
 
parserutils_buffer *raw; /**< Buffer containing raw data */
 
bool done_first_chunk; /**< Whether the first chunk has
* been processed */
 
uint16_t mibenum; /**< MIB enum for charset, or 0 */
uint32_t encsrc; /**< Charset source */
 
parserutils_filter *input; /**< Charset conversion filter */
 
parserutils_charset_detect_func csdetect; /**< Charset detection func.*/
 
parserutils_alloc alloc; /**< Memory (de)allocation function */
void *pw; /**< Client private data */
} parserutils_inputstream_private;
 
static inline parserutils_error parserutils_inputstream_refill_buffer(
parserutils_inputstream_private *stream);
static inline parserutils_error parserutils_inputstream_strip_bom(
uint16_t *mibenum, parserutils_buffer *buffer);
 
/**
* Create an input stream
*
* \param enc Document charset, or NULL to autodetect
* \param encsrc Value for encoding source, if specified, or 0
* \param csdetect Charset detection function, or NULL
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \param stream Pointer to location to receive stream instance
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhaustion,
* PARSERUTILS_BADENCODING on unsupported encoding
*
* The value 0 is defined as being the lowest priority encoding source
* (i.e. the default fallback encoding). Beyond this, no further
* interpretation is made upon the encoding source.
*/
parserutils_error parserutils_inputstream_create(const char *enc,
uint32_t encsrc, parserutils_charset_detect_func csdetect,
parserutils_alloc alloc, void *pw,
parserutils_inputstream **stream)
{
parserutils_inputstream_private *s;
parserutils_error error;
 
if (alloc == NULL || stream == NULL)
return PARSERUTILS_BADPARM;
 
s = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
if (s == NULL)
return PARSERUTILS_NOMEM;
 
error = parserutils_buffer_create(alloc, pw, &s->raw);
if (error != PARSERUTILS_OK) {
alloc(s, 0, pw);
return error;
}
 
error = parserutils_buffer_create(alloc, pw, &s->public.utf8);
if (error != PARSERUTILS_OK) {
parserutils_buffer_destroy(s->raw);
alloc(s, 0, pw);
return error;
}
 
s->public.cursor = 0;
s->public.had_eof = false;
s->done_first_chunk = false;
 
error = parserutils__filter_create("UTF-8", alloc, pw, &s->input);
if (error != PARSERUTILS_OK) {
parserutils_buffer_destroy(s->public.utf8);
parserutils_buffer_destroy(s->raw);
alloc(s, 0, pw);
return error;
}
 
if (enc != NULL) {
parserutils_filter_optparams params;
 
s->mibenum =
parserutils_charset_mibenum_from_name(enc, strlen(enc));
 
if (s->mibenum == 0)
return PARSERUTILS_BADENCODING;
 
params.encoding.name = enc;
 
error = parserutils__filter_setopt(s->input,
PARSERUTILS_FILTER_SET_ENCODING,
&params);
if (error != PARSERUTILS_OK) {
parserutils__filter_destroy(s->input);
parserutils_buffer_destroy(s->public.utf8);
parserutils_buffer_destroy(s->raw);
alloc(s, 0, pw);
return error;
}
 
s->encsrc = encsrc;
} else {
s->mibenum = 0;
s->encsrc = 0;
}
 
s->csdetect = csdetect;
 
s->alloc = alloc;
s->pw = pw;
 
*stream = (parserutils_inputstream *) s;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy an input stream
*
* \param stream Input stream to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_inputstream_destroy(
parserutils_inputstream *stream)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
 
if (stream == NULL)
return PARSERUTILS_BADPARM;
 
parserutils__filter_destroy(s->input);
parserutils_buffer_destroy(s->public.utf8);
parserutils_buffer_destroy(s->raw);
s->alloc(s, 0, s->pw);
 
return PARSERUTILS_OK;
}
 
/**
* Append data to an input stream
*
* \param stream Input stream to append data to
* \param data Data to append (in document charset), or NULL to flag EOF
* \param len Length, in bytes, of data
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_inputstream_append(
parserutils_inputstream *stream,
const uint8_t *data, size_t len)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
 
if (stream == NULL)
return PARSERUTILS_BADPARM;
 
if (data == NULL) {
s->public.had_eof = true;
return PARSERUTILS_OK;
}
 
return parserutils_buffer_append(s->raw, data, len);
}
 
/**
* Insert data into stream at current location
*
* \param stream Input stream to insert into
* \param data Data to insert (UTF-8 encoded)
* \param len Length, in bytes, of data
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_inputstream_insert(
parserutils_inputstream *stream,
const uint8_t *data, size_t len)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
 
if (stream == NULL || data == NULL)
return PARSERUTILS_BADPARM;
 
return parserutils_buffer_insert(s->public.utf8, s->public.cursor,
data, len);
}
 
#define IS_ASCII(x) (((x) & 0x80) == 0)
 
/**
* Look at the character in the stream that starts at
* offset bytes from the cursor (slow version)
*
* \param stream Stream to look in
* \param offset Byte offset of start of character
* \param ptr Pointer to location to receive pointer to character data
* \param length Pointer to location to receive character length (in bytes)
* \return PARSERUTILS_OK on success,
* _NEEDDATA on reaching the end of available input,
* _EOF on reaching the end of all input,
* _BADENCODING if the input cannot be decoded,
* _NOMEM on memory exhaustion,
* _BADPARM if bad parameters are passed.
*
* Once the character pointed to by the result of this call has been advanced
* past (i.e. parserutils_inputstream_advance has caused the stream cursor to
* pass over the character), then no guarantee is made as to the validity of
* the data pointed to. Thus, any attempt to dereference the pointer after
* advancing past the data it points to is a bug.
*/
parserutils_error parserutils_inputstream_peek_slow(
parserutils_inputstream *stream,
size_t offset, const uint8_t **ptr, size_t *length)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
parserutils_error error = PARSERUTILS_OK;
size_t len;
 
if (stream == NULL || ptr == NULL || length == NULL)
return PARSERUTILS_BADPARM;
 
/* There's insufficient data in the buffer, so read some more */
if (s->raw->length == 0) {
/* No more data to be had */
return s->public.had_eof ? PARSERUTILS_EOF
: PARSERUTILS_NEEDDATA;
}
 
/* Refill utf8 buffer from raw buffer */
error = parserutils_inputstream_refill_buffer(s);
if (error != PARSERUTILS_OK)
return error;
 
/* Refill may have succeeded, but not actually produced any new data */
if (s->public.cursor + offset == s->public.utf8->length)
return PARSERUTILS_NEEDDATA;
 
/* Now try the read */
if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
len = 1;
} else {
error = parserutils_charset_utf8_char_byte_length(
s->public.utf8->data + s->public.cursor + offset,
&len);
 
if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
return error;
 
if (error == PARSERUTILS_NEEDDATA) {
return s->public.had_eof ? PARSERUTILS_EOF
: PARSERUTILS_NEEDDATA;
}
}
 
(*length) = len;
(*ptr) = (s->public.utf8->data + s->public.cursor + offset);
 
return PARSERUTILS_OK;
}
 
#undef IS_ASCII
 
/**
* Read the source charset of the input stream
*
* \param stream Input stream to query
* \param source Pointer to location to receive charset source identifier
* \return Pointer to charset name (constant; do not free)
*/
const char *parserutils_inputstream_read_charset(
parserutils_inputstream *stream, uint32_t *source)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
 
if (stream == NULL || source == NULL)
return NULL;
 
*source = s->encsrc;
 
if (s->encsrc == 0)
return "UTF-8";
 
return parserutils_charset_mibenum_to_name(s->mibenum);
}
 
/**
* Change the source charset of the input stream
*
* \param stream Input stream to modify
* \param enc Charset name
* \param source Charset source identifier
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on invalid parameters,
* PARSERUTILS_INVALID if called after data has been read from stream,
* PARSERUTILS_BADENCODING if the encoding is unsupported,
* PARSERUTILS_NOMEM on memory exhaustion.
*/
parserutils_error parserutils_inputstream_change_charset(
parserutils_inputstream *stream,
const char *enc, uint32_t source)
{
parserutils_inputstream_private *s =
(parserutils_inputstream_private *) stream;
parserutils_filter_optparams params;
uint16_t temp;
parserutils_error error;
 
if (stream == NULL || enc == NULL)
return PARSERUTILS_BADPARM;
 
if (s->done_first_chunk)
return PARSERUTILS_INVALID;
 
temp = parserutils_charset_mibenum_from_name(enc, strlen(enc));
if (temp == 0)
return PARSERUTILS_BADENCODING;
 
/* Ensure filter is using the correct encoding */
params.encoding.name = enc;
error = parserutils__filter_setopt(s->input,
PARSERUTILS_FILTER_SET_ENCODING,
&params);
if (error != PARSERUTILS_OK)
return error;
 
/* Finally, replace the current settings */
s->mibenum = temp;
s->encsrc = source;
 
return PARSERUTILS_OK;
}
 
/******************************************************************************
******************************************************************************/
 
/**
* Refill the UTF-8 buffer from the raw buffer
*
* \param stream The inputstream to operate on
* \return PARSERUTILS_OK on success
*/
parserutils_error parserutils_inputstream_refill_buffer(
parserutils_inputstream_private *stream)
{
const uint8_t *raw;
uint8_t *utf8;
size_t raw_length, utf8_space;
parserutils_error error;
 
/* If this is the first chunk of data, we must detect the charset and
* strip the BOM, if one exists */
if (stream->done_first_chunk == false) {
parserutils_filter_optparams params;
 
/* If there is a charset detection routine, give it an
* opportunity to override any charset specified when the
* inputstream was created */
if (stream->csdetect != NULL) {
error = stream->csdetect(stream->raw->data,
stream->raw->length,
&stream->mibenum, &stream->encsrc);
if (error != PARSERUTILS_OK) {
if (error != PARSERUTILS_NEEDDATA ||
stream->public.had_eof == false)
return error;
 
/* We don't have enough data to detect the
* input encoding, but we're not going to get
* any more as we've been notified of EOF.
* Therefore, leave the encoding alone
* so that any charset specified when the
* inputstream was created will be preserved.
* If there was no charset specified, then
* we'll default to UTF-8, below */
}
}
 
/* Default to UTF-8 if there is still no encoding information
* We'll do this if there was no encoding specified up-front
* and:
* 1) there was no charset detection routine
* or 2) there was insufficient data for the charset
* detection routine to detect an encoding
*/
if (stream->mibenum == 0) {
stream->mibenum =
parserutils_charset_mibenum_from_name("UTF-8",
SLEN("UTF-8"));
stream->encsrc = 0;
}
 
if (stream->mibenum == 0)
abort();
 
/* Strip any BOM, and update encoding as appropriate */
error = parserutils_inputstream_strip_bom(&stream->mibenum,
stream->raw);
if (error != PARSERUTILS_OK)
return error;
 
/* Ensure filter is using the correct encoding */
params.encoding.name =
parserutils_charset_mibenum_to_name(stream->mibenum);
 
error = parserutils__filter_setopt(stream->input,
PARSERUTILS_FILTER_SET_ENCODING,
&params);
if (error != PARSERUTILS_OK)
return error;
 
stream->done_first_chunk = true;
}
 
/* Work out how to perform the buffer fill */
if (stream->public.cursor == stream->public.utf8->length) {
/* Cursor's at the end, so simply reuse the entire buffer */
utf8 = stream->public.utf8->data;
utf8_space = stream->public.utf8->allocated;
} else {
/* Cursor's not at the end, so shift data after cursor to the
* bottom of the buffer. If the buffer's still over half full,
* extend it. */
memmove(stream->public.utf8->data,
stream->public.utf8->data + stream->public.cursor,
stream->public.utf8->length - stream->public.cursor);
 
stream->public.utf8->length -= stream->public.cursor;
 
if (stream->public.utf8->length >
stream->public.utf8->allocated / 2) {
error = parserutils_buffer_grow(stream->public.utf8);
if (error != PARSERUTILS_OK)
return error;
}
 
utf8 = stream->public.utf8->data + stream->public.utf8->length;
utf8_space = stream->public.utf8->allocated -
stream->public.utf8->length;
}
 
raw = stream->raw->data;
raw_length = stream->raw->length;
 
/* Try to fill utf8 buffer from the raw data */
error = parserutils__filter_process_chunk(stream->input,
&raw, &raw_length, &utf8, &utf8_space);
/* _NOMEM implies that there's more input to read than available space
* in the utf8 buffer. That's fine, so we'll ignore that error. */
if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
return error;
 
/* Remove the raw data we've processed from the raw buffer */
error = parserutils_buffer_discard(stream->raw, 0,
stream->raw->length - raw_length);
if (error != PARSERUTILS_OK)
return error;
 
/* Fix up the utf8 buffer information */
stream->public.utf8->length =
stream->public.utf8->allocated - utf8_space;
 
/* Finally, fix up the cursor */
stream->public.cursor = 0;
 
return PARSERUTILS_OK;
}
 
/**
* Strip a BOM from a buffer in the given encoding
*
* \param mibenum Pointer to the character set of the buffer, updated on exit
* \param buffer The buffer to process
*/
parserutils_error parserutils_inputstream_strip_bom(uint16_t *mibenum,
parserutils_buffer *buffer)
{
static uint16_t utf8;
static uint16_t utf16;
static uint16_t utf16be;
static uint16_t utf16le;
static uint16_t utf32;
static uint16_t utf32be;
static uint16_t utf32le;
 
if (utf8 == 0) {
utf8 = parserutils_charset_mibenum_from_name("UTF-8",
SLEN("UTF-8"));
utf16 = parserutils_charset_mibenum_from_name("UTF-16",
SLEN("UTF-16"));
utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
SLEN("UTF-16BE"));
utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
SLEN("UTF-16LE"));
utf32 = parserutils_charset_mibenum_from_name("UTF-32",
SLEN("UTF-32"));
utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
SLEN("UTF-32BE"));
utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
SLEN("UTF-32LE"));
}
 
#define UTF32_BOM_LEN (4)
#define UTF16_BOM_LEN (2)
#define UTF8_BOM_LEN (3)
 
if (*mibenum == utf8) {
if (buffer->length >= UTF8_BOM_LEN &&
buffer->data[0] == 0xEF &&
buffer->data[1] == 0xBB &&
buffer->data[2] == 0xBF) {
return parserutils_buffer_discard(
buffer, 0, UTF8_BOM_LEN);
}
} else if (*mibenum == utf16be) {
if (buffer->length >= UTF16_BOM_LEN &&
buffer->data[0] == 0xFE &&
buffer->data[1] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
}
} else if (*mibenum == utf16le) {
if (buffer->length >= UTF16_BOM_LEN &&
buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE) {
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
}
} else if (*mibenum == utf16) {
*mibenum = utf16be;
 
if (buffer->length >= UTF16_BOM_LEN) {
if (buffer->data[0] == 0xFE &&
buffer->data[1] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
} else if (buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE) {
*mibenum = utf16le;
return parserutils_buffer_discard(
buffer, 0, UTF16_BOM_LEN);
}
}
} else if (*mibenum == utf32be) {
if (buffer->length >= UTF32_BOM_LEN &&
buffer->data[0] == 0x00 &&
buffer->data[1] == 0x00 &&
buffer->data[2] == 0xFE &&
buffer->data[3] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
}
} else if (*mibenum == utf32le) {
if (buffer->length >= UTF32_BOM_LEN &&
buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE &&
buffer->data[2] == 0x00 &&
buffer->data[3] == 0x00) {
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
}
} else if (*mibenum == utf32) {
*mibenum = utf32be;
 
if (buffer->length >= UTF32_BOM_LEN) {
if (buffer->data[0] == 0x00 &&
buffer->data[1] == 0x00 &&
buffer->data[2] == 0xFE &&
buffer->data[3] == 0xFF) {
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
} else if (buffer->data[0] == 0xFF &&
buffer->data[1] == 0xFE &&
buffer->data[2] == 0x00 &&
buffer->data[3] == 0x00) {
*mibenum = utf32le;
return parserutils_buffer_discard(
buffer, 0, UTF32_BOM_LEN);
}
}
}
 
#undef UTF8_BOM_LEN
#undef UTF16_BOM_LEN
#undef UTF32_BOM_LEN
 
return PARSERUTILS_OK;
}
 
/programs/network/netsurf/libparserutils/src/utils/Makefile
0,0 → 1,5
 
OUTFILE = libo.o
OBJS = buffer.o errors.o stack.o vector.o
CFLAGS += -I ../../include/ -I ../../../ -I ../
include $(MENUETDEV)/makefiles/Makefile_for_o_lib
/programs/network/netsurf/libparserutils/src/utils/buffer.c
0,0 → 1,196
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <string.h>
 
#include <parserutils/utils/buffer.h>
 
#define DEFAULT_SIZE (4096)
 
/**
* Create a memory buffer
*
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \param buffer Pointer to location to receive memory buffer
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhausion
*/
parserutils_error parserutils_buffer_create(parserutils_alloc alloc, void *pw,
parserutils_buffer **buffer)
{
parserutils_buffer *b;
 
if (alloc == NULL || buffer == NULL)
return PARSERUTILS_BADPARM;
 
b = alloc(NULL, sizeof(parserutils_buffer), pw);
if (b == NULL)
return PARSERUTILS_NOMEM;
 
b->data = alloc(NULL, DEFAULT_SIZE, pw);
if (b->data == NULL) {
alloc(b, 0, pw);
return PARSERUTILS_NOMEM;
}
 
b->length = 0;
b->allocated = DEFAULT_SIZE;
 
b->alloc = alloc;
b->pw = pw;
 
*buffer = b;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy a memory buffer
*
* \param buffer The buffer to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_buffer_destroy(parserutils_buffer *buffer)
{
if (buffer == NULL)
return PARSERUTILS_BADPARM;
 
buffer->alloc(buffer->data, 0, buffer->pw);
buffer->alloc(buffer, 0, buffer->pw);
 
return PARSERUTILS_OK;
}
 
/**
* Append data to a memory buffer
*
* \param buffer The buffer to append to
* \param data The data to append
* \param len The length, in bytes, of the data to append
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*/
parserutils_error parserutils_buffer_append(parserutils_buffer *buffer,
const uint8_t *data, size_t len)
{
while (len >= buffer->allocated - buffer->length) {
parserutils_error error = parserutils_buffer_grow(buffer);
if (error != PARSERUTILS_OK)
return error;
}
 
memcpy(buffer->data + buffer->length, data, len);
 
buffer->length += len;
 
return PARSERUTILS_OK;
}
 
/**
* Insert data into a memory buffer
*
* \param buffer The buffer to insert into
* \param offset The offset into the buffer to insert at
* \param data The data to insert
* \param len The length, in bytes, of the data to insert
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer,
size_t offset, const uint8_t *data, size_t len)
{
if (offset > buffer->length)
return PARSERUTILS_BADPARM;
 
if (offset == buffer->length)
return parserutils_buffer_append(buffer, data, len);
 
while (len >= buffer->allocated - buffer->length) {
parserutils_error error = parserutils_buffer_grow(buffer);
if (error != PARSERUTILS_OK)
return error;
}
 
memmove(buffer->data + offset + len,
buffer->data + offset, buffer->length - offset);
 
memcpy(buffer->data + offset, data, len);
 
buffer->length += len;
 
return PARSERUTILS_OK;
}
 
/**
* Discard a section of a memory buffer
*
* \param buffer The buffer to discard data from
* \param offset The offset into the buffer of the start of the section
* \param len The number of bytes to discard
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*/
parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer,
size_t offset, size_t len)
{
if (offset >= buffer->length || offset + len > buffer->length)
return PARSERUTILS_BADPARM;
 
memmove(buffer->data + offset, buffer->data + offset + len,
buffer->length - len);
 
buffer->length -= len;
 
return PARSERUTILS_OK;
}
 
/**
* Extend the amount of space allocated for a memory buffer
*
* \param buffer The buffer to extend
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*/
parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer)
{
uint8_t *temp = buffer->alloc(buffer->data,
buffer->allocated * 2, buffer->pw);
if (temp == NULL)
return PARSERUTILS_NOMEM;
 
buffer->data = temp;
buffer->allocated *= 2;
 
return PARSERUTILS_OK;
}
 
parserutils_error parserutils_buffer_randomise(parserutils_buffer *buffer)
{
#ifndef NDEBUG
uint8_t *temp;
#endif
 
if (buffer == NULL)
return PARSERUTILS_BADPARM;
 
#ifndef NDEBUG
temp = buffer->alloc(NULL, buffer->allocated, buffer->pw);
if (temp == NULL)
return PARSERUTILS_NOMEM;
 
memcpy(temp, buffer->data, buffer->length);
 
memset(buffer->data, 0xff, buffer->length);
 
/* Leak the buffer's current data, so we don't reuse it */
/* buffer->alloc(buffer->data, 0, buffer->pw); */
 
buffer->data = temp;
#endif
 
 
return PARSERUTILS_OK;
}
 
/programs/network/netsurf/libparserutils/src/utils/endian.h
0,0 → 1,40
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2009 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_endian_h_
#define parserutils_endian_h_
 
static inline bool endian_host_is_le(void)
{
static uint32_t magic = 0x10000002;
 
return (((uint8_t *) &magic)[0] == 0x02);
}
 
static inline uint32_t endian_swap(uint32_t val)
{
return ((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) |
((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24);
}
 
static inline uint32_t endian_host_to_big(uint32_t host)
{
if (endian_host_is_le())
return endian_swap(host);
 
return host;
}
 
static inline uint32_t endian_big_to_host(uint32_t big)
{
if (endian_host_is_le())
return endian_swap(big);
 
return big;
}
 
#endif
/programs/network/netsurf/libparserutils/src/utils/errors.c
0,0 → 1,80
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <string.h>
 
#include <parserutils/errors.h>
 
/**
* Convert a parserutils error code to a string
*
* \param error The error code to convert
* \return Pointer to string representation of error, or NULL if unknown.
*/
const char *parserutils_error_to_string(parserutils_error error)
{
const char *result = NULL;
 
switch (error) {
case PARSERUTILS_OK:
result = "No error";
break;
case PARSERUTILS_NOMEM:
result = "Insufficient memory";
break;
case PARSERUTILS_BADPARM:
result = "Bad parameter";
break;
case PARSERUTILS_INVALID:
result = "Invalid input";
break;
case PARSERUTILS_FILENOTFOUND:
result = "File not found";
break;
case PARSERUTILS_NEEDDATA:
result = "Insufficient data";
break;
case PARSERUTILS_BADENCODING:
result = "Unsupported encoding";
break;
case PARSERUTILS_EOF:
result = "EOF";
break;
}
 
return result;
}
 
/**
* Convert a string representation of an error name to a parserutils error code
*
* \param str String containing error name
* \param len Length of string (bytes)
* \return Error code, or PARSERUTILS_OK if unknown
*/
parserutils_error parserutils_error_from_string(const char *str, size_t len)
{
if (strncmp(str, "PARSERUTILS_OK", len) == 0) {
return PARSERUTILS_OK;
} else if (strncmp(str, "PARSERUTILS_NOMEM", len) == 0) {
return PARSERUTILS_NOMEM;
} else if (strncmp(str, "PARSERUTILS_BADPARM", len) == 0) {
return PARSERUTILS_BADPARM;
} else if (strncmp(str, "PARSERUTILS_INVALID", len) == 0) {
return PARSERUTILS_INVALID;
} else if (strncmp(str, "PARSERUTILS_FILENOTFOUND", len) == 0) {
return PARSERUTILS_FILENOTFOUND;
} else if (strncmp(str, "PARSERUTILS_NEEDDATA", len) == 0) {
return PARSERUTILS_NEEDDATA;
} else if (strncmp(str, "PARSERUTILS_BADENCODING", len) == 0) {
return PARSERUTILS_BADENCODING;
} else if (strncmp(str, "PARSERUTILS_EOF", len) == 0) {
return PARSERUTILS_EOF;
}
 
return PARSERUTILS_OK;
}
/programs/network/netsurf/libparserutils/src/utils/stack.c
0,0 → 1,190
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <inttypes.h>
#include <string.h>
 
#include <parserutils/utils/stack.h>
 
/**
* Stack object
*/
struct parserutils_stack
{
size_t item_size; /**< Size of an item in the stack */
size_t chunk_size; /**< Size of a stack chunk */
size_t items_allocated; /**< Number of slots allocated */
int32_t current_item; /**< Index of current item */
void *items; /**< Items in stack */
 
parserutils_alloc alloc; /**< Memory (de)allocation function */
void *pw; /**< Client-specific data */
};
 
/**
* Create a stack
*
* \param item_size Length, in bytes, of an item in the stack
* \param chunk_size Number of stack slots in a chunk
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \param stack Pointer to location to receive stack instance
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters
* PARSERUTILS_NOMEM on memory exhaustion
*/
parserutils_error parserutils_stack_create(size_t item_size, size_t chunk_size,
parserutils_alloc alloc, void *pw, parserutils_stack **stack)
{
parserutils_stack *s;
 
if (item_size == 0 || chunk_size == 0 || alloc == NULL || stack == NULL)
return PARSERUTILS_BADPARM;
 
s = alloc(NULL, sizeof(parserutils_stack), pw);
if (s == NULL)
return PARSERUTILS_NOMEM;
 
s->items = alloc(NULL, item_size * chunk_size, pw);
if (s->items == NULL) {
alloc(s, 0, pw);
return PARSERUTILS_NOMEM;
}
 
s->item_size = item_size;
s->chunk_size = chunk_size;
s->items_allocated = chunk_size;
s->current_item = -1;
 
s->alloc = alloc;
s->pw = pw;
 
*stack = s;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy a stack instance
*
* \param stack The stack to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*/
parserutils_error parserutils_stack_destroy(parserutils_stack *stack)
{
if (stack == NULL)
return PARSERUTILS_BADPARM;
 
stack->alloc(stack->items, 0, stack->pw);
stack->alloc(stack, 0, stack->pw);
 
return PARSERUTILS_OK;
}
 
/**
* Push an item onto the stack
*
* \param stack The stack to push onto
* \param item The item to push
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_stack_push(parserutils_stack *stack,
const void *item)
{
int32_t slot;
 
if (stack == NULL || item == NULL)
return PARSERUTILS_BADPARM;
 
/* Ensure we'll get a valid slot */
if (stack->current_item < -1 || stack->current_item == INT32_MAX)
return PARSERUTILS_INVALID;
 
slot = stack->current_item + 1;
 
if ((size_t) slot >= stack->items_allocated) {
void *temp = stack->alloc(stack->items,
(stack->items_allocated + stack->chunk_size) *
stack->item_size, stack->pw);
if (temp == NULL)
return PARSERUTILS_NOMEM;
 
stack->items = temp;
stack->items_allocated += stack->chunk_size;
}
 
memcpy((uint8_t *) stack->items + (slot * stack->item_size),
item, stack->item_size);
stack->current_item = slot;
 
return PARSERUTILS_OK;
}
 
/**
* Pop an item off a stack
*
* \param stack The stack to pop from
* \param item Pointer to location to receive popped item, or NULL
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*/
parserutils_error parserutils_stack_pop(parserutils_stack *stack, void *item)
{
if (stack == NULL)
return PARSERUTILS_BADPARM;
 
if (stack->current_item < 0)
return PARSERUTILS_INVALID;
 
if (item != NULL) {
memcpy(item, (uint8_t *) stack->items +
(stack->current_item * stack->item_size),
stack->item_size);
}
 
stack->current_item -= 1;
 
return PARSERUTILS_OK;
}
 
/**
* Retrieve a pointer to the current item on the stack
*
* \param stack The stack to inspect
* \return Pointer to item on stack, or NULL if none
*/
void *parserutils_stack_get_current(parserutils_stack *stack)
{
if (stack == NULL || stack->current_item < 0)
return NULL;
 
return (uint8_t *) stack->items +
(stack->current_item * stack->item_size);
}
 
#ifndef NDEBUG
#include <stdio.h>
 
extern void parserutils_stack_dump(parserutils_stack *stack, const char *prefix,
void (*printer)(void *item));
 
void parserutils_stack_dump(parserutils_stack *stack, const char *prefix,
void (*printer)(void *item))
{
int32_t i;
 
if (stack == NULL || printer == NULL)
return;
 
for (i = 0; i <= stack->current_item; i++) {
printf("%s %d: ", prefix != NULL ? prefix : "", i);
printer((uint8_t *) stack->items + (i * stack->item_size));
printf("\n");
}
}
 
#endif
 
/programs/network/netsurf/libparserutils/src/utils/utils.h
0,0 → 1,36
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#ifndef parserutils_utils_h_
#define parserutils_utils_h_
 
#ifndef max
#define max(a,b) ((a)>(b)?(a):(b))
#endif
 
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
 
#ifndef SLEN
/* Calculate length of a string constant */
#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */
#endif
 
#ifndef UNUSED
#define UNUSED(x) ((x)=(x))
#endif
 
#ifndef N_ELEMENTS
#define N_ELEMENTS(s) (sizeof((s)) / sizeof((s)[0]))
#endif
 
#ifndef ALIGN
#define ALIGN(val) (((val) + 3) & ~(3))
#endif
 
#endif
/programs/network/netsurf/libparserutils/src/utils/vector.c
0,0 → 1,257
/*
* This file is part of LibParserUtils.
* Licensed under the MIT License,
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
*/
 
#include <inttypes.h>
#include <string.h>
 
#include <parserutils/utils/vector.h>
 
/**
* Vector object
*/
struct parserutils_vector
{
size_t item_size; /**< Size of an item in the vector */
size_t chunk_size; /**< Size of a vector chunk */
size_t items_allocated; /**< Number of slots allocated */
int32_t current_item; /**< Index of current item */
void *items; /**< Items in vector */
 
parserutils_alloc alloc; /**< Memory (de)allocation function */
void *pw; /**< Client-specific data */
};
 
/**
* Create a vector
*
* \param item_size Length, in bytes, of an item in the vector
* \param chunk_size Number of vector slots in a chunk
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data
* \param vector Pointer to location to receive vector instance
* \return PARSERUTILS_OK on success,
* PARSERUTILS_BADPARM on bad parameters,
* PARSERUTILS_NOMEM on memory exhaustion
*/
parserutils_error parserutils_vector_create(size_t item_size,
size_t chunk_size, parserutils_alloc alloc, void *pw,
parserutils_vector **vector)
{
parserutils_vector *v;
 
if (item_size == 0 || chunk_size == 0 || alloc == NULL ||
vector == NULL)
return PARSERUTILS_BADPARM;
 
v = alloc(NULL, sizeof(parserutils_vector), pw);
if (v == NULL)
return PARSERUTILS_NOMEM;
 
v->items = alloc(NULL, item_size * chunk_size, pw);
if (v->items == NULL) {
alloc(v, 0, pw);
return PARSERUTILS_NOMEM;
}
 
v->item_size = item_size;
v->chunk_size = chunk_size;
v->items_allocated = chunk_size;
v->current_item = -1;
 
v->alloc = alloc;
v->pw = pw;
 
*vector = v;
 
return PARSERUTILS_OK;
}
 
/**
* Destroy a vector instance
*
* \param vector The vector to destroy
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*/
parserutils_error parserutils_vector_destroy(parserutils_vector *vector)
{
if (vector == NULL)
return PARSERUTILS_BADPARM;
 
vector->alloc(vector->items, 0, vector->pw);
vector->alloc(vector, 0, vector->pw);
 
return PARSERUTILS_OK;
}
 
/**
* Append an item to the vector
*
* \param vector The vector to append to
* \param item The item to append
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_vector_append(parserutils_vector *vector,
void *item)
{
int32_t slot;
 
if (vector == NULL || item == NULL)
return PARSERUTILS_BADPARM;
 
/* Ensure we'll get a valid slot */
if (vector->current_item < -1 || vector->current_item == INT32_MAX)
return PARSERUTILS_INVALID;
 
slot = vector->current_item + 1;
 
if ((size_t) slot >= vector->items_allocated) {
void *temp = vector->alloc(vector->items,
(vector->items_allocated + vector->chunk_size) *
vector->item_size, vector->pw);
if (temp == NULL)
return PARSERUTILS_NOMEM;
 
vector->items = temp;
vector->items_allocated += vector->chunk_size;
}
 
memcpy((uint8_t *) vector->items + (slot * vector->item_size),
item, vector->item_size);
vector->current_item = slot;
 
return PARSERUTILS_OK;
}
 
/**
* Clear a vector
*
* \param vector The vector to clear
* \return PARSERUTILS_OK on success, appropriate error otherwise.
*/
parserutils_error parserutils_vector_clear(parserutils_vector *vector)
{
if (vector == NULL)
return PARSERUTILS_BADPARM;
 
if (vector->current_item < 0)
return PARSERUTILS_INVALID;
 
vector->current_item = -1;
 
return PARSERUTILS_OK;
}
 
/**
* Remove the last item from a vector
*
* \param vector The vector to remove from
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_vector_remove_last(parserutils_vector *vector)
{
if (vector == NULL)
return PARSERUTILS_BADPARM;
 
if (vector->current_item < 0)
return PARSERUTILS_INVALID;
 
vector->current_item--;
 
return PARSERUTILS_OK;
}
 
/**
* Acquire the length (in items) of the vector.
*
* \param vector The vector to interrogate.
* \param length Pointer to location to receive length information.
* \return PARSERUTILS_OK on success, appropriate error otherwise
*/
parserutils_error parserutils_vector_get_length(parserutils_vector *vector,
size_t *length)
{
if (vector == NULL)
return PARSERUTILS_BADPARM;
if (length == NULL)
return PARSERUTILS_BADPARM;
*length = vector->current_item + 1;
return PARSERUTILS_OK;
}
 
/**
* Iterate over a vector
*
* \param vector The vector to iterate over
* \param ctx Pointer to an integer for the iterator to use as context.
* \return Pointer to current item, or NULL if no more
*
* \note The value pointed to by \a ctx must be zero to begin the iteration.
*/
const void *parserutils_vector_iterate(const parserutils_vector *vector,
int32_t *ctx)
{
void *item;
 
if (vector == NULL || ctx == NULL || vector->current_item < 0)
return NULL;
 
if ((*ctx) > vector->current_item)
return NULL;
 
item = (uint8_t *) vector->items + ((*ctx) * vector->item_size);
 
(*ctx)++;
 
return item;
}
 
/**
* Peek at an item in a vector
*
* \param vector The vector to iterate over
* \param ctx Integer for the iterator to use as context.
* \return Pointer to item, or NULL if no more
*/
const void *parserutils_vector_peek(const parserutils_vector *vector,
int32_t ctx)
{
if (vector == NULL || vector->current_item < 0)
return NULL;
 
if (ctx > vector->current_item)
return NULL;
 
return (uint8_t *) vector->items + (ctx * vector->item_size);
}
 
 
#ifndef NDEBUG
#include <stdio.h>
 
extern void parserutils_vector_dump(parserutils_vector *vector,
const char *prefix, void (*printer)(void *item));
 
void parserutils_vector_dump(parserutils_vector *vector, const char *prefix,
void (*printer)(void *item))
{
int32_t i;
 
if (vector == NULL || printer == NULL)
return;
 
for (i = 0; i <= vector->current_item; i++) {
printf("%s %d: ", prefix != NULL ? prefix : "", i);
printer((uint8_t *) vector->items + (i * vector->item_size));
printf("\n");
}
}
 
#endif
 
/programs/network/netsurf/libparserutils/test/INDEX
0,0 → 1,11
# Index for testcases
#
# Test Description DataDir
 
aliases Encoding alias handling
cscodec-utf8 UTF-8 charset codec implementation cscodec-utf8
cscodec-utf16 UTF-16 charset codec implementation cscodec-utf16
cscodec-ext8 Extended 8bit charset codec cscodec-ext8
cscodec-8859 ISO-8859-n codec cscodec-8859
filter Input stream filtering
inputstream Inputstream handling input
/programs/network/netsurf/libparserutils/test/Makefile
0,0 → 1,7
# Tests
DIR_TEST_ITEMS := aliases:aliases.c cscodec-8859:cscodec-8859.c \
cscodec-ext8:cscodec-ext8.c cscodec-utf8:cscodec-utf8.c \
cscodec-utf16:cscodec-utf16.c filter:filter.c \
inputstream:inputstream.c
 
include $(NSBUILD)/Makefile.subdir
/programs/network/netsurf/libparserutils/test/README
0,0 → 1,84
Libcharset testcases
====================
 
Testcases for Libcharset are self-contained binaries which test various parts
of the charset library. These may make use of external data files to drive
the testing.
 
Testcase command lines
----------------------
 
Testcase command lines are in a unified format, thus:
 
<aliases_file> [ <data_file> ]
 
The aliases file parameter will always be specified (as it is required for
the library to work at all).
 
The data file parameter is optional and may be provided on a test-by-test
basis.
 
Testcase output
---------------
 
Testcases may output anything at all to stdout. The final line of the
output must begin with either PASS or FAIL (case sensitive), indicating
the success status of the test.
 
Test Index
----------
 
In the test sources directory, is a file, named INDEX, which provides an
index of all available test binaries. Any new test applications should be
added to this index as they are created.
 
The test index file format is as follows:
 
file = *line
 
line = ( entry / comment / blank ) LF
 
entry = testname 1*HTAB description [ 1*HTAB datadir ]
comment = "#" *non-newline
blank = 0<OCTET>
 
testname = 1*non-reserved
description = 1*non-reserved
datadir = 1*non-reserved
 
non-newline = VCHAR / WSP
non-reserved = VCHAR / SP
 
Each entry contains a mandatory binary name and description followed by
an optional data directory specifier. The data directory specifier is
used to state the name of the directory containing data files for the
test name. This directory will be searched for within the "data"
directory in the source tree.
 
If a data directory is specified, the test binary will be invoked for
each data file listed within the data directory INDEX, passing the
filename as the second parameter (<data_file>, above).
 
Data Index
----------
 
Each test data directory contains a file, named INDEX, which provides an
index of all available test data files.
 
The data index file format is as follows:
 
file = *line
 
line = ( entry / comment / blank ) LF
 
entry = dataname 1*HTAB description
comment = "#" *non-newline
blank = 0<OCTET>
 
dataname = 1*non-reserved
description = 1*non-reserved
 
non-newline = VCHAR / WSP
non-reserved = VCHAR / SP
 
Each entry contains a mandatory data file name and description.
/programs/network/netsurf/libparserutils/test/aliases.c
0,0 → 1,62
#include <stdio.h>
#include <string.h>
 
#include "charset/aliases.h"
 
#include "testutils.h"
 
int main (int argc, char **argv)
{
parserutils_charset_aliases_canon *c;
 
UNUSED(argc);
UNUSED(argv);
 
c = parserutils__charset_alias_canonicalise("moose", 5);
if (c) {
printf("FAIL - found invalid encoding 'moose'\n");
return 1;
}
 
c = parserutils__charset_alias_canonicalise("csinvariant", 11);
if (c) {
printf("%s %d\n", c->name, c->mib_enum);
} else {
printf("FAIL - failed finding encoding 'csinvariant'\n");
return 1;
}
 
c = parserutils__charset_alias_canonicalise("csinvariant\"", 12);
if (c) {
printf("%s %d\n", c->name, c->mib_enum);
} else {
printf("FAIL - failed finding encoding 'csinvariant'\n");
return 1;
}
 
c = parserutils__charset_alias_canonicalise("nats-sefi-add", 13);
if (c) {
printf("%s %d\n", c->name, c->mib_enum);
} else {
printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
return 1;
}
 
printf("%d\n", parserutils_charset_mibenum_from_name(c->name,
strlen(c->name)));
 
printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum));
 
 
c = parserutils__charset_alias_canonicalise("u.t.f.8", 7);
if (c) {
printf("%s %d\n", c->name, c->mib_enum);
} else {
printf("FAIL - failed finding encoding 'u.t.f.8'\n");
return 1;
}
 
printf("PASS\n");
 
return 0;
}
/programs/network/netsurf/libparserutils/test/cscodec-8859.c
0,0 → 1,263
#include <ctype.h>
#include <stdio.h>
#include <string.h>
 
#include <parserutils/charset/codec.h>
 
#include "utils/utils.h"
 
#include "testutils.h"
 
typedef struct line_ctx {
parserutils_charset_codec *codec;
 
size_t buflen;
size_t bufused;
uint8_t *buf;
size_t explen;
size_t expused;
uint8_t *exp;
 
bool hadenc;
bool indata;
bool inexp;
 
parserutils_error exp_ret;
 
enum { ENCODE, DECODE, BOTH } dir;
} line_ctx;
 
static bool handle_line(const char *data, size_t datalen, void *pw);
static void run_test(line_ctx *ctx);
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_charset_codec *codec;
line_ctx ctx;
 
if (argc != 2) {
printf("Usage: %s <filename>\n", argv[0]);
return 1;
}
 
assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
 
ctx.buflen = parse_filesize(argv[1]);
if (ctx.buflen == 0)
return 1;
 
ctx.buf = malloc(2 * ctx.buflen);
if (ctx.buf == NULL) {
printf("Failed allocating %u bytes\n",
(unsigned int) ctx.buflen);
return 1;
}
 
ctx.exp = ctx.buf + ctx.buflen;
ctx.explen = ctx.buflen;
 
ctx.buf[0] = '\0';
ctx.exp[0] = '\0';
ctx.bufused = 0;
ctx.expused = 0;
ctx.hadenc = false;
ctx.indata = false;
ctx.inexp = false;
ctx.exp_ret = PARSERUTILS_OK;
 
assert(parse_testfile(argv[1], handle_line, &ctx) == true);
 
/* and run final test */
if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
ctx.bufused -= 1;
 
if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
ctx.expused -= 1;
 
run_test(&ctx);
 
free(ctx.buf);
 
parserutils_charset_codec_destroy(ctx.codec);
 
printf("PASS\n");
 
return 0;
}
 
bool handle_line(const char *data, size_t datalen, void *pw)
{
line_ctx *ctx = (line_ctx *) pw;
 
if (data[0] == '#') {
if (ctx->inexp) {
/* This marks end of testcase, so run it */
 
if (ctx->buf[ctx->bufused - 1] == '\n')
ctx->bufused -= 1;
 
if (ctx->exp[ctx->expused - 1] == '\n')
ctx->expused -= 1;
 
run_test(ctx);
 
ctx->buf[0] = '\0';
ctx->exp[0] = '\0';
ctx->bufused = 0;
ctx->expused = 0;
ctx->exp_ret = PARSERUTILS_OK;
}
 
if (strncasecmp(data+1, "data", 4) == 0) {
parserutils_charset_codec_optparams params;
const char *ptr = data + 6;
 
ctx->indata = true;
ctx->inexp = false;
 
if (strncasecmp(ptr, "decode", 6) == 0)
ctx->dir = DECODE;
else if (strncasecmp(ptr, "encode", 6) == 0)
ctx->dir = ENCODE;
else
ctx->dir = BOTH;
 
ptr += 7;
 
if (strncasecmp(ptr, "LOOSE", 5) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
ptr += 6;
} else if (strncasecmp(ptr, "STRICT", 6) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
ptr += 7;
} else {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
ptr += 9;
}
 
assert(parserutils_charset_codec_setopt(ctx->codec,
PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
(parserutils_charset_codec_optparams *) &params)
== PARSERUTILS_OK);
} else if (strncasecmp(data+1, "expected", 8) == 0) {
ctx->indata = false;
ctx->inexp = true;
 
ctx->exp_ret = parserutils_error_from_string(data + 10,
datalen - 10 - 1 /* \n */);
} else if (strncasecmp(data+1, "reset", 5) == 0) {
ctx->indata = false;
ctx->inexp = false;
 
parserutils_charset_codec_reset(ctx->codec);
} else if (strncasecmp(data+1, "enc", 3) == 0) {
const char *enc = data + 5;
const char *end;
char *enc_name;
 
for (end = enc; !isspace(*end); end++)
;
 
enc_name = alloca(end - enc + 1);
memcpy(enc_name, enc, end - enc);
enc_name[end - enc] = 0;
 
assert(parserutils_charset_codec_create(enc_name,
myrealloc, NULL, &ctx->codec) ==
PARSERUTILS_OK);
 
ctx->hadenc = true;
}
} else {
if (ctx->indata) {
memcpy(ctx->buf + ctx->bufused, data, datalen);
ctx->bufused += datalen;
}
if (ctx->inexp) {
memcpy(ctx->exp + ctx->expused, data, datalen);
ctx->expused += datalen;
}
}
 
return true;
}
 
void run_test(line_ctx *ctx)
{
static int testnum;
size_t destlen = ctx->bufused * 4;
uint8_t *dest = alloca(destlen);
uint8_t *pdest = dest;
const uint8_t *psrc = ctx->buf;
size_t srclen = ctx->bufused;
size_t i;
 
if (ctx->dir == DECODE) {
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else if (ctx->dir == ENCODE) {
assert(parserutils_charset_codec_encode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else {
size_t templen = ctx->bufused * 4;
uint8_t *temp = alloca(templen);
uint8_t *ptemp = temp;
const uint8_t *ptemp2;
size_t templen2;
 
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&ptemp, &templen) == ctx->exp_ret);
/* \todo currently there is no way to specify the number of
consumed & produced data in case of a deliberate bad input
data set. */
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(temp + (ctx->bufused * 4 - templen) == ptemp);
}
 
ptemp2 = temp;
templen2 = ctx->bufused * 4 - templen;
assert(parserutils_charset_codec_encode(ctx->codec,
&ptemp2, &templen2,
&pdest, &destlen) == ctx->exp_ret);
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(templen2 == 0);
assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
}
}
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(srclen == 0);
assert(ctx->buf + ctx->bufused == psrc);
assert(dest + (ctx->bufused * 4 - destlen) == pdest);
assert(ctx->bufused * 4 - destlen == ctx->expused);
}
 
printf("%d: Read '", ++testnum);
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
"0123456789abcdef"[dest[i] & 0xf]);
}
printf("' Expected '");
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
"0123456789abcdef"[ctx->exp[i] & 0xf]);
}
printf("'\n");
 
assert(pdest == dest + ctx->expused);
assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
}
 
/programs/network/netsurf/libparserutils/test/cscodec-ext8.c
0,0 → 1,263
#include <ctype.h>
#include <stdio.h>
#include <string.h>
 
#include <parserutils/charset/codec.h>
 
#include "utils/utils.h"
 
#include "testutils.h"
 
typedef struct line_ctx {
parserutils_charset_codec *codec;
 
size_t buflen;
size_t bufused;
uint8_t *buf;
size_t explen;
size_t expused;
uint8_t *exp;
 
bool hadenc;
bool indata;
bool inexp;
 
parserutils_error exp_ret;
 
enum { ENCODE, DECODE, BOTH } dir;
} line_ctx;
 
static bool handle_line(const char *data, size_t datalen, void *pw);
static void run_test(line_ctx *ctx);
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_charset_codec *codec;
line_ctx ctx;
 
if (argc != 2) {
printf("Usage: %s <filename>\n", argv[0]);
return 1;
}
 
assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
 
ctx.buflen = parse_filesize(argv[1]);
if (ctx.buflen == 0)
return 1;
 
ctx.buf = malloc(2 * ctx.buflen);
if (ctx.buf == NULL) {
printf("Failed allocating %u bytes\n",
(unsigned int) ctx.buflen);
return 1;
}
 
ctx.exp = ctx.buf + ctx.buflen;
ctx.explen = ctx.buflen;
 
ctx.buf[0] = '\0';
ctx.exp[0] = '\0';
ctx.bufused = 0;
ctx.expused = 0;
ctx.hadenc = false;
ctx.indata = false;
ctx.inexp = false;
ctx.exp_ret = PARSERUTILS_OK;
 
assert(parse_testfile(argv[1], handle_line, &ctx) == true);
 
/* and run final test */
if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
ctx.bufused -= 1;
 
if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
ctx.expused -= 1;
 
run_test(&ctx);
 
free(ctx.buf);
 
parserutils_charset_codec_destroy(ctx.codec);
 
printf("PASS\n");
 
return 0;
}
 
bool handle_line(const char *data, size_t datalen, void *pw)
{
line_ctx *ctx = (line_ctx *) pw;
 
if (data[0] == '#') {
if (ctx->inexp) {
/* This marks end of testcase, so run it */
 
if (ctx->buf[ctx->bufused - 1] == '\n')
ctx->bufused -= 1;
 
if (ctx->exp[ctx->expused - 1] == '\n')
ctx->expused -= 1;
 
run_test(ctx);
 
ctx->buf[0] = '\0';
ctx->exp[0] = '\0';
ctx->bufused = 0;
ctx->expused = 0;
ctx->exp_ret = PARSERUTILS_OK;
}
 
if (strncasecmp(data+1, "data", 4) == 0) {
parserutils_charset_codec_optparams params;
const char *ptr = data + 6;
 
ctx->indata = true;
ctx->inexp = false;
 
if (strncasecmp(ptr, "decode", 6) == 0)
ctx->dir = DECODE;
else if (strncasecmp(ptr, "encode", 6) == 0)
ctx->dir = ENCODE;
else
ctx->dir = BOTH;
 
ptr += 7;
 
if (strncasecmp(ptr, "LOOSE", 5) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
ptr += 6;
} else if (strncasecmp(ptr, "STRICT", 6) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
ptr += 7;
} else {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
ptr += 9;
}
 
assert(parserutils_charset_codec_setopt(ctx->codec,
PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
(parserutils_charset_codec_optparams *) &params)
== PARSERUTILS_OK);
} else if (strncasecmp(data+1, "expected", 8) == 0) {
ctx->indata = false;
ctx->inexp = true;
 
ctx->exp_ret = parserutils_error_from_string(data + 10,
datalen - 10 - 1 /* \n */);
} else if (strncasecmp(data+1, "reset", 5) == 0) {
ctx->indata = false;
ctx->inexp = false;
 
parserutils_charset_codec_reset(ctx->codec);
} else if (strncasecmp(data+1, "enc", 3) == 0) {
const char *enc = data + 5;
const char *end;
char *enc_name;
 
for (end = enc; !isspace(*end); end++)
;
 
enc_name = alloca(end - enc + 1);
memcpy(enc_name, enc, end - enc);
enc_name[end - enc] = 0;
 
assert(parserutils_charset_codec_create(enc_name,
myrealloc, NULL, &ctx->codec) ==
PARSERUTILS_OK);
 
ctx->hadenc = true;
}
} else {
if (ctx->indata) {
memcpy(ctx->buf + ctx->bufused, data, datalen);
ctx->bufused += datalen;
}
if (ctx->inexp) {
memcpy(ctx->exp + ctx->expused, data, datalen);
ctx->expused += datalen;
}
}
 
return true;
}
 
void run_test(line_ctx *ctx)
{
static int testnum;
size_t destlen = ctx->bufused * 4;
uint8_t *dest = alloca(destlen);
uint8_t *pdest = dest;
const uint8_t *psrc = ctx->buf;
size_t srclen = ctx->bufused;
size_t i;
 
if (ctx->dir == DECODE) {
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else if (ctx->dir == ENCODE) {
assert(parserutils_charset_codec_encode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else {
size_t templen = ctx->bufused * 4;
uint8_t *temp = alloca(templen);
uint8_t *ptemp = temp;
const uint8_t *ptemp2;
size_t templen2;
 
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&ptemp, &templen) == ctx->exp_ret);
/* \todo currently there is no way to specify the number of
consumed & produced data in case of a deliberate bad input
data set. */
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(temp + (ctx->bufused * 4 - templen) == ptemp);
}
 
ptemp2 = temp;
templen2 = ctx->bufused * 4 - templen;
assert(parserutils_charset_codec_encode(ctx->codec,
&ptemp2, &templen2,
&pdest, &destlen) == ctx->exp_ret);
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(templen2 == 0);
assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
}
}
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(srclen == 0);
assert(ctx->buf + ctx->bufused == psrc);
assert(dest + (ctx->bufused * 4 - destlen) == pdest);
assert(ctx->bufused * 4 - destlen == ctx->expused);
}
 
printf("%d: Read '", ++testnum);
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
"0123456789abcdef"[dest[i] & 0xf]);
}
printf("' Expected '");
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
"0123456789abcdef"[ctx->exp[i] & 0xf]);
}
printf("'\n");
 
assert(pdest == dest + ctx->expused);
assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
}
 
/programs/network/netsurf/libparserutils/test/cscodec-utf16.c
0,0 → 1,321
#include <ctype.h>
#include <stdio.h>
#include <string.h>
 
/* These two are for htonl / ntohl */
#include <arpa/inet.h>
#include <netinet/in.h>
 
#include <parserutils/charset/codec.h>
 
#include "utils/utils.h"
 
#include "testutils.h"
 
typedef struct line_ctx {
parserutils_charset_codec *codec;
 
size_t buflen;
size_t bufused;
uint8_t *buf;
size_t explen;
size_t expused;
uint8_t *exp;
 
bool indata;
bool inexp;
 
parserutils_error exp_ret;
 
enum { ENCODE, DECODE, BOTH } dir;
} line_ctx;
 
static bool handle_line(const char *data, size_t datalen, void *pw);
static void run_test(line_ctx *ctx);
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_charset_codec *codec;
line_ctx ctx;
 
if (argc != 2) {
printf("Usage: %s <filename>\n", argv[0]);
return 1;
}
 
assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
 
assert(parserutils_charset_codec_create("UTF-16", myrealloc, NULL,
&ctx.codec) == PARSERUTILS_OK);
 
ctx.buflen = parse_filesize(argv[1]);
if (ctx.buflen == 0)
return 1;
 
ctx.buf = malloc(ctx.buflen);
if (ctx.buf == NULL) {
printf("Failed allocating %u bytes\n", (int) ctx.buflen);
return 1;
}
 
ctx.exp = malloc(ctx.buflen);
if (ctx.exp == NULL) {
printf("Failed allocating %u bytes\n", (int) ctx.buflen);
free(ctx.buf);
return 1;
}
ctx.explen = ctx.buflen;
 
ctx.buf[0] = '\0';
ctx.exp[0] = '\0';
ctx.bufused = 0;
ctx.expused = 0;
ctx.indata = false;
ctx.inexp = false;
ctx.exp_ret = PARSERUTILS_OK;
 
assert(parse_testfile(argv[1], handle_line, &ctx) == true);
 
/* and run final test */
if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
ctx.bufused -= 1;
 
if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
ctx.expused -= 1;
 
run_test(&ctx);
 
free(ctx.buf);
 
parserutils_charset_codec_destroy(ctx.codec);
 
printf("PASS\n");
 
return 0;
}
 
/**
* Converts hex character ('0' ... '9' or 'a' ... 'f' or 'A' ... 'F') to
* digit value.
* \param hex Valid hex character
* \return Corresponding digit value.
*/
static inline int hex2digit(char hex)
{
return (hex <= '9') ? hex - '0' : (hex | 0x20) - 'a' + 10;
}
 
bool handle_line(const char *data, size_t datalen, void *pw)
{
line_ctx *ctx = (line_ctx *) pw;
 
if (data[0] == '#') {
if (ctx->inexp) {
/* This marks end of testcase, so run it */
 
if (ctx->buf[ctx->bufused - 1] == '\n')
ctx->bufused -= 1;
 
if (ctx->exp[ctx->expused - 1] == '\n')
ctx->expused -= 1;
 
run_test(ctx);
 
ctx->buf[0] = '\0';
ctx->exp[0] = '\0';
ctx->bufused = 0;
ctx->expused = 0;
ctx->exp_ret = PARSERUTILS_OK;
}
 
if (strncasecmp(data+1, "data", 4) == 0) {
parserutils_charset_codec_optparams params;
const char *ptr = data + 6;
 
ctx->indata = true;
ctx->inexp = false;
 
if (strncasecmp(ptr, "decode", 6) == 0)
ctx->dir = DECODE;
else if (strncasecmp(ptr, "encode", 6) == 0)
ctx->dir = ENCODE;
else
ctx->dir = BOTH;
 
ptr += 7;
 
if (strncasecmp(ptr, "LOOSE", 5) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
ptr += 6;
} else if (strncasecmp(ptr, "STRICT", 6) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
ptr += 7;
} else {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
ptr += 9;
}
 
assert(parserutils_charset_codec_setopt(ctx->codec,
PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
(parserutils_charset_codec_optparams *) &params)
== PARSERUTILS_OK);
} else if (strncasecmp(data+1, "expected", 8) == 0) {
ctx->indata = false;
ctx->inexp = true;
 
ctx->exp_ret = parserutils_error_from_string(data + 10,
datalen - 10 - 1 /* \n */);
} else if (strncasecmp(data+1, "reset", 5) == 0) {
ctx->indata = false;
ctx->inexp = false;
 
parserutils_charset_codec_reset(ctx->codec);
}
} else {
if (ctx->indata) {
/* Process "&#xNNNN" as 16-bit code units. */
while (datalen) {
uint16_t nCodePoint;
 
if (data[0] == '\n') {
ctx->buf[ctx->bufused++] = *data++;
--datalen;
continue;
}
assert(datalen >= sizeof ("&#xNNNN")-1 \
&& data[0] == '&' && data[1] == '#' \
&& data[2] == 'x' && isxdigit(data[3]) \
&& isxdigit(data[4]) && isxdigit(data[5]) \
&& isxdigit(data[6]));
/* UTF-16 code is always host endian (different
than UCS-32 !). */
nCodePoint = (hex2digit(data[3]) << 12) |
(hex2digit(data[4]) << 8) |
(hex2digit(data[5]) << 4) |
hex2digit(data[6]);
*((uint16_t *) (void *) (ctx->buf + ctx->bufused)) =
nCodePoint;
ctx->bufused += 2;
data += sizeof ("&#xNNNN")-1;
datalen -= sizeof ("&#xNNNN")-1;
}
}
if (ctx->inexp) {
/* Process "&#xXXXXYYYY as 32-bit code units. */
while (datalen) {
uint32_t nCodePoint;
 
if (data[0] == '\n') {
ctx->exp[ctx->expused++] = *data++;
--datalen;
continue;
}
assert(datalen >= sizeof ("&#xXXXXYYYY")-1 \
&& data[0] == '&' && data[1] == '#' \
&& data[2] == 'x' && isxdigit(data[3]) \
&& isxdigit(data[4]) && isxdigit(data[5]) \
&& isxdigit(data[6]) && isxdigit(data[7]) \
&& isxdigit(data[8]) && isxdigit(data[9]) \
&& isxdigit(data[10]));
/* UCS-4 code is always big endian, so convert
host endian to big endian. */
nCodePoint =
htonl((hex2digit(data[3]) << 28)
| (hex2digit(data[4]) << 24)
| (hex2digit(data[5]) << 20)
| (hex2digit(data[6]) << 16)
| (hex2digit(data[7]) << 12)
| (hex2digit(data[8]) << 8)
| (hex2digit(data[9]) << 4)
| hex2digit(data[10]));
*((uint32_t *) (void *) (ctx->exp + ctx->expused)) =
nCodePoint;
ctx->expused += 4;
data += sizeof ("&#xXXXXYYYY")-1;
datalen -= sizeof ("&#xXXXXYYYY")-1;
}
}
}
 
return true;
}
 
void run_test(line_ctx *ctx)
{
static int testnum;
size_t destlen = ctx->bufused * 4;
uint8_t *dest = alloca(destlen);
uint8_t *pdest = dest;
const uint8_t *psrc = ctx->buf;
size_t srclen = ctx->bufused;
size_t i;
 
if (ctx->dir == DECODE) {
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else if (ctx->dir == ENCODE) {
assert(parserutils_charset_codec_encode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else {
size_t templen = ctx->bufused * 4;
uint8_t *temp = alloca(templen);
uint8_t *ptemp = temp;
const uint8_t *ptemp2;
size_t templen2;
 
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&ptemp, &templen) == ctx->exp_ret);
/* \todo currently there is no way to specify the number of
consumed & produced data in case of a deliberate bad input
data set. */
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(temp + (ctx->bufused * 4 - templen) == ptemp);
}
 
ptemp2 = temp;
templen2 = ctx->bufused * 4 - templen;
assert(parserutils_charset_codec_encode(ctx->codec,
&ptemp2, &templen2,
&pdest, &destlen) == ctx->exp_ret);
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(templen2 == 0);
assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
}
}
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(srclen == 0);
assert(ctx->buf + ctx->bufused == psrc);
assert(dest + (ctx->bufused * 4 - destlen) == pdest);
assert(ctx->bufused * 4 - destlen == ctx->expused);
}
 
printf("%d: Read '", ++testnum);
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
"0123456789abcdef"[dest[i] & 0xf]);
}
printf("' Expected '");
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
"0123456789abcdef"[ctx->exp[i] & 0xf]);
}
printf("'\n");
 
assert(pdest == dest + ctx->expused);
assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
}
 
/programs/network/netsurf/libparserutils/test/cscodec-utf8.c
0,0 → 1,246
#include <stdio.h>
#include <string.h>
 
#include <parserutils/charset/codec.h>
 
#include "utils/utils.h"
 
#include "testutils.h"
 
typedef struct line_ctx {
parserutils_charset_codec *codec;
 
size_t buflen;
size_t bufused;
uint8_t *buf;
size_t explen;
size_t expused;
uint8_t *exp;
 
bool indata;
bool inexp;
 
parserutils_error exp_ret;
 
enum { ENCODE, DECODE, BOTH } dir;
} line_ctx;
 
static bool handle_line(const char *data, size_t datalen, void *pw);
static void run_test(line_ctx *ctx);
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_charset_codec *codec;
line_ctx ctx;
 
if (argc != 2) {
printf("Usage: %s <filename>\n", argv[0]);
return 1;
}
 
assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);
 
assert(parserutils_charset_codec_create("UTF-8", myrealloc, NULL,
&ctx.codec) == PARSERUTILS_OK);
 
ctx.buflen = parse_filesize(argv[1]);
if (ctx.buflen == 0)
return 1;
 
ctx.buf = malloc(2 * ctx.buflen);
if (ctx.buf == NULL) {
printf("Failed allocating %u bytes\n",
(unsigned int) ctx.buflen);
return 1;
}
 
ctx.exp = ctx.buf + ctx.buflen;
ctx.explen = ctx.buflen;
 
ctx.buf[0] = '\0';
ctx.exp[0] = '\0';
ctx.bufused = 0;
ctx.expused = 0;
ctx.indata = false;
ctx.inexp = false;
ctx.exp_ret = PARSERUTILS_OK;
 
assert(parse_testfile(argv[1], handle_line, &ctx) == true);
 
/* and run final test */
if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
ctx.bufused -= 1;
 
if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
ctx.expused -= 1;
 
run_test(&ctx);
 
free(ctx.buf);
 
parserutils_charset_codec_destroy(ctx.codec);
 
printf("PASS\n");
 
return 0;
}
 
bool handle_line(const char *data, size_t datalen, void *pw)
{
line_ctx *ctx = (line_ctx *) pw;
 
if (data[0] == '#') {
if (ctx->inexp) {
/* This marks end of testcase, so run it */
 
if (ctx->buf[ctx->bufused - 1] == '\n')
ctx->bufused -= 1;
 
if (ctx->exp[ctx->expused - 1] == '\n')
ctx->expused -= 1;
 
run_test(ctx);
 
ctx->buf[0] = '\0';
ctx->exp[0] = '\0';
ctx->bufused = 0;
ctx->expused = 0;
ctx->exp_ret = PARSERUTILS_OK;
}
 
if (strncasecmp(data+1, "data", 4) == 0) {
parserutils_charset_codec_optparams params;
const char *ptr = data + 6;
 
ctx->indata = true;
ctx->inexp = false;
 
if (strncasecmp(ptr, "decode", 6) == 0)
ctx->dir = DECODE;
else if (strncasecmp(ptr, "encode", 6) == 0)
ctx->dir = ENCODE;
else
ctx->dir = BOTH;
 
ptr += 7;
 
if (strncasecmp(ptr, "LOOSE", 5) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
ptr += 6;
} else if (strncasecmp(ptr, "STRICT", 6) == 0) {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
ptr += 7;
} else {
params.error_mode.mode =
PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
ptr += 9;
}
 
assert(parserutils_charset_codec_setopt(ctx->codec,
PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
(parserutils_charset_codec_optparams *) &params)
== PARSERUTILS_OK);
} else if (strncasecmp(data+1, "expected", 8) == 0) {
ctx->indata = false;
ctx->inexp = true;
 
ctx->exp_ret = parserutils_error_from_string(data + 10,
datalen - 10 - 1 /* \n */);
} else if (strncasecmp(data+1, "reset", 5) == 0) {
ctx->indata = false;
ctx->inexp = false;
 
parserutils_charset_codec_reset(ctx->codec);
}
} else {
if (ctx->indata) {
memcpy(ctx->buf + ctx->bufused, data, datalen);
ctx->bufused += datalen;
}
if (ctx->inexp) {
memcpy(ctx->exp + ctx->expused, data, datalen);
ctx->expused += datalen;
}
}
 
return true;
}
 
void run_test(line_ctx *ctx)
{
static int testnum;
size_t destlen = ctx->bufused * 4;
uint8_t *dest = alloca(destlen);
uint8_t *pdest = dest;
const uint8_t *psrc = ctx->buf;
size_t srclen = ctx->bufused;
size_t i;
 
if (ctx->dir == DECODE) {
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else if (ctx->dir == ENCODE) {
assert(parserutils_charset_codec_encode(ctx->codec,
&psrc, &srclen,
&pdest, &destlen) == ctx->exp_ret);
} else {
size_t templen = ctx->bufused * 4;
uint8_t *temp = alloca(templen);
uint8_t *ptemp = temp;
const uint8_t *ptemp2;
size_t templen2;
 
assert(parserutils_charset_codec_decode(ctx->codec,
&psrc, &srclen,
&ptemp, &templen) == ctx->exp_ret);
/* \todo currently there is no way to specify the number of
consumed & produced data in case of a deliberate bad input
data set. */
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(temp + (ctx->bufused * 4 - templen) == ptemp);
}
 
ptemp2 = temp;
templen2 = ctx->bufused * 4 - templen;
assert(parserutils_charset_codec_encode(ctx->codec,
&ptemp2, &templen2,
&pdest, &destlen) == ctx->exp_ret);
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(templen2 == 0);
assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
}
}
if (ctx->exp_ret == PARSERUTILS_OK) {
assert(srclen == 0);
assert(ctx->buf + ctx->bufused == psrc);
assert(dest + (ctx->bufused * 4 - destlen) == pdest);
assert(ctx->bufused * 4 - destlen == ctx->expused);
}
 
printf("%d: Read '", ++testnum);
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
"0123456789abcdef"[dest[i] & 0xf]);
}
printf("' Expected '");
for (i = 0; i < ctx->expused; i++) {
printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
"0123456789abcdef"[ctx->exp[i] & 0xf]);
}
printf("'\n");
 
assert(pdest == dest + ctx->expused);
assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
}
 
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/1.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/10.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/11.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/13.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/14.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/15.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/16.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/2.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/3.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/4.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/5.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/6.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/7.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/8.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/9.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-8859/INDEX
0,0 → 1,19
# Index file for charset codec tests
#
# Test Description
 
1.dat ISO-8859-1
2.dat ISO-8859-2
3.dat ISO-8859-3
4.dat ISO-8859-4
5.dat ISO-8859-5
6.dat ISO-8859-6
7.dat ISO-8859-7
8.dat ISO-8859-8
9.dat ISO-8859-9
10.dat ISO-8859-10
11.dat ISO-8859-11
13.dat ISO-8859-13
14.dat ISO-8859-14
15.dat ISO-8859-15
16.dat ISO-8859-16
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/INDEX
0,0 → 1,13
# Index file for charset codec tests
#
# Test Description
 
cp1250.dat Windows-1250
cp1251.dat Windows-1251
cp1252.dat Windows-1252
cp1253.dat Windows-1253
cp1254.dat Windows-1254
cp1255.dat Windows-1255
cp1256.dat Windows-1256
cp1257.dat Windows-1257
cp1258.dat Windows-1258
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1250.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1251.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1252.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1253.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1254.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1255.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1256.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1257.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-ext8/cp1258.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/cscodec-utf16/INDEX
0,0 → 1,6
# Index file for UTF-16 charset codec tests
#
# Test Description
 
simple.dat Simple tests, designed to validate testdriver
 
/programs/network/netsurf/libparserutils/test/data/cscodec-utf16/simple.dat
0,0 → 1,33
# *** Simple test:
#data decode STRICT
&#x0040&#x4142
#expected PARSERUTILS_OK
&#x00000040&#x00004142
#reset
 
# *** Surrogate test:
#data decode STRICT
&#xD800&#xDF02
#expected PARSERUTILS_OK
&#x00010302
#reset
 
# *** Lonely high surrogate:
# This is a bit strange that end status is ok.
#data decode STRICT
&#xD805
#expected PARSERUTILS_OK
#reset
 
# With an extra code point, the status is different.
#data decode STRICT
&#xD805&#x4142
#expected PARSERUTILS_INVALID
#reset
 
# *** Wrong low surrogate start:
#data decode STRICT
&#xDC05
#expected PARSERUTILS_INVALID
#reset
 
/programs/network/netsurf/libparserutils/test/data/cscodec-utf8/INDEX
0,0 → 1,6
# Index file for charset codec tests
#
# Test Description
 
simple.dat Simple tests, designed to validate testdriver
UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file
/programs/network/netsurf/libparserutils/test/data/cscodec-utf8/UTF-8-test.txt
0,0 → 1,536
#data recode LOOSE
UTF-8 decoder capability and stress test
----------------------------------------
 
Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2003-02-19
 
This test file can help you examine, how your UTF-8 decoder handles
various types of correct, malformed, or otherwise interesting UTF-8
sequences. This file is not meant to be a conformance test. It does
not prescribes any particular outcome and therefore there is no way to
"pass" or "fail" this test file, even though the texts suggests a
preferable decoder behaviour at some places. The aim is instead to
help you think about and test the behaviour of your UTF-8 on a
systematic collection of unusual inputs. Experience so far suggests
that most first-time authors of UTF-8 decoders find at least one
serious problem in their decoder by using this file.
 
The test lines below cover boundary conditions, malformed UTF-8
sequences as well as correctly encoded UTF-8 sequences of Unicode code
points that should never occur in a correct UTF-8 file.
 
According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
receiving UTF-8 shall interpret a "malformed sequence in the same way
that it interprets a character that is outside the adopted subset" and
"characters that are not within the adopted subset shall be indicated
to the user" by a receiving device. A quite commonly used approach in
UTF-8 decoders is to replace any malformed UTF-8 sequence by a
replacement character (U+FFFD), which looks a bit like an inverted
question mark, or a similar symbol. It might be a good idea to
visually distinguish a malformed UTF-8 sequence from a correctly
encoded Unicode character that is just not available in the current
font but otherwise fully legal, even though ISO 10646-1 doesn't
mandate this. In any case, just ignoring malformed sequences or
unavailable characters does not conform to ISO 10646, will make
debugging more difficult, and can lead to user confusion.
 
Please check, whether a malformed UTF-8 sequence is (1) represented at
all, (2) represented by exactly one single replacement character (or
equivalent signal), and (3) the following quotation mark after an
illegal UTF-8 sequence is correctly displayed, i.e. proper
resynchronization takes place immageately after any malformed
sequence. This file says "THE END" in the last line, so if you don't
see that, your decoder crashed somehow before, which should always be
cause for concern.
 
All lines in this file are exactly 79 characters long (plus the line
feed). In addition, all lines end with "|", except for the two test
lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
U+0000 and U+007F. If you display this file with a fixed-width font,
these "|" characters should all line up in column 79 (right margin).
This allows you to test quickly, whether your UTF-8 decoder finds the
correct number of characters in every line, that is whether each
malformed sequences is replaced by a single replacement character.
 
Note that as an alternative to the notion of malformed sequence used
here, it is also a perfectly acceptable (and in some situations even
preferable) solution to represent each individual byte of a malformed
sequence by a replacement character. If you follow this strategy in
your decoder, then please ignore the "|" column.
 
 
Here come the tests: |
|
1 Some correct UTF-8 text |
|
You should see the Greek word 'kosme': "κόσμε" |
|
2 Boundary condition test cases |
|
2.1 First possible sequence of a certain length |
|
2.1.1 1 byte (U-00000000): ""
2.1.2 2 bytes (U-00000080): "€" |
2.1.3 3 bytes (U-00000800): "ࠀ" |
2.1.4 4 bytes (U-00010000): "𐀀" |
2.1.5 5 bytes (U-00200000): "øˆ€€€" |
2.1.6 6 bytes (U-04000000): "ü„€€€€" |
|
2.2 Last possible sequence of a certain length |
|
2.2.1 1 byte (U-0000007F): ""
2.2.2 2 bytes (U-000007FF): "߿" |
2.2.3 3 bytes (U-0000FFFF): "￿" |
2.2.4 4 bytes (U-001FFFFF): "÷¿¿¿" |
2.2.5 5 bytes (U-03FFFFFF): "û¿¿¿¿" |
2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿¿¿¿" |
|
2.3 Other boundary conditions |
|
2.3.1 U-0000D7FF = ed 9f bf = "퟿" |
2.3.2 U-0000E000 = ee 80 80 = "" |
2.3.3 U-0000FFFD = ef bf bd = "�" |
2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" |
2.3.5 U-00110000 = f4 90 80 80 = "ô€€" |
|
3 Malformed sequences |
|
3.1 Unexpected continuation bytes |
|
Each unexpected continuation byte should be separately signalled as a |
malformed sequence of its own. |
|
3.1.1 First continuation byte 0x80: "€" |
3.1.2 Last continuation byte 0xbf: "¿" |
|
3.1.3 2 continuation bytes: "€¿" |
3.1.4 3 continuation bytes: "€¿€" |
3.1.5 4 continuation bytes: "€¿€¿" |
3.1.6 5 continuation bytes: "€¿€¿€" |
3.1.7 6 continuation bytes: "€¿€¿€¿" |
3.1.8 7 continuation bytes: "€¿€¿€¿€" |
|
3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): |
|
"€‚ƒ„…†‡ˆ‰Š‹ŒŽ |
‘’“”•–—˜™š›œžŸ |
 ¡¢£¤¥¦§¨©ª«¬­®¯ |
°±²³´µ¶·¸¹º»¼½¾¿" |
|
3.2 Lonely start characters |
|
3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), |
each followed by a space character: |
|
"À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï |
Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß " |
|
3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), |
each followed by a space character: |
|
"à á â ã ä å æ ç è é ê ë ì í î ï " |
|
3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), |
each followed by a space character: |
|
"ð ñ ò ó ô õ ö ÷ " |
|
3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), |
each followed by a space character: |
|
"ø ù ú û " |
|
3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), |
each followed by a space character: |
|
"ü ý " |
|
3.3 Sequences with last continuation byte missing |
|
All bytes of an incomplete sequence should be signalled as a single |
malformed sequence, i.e., you should see only a single replacement |
character in each of the next 10 tests. (Characters as in section 2) |
|
3.3.1 2-byte sequence with last byte missing (U+0000): "À" |
3.3.2 3-byte sequence with last byte missing (U+0000): "à€" |
3.3.3 4-byte sequence with last byte missing (U+0000): "ð€€" |
3.3.4 5-byte sequence with last byte missing (U+0000): "ø€€€" |
3.3.5 6-byte sequence with last byte missing (U+0000): "ü€€€€" |
3.3.6 2-byte sequence with last byte missing (U-000007FF): "ß" |
3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "ï¿" |
3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "÷¿¿" |
3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "û¿¿¿" |
3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "ý¿¿¿¿" |
|
3.4 Concatenation of incomplete sequences |
|
All the 10 sequences of 3.3 concatenated, you should see 10 malformed |
sequences being signalled: |
|
"Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿" |
|
3.5 Impossible bytes |
|
The following two bytes cannot appear in a correct UTF-8 string |
|
3.5.1 fe = "þ" |
3.5.2 ff = "ÿ" |
3.5.3 fe fe ff ff = "þþÿÿ" |
|
4 Overlong sequences |
|
The following sequences are not malformed according to the letter of |
the Unicode 2.0 standard. However, they are longer then necessary and |
a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 |
decoder" should reject them just like malformed sequences for two |
reasons: (1) It helps to debug applications if overlong sequences are |
not treated as valid representations of characters, because this helps |
to spot problems more quickly. (2) Overlong sequences provide |
alternative representations of characters, that could maliciously be |
used to bypass filters that check only for ASCII characters. For |
instance, a 2-byte encoded line feed (LF) would not be caught by a |
line counter that counts only 0x0a bytes, but it would still be |
processed as a line feed by an unsafe UTF-8 decoder later in the |
pipeline. From a security point of view, ASCII compatibility of UTF-8 |
sequences means also, that ASCII characters are *only* allowed to be |
represented by ASCII bytes in the range 0x00-0x7f. To ensure this |
aspect of ASCII compatibility, use only "safe UTF-8 decoders" that |
reject overlong UTF-8 sequences for which a shorter encoding exists. |
|
4.1 Examples of an overlong ASCII character |
|
With a safe UTF-8 decoder, all of the following five overlong |
representations of the ASCII character slash ("/") should be rejected |
like a malformed UTF-8 sequence, for instance by substituting it with |
a replacement character. If you see a slash below, you do not have a |
safe UTF-8 decoder! |
|
4.1.1 U+002F = c0 af = "À¯" |
4.1.2 U+002F = e0 80 af = "à€¯" |
4.1.3 U+002F = f0 80 80 af = "ð€€¯" |
4.1.4 U+002F = f8 80 80 80 af = "ø€€€¯" |
4.1.5 U+002F = fc 80 80 80 80 af = "ü€€€€¯" |
|
4.2 Maximum overlong sequences |
|
Below you see the highest Unicode value that is still resulting in an |
overlong sequence if represented with the given number of bytes. This |
is a boundary test for safe UTF-8 decoders. All five characters should |
be rejected like malformed UTF-8 sequences. |
|
4.2.1 U-0000007F = c1 bf = "Á¿" |
4.2.2 U-000007FF = e0 9f bf = "àŸ¿" |
4.2.3 U-0000FFFF = f0 8f bf bf = "ð¿¿" |
4.2.4 U-001FFFFF = f8 87 bf bf bf = "ø‡¿¿¿" |
4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "üƒ¿¿¿¿" |
|
4.3 Overlong representation of the NUL character |
|
The following five sequences should also be rejected like malformed |
UTF-8 sequences and should not be treated like the ASCII NUL |
character. |
|
4.3.1 U+0000 = c0 80 = "" |
4.3.2 U+0000 = e0 80 80 = "à€€" |
4.3.3 U+0000 = f0 80 80 80 = "ð€€€" |
4.3.4 U+0000 = f8 80 80 80 80 = "ø€€€€" |
4.3.5 U+0000 = fc 80 80 80 80 80 = "ü€€€€€" |
|
5 Illegal code positions |
|
The following UTF-8 sequences should be rejected like malformed |
sequences, because they never represent valid ISO 10646 characters and |
a UTF-8 decoder that accepts them might introduce security problems |
comparable to overlong UTF-8 sequences. |
|
5.1 Single UTF-16 surrogates |
|
5.1.1 U+D800 = ed a0 80 = "í €" |
5.1.2 U+DB7F = ed ad bf = "í­¿" |
5.1.3 U+DB80 = ed ae 80 = "í®€" |
5.1.4 U+DBFF = ed af bf = "í¯¿" |
5.1.5 U+DC00 = ed b0 80 = "í°€" |
5.1.6 U+DF80 = ed be 80 = "í¾€" |
5.1.7 U+DFFF = ed bf bf = "í¿¿" |
|
5.2 Paired UTF-16 surrogates |
|
5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "𐀀" |
5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "𐏿" |
5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "󯰀" |
5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "í­¿í¿¿" |
5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "󰀀" |
5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "󰏿" |
5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "􏰀" |
5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "􏿿" |
|
5.3 Other illegal code positions |
|
5.3.1 U+FFFE = ef bf be = "￾" |
5.3.2 U+FFFF = ef bf bf = "￿" |
|
THE END |
#expected CHARSET_OK
UTF-8 decoder capability and stress test
----------------------------------------
Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2003-02-19
This test file can help you examine, how your UTF-8 decoder handles
various types of correct, malformed, or otherwise interesting UTF-8
sequences. This file is not meant to be a conformance test. It does
not prescribes any particular outcome and therefore there is no way to
"pass" or "fail" this test file, even though the texts suggests a
preferable decoder behaviour at some places. The aim is instead to
help you think about and test the behaviour of your UTF-8 on a
systematic collection of unusual inputs. Experience so far suggests
that most first-time authors of UTF-8 decoders find at least one
serious problem in their decoder by using this file.
The test lines below cover boundary conditions, malformed UTF-8
sequences as well as correctly encoded UTF-8 sequences of Unicode code
points that should never occur in a correct UTF-8 file.
According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
receiving UTF-8 shall interpret a "malformed sequence in the same way
that it interprets a character that is outside the adopted subset" and
"characters that are not within the adopted subset shall be indicated
to the user" by a receiving device. A quite commonly used approach in
UTF-8 decoders is to replace any malformed UTF-8 sequence by a
replacement character (U+FFFD), which looks a bit like an inverted
question mark, or a similar symbol. It might be a good idea to
visually distinguish a malformed UTF-8 sequence from a correctly
encoded Unicode character that is just not available in the current
font but otherwise fully legal, even though ISO 10646-1 doesn't
mandate this. In any case, just ignoring malformed sequences or
unavailable characters does not conform to ISO 10646, will make
debugging more difficult, and can lead to user confusion.
Please check, whether a malformed UTF-8 sequence is (1) represented at
all, (2) represented by exactly one single replacement character (or
equivalent signal), and (3) the following quotation mark after an
illegal UTF-8 sequence is correctly displayed, i.e. proper
resynchronization takes place immageately after any malformed
sequence. This file says "THE END" in the last line, so if you don't
see that, your decoder crashed somehow before, which should always be
cause for concern.
All lines in this file are exactly 79 characters long (plus the line
feed). In addition, all lines end with "|", except for the two test
lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
U+0000 and U+007F. If you display this file with a fixed-width font,
these "|" characters should all line up in column 79 (right margin).
This allows you to test quickly, whether your UTF-8 decoder finds the
correct number of characters in every line, that is whether each
malformed sequences is replaced by a single replacement character.
Note that as an alternative to the notion of malformed sequence used
here, it is also a perfectly acceptable (and in some situations even
preferable) solution to represent each individual byte of a malformed
sequence by a replacement character. If you follow this strategy in
your decoder, then please ignore the "|" column.
Here come the tests: |
|
1 Some correct UTF-8 text |
|
You should see the Greek word 'kosme': "κόσμε" |
|
2 Boundary condition test cases |
|
2.1 First possible sequence of a certain length |
|
2.1.1 1 byte (U-00000000): ""
2.1.2 2 bytes (U-00000080): "€" |
2.1.3 3 bytes (U-00000800): "ࠀ" |
2.1.4 4 bytes (U-00010000): "𐀀" |
2.1.5 5 bytes (U-00200000): "øˆ€€€" |
2.1.6 6 bytes (U-04000000): "ü„€€€€" |
|
2.2 Last possible sequence of a certain length |
|
2.2.1 1 byte (U-0000007F): ""
2.2.2 2 bytes (U-000007FF): "߿" |
2.2.3 3 bytes (U-0000FFFF): "�" |
2.2.4 4 bytes (U-001FFFFF): "÷¿¿¿" |
2.2.5 5 bytes (U-03FFFFFF): "û¿¿¿¿" |
2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿¿¿¿" |
|
2.3 Other boundary conditions |
|
2.3.1 U-0000D7FF = ed 9f bf = "퟿" |
2.3.2 U-0000E000 = ee 80 80 = "" |
2.3.3 U-0000FFFD = ef bf bd = "�" |
2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" |
2.3.5 U-00110000 = f4 90 80 80 = "ô€€" |
|
3 Malformed sequences |
|
3.1 Unexpected continuation bytes |
|
Each unexpected continuation byte should be separately signalled as a |
malformed sequence of its own. |
|
3.1.1 First continuation byte 0x80: "�" |
3.1.2 Last continuation byte 0xbf: "�" |
|
3.1.3 2 continuation bytes: "��" |
3.1.4 3 continuation bytes: "���" |
3.1.5 4 continuation bytes: "����" |
3.1.6 5 continuation bytes: "�����" |
3.1.7 6 continuation bytes: "������" |
3.1.8 7 continuation bytes: "�������" |
|
3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): |
|
"���������������� |
���������������� |
���������������� |
����������������" |
|
3.2 Lonely start characters |
|
3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), |
each followed by a space character: |
|
"� � � � � � � � � � � � � � � � |
� � � � � � � � � � � � � � � � " |
|
3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), |
each followed by a space character: |
|
"� � � � � � � � � � � � � � � � " |
|
3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), |
each followed by a space character: |
|
"� � � � � � � � " |
|
3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), |
each followed by a space character: |
|
"� � � � " |
|
3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), |
each followed by a space character: |
|
"� � " |
|
3.3 Sequences with last continuation byte missing |
|
All bytes of an incomplete sequence should be signalled as a single |
malformed sequence, i.e., you should see only a single replacement |
character in each of the next 10 tests. (Characters as in section 2) |
|
3.3.1 2-byte sequence with last byte missing (U+0000): "�" |
3.3.2 3-byte sequence with last byte missing (U+0000): "�" |
3.3.3 4-byte sequence with last byte missing (U+0000): "�" |
3.3.4 5-byte sequence with last byte missing (U+0000): "�" |
3.3.5 6-byte sequence with last byte missing (U+0000): "�" |
3.3.6 2-byte sequence with last byte missing (U-000007FF): "�" |
3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "�" |
3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "�" |
3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "�" |
3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�" |
|
3.4 Concatenation of incomplete sequences |
|
All the 10 sequences of 3.3 concatenated, you should see 10 malformed |
sequences being signalled: |
|
"����������" |
|
3.5 Impossible bytes |
|
The following two bytes cannot appear in a correct UTF-8 string |
|
3.5.1 fe = "�" |
3.5.2 ff = "�" |
3.5.3 fe fe ff ff = "����" |
|
4 Overlong sequences |
|
The following sequences are not malformed according to the letter of |
the Unicode 2.0 standard. However, they are longer then necessary and |
a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 |
decoder" should reject them just like malformed sequences for two |
reasons: (1) It helps to debug applications if overlong sequences are |
not treated as valid representations of characters, because this helps |
to spot problems more quickly. (2) Overlong sequences provide |
alternative representations of characters, that could maliciously be |
used to bypass filters that check only for ASCII characters. For |
instance, a 2-byte encoded line feed (LF) would not be caught by a |
line counter that counts only 0x0a bytes, but it would still be |
processed as a line feed by an unsafe UTF-8 decoder later in the |
pipeline. From a security point of view, ASCII compatibility of UTF-8 |
sequences means also, that ASCII characters are *only* allowed to be |
represented by ASCII bytes in the range 0x00-0x7f. To ensure this |
aspect of ASCII compatibility, use only "safe UTF-8 decoders" that |
reject overlong UTF-8 sequences for which a shorter encoding exists. |
|
4.1 Examples of an overlong ASCII character |
|
With a safe UTF-8 decoder, all of the following five overlong |
representations of the ASCII character slash ("/") should be rejected |
like a malformed UTF-8 sequence, for instance by substituting it with |
a replacement character. If you see a slash below, you do not have a |
safe UTF-8 decoder! |
|
4.1.1 U+002F = c0 af = "�" |
4.1.2 U+002F = e0 80 af = "�" |
4.1.3 U+002F = f0 80 80 af = "�" |
4.1.4 U+002F = f8 80 80 80 af = "�" |
4.1.5 U+002F = fc 80 80 80 80 af = "�" |
|
4.2 Maximum overlong sequences |
|
Below you see the highest Unicode value that is still resulting in an |
overlong sequence if represented with the given number of bytes. This |
is a boundary test for safe UTF-8 decoders. All five characters should |
be rejected like malformed UTF-8 sequences. |
|
4.2.1 U-0000007F = c1 bf = "�" |
4.2.2 U-000007FF = e0 9f bf = "�" |
4.2.3 U-0000FFFF = f0 8f bf bf = "�" |
4.2.4 U-001FFFFF = f8 87 bf bf bf = "�" |
4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "�" |
|
4.3 Overlong representation of the NUL character |
|
The following five sequences should also be rejected like malformed |
UTF-8 sequences and should not be treated like the ASCII NUL |
character. |
|
4.3.1 U+0000 = c0 80 = "�" |
4.3.2 U+0000 = e0 80 80 = "�" |
4.3.3 U+0000 = f0 80 80 80 = "�" |
4.3.4 U+0000 = f8 80 80 80 80 = "�" |
4.3.5 U+0000 = fc 80 80 80 80 80 = "�" |
|
5 Illegal code positions |
|
The following UTF-8 sequences should be rejected like malformed |
sequences, because they never represent valid ISO 10646 characters and |
a UTF-8 decoder that accepts them might introduce security problems |
comparable to overlong UTF-8 sequences. |
|
5.1 Single UTF-16 surrogates |
|
5.1.1 U+D800 = ed a0 80 = "�" |
5.1.2 U+DB7F = ed ad bf = "�" |
5.1.3 U+DB80 = ed ae 80 = "�" |
5.1.4 U+DBFF = ed af bf = "�" |
5.1.5 U+DC00 = ed b0 80 = "�" |
5.1.6 U+DF80 = ed be 80 = "�" |
5.1.7 U+DFFF = ed bf bf = "�" |
|
5.2 Paired UTF-16 surrogates |
|
5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "��" |
5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "��" |
5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "��" |
5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "��" |
5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "��" |
5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "��" |
5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "��" |
5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "��" |
|
5.3 Other illegal code positions |
|
5.3.1 U+FFFE = ef bf be = "�" |
5.3.2 U+FFFF = ef bf bf = "�" |
|
THE END |
#reset
/programs/network/netsurf/libparserutils/test/data/cscodec-utf8/simple.dat
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes:
Added: svn:mime-type
+application/octet-stream
\ No newline at end of property
/programs/network/netsurf/libparserutils/test/data/input/INDEX
0,0 → 1,5
# Index file for inputstream tests
#
# Test Description
 
UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file
/programs/network/netsurf/libparserutils/test/data/input/UTF-8-test.txt
0,0 → 1,271
UTF-8 decoder capability and stress test
----------------------------------------
 
Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2003-02-19
 
This test file can help you examine, how your UTF-8 decoder handles
various types of correct, malformed, or otherwise interesting UTF-8
sequences. This file is not meant to be a conformance test. It does
not prescribes any particular outcome and therefore there is no way to
"pass" or "fail" this test file, even though the texts suggests a
preferable decoder behaviour at some places. The aim is instead to
help you think about and test the behaviour of your UTF-8 on a
systematic collection of unusual inputs. Experience so far suggests
that most first-time authors of UTF-8 decoders find at least one
serious problem in their decoder by using this file.
 
The test lines below cover boundary conditions, malformed UTF-8
sequences as well as correctly encoded UTF-8 sequences of Unicode code
points that should never occur in a correct UTF-8 file.
 
According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
receiving UTF-8 shall interpret a "malformed sequence in the same way
that it interprets a character that is outside the adopted subset" and
"characters that are not within the adopted subset shall be indicated
to the user" by a receiving device. A quite commonly used approach in
UTF-8 decoders is to replace any malformed UTF-8 sequence by a
replacement character (U+FFFD), which looks a bit like an inverted
question mark, or a similar symbol. It might be a good idea to
visually distinguish a malformed UTF-8 sequence from a correctly
encoded Unicode character that is just not available in the current
font but otherwise fully legal, even though ISO 10646-1 doesn't
mandate this. In any case, just ignoring malformed sequences or
unavailable characters does not conform to ISO 10646, will make
debugging more difficult, and can lead to user confusion.
 
Please check, whether a malformed UTF-8 sequence is (1) represented at
all, (2) represented by exactly one single replacement character (or
equivalent signal), and (3) the following quotation mark after an
illegal UTF-8 sequence is correctly displayed, i.e. proper
resynchronization takes place immageately after any malformed
sequence. This file says "THE END" in the last line, so if you don't
see that, your decoder crashed somehow before, which should always be
cause for concern.
 
All lines in this file are exactly 79 characters long (plus the line
feed). In addition, all lines end with "|", except for the two test
lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
U+0000 and U+007F. If you display this file with a fixed-width font,
these "|" characters should all line up in column 79 (right margin).
This allows you to test quickly, whether your UTF-8 decoder finds the
correct number of characters in every line, that is whether each
malformed sequences is replaced by a single replacement character.
 
Note that as an alternative to the notion of malformed sequence used
here, it is also a perfectly acceptable (and in some situations even
preferable) solution to represent each individual byte of a malformed
sequence by a replacement character. If you follow this strategy in
your decoder, then please ignore the "|" column.
 
 
Here come the tests: |
|
1 Some correct UTF-8 text |
|
You should see the Greek word 'kosme': "κόσμε" |
|
2 Boundary condition test cases |
|
2.1 First possible sequence of a certain length |
|
2.1.1 1 byte (U-00000000): ""
2.1.2 2 bytes (U-00000080): "€" |
2.1.3 3 bytes (U-00000800): "ࠀ" |
2.1.4 4 bytes (U-00010000): "𐀀" |
2.1.5 5 bytes (U-00200000): "øˆ€€€" |
2.1.6 6 bytes (U-04000000): "ü„€€€€" |
|
2.2 Last possible sequence of a certain length |
|
2.2.1 1 byte (U-0000007F): ""
2.2.2 2 bytes (U-000007FF): "߿" |
2.2.3 3 bytes (U-0000FFFF): "￿" |
2.2.4 4 bytes (U-001FFFFF): "÷¿¿¿" |
2.2.5 5 bytes (U-03FFFFFF): "û¿¿¿¿" |
2.2.6 6 bytes (U-7FFFFFFF): "ý¿¿¿¿¿" |
|
2.3 Other boundary conditions |
|
2.3.1 U-0000D7FF = ed 9f bf = "퟿" |
2.3.2 U-0000E000 = ee 80 80 = "" |
2.3.3 U-0000FFFD = ef bf bd = "�" |
2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" |
2.3.5 U-00110000 = f4 90 80 80 = "ô€€" |
|
3 Malformed sequences |
|
3.1 Unexpected continuation bytes |
|
Each unexpected continuation byte should be separately signalled as a |
malformed sequence of its own. |
|
3.1.1 First continuation byte 0x80: "€" |
3.1.2 Last continuation byte 0xbf: "¿" |
|
3.1.3 2 continuation bytes: "€¿" |
3.1.4 3 continuation bytes: "€¿€" |
3.1.5 4 continuation bytes: "€¿€¿" |
3.1.6 5 continuation bytes: "€¿€¿€" |
3.1.7 6 continuation bytes: "€¿€¿€¿" |
3.1.8 7 continuation bytes: "€¿€¿€¿€" |
|
3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): |
|
"€‚ƒ„…†‡ˆ‰Š‹ŒŽ |
‘’“”•–—˜™š›œžŸ |
 ¡¢£¤¥¦§¨©ª«¬­®¯ |
°±²³´µ¶·¸¹º»¼½¾¿" |
|
3.2 Lonely start characters |
|
3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), |
each followed by a space character: |
|
"À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï |
Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß " |
|
3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), |
each followed by a space character: |
|
"à á â ã ä å æ ç è é ê ë ì í î ï " |
|
3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), |
each followed by a space character: |
|
"ð ñ ò ó ô õ ö ÷ " |
|
3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), |
each followed by a space character: |
|
"ø ù ú û " |
|
3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), |
each followed by a space character: |
|
"ü ý " |
|
3.3 Sequences with last continuation byte missing |
|
All bytes of an incomplete sequence should be signalled as a single |
malformed sequence, i.e., you should see only a single replacement |
character in each of the next 10 tests. (Characters as in section 2) |
|
3.3.1 2-byte sequence with last byte missing (U+0000): "À" |
3.3.2 3-byte sequence with last byte missing (U+0000): "à€" |
3.3.3 4-byte sequence with last byte missing (U+0000): "ð€€" |
3.3.4 5-byte sequence with last byte missing (U+0000): "ø€€€" |
3.3.5 6-byte sequence with last byte missing (U+0000): "ü€€€€" |
3.3.6 2-byte sequence with last byte missing (U-000007FF): "ß" |
3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "ï¿" |
3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "÷¿¿" |
3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "û¿¿¿" |
3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "ý¿¿¿¿" |
|
3.4 Concatenation of incomplete sequences |
|
All the 10 sequences of 3.3 concatenated, you should see 10 malformed |
sequences being signalled: |
|
"Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿" |
|
3.5 Impossible bytes |
|
The following two bytes cannot appear in a correct UTF-8 string |
|
3.5.1 fe = "þ" |
3.5.2 ff = "ÿ" |
3.5.3 fe fe ff ff = "þþÿÿ" |
|
4 Overlong sequences |
|
The following sequences are not malformed according to the letter of |
the Unicode 2.0 standard. However, they are longer then necessary and |
a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 |
decoder" should reject them just like malformed sequences for two |
reasons: (1) It helps to debug applications if overlong sequences are |
not treated as valid representations of characters, because this helps |
to spot problems more quickly. (2) Overlong sequences provide |
alternative representations of characters, that could maliciously be |
used to bypass filters that check only for ASCII characters. For |
instance, a 2-byte encoded line feed (LF) would not be caught by a |
line counter that counts only 0x0a bytes, but it would still be |
processed as a line feed by an unsafe UTF-8 decoder later in the |
pipeline. From a security point of view, ASCII compatibility of UTF-8 |
sequences means also, that ASCII characters are *only* allowed to be |
represented by ASCII bytes in the range 0x00-0x7f. To ensure this |
aspect of ASCII compatibility, use only "safe UTF-8 decoders" that |
reject overlong UTF-8 sequences for which a shorter encoding exists. |
|
4.1 Examples of an overlong ASCII character |
|
With a safe UTF-8 decoder, all of the following five overlong |
representations of the ASCII character slash ("/") should be rejected |
like a malformed UTF-8 sequence, for instance by substituting it with |
a replacement character. If you see a slash below, you do not have a |
safe UTF-8 decoder! |
|
4.1.1 U+002F = c0 af = "À¯" |
4.1.2 U+002F = e0 80 af = "à€¯" |
4.1.3 U+002F = f0 80 80 af = "ð€€¯" |
4.1.4 U+002F = f8 80 80 80 af = "ø€€€¯" |
4.1.5 U+002F = fc 80 80 80 80 af = "ü€€€€¯" |
|
4.2 Maximum overlong sequences |
|
Below you see the highest Unicode value that is still resulting in an |
overlong sequence if represented with the given number of bytes. This |
is a boundary test for safe UTF-8 decoders. All five characters should |
be rejected like malformed UTF-8 sequences. |
|
4.2.1 U-0000007F = c1 bf = "Á¿" |
4.2.2 U-000007FF = e0 9f bf = "àŸ¿" |
4.2.3 U-0000FFFF = f0 8f bf bf = "ð¿¿" |
4.2.4 U-001FFFFF = f8 87 bf bf bf = "ø‡¿¿¿" |
4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "üƒ¿¿¿¿" |
|
4.3 Overlong representation of the NUL character |
|
The following five sequences should also be rejected like malformed |
UTF-8 sequences and should not be treated like the ASCII NUL |
character. |
|
4.3.1 U+0000 = c0 80 = "" |
4.3.2 U+0000 = e0 80 80 = "à€€" |
4.3.3 U+0000 = f0 80 80 80 = "ð€€€" |
4.3.4 U+0000 = f8 80 80 80 80 = "ø€€€€" |
4.3.5 U+0000 = fc 80 80 80 80 80 = "ü€€€€€" |
|
5 Illegal code positions |
|
The following UTF-8 sequences should be rejected like malformed |
sequences, because they never represent valid ISO 10646 characters and |
a UTF-8 decoder that accepts them might introduce security problems |
comparable to overlong UTF-8 sequences. |
|
5.1 Single UTF-16 surrogates |
|
5.1.1 U+D800 = ed a0 80 = "í €" |
5.1.2 U+DB7F = ed ad bf = "í­¿" |
5.1.3 U+DB80 = ed ae 80 = "í®€" |
5.1.4 U+DBFF = ed af bf = "í¯¿" |
5.1.5 U+DC00 = ed b0 80 = "í°€" |
5.1.6 U+DF80 = ed be 80 = "í¾€" |
5.1.7 U+DFFF = ed bf bf = "í¿¿" |
|
5.2 Paired UTF-16 surrogates |
|
5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "𐀀" |
5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "𐏿" |
5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "󯰀" |
5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "í­¿í¿¿" |
5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "󰀀" |
5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "󰏿" |
5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "􏰀" |
5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "􏿿" |
|
5.3 Other illegal code positions |
|
5.3.1 U+FFFE = ef bf be = "￾" |
5.3.2 U+FFFF = ef bf bf = "￿" |
|
THE END |
/programs/network/netsurf/libparserutils/test/filter.c
0,0 → 1,349
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
#include <parserutils/parserutils.h>
 
#include "utils/utils.h"
 
#include "input/filter.h"
 
#include "testutils.h"
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_filter_optparams params;
parserutils_filter *input;
uint8_t inbuf[64], outbuf[64];
size_t inlen, outlen;
const uint8_t *in = inbuf;
uint8_t *out = outbuf;
 
UNUSED(argc);
UNUSED(argv);
 
/* Create input filter */
assert(parserutils__filter_create("UTF-8", myrealloc, NULL, &input) ==
PARSERUTILS_OK);
 
/* Convert filter to UTF-8 encoding */
params.encoding.name = "UTF-8";
assert(parserutils__filter_setopt(input, PARSERUTILS_FILTER_SET_ENCODING,
(parserutils_filter_optparams *) &params) ==
PARSERUTILS_OK);
 
 
/* Simple case - valid input & output buffer large enough */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\xc2\xa0o!");
inlen = strlen((const char *) inbuf);
outbuf[0] = '\0';
outlen = 64;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xc2\xa0o!",
SLEN("hell\xc2\xa0o!")) == 0);
 
 
/* Too small an output buffer; no encoding edge cases */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hello!");
inlen = strlen((const char *) inbuf);
outbuf[0] = '\0';
outlen = 5;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_NOMEM);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
outlen = 64 - 5 + outlen;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hello!",
SLEN("hello!")) == 0);
 
 
/* Illegal input sequence; output buffer large enough */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\x96o!");
inlen = strlen((const char *) inbuf);
outbuf[0] = '\0';
outlen = 64;
 
/* Input does loose decoding, converting to U+FFFD if illegal
* input is encountered */
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xef\xbf\xbdo!",
SLEN("hell\xef\xbf\xbdo!")) == 0);
 
 
/* Input ends mid-sequence */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\xc2\xa0o!");
inlen = strlen((const char *) inbuf) - 3;
outbuf[0] = '\0';
outlen = 64;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 3;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xc2\xa0o!",
SLEN("hell\xc2\xa0o!")) == 0);
 
 
/* Input ends mid-sequence, but second attempt has too small a
* buffer, but large enough to write out the incomplete character. */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\xc2\xa0o!");
inlen = strlen((const char *) inbuf) - 3;
outbuf[0] = '\0';
outlen = 64;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 3;
outlen = 3;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_NOMEM);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
outlen = 64 - 7;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xc2\xa0o!",
SLEN("hell\xc2\xa0o!")) == 0);
 
 
/* Input ends mid-sequence, but second attempt has too small a
* buffer, not large enough to write out the incomplete character. */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\xc2\xa0o!");
inlen = strlen((const char *) inbuf) - 3;
outbuf[0] = '\0';
outlen = 64;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 3;
outlen = 1;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_NOMEM);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
outlen = 60;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xc2\xa0o!",
SLEN("hell\xc2\xa0o!")) == 0);
 
 
/* Input ends mid-sequence, but second attempt contains
* invalid character */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\xc2\xc2o!");
inlen = strlen((const char *) inbuf) - 3;
outbuf[0] = '\0';
outlen = 64;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 3;
 
/* Input does loose decoding, converting to U+FFFD if illegal
* input is encountered */
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xef\xbf\xbd\xef\xbf\xbdo!",
SLEN("hell\xef\xbf\xbd\xef\xbf\xbdo!")) == 0);
 
 
/* Input ends mid-sequence, but second attempt contains another
* incomplete character */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\xc2\xa0\xc2\xa1o!");
inlen = strlen((const char *) inbuf) - 5;
outbuf[0] = '\0';
outlen = 64;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 2;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 3;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xc2\xa0\xc2\xa1o!",
SLEN("hell\xc2\xa0\xc2\xa1o!")) == 0);
 
 
/* Input ends mid-sequence, but second attempt contains insufficient
* data to complete the incomplete character */
in = inbuf;
out = outbuf;
strcpy((char *) inbuf, "hell\xe2\x80\xa2o!");
inlen = strlen((const char *) inbuf) - 4;
outbuf[0] = '\0';
outlen = 64;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 1;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
inlen += 3;
 
assert(parserutils__filter_process_chunk(input, &in, &inlen,
&out, &outlen) == PARSERUTILS_OK);
 
printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
(int) (out - ((uint8_t *) outbuf)),
outbuf, (int) outlen);
 
assert(parserutils__filter_reset(input) == PARSERUTILS_OK);
 
assert(memcmp(outbuf, "hell\xe2\x80\xa2o!",
SLEN("hell\xe2\x80\xa2o!")) == 0);
 
 
/* Clean up */
parserutils__filter_destroy(input);
 
printf("PASS\n");
 
return 0;
}
/programs/network/netsurf/libparserutils/test/inputstream.c
0,0 → 1,102
#include <inttypes.h>
#include <stdio.h>
 
#include <parserutils/parserutils.h>
#include <parserutils/charset/utf8.h>
#include <parserutils/input/inputstream.h>
 
#include "utils/utils.h"
 
#include "testutils.h"
 
#ifdef __riscos
const char * const __dynamic_da_name = "InputStream";
int __dynamic_da_max_size = 128*1024*1024;
#endif
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_inputstream *stream;
FILE *fp;
size_t len;
#define CHUNK_SIZE (4096)
uint8_t buf[CHUNK_SIZE];
const uint8_t *c;
size_t clen;
 
if (argc != 2) {
printf("Usage: %s <filename>\n", argv[0]);
return 1;
}
 
assert(parserutils_inputstream_create("UTF-8", 1, NULL,
myrealloc, NULL, &stream) == PARSERUTILS_OK);
 
fp = fopen(argv[1], "rb");
if (fp == NULL) {
printf("Failed opening %s\n", argv[1]);
return 1;
}
 
fseek(fp, 0, SEEK_END);
len = ftell(fp);
fseek(fp, 0, SEEK_SET);
 
while (len >= CHUNK_SIZE) {
size_t read = fread(buf, 1, CHUNK_SIZE, fp);
assert(read == CHUNK_SIZE);
 
assert(parserutils_inputstream_append(stream,
buf, CHUNK_SIZE) == PARSERUTILS_OK);
 
len -= CHUNK_SIZE;
 
while (parserutils_inputstream_peek(stream, 0, &c, &clen) !=
PARSERUTILS_NEEDDATA) {
parserutils_inputstream_advance(stream, clen);
if (*c == 'a') {
assert(parserutils_inputstream_insert(stream,
(const uint8_t *) "hello!!!",
SLEN("hello!!!")) == PARSERUTILS_OK);
}
}
}
 
if (len > 0) {
size_t read = fread(buf, 1, len, fp);
assert(read == len);
 
assert(parserutils_inputstream_append(stream,
buf, len) == PARSERUTILS_OK);
 
len = 0;
}
 
fclose(fp);
 
assert(parserutils_inputstream_insert(stream,
(const uint8_t *) "hello!!!",
SLEN("hello!!!")) == PARSERUTILS_OK);
 
assert(parserutils_inputstream_append(stream, NULL, 0) ==
PARSERUTILS_OK);
 
while (parserutils_inputstream_peek(stream, 0, &c, &clen) !=
PARSERUTILS_EOF) {
parserutils_inputstream_advance(stream, clen);
}
 
parserutils_inputstream_destroy(stream);
 
printf("PASS\n");
 
return 0;
}
 
/programs/network/netsurf/libparserutils/test/regression/INDEX
0,0 → 1,7
# Index for testcases
#
# Test Description DataDir
 
filter-segv Segfault in input filtering
stream-nomem Inputstream buffer expansion
filter-badenc-segv Segfault on resetting bad encoding in filter
/programs/network/netsurf/libparserutils/test/regression/Makefile
0,0 → 1,7
# Tests
DIR_TEST_ITEMS := filter-segv:filter-segv.c \
stream-nomem:stream-nomem.c filter-badenc-segv:filter-badenc-segv.c
 
CFLAGS := $(CFLAGS) -I$(CURDIR)/test
 
include $(NSBUILD)/Makefile.subdir
/programs/network/netsurf/libparserutils/test/regression/filter-badenc-segv.c
0,0 → 1,50
#include <stdio.h>
#include <stdlib.h>
 
#include <parserutils/parserutils.h>
 
#include "input/filter.h"
 
#include "testutils.h"
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_filter *input;
parserutils_filter_optparams params;
parserutils_error expected;
 
#ifndef WITHOUT_ICONV_FILTER
expected = PARSERUTILS_OK;
#else
expected = PARSERUTILS_BADENCODING;
#endif
 
UNUSED(argc);
UNUSED(argv);
 
assert(parserutils__filter_create("UTF-8", myrealloc, NULL, &input) ==
PARSERUTILS_OK);
 
params.encoding.name = "GBK";
assert(parserutils__filter_setopt(input,
PARSERUTILS_FILTER_SET_ENCODING, &params) ==
expected);
 
params.encoding.name = "GBK";
assert(parserutils__filter_setopt(input,
PARSERUTILS_FILTER_SET_ENCODING, &params) ==
expected);
 
parserutils__filter_destroy(input);
 
printf("PASS\n");
 
return 0;
}
/programs/network/netsurf/libparserutils/test/regression/filter-segv.c
0,0 → 1,32
#include <stdio.h>
#include <stdlib.h>
 
#include <parserutils/parserutils.h>
 
#include "input/filter.h"
 
#include "testutils.h"
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_filter *input;
 
UNUSED(argc);
UNUSED(argv);
 
assert(parserutils__filter_create("UTF-8", myrealloc, NULL, &input) ==
PARSERUTILS_OK);
 
parserutils__filter_destroy(input);
 
printf("PASS\n");
 
return 0;
}
/programs/network/netsurf/libparserutils/test/regression/stream-nomem.c
0,0 → 1,86
#include <stdio.h>
#include <string.h>
 
#include <parserutils/parserutils.h>
#include <parserutils/input/inputstream.h>
 
#include "utils/utils.h"
 
#include "testutils.h"
 
static void *myrealloc(void *ptr, size_t len, void *pw)
{
UNUSED(pw);
 
return realloc(ptr, len);
}
 
int main(int argc, char **argv)
{
parserutils_inputstream *stream;
 
/* This is specially calculated so that the inputstream is forced to
* reallocate (it assumes that the inputstream's buffer chunk size
* is 4k) */
#define BUFFER_SIZE (4096 + 4)
uint8_t input_buffer[BUFFER_SIZE];
// uint8_t *buffer;
// size_t buflen;
const uint8_t *c;
size_t clen;
 
UNUSED(argc);
UNUSED(argv);
 
/* Populate the buffer with something sane */
memset(input_buffer, 'a', BUFFER_SIZE);
/* Now, set up our test data */
input_buffer[BUFFER_SIZE - 1] = '5';
input_buffer[BUFFER_SIZE - 2] = '4';
input_buffer[BUFFER_SIZE - 3] = '\xbd';
input_buffer[BUFFER_SIZE - 4] = '\xbf';
/* This byte will occupy the 4095th byte in the buffer and
* thus cause the entirety of U+FFFD to be buffered until after
* the buffer has been enlarged */
input_buffer[BUFFER_SIZE - 5] = '\xef';
input_buffer[BUFFER_SIZE - 6] = '3';
input_buffer[BUFFER_SIZE - 7] = '2';
input_buffer[BUFFER_SIZE - 8] = '1';
 
assert(parserutils_inputstream_create("UTF-8", 0,
NULL, myrealloc, NULL, &stream) == PARSERUTILS_OK);
 
assert(parserutils_inputstream_append(stream,
input_buffer, BUFFER_SIZE) == PARSERUTILS_OK);
 
assert(parserutils_inputstream_append(stream, NULL, 0) ==
PARSERUTILS_OK);
 
while (parserutils_inputstream_peek(stream, 0, &c, &clen) !=
PARSERUTILS_EOF)
parserutils_inputstream_advance(stream, clen);
 
/*
assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) ==
CSS_OK);
 
assert(buflen == BUFFER_SIZE);
 
printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8));
 
assert( buffer[BUFFER_SIZE - 6] == '3' &&
buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' &&
buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' &&
buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' &&
buffer[BUFFER_SIZE - 2] == '4');
 
free(buffer);
*/
 
parserutils_inputstream_destroy(stream);
 
printf("PASS\n");
 
return 0;
}
 
/programs/network/netsurf/libparserutils/test/testutils.h
0,0 → 1,123
#ifndef test_testutils_h_
#define test_testutils_h_
 
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
 
#ifndef UNUSED
#define UNUSED(x) ((x) = (x))
#endif
 
/* Redefine assert, so we can simply use the standard assert mechanism
* within testcases and exit with the right output for the testrunner
* to do the right thing. */
void __assert2(const char *expr, const char *function,
const char *file, int line);
 
void __assert2(const char *expr, const char *function,
const char *file, int line)
{
UNUSED(function);
UNUSED(file);
 
printf("FAIL - %s at line %d\n", expr, line);
 
exit(EXIT_FAILURE);
}
 
#define assert(expr) \
((void) ((expr) || (__assert2 (#expr, __func__, __FILE__, __LINE__), 0)))
 
 
typedef bool (*line_func)(const char *data, size_t datalen, void *pw);
 
static size_t parse_strlen(const char *str, size_t limit);
bool parse_testfile(const char *filename, line_func callback, void *pw);
size_t parse_filesize(const char *filename);
 
/**
* Testcase datafile parser driver
*
* \param filename Name of file to parse
* \param callback Pointer to function to handle each line of input data
* \param pw Pointer to client-specific private data
* \return true on success, false otherwise.
*/
bool parse_testfile(const char *filename, line_func callback, void *pw)
{
FILE *fp;
char buf[300];
 
fp = fopen(filename, "rb");
if (fp == NULL) {
printf("Failed opening %s\n", filename);
return false;
}
 
while (fgets(buf, sizeof buf, fp)) {
if (buf[0] == '\n')
continue;
 
if (!callback(buf, parse_strlen(buf, sizeof buf - 1), pw)) {
fclose(fp);
return false;
}
}
 
fclose(fp);
 
return true;
}
 
/**
* Utility string length measurer; assumes strings are '\n' terminated
*
* \param str String to measure length of
* \param limit Upper bound on string length
* \return String length
*/
size_t parse_strlen(const char *str, size_t limit)
{
size_t len = 0;
 
if (str == NULL)
return 0;
 
while (len < limit - 1 && *str != '\n') {
len++;
str++;
}
 
len++;
 
return len;
}
 
/**
* Read the size of a file
*
* \param filename Name of file to read size of
* \return File size (in bytes), or 0 on error
*/
size_t parse_filesize(const char *filename)
{
FILE *fp;
size_t len = 0;
 
fp = fopen(filename, "rb");
if (fp == NULL) {
printf("Failed opening %s\n", filename);
return 0;
}
 
fseek(fp, 0, SEEK_END);
len = ftell(fp);
 
fclose(fp);
 
return len;
}
 
 
#endif