WebSVN – Kolibri OS – Blame – /contrib/other/kpm/tinyxml/tinyxmlparser.cpp

Rev	Author	Line No.	Line
5725	serge	1	/*
		2	www.sourceforge.net/projects/tinyxml
		3	Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
		4
		5	This software is provided 'as-is', without any express or implied
		6	warranty. In no event will the authors be held liable for any
		7	damages arising from the use of this software.
		8
		9	Permission is granted to anyone to use this software for any
		10	purpose, including commercial applications, and to alter it and
		11	redistribute it freely, subject to the following restrictions:
		12
		13	1. The origin of this software must not be misrepresented; you must
		14	not claim that you wrote the original software. If you use this
		15	software in a product, an acknowledgment in the product documentation
		16	would be appreciated but is not required.
		17
		18	2. Altered source versions must be plainly marked as such, and
		19	must not be misrepresented as being the original software.
		20
		21	3. This notice may not be removed or altered from any source
		22	distribution.
		23	*/
		24
		25	#include "tinyxml.h"
		26
		27	//#define DEBUG_PARSER
		28	#if defined( DEBUG_PARSER )
		29	# if defined( DEBUG ) && defined( _MSC_VER )
		30	# include
		31	# define TIXML_LOG OutputDebugString
		32	# else
		33	# define TIXML_LOG printf
		34	# endif
		35	#endif
		36
		37	// Note tha "PutString" hardcodes the same list. This
		38	// is less flexible than it appears. Changing the entries
		39	// or order will break putstring.
		40	TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
		41	{
		42	{ "&", 5, '&' },
		43	{ "<", 4, '<' },
		44	{ ">", 4, '>' },
		45	{ """, 6, '\"' },
		46	{ "'", 6, '\'' }
		47	};
		48
		49	// Bunch of unicode info at:
		50	// http://www.unicode.org/faq/utf_bom.html
		51	// Including the basic of this table, which determines the #bytes in the
		52	// sequence from the lead byte. 1 placed for invalid sequences --
		53	// although the result will be junk, pass it through as much as possible.
		54	// Beware of the non-characters in UTF-8:
		55	// ef bb bf (Microsoft "lead bytes")
		56	// ef bf be
		57	// ef bf bf
		58
		59	const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
		60	const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
		61	const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
		62
		63	const int TiXmlBase::utf8ByteTable[256] =
		64	{
		65	// 0 1 2 3 4 5 6 7 8 9 a b c d e f
		66	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
		67	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
		68	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
		69	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
		70	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
		71	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
		72	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
		73	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
		74	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
		75	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
		76	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
		77	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
		78	1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
		79	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
		80	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
		81	4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
		82	};
		83
		84
		85	void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
		86	{
		87	const unsigned long BYTE_MASK = 0xBF;
		88	const unsigned long BYTE_MARK = 0x80;
		89	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
		90
		91	if (input < 0x80)
		92	*length = 1;
		93	else if ( input < 0x800 )
		94	*length = 2;
		95	else if ( input < 0x10000 )
		96	*length = 3;
		97	else if ( input < 0x200000 )
		98	*length = 4;
		99	else
		100	{ *length = 0; return; } // This code won't covert this correctly anyway.
		101
		102	output += *length;
		103
		104	// Scary scary fall throughs.
		105	switch (*length)
		106	{
		107	case 4:
		108	--output;
		109	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
		110	input >>= 6;
		111	case 3:
		112	--output;
		113	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
		114	input >>= 6;
		115	case 2:
		116	--output;
		117	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
		118	input >>= 6;
		119	case 1:
		120	--output;
		121	output = (char)(input \| FIRST_BYTE_MARK[length]);
		122	}
		123	}
		124
		125
		126	/static/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /encoding/ )
		127	{
		128	// This will only work for low-ascii, everything else is assumed to be a valid
		129	// letter. I'm not sure this is the best approach, but it is quite tricky trying
		130	// to figure out alhabetical vs. not across encoding. So take a very
		131	// conservative approach.
		132
		133	// if ( encoding == TIXML_ENCODING_UTF8 )
		134	// {
		135	if ( anyByte < 127 )
		136	return isalpha( anyByte );
		137	else
		138	return 1; // What else to do? The unicode set is huge...get the english ones right.
		139	// }
		140	// else
		141	// {
		142	// return isalpha( anyByte );
		143	// }
		144	}
		145
		146
		147	/static/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /encoding/ )
		148	{
		149	// This will only work for low-ascii, everything else is assumed to be a valid
		150	// letter. I'm not sure this is the best approach, but it is quite tricky trying
		151	// to figure out alhabetical vs. not across encoding. So take a very
		152	// conservative approach.
		153
		154	// if ( encoding == TIXML_ENCODING_UTF8 )
		155	// {
		156	if ( anyByte < 127 )
		157	return isalnum( anyByte );
		158	else
		159	return 1; // What else to do? The unicode set is huge...get the english ones right.
		160	// }
		161	// else
		162	// {
		163	// return isalnum( anyByte );
		164	// }
		165	}
		166
		167
		168	class TiXmlParsingData
		169	{
		170	friend class TiXmlDocument;
		171	public:
		172	void Stamp( const char* now, TiXmlEncoding encoding );
		173
		174	const TiXmlCursor& Cursor() { return cursor; }
		175
		176	private:
		177	// Only used by the document!
		178	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
		179	{
		180	assert( start );
		181	stamp = start;
		182	tabsize = _tabsize;
		183	cursor.row = row;
		184	cursor.col = col;
		185	}
		186
		187	TiXmlCursor cursor;
		188	const char* stamp;
		189	int tabsize;
		190	};
		191
		192
		193	void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
		194	{
		195	assert( now );
		196
		197	// Do nothing if the tabsize is 0.
		198	if ( tabsize < 1 )
		199	{
		200	return;
		201	}
		202
		203	// Get the current row, column.
		204	int row = cursor.row;
		205	int col = cursor.col;
		206	const char* p = stamp;
		207	assert( p );
		208
		209	while ( p < now )
		210	{
		211	// Treat p as unsigned, so we have a happy compiler.
		212	const unsigned char* pU = (const unsigned char*)p;
		213
		214	// Code contributed by Fletcher Dunn: (modified by lee)
		215	switch (*pU) {
		216	case 0:
		217	// We should never get here, but in case we do, don't
		218	// advance past the terminating null character, ever
		219	return;
		220
		221	case '\r':
		222	// bump down to the next line
		223	++row;
		224	col = 0;
		225	// Eat the character
		226	++p;
		227
		228	// Check for \r\n sequence, and treat this as a single character
		229	if (*p == '\n') {
		230	++p;
		231	}
		232	break;
		233
		234	case '\n':
		235	// bump down to the next line
		236	++row;
		237	col = 0;
		238
		239	// Eat the character
		240	++p;
		241
		242	// Check for \n\r sequence, and treat this as a single
		243	// character. (Yes, this bizarre thing does occur still
		244	// on some arcane platforms...)
		245	if (*p == '\r') {
		246	++p;
		247	}
		248	break;
		249
		250	case '\t':
		251	// Eat the character
		252	++p;
		253
		254	// Skip to next tab stop
		255	col = (col / tabsize + 1) * tabsize;
		256	break;
		257
		258	case TIXML_UTF_LEAD_0:
		259	if ( encoding == TIXML_ENCODING_UTF8 )
		260	{
		261	if ( (p+1) && (p+2) )
		262	{
		263	// In these cases, don't advance the column. These are
		264	// 0-width spaces.
		265	if ( (pU+1)==TIXML_UTF_LEAD_1 && (pU+2)==TIXML_UTF_LEAD_2 )
		266	p += 3;
		267	else if ( (pU+1)==0xbfU && (pU+2)==0xbeU )
		268	p += 3;
		269	else if ( (pU+1)==0xbfU && (pU+2)==0xbfU )
		270	p += 3;
		271	else
		272	{ p +=3; ++col; } // A normal character.
		273	}
		274	}
		275	else
		276	{
		277	++p;
		278	++col;
		279	}
		280	break;
		281
		282	default:
		283	if ( encoding == TIXML_ENCODING_UTF8 )
		284	{
		285	// Eat the 1 to 4 byte utf8 character.
		286	int step = TiXmlBase::utf8ByteTable[((const unsigned char)p)];
		287	if ( step == 0 )
		288	step = 1; // Error case from bad encoding, but handle gracefully.
		289	p += step;
		290
		291	// Just advance one column, of course.
		292	++col;
		293	}
		294	else
		295	{
		296	++p;
		297	++col;
		298	}
		299	break;
		300	}
		301	}
		302	cursor.row = row;
		303	cursor.col = col;
		304	assert( cursor.row >= -1 );
		305	assert( cursor.col >= -1 );
		306	stamp = p;
		307	assert( stamp );
		308	}
		309
		310
		311	const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
		312	{
		313	if ( !p \|\| !*p )
		314	{
		315	return 0;
		316	}
		317	if ( encoding == TIXML_ENCODING_UTF8 )
		318	{
		319	while ( *p )
		320	{
		321	const unsigned char* pU = (const unsigned char*)p;
		322
		323	// Skip the stupid Microsoft UTF-8 Byte order marks
		324	if ( *(pU+0)==TIXML_UTF_LEAD_0
		325	&& *(pU+1)==TIXML_UTF_LEAD_1
		326	&& *(pU+2)==TIXML_UTF_LEAD_2 )
		327	{
		328	p += 3;
		329	continue;
		330	}
		331	else if(*(pU+0)==TIXML_UTF_LEAD_0
		332	&& *(pU+1)==0xbfU
		333	&& *(pU+2)==0xbeU )
		334	{
		335	p += 3;
		336	continue;
		337	}
		338	else if(*(pU+0)==TIXML_UTF_LEAD_0
		339	&& *(pU+1)==0xbfU
		340	&& *(pU+2)==0xbfU )
		341	{
		342	p += 3;
		343	continue;
		344	}
		345
		346	if ( IsWhiteSpace( *p ) )
		347	++p;
		348	else
		349	break;
		350	}
		351	}
		352	else
		353	{
		354	while ( p && IsWhiteSpace( p ) )
		355	++p;
		356	}
		357
		358	return p;
		359	}
		360
		361	#ifdef TIXML_USE_STL
		362	/static/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
		363	{
		364	for( ;; )
		365	{
		366	if ( !in->good() ) return false;
		367
		368	int c = in->peek();
		369	// At this scope, we can't get to a document. So fail silently.
		370	if ( !IsWhiteSpace( c ) \|\| c <= 0 )
		371	return true;
		372
		373	*tag += (char) in->get();
		374	}
		375	}
		376
		377	/static/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
		378	{
		379	//assert( character > 0 && character < 128 ); // else it won't work in utf-8
		380	while ( in->good() )
		381	{
		382	int c = in->peek();
		383	if ( c == character )
		384	return true;
		385	if ( c <= 0 ) // Silent failure: can't get document at this scope
		386	return false;
		387
		388	in->get();
		389	*tag += (char) c;
		390	}
		391	return false;
		392	}
		393	#endif
		394
		395	// One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
		396	// "assign" optimization removes over 10% of the execution time.
		397	//
		398	const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
		399	{
		400	// Oddly, not supported on some comilers,
		401	//name->clear();
		402	// So use this:
		403	*name = "";
		404	assert( p );
		405
		406	// Names start with letters or underscores.
		407	// Of course, in unicode, tinyxml has no idea what a letter is. The
		408	// algorithm is generous.
		409	//
		410	// After that, they can be letters, underscores, numbers,
		411	// hyphens, or colons. (Colons are valid ony for namespaces,
		412	// but tinyxml can't tell namespaces from names.)
		413	if ( p && *p
		414	&& ( IsAlpha( (unsigned char) p, encoding ) \|\| p == '_' ) )
		415	{
		416	const char* start = p;
		417	while( p && *p
		418	&& ( IsAlphaNum( (unsigned char ) *p, encoding )
		419	\|\| *p == '_'
		420	\|\| *p == '-'
		421	\|\| *p == '.'
		422	\|\| *p == ':' ) )
		423	{
		424	//(name) += p; // expensive
		425	++p;
		426	}
		427	if ( p-start > 0 ) {
		428	name->assign( start, p-start );
		429	}
		430	return p;
		431	}
		432	return 0;
		433	}
		434
		435	const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
		436	{
		437	// Presume an entity, and pull it out.
		438	TIXML_STRING ent;
		439	int i;
		440	*length = 0;
		441
		442	if ( (p+1) && (p+1) == '#' && *(p+2) )
		443	{
		444	unsigned long ucs = 0;
		445	ptrdiff_t delta = 0;
		446	unsigned mult = 1;
		447
		448	if ( *(p+2) == 'x' )
		449	{
		450	// Hexadecimal.
		451	if ( !*(p+3) ) return 0;
		452
		453	const char* q = p+3;
		454	q = strchr( q, ';' );
		455
		456	if ( !q \|\| !*q ) return 0;
		457
		458	delta = q-p;
		459	--q;
		460
		461	while ( *q != 'x' )
		462	{
		463	if ( q >= '0' && q <= '9' )
		464	ucs += mult * (*q - '0');
		465	else if ( q >= 'a' && q <= 'f' )
		466	ucs += mult * (*q - 'a' + 10);
		467	else if ( q >= 'A' && q <= 'F' )
		468	ucs += mult * (*q - 'A' + 10 );
		469	else
		470	return 0;
		471	mult *= 16;
		472	--q;
		473	}
		474	}
		475	else
		476	{
		477	// Decimal.
		478	if ( !*(p+2) ) return 0;
		479
		480	const char* q = p+2;
		481	q = strchr( q, ';' );
		482
		483	if ( !q \|\| !*q ) return 0;
		484
		485	delta = q-p;
		486	--q;
		487
		488	while ( *q != '#' )
		489	{
		490	if ( q >= '0' && q <= '9' )
		491	ucs += mult * (*q - '0');
		492	else
		493	return 0;
		494	mult *= 10;
		495	--q;
		496	}
		497	}
		498	if ( encoding == TIXML_ENCODING_UTF8 )
		499	{
		500	// convert the UCS to UTF-8
		501	ConvertUTF32ToUTF8( ucs, value, length );
		502	}
		503	else
		504	{
		505	*value = (char)ucs;
		506	*length = 1;
		507	}
		508	return p + delta + 1;
		509	}
		510
		511	// Now try to match it.
		512	for( i=0; i
		513	{
		514	if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
		515	{
		516	assert( strlen( entity[i].str ) == entity[i].strLength );
		517	*value = entity[i].chr;
		518	*length = 1;
		519	return ( p + entity[i].strLength );
		520	}
		521	}
		522
		523	// So it wasn't an entity, its unrecognized, or something like that.
		524	value = p; // Don't put back the last one, since we return it!
		525	//*length = 1; // Leave unrecognized entities - this doesn't really work.
		526	// Just writes strange XML.
		527	return p+1;
		528	}
		529
		530
		531	bool TiXmlBase::StringEqual( const char* p,
		532	const char* tag,
		533	bool ignoreCase,
		534	TiXmlEncoding encoding )
		535	{
		536	assert( p );
		537	assert( tag );
		538	if ( !p \|\| !*p )
		539	{
		540	assert( 0 );
		541	return false;
		542	}
		543
		544	const char* q = p;
		545
		546	if ( ignoreCase )
		547	{
		548	while ( q && tag && ToLower( q, encoding ) == ToLower( tag, encoding ) )
		549	{
		550	++q;
		551	++tag;
		552	}
		553
		554	if ( *tag == 0 )
		555	return true;
		556	}
		557	else
		558	{
		559	while ( q && tag && q == tag )
		560	{
		561	++q;
		562	++tag;
		563	}
		564
		565	if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
		566	return true;
		567	}
		568	return false;
		569	}
		570
		571	const char* TiXmlBase::ReadText( const char* p,
		572	TIXML_STRING * text,
		573	bool trimWhiteSpace,
		574	const char* endTag,
		575	bool caseInsensitive,
		576	TiXmlEncoding encoding )
		577	{
		578	*text = "";
		579	if ( !trimWhiteSpace // certain tags always keep whitespace
		580	\|\| !condenseWhiteSpace ) // if true, whitespace is always kept
		581	{
		582	// Keep all the white space.
		583	while ( p && *p
		584	&& !StringEqual( p, endTag, caseInsensitive, encoding )
		585	)
		586	{
		587	int len;
		588	char cArr[4] = { 0, 0, 0, 0 };
		589	p = GetChar( p, cArr, &len, encoding );
		590	text->append( cArr, len );
		591	}
		592	}
		593	else
		594	{
		595	bool whitespace = false;
		596
		597	// Remove leading white space:
		598	p = SkipWhiteSpace( p, encoding );
		599	while ( p && *p
		600	&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
		601	{
		602	if ( p == '\r' \|\| p == '\n' )
		603	{
		604	whitespace = true;
		605	++p;
		606	}
		607	else if ( IsWhiteSpace( *p ) )
		608	{
		609	whitespace = true;
		610	++p;
		611	}
		612	else
		613	{
		614	// If we've found whitespace, add it before the
		615	// new character. Any whitespace just becomes a space.
		616	if ( whitespace )
		617	{
		618	(*text) += ' ';
		619	whitespace = false;
		620	}
		621	int len;
		622	char cArr[4] = { 0, 0, 0, 0 };
		623	p = GetChar( p, cArr, &len, encoding );
		624	if ( len == 1 )
		625	(*text) += cArr[0]; // more efficient
		626	else
		627	text->append( cArr, len );
		628	}
		629	}
		630	}
		631	if ( p )
		632	p += strlen( endTag );
		633	return p;
		634	}
		635
		636	#ifdef TIXML_USE_STL
		637
		638	void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
		639	{
		640	// The basic issue with a document is that we don't know what we're
		641	// streaming. Read something presumed to be a tag (and hope), then
		642	// identify it, and call the appropriate stream method on the tag.
		643	//
		644	// This "pre-streaming" will never read the closing ">" so the
		645	// sub-tag can orient itself.
		646
		647	if ( !StreamTo( in, '<', tag ) )
		648	{
		649	SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
		650	return;
		651	}
		652
		653	while ( in->good() )
		654	{
		655	int tagIndex = (int) tag->length();
		656	while ( in->good() && in->peek() != '>' )
		657	{
		658	int c = in->get();
		659	if ( c <= 0 )
		660	{
		661	SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
		662	break;
		663	}
		664	(*tag) += (char) c;
		665	}
		666
		667	if ( in->good() )
		668	{
		669	// We now have something we presume to be a node of
		670	// some sort. Identify it, and call the node to
		671	// continue streaming.
		672	TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
		673
		674	if ( node )
		675	{
		676	node->StreamIn( in, tag );
		677	bool isElement = node->ToElement() != 0;
		678	delete node;
		679	node = 0;
		680
		681	// If this is the root element, we're done. Parsing will be
		682	// done by the >> operator.
		683	if ( isElement )
		684	{
		685	return;
		686	}
		687	}
		688	else
		689	{
		690	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
		691	return;
		692	}
		693	}
		694	}
		695	// We should have returned sooner.
		696	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
		697	}
		698
		699	#endif
		700
		701	const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
		702	{
		703	ClearError();
		704
		705	// Parse away, at the document level. Since a document
		706	// contains nothing but other tags, most of what happens
		707	// here is skipping white space.
		708	if ( !p \|\| !*p )
		709	{
		710	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
		711	return 0;
		712	}
		713
		714	// Note that, for a document, this needs to come
		715	// before the while space skip, so that parsing
		716	// starts from the pointer we are given.
		717	location.Clear();
		718	if ( prevData )
		719	{
		720	location.row = prevData->cursor.row;
		721	location.col = prevData->cursor.col;
		722	}
		723	else
		724	{
		725	location.row = 0;
		726	location.col = 0;
		727	}
		728	TiXmlParsingData data( p, TabSize(), location.row, location.col );
		729	location = data.Cursor();
		730
		731	if ( encoding == TIXML_ENCODING_UNKNOWN )
		732	{
		733	// Check for the Microsoft UTF-8 lead bytes.
		734	const unsigned char* pU = (const unsigned char*)p;
		735	if ( (pU+0) && (pU+0) == TIXML_UTF_LEAD_0
		736	&& (pU+1) && (pU+1) == TIXML_UTF_LEAD_1
		737	&& (pU+2) && (pU+2) == TIXML_UTF_LEAD_2 )
		738	{
		739	encoding = TIXML_ENCODING_UTF8;
		740	useMicrosoftBOM = true;
		741	}
		742	}
		743
		744	p = SkipWhiteSpace( p, encoding );
		745	if ( !p )
		746	{
		747	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
		748	return 0;
		749	}
		750
		751	while ( p && *p )
		752	{
		753	TiXmlNode* node = Identify( p, encoding );
		754	if ( node )
		755	{
		756	p = node->Parse( p, &data, encoding );
		757	LinkEndChild( node );
		758	}
		759	else
		760	{
		761	break;
		762	}
		763
		764	// Did we get encoding info?
		765	if ( encoding == TIXML_ENCODING_UNKNOWN
		766	&& node->ToDeclaration() )
		767	{
		768	TiXmlDeclaration* dec = node->ToDeclaration();
		769	const char* enc = dec->Encoding();
		770	assert( enc );
		771
		772	if ( *enc == 0 )
		773	encoding = TIXML_ENCODING_UTF8;
		774	else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
		775	encoding = TIXML_ENCODING_UTF8;
		776	else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
		777	encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
		778	else
		779	encoding = TIXML_ENCODING_LEGACY;
		780	}
		781
		782	p = SkipWhiteSpace( p, encoding );
		783	}
		784
		785	// Was this empty?
		786	if ( !firstChild ) {
		787	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
		788	return 0;
		789	}
		790
		791	// All is well.
		792	return p;
		793	}
		794
		795	void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
		796	{
		797	// The first error in a chain is more accurate - don't set again!
		798	if ( error )
		799	return;
		800
		801	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
		802	error = true;
		803	errorId = err;
		804	errorDesc = errorString[ errorId ];
		805
		806	errorLocation.Clear();
		807	if ( pError && data )
		808	{
		809	data->Stamp( pError, encoding );
		810	errorLocation = data->Cursor();
		811	}
		812	}
		813
		814
		815	TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
		816	{
		817	TiXmlNode* returnNode = 0;
		818
		819	p = SkipWhiteSpace( p, encoding );
		820	if( !p \|\| !p \|\| p != '<' )
		821	{
		822	return 0;
		823	}
		824
		825	TiXmlDocument* doc = GetDocument();
		826	p = SkipWhiteSpace( p, encoding );
		827
		828	if ( !p \|\| !*p )
		829	{
		830	return 0;
		831	}
		832
		833	// What is this thing?
		834	// - Elements start with a letter or underscore, but xml is reserved.
		835	// - Comments: ";
		1346
		1347	if ( !StringEqual( p, startTag, false, encoding ) )
		1348	{
		1349	document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
		1350	return 0;
		1351	}
		1352	p += strlen( startTag );
		1353
		1354	// [ 1475201 ] TinyXML parses entities in comments
		1355	// Oops - ReadText doesn't work, because we don't want to parse the entities.
		1356	// p = ReadText( p, &value, false, endTag, false, encoding );
		1357	//
		1358	// from the XML spec:
		1359	/*
		1360	[Definition: Comments may appear anywhere in a document outside other markup; in addition,
		1361	they may appear within the document type declaration at places allowed by the grammar.
		1362	They are not part of the document's character data; an XML processor MAY, but need not,
		1363	make it possible for an application to retrieve the text of comments. For compatibility,
		1364	the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
		1365	references MUST NOT be recognized within comments.
		1366
		1367	An example of a comment:
		1368
		1369
		1370	*/
		1371
		1372	value = "";
		1373	// Keep all the white space.
		1374	while ( p && *p && !StringEqual( p, endTag, false, encoding ) )
		1375	{
		1376	value.append( p, 1 );
		1377	++p;
		1378	}
		1379	if ( p )
		1380	p += strlen( endTag );
		1381
		1382	return p;
		1383	}
		1384
		1385
		1386	const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
		1387	{
		1388	p = SkipWhiteSpace( p, encoding );
		1389	if ( !p \|\| !*p ) return 0;
		1390
		1391	// int tabsize = 4;
		1392	// if ( document )
		1393	// tabsize = document->TabSize();
		1394
		1395	if ( data )
		1396	{
		1397	data->Stamp( p, encoding );
		1398	location = data->Cursor();
		1399	}
		1400	// Read the name, the '=' and the value.
		1401	const char* pErr = p;
		1402	p = ReadName( p, &name, encoding );
		1403	if ( !p \|\| !*p )
		1404	{
		1405	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
		1406	return 0;
		1407	}
		1408	p = SkipWhiteSpace( p, encoding );
		1409	if ( !p \|\| !p \|\| p != '=' )
		1410	{
		1411	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
		1412	return 0;
		1413	}
		1414
		1415	++p; // skip '='
		1416	p = SkipWhiteSpace( p, encoding );
		1417	if ( !p \|\| !*p )
		1418	{
		1419	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
		1420	return 0;
		1421	}
		1422
		1423	const char* end;
		1424	const char SINGLE_QUOTE = '\'';
		1425	const char DOUBLE_QUOTE = '\"';
		1426
		1427	if ( *p == SINGLE_QUOTE )
		1428	{
		1429	++p;
		1430	end = "\'"; // single quote in string
		1431	p = ReadText( p, &value, false, end, false, encoding );
		1432	}
		1433	else if ( *p == DOUBLE_QUOTE )
		1434	{
		1435	++p;
		1436	end = "\""; // double quote in string
		1437	p = ReadText( p, &value, false, end, false, encoding );
		1438	}
		1439	else
		1440	{
		1441	// All attribute values should be in single or double quotes.
		1442	// But this is such a common error that the parser will try
		1443	// its best, even without them.
		1444	value = "";
		1445	while ( p && *p // existence
		1446	&& !IsWhiteSpace( *p ) // whitespace
		1447	&& p != '/' && p != '>' ) // tag end
		1448	{
		1449	if ( p == SINGLE_QUOTE \|\| p == DOUBLE_QUOTE ) {
		1450	// [ 1451649 ] Attribute values with trailing quotes not handled correctly
		1451	// We did not have an opening quote but seem to have a
		1452	// closing one. Give up and throw an error.
		1453	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
		1454	return 0;
		1455	}
		1456	value += *p;
		1457	++p;
		1458	}
		1459	}
		1460	return p;
		1461	}
		1462
		1463	#ifdef TIXML_USE_STL
		1464	void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
		1465	{
		1466	while ( in->good() )
		1467	{
		1468	int c = in->peek();
		1469	if ( !cdata && (c == '<' ) )
		1470	{
		1471	return;
		1472	}
		1473	if ( c <= 0 )
		1474	{
		1475	TiXmlDocument* document = GetDocument();
		1476	if ( document )
		1477	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
		1478	return;
		1479	}
		1480
		1481	(*tag) += (char) c;
		1482	in->get(); // "commits" the peek made above
		1483
		1484	if ( cdata && c == '>' && tag->size() >= 3 ) {
		1485	size_t len = tag->size();
		1486	if ( (tag)[len-2] == ']' && (tag)[len-3] == ']' ) {
		1487	// terminator of cdata.
		1488	return;
		1489	}
		1490	}
		1491	}
		1492	}
		1493	#endif
		1494
		1495	const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
		1496	{
		1497	value = "";
		1498	TiXmlDocument* document = GetDocument();
		1499
		1500	if ( data )
		1501	{
		1502	data->Stamp( p, encoding );
		1503	location = data->Cursor();
		1504	}
		1505
		1506	const char* const startTag = "
		1507	const char* const endTag = "]]>";
		1508
		1509	if ( cdata \|\| StringEqual( p, startTag, false, encoding ) )
		1510	{
		1511	cdata = true;
		1512
		1513	if ( !StringEqual( p, startTag, false, encoding ) )
		1514	{
		1515	document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
		1516	return 0;
		1517	}
		1518	p += strlen( startTag );
		1519
		1520	// Keep all the white space, ignore the encoding, etc.
		1521	while ( p && *p
		1522	&& !StringEqual( p, endTag, false, encoding )
		1523	)
		1524	{
		1525	value += *p;
		1526	++p;
		1527	}
		1528
		1529	TIXML_STRING dummy;
		1530	p = ReadText( p, &dummy, false, endTag, false, encoding );
		1531	return p;
		1532	}
		1533	else
		1534	{
		1535	bool ignoreWhite = true;
		1536
		1537	const char* end = "<";
		1538	p = ReadText( p, &value, ignoreWhite, end, false, encoding );
		1539	if ( p )
		1540	return p-1; // don't truncate the '<'
		1541	return 0;
		1542	}
		1543	}
		1544
		1545	#ifdef TIXML_USE_STL
		1546	void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
		1547	{
		1548	while ( in->good() )
		1549	{
		1550	int c = in->get();
		1551	if ( c <= 0 )
		1552	{
		1553	TiXmlDocument* document = GetDocument();
		1554	if ( document )
		1555	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
		1556	return;
		1557	}
		1558	(*tag) += (char) c;
		1559
		1560	if ( c == '>' )
		1561	{
		1562	// All is well.
		1563	return;
		1564	}
		1565	}
		1566	}
		1567	#endif
		1568
		1569	const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
		1570	{
		1571	p = SkipWhiteSpace( p, _encoding );
		1572	// Find the beginning, find the end, and look for
		1573	// the stuff in-between.
		1574	TiXmlDocument* document = GetDocument();
		1575	if ( !p \|\| !*p \|\| !StringEqual( p, "
		1576	{
		1577	if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
		1578	return 0;
		1579	}
		1580	if ( data )
		1581	{
		1582	data->Stamp( p, _encoding );
		1583	location = data->Cursor();
		1584	}
		1585	p += 5;
		1586
		1587	version = "";
		1588	encoding = "";
		1589	standalone = "";
		1590
		1591	while ( p && *p )
		1592	{
		1593	if ( *p == '>' )
		1594	{
		1595	++p;
		1596	return p;
		1597	}
		1598
		1599	p = SkipWhiteSpace( p, _encoding );
		1600	if ( StringEqual( p, "version", true, _encoding ) )
		1601	{
		1602	TiXmlAttribute attrib;
		1603	p = attrib.Parse( p, data, _encoding );
		1604	version = attrib.Value();
		1605	}
		1606	else if ( StringEqual( p, "encoding", true, _encoding ) )
		1607	{
		1608	TiXmlAttribute attrib;
		1609	p = attrib.Parse( p, data, _encoding );
		1610	encoding = attrib.Value();
		1611	}
		1612	else if ( StringEqual( p, "standalone", true, _encoding ) )
		1613	{
		1614	TiXmlAttribute attrib;
		1615	p = attrib.Parse( p, data, _encoding );
		1616	standalone = attrib.Value();
		1617	}
		1618	else
		1619	{
		1620	// Read over whatever it is.
		1621	while( p && p && p != '>' && !IsWhiteSpace( *p ) )
		1622	++p;
		1623	}
		1624	}
		1625	return 0;
		1626	}
		1627
		1628	bool TiXmlText::Blank() const
		1629	{
		1630	for ( unsigned i=0; i
		1631	if ( !IsWhiteSpace( value[i] ) )
		1632	return false;
		1633	return true;
		1634	}
		1635

Subversion Repositories Kolibri OS

(root)/contrib/other/kpm/tinyxml/tinyxmlparser.cpp @ 6552 – Rev 5725