Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5725 serge 1
/*
2
www.sourceforge.net/projects/tinyxml
3
Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
 
5
This software is provided 'as-is', without any express or implied
6
warranty. In no event will the authors be held liable for any
7
damages arising from the use of this software.
8
 
9
Permission is granted to anyone to use this software for any
10
purpose, including commercial applications, and to alter it and
11
redistribute it freely, subject to the following restrictions:
12
 
13
1. The origin of this software must not be misrepresented; you must
14
not claim that you wrote the original software. If you use this
15
software in a product, an acknowledgment in the product documentation
16
would be appreciated but is not required.
17
 
18
2. Altered source versions must be plainly marked as such, and
19
must not be misrepresented as being the original software.
20
 
21
3. This notice may not be removed or altered from any source
22
distribution.
23
*/
24
 
25
#include "tinyxml.h"
26
 
27
//#define DEBUG_PARSER
28
#if defined( DEBUG_PARSER )
29
#	if defined( DEBUG ) && defined( _MSC_VER )
30
#		include 
31
#		define TIXML_LOG OutputDebugString
32
#	else
33
#		define TIXML_LOG printf
34
#	endif
35
#endif
36
 
37
// Note tha "PutString" hardcodes the same list. This
38
// is less flexible than it appears. Changing the entries
39
// or order will break putstring.
40
TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
41
{
42
	{ "&",  5, '&' },
43
	{ "<",   4, '<' },
44
	{ ">",   4, '>' },
45
	{ """, 6, '\"' },
46
	{ "'", 6, '\'' }
47
};
48
 
49
// Bunch of unicode info at:
50
//		http://www.unicode.org/faq/utf_bom.html
51
// Including the basic of this table, which determines the #bytes in the
52
// sequence from the lead byte. 1 placed for invalid sequences --
53
// although the result will be junk, pass it through as much as possible.
54
// Beware of the non-characters in UTF-8:
55
//				ef bb bf (Microsoft "lead bytes")
56
//				ef bf be
57
//				ef bf bf
58
 
59
const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
60
const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
61
const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
62
 
63
const int TiXmlBase::utf8ByteTable[256] =
64
{
65
	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
66
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00
67
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10
68
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20
69
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30
70
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40
71
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50
72
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60
73
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range
74
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid
75
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90
76
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0
77
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0
78
		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte
79
		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0
80
		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte
81
		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
82
};
83
 
84
 
85
void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
86
{
87
	const unsigned long BYTE_MASK = 0xBF;
88
	const unsigned long BYTE_MARK = 0x80;
89
	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
90
 
91
	if (input < 0x80)
92
		*length = 1;
93
	else if ( input < 0x800 )
94
		*length = 2;
95
	else if ( input < 0x10000 )
96
		*length = 3;
97
	else if ( input < 0x200000 )
98
		*length = 4;
99
	else
100
		{ *length = 0; return; }	// This code won't covert this correctly anyway.
101
 
102
	output += *length;
103
 
104
	// Scary scary fall throughs.
105
	switch (*length)
106
	{
107
		case 4:
108
			--output;
109
			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
110
			input >>= 6;
111
		case 3:
112
			--output;
113
			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
114
			input >>= 6;
115
		case 2:
116
			--output;
117
			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
118
			input >>= 6;
119
		case 1:
120
			--output;
121
			*output = (char)(input | FIRST_BYTE_MARK[*length]);
122
	}
123
}
124
 
125
 
126
/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
127
{
128
	// This will only work for low-ascii, everything else is assumed to be a valid
129
	// letter. I'm not sure this is the best approach, but it is quite tricky trying
130
	// to figure out alhabetical vs. not across encoding. So take a very
131
	// conservative approach.
132
 
133
//	if ( encoding == TIXML_ENCODING_UTF8 )
134
//	{
135
		if ( anyByte < 127 )
136
			return isalpha( anyByte );
137
		else
138
			return 1;	// What else to do? The unicode set is huge...get the english ones right.
139
//	}
140
//	else
141
//	{
142
//		return isalpha( anyByte );
143
//	}
144
}
145
 
146
 
147
/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
148
{
149
	// This will only work for low-ascii, everything else is assumed to be a valid
150
	// letter. I'm not sure this is the best approach, but it is quite tricky trying
151
	// to figure out alhabetical vs. not across encoding. So take a very
152
	// conservative approach.
153
 
154
//	if ( encoding == TIXML_ENCODING_UTF8 )
155
//	{
156
		if ( anyByte < 127 )
157
			return isalnum( anyByte );
158
		else
159
			return 1;	// What else to do? The unicode set is huge...get the english ones right.
160
//	}
161
//	else
162
//	{
163
//		return isalnum( anyByte );
164
//	}
165
}
166
 
167
 
168
class TiXmlParsingData
169
{
170
	friend class TiXmlDocument;
171
  public:
172
	void Stamp( const char* now, TiXmlEncoding encoding );
173
 
174
	const TiXmlCursor& Cursor()	{ return cursor; }
175
 
176
  private:
177
	// Only used by the document!
178
	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
179
	{
180
		assert( start );
181
		stamp = start;
182
		tabsize = _tabsize;
183
		cursor.row = row;
184
		cursor.col = col;
185
	}
186
 
187
	TiXmlCursor		cursor;
188
	const char*		stamp;
189
	int				tabsize;
190
};
191
 
192
 
193
void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
194
{
195
	assert( now );
196
 
197
	// Do nothing if the tabsize is 0.
198
	if ( tabsize < 1 )
199
	{
200
		return;
201
	}
202
 
203
	// Get the current row, column.
204
	int row = cursor.row;
205
	int col = cursor.col;
206
	const char* p = stamp;
207
	assert( p );
208
 
209
	while ( p < now )
210
	{
211
		// Treat p as unsigned, so we have a happy compiler.
212
		const unsigned char* pU = (const unsigned char*)p;
213
 
214
		// Code contributed by Fletcher Dunn: (modified by lee)
215
		switch (*pU) {
216
			case 0:
217
				// We *should* never get here, but in case we do, don't
218
				// advance past the terminating null character, ever
219
				return;
220
 
221
			case '\r':
222
				// bump down to the next line
223
				++row;
224
				col = 0;
225
				// Eat the character
226
				++p;
227
 
228
				// Check for \r\n sequence, and treat this as a single character
229
				if (*p == '\n') {
230
					++p;
231
				}
232
				break;
233
 
234
			case '\n':
235
				// bump down to the next line
236
				++row;
237
				col = 0;
238
 
239
				// Eat the character
240
				++p;
241
 
242
				// Check for \n\r sequence, and treat this as a single
243
				// character.  (Yes, this bizarre thing does occur still
244
				// on some arcane platforms...)
245
				if (*p == '\r') {
246
					++p;
247
				}
248
				break;
249
 
250
			case '\t':
251
				// Eat the character
252
				++p;
253
 
254
				// Skip to next tab stop
255
				col = (col / tabsize + 1) * tabsize;
256
				break;
257
 
258
			case TIXML_UTF_LEAD_0:
259
				if ( encoding == TIXML_ENCODING_UTF8 )
260
				{
261
					if ( *(p+1) && *(p+2) )
262
					{
263
						// In these cases, don't advance the column. These are
264
						// 0-width spaces.
265
						if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
266
							p += 3;
267
						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
268
							p += 3;
269
						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
270
							p += 3;
271
						else
272
							{ p +=3; ++col; }	// A normal character.
273
					}
274
				}
275
				else
276
				{
277
					++p;
278
					++col;
279
				}
280
				break;
281
 
282
			default:
283
				if ( encoding == TIXML_ENCODING_UTF8 )
284
				{
285
					// Eat the 1 to 4 byte utf8 character.
286
					int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
287
					if ( step == 0 )
288
						step = 1;		// Error case from bad encoding, but handle gracefully.
289
					p += step;
290
 
291
					// Just advance one column, of course.
292
					++col;
293
				}
294
				else
295
				{
296
					++p;
297
					++col;
298
				}
299
				break;
300
		}
301
	}
302
	cursor.row = row;
303
	cursor.col = col;
304
	assert( cursor.row >= -1 );
305
	assert( cursor.col >= -1 );
306
	stamp = p;
307
	assert( stamp );
308
}
309
 
310
 
311
const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
312
{
313
	if ( !p || !*p )
314
	{
315
		return 0;
316
	}
317
	if ( encoding == TIXML_ENCODING_UTF8 )
318
	{
319
		while ( *p )
320
		{
321
			const unsigned char* pU = (const unsigned char*)p;
322
 
323
			// Skip the stupid Microsoft UTF-8 Byte order marks
324
			if (	*(pU+0)==TIXML_UTF_LEAD_0
325
				 && *(pU+1)==TIXML_UTF_LEAD_1
326
				 && *(pU+2)==TIXML_UTF_LEAD_2 )
327
			{
328
				p += 3;
329
				continue;
330
			}
331
			else if(*(pU+0)==TIXML_UTF_LEAD_0
332
				 && *(pU+1)==0xbfU
333
				 && *(pU+2)==0xbeU )
334
			{
335
				p += 3;
336
				continue;
337
			}
338
			else if(*(pU+0)==TIXML_UTF_LEAD_0
339
				 && *(pU+1)==0xbfU
340
				 && *(pU+2)==0xbfU )
341
			{
342
				p += 3;
343
				continue;
344
			}
345
 
346
			if ( IsWhiteSpace( *p ) )
347
				++p;
348
			else
349
				break;
350
		}
351
	}
352
	else
353
	{
354
		while ( *p && IsWhiteSpace( *p ) )
355
			++p;
356
	}
357
 
358
	return p;
359
}
360
 
361
#ifdef TIXML_USE_STL
362
/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
363
{
364
	for( ;; )
365
	{
366
		if ( !in->good() ) return false;
367
 
368
		int c = in->peek();
369
		// At this scope, we can't get to a document. So fail silently.
370
		if ( !IsWhiteSpace( c ) || c <= 0 )
371
			return true;
372
 
373
		*tag += (char) in->get();
374
	}
375
}
376
 
377
/*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
378
{
379
	//assert( character > 0 && character < 128 );	// else it won't work in utf-8
380
	while ( in->good() )
381
	{
382
		int c = in->peek();
383
		if ( c == character )
384
			return true;
385
		if ( c <= 0 )		// Silent failure: can't get document at this scope
386
			return false;
387
 
388
		in->get();
389
		*tag += (char) c;
390
	}
391
	return false;
392
}
393
#endif
394
 
395
// One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
396
// "assign" optimization removes over 10% of the execution time.
397
//
398
const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
399
{
400
	// Oddly, not supported on some comilers,
401
	//name->clear();
402
	// So use this:
403
	*name = "";
404
	assert( p );
405
 
406
	// Names start with letters or underscores.
407
	// Of course, in unicode, tinyxml has no idea what a letter *is*. The
408
	// algorithm is generous.
409
	//
410
	// After that, they can be letters, underscores, numbers,
411
	// hyphens, or colons. (Colons are valid ony for namespaces,
412
	// but tinyxml can't tell namespaces from names.)
413
	if (    p && *p
414
		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
415
	{
416
		const char* start = p;
417
		while(		p && *p
418
				&&	(		IsAlphaNum( (unsigned char ) *p, encoding )
419
						 || *p == '_'
420
						 || *p == '-'
421
						 || *p == '.'
422
						 || *p == ':' ) )
423
		{
424
			//(*name) += *p; // expensive
425
			++p;
426
		}
427
		if ( p-start > 0 ) {
428
			name->assign( start, p-start );
429
		}
430
		return p;
431
	}
432
	return 0;
433
}
434
 
435
const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
436
{
437
	// Presume an entity, and pull it out.
438
    TIXML_STRING ent;
439
	int i;
440
	*length = 0;
441
 
442
	if ( *(p+1) && *(p+1) == '#' && *(p+2) )
443
	{
444
		unsigned long ucs = 0;
445
		ptrdiff_t delta = 0;
446
		unsigned mult = 1;
447
 
448
		if ( *(p+2) == 'x' )
449
		{
450
			// Hexadecimal.
451
			if ( !*(p+3) ) return 0;
452
 
453
			const char* q = p+3;
454
			q = strchr( q, ';' );
455
 
456
			if ( !q || !*q ) return 0;
457
 
458
			delta = q-p;
459
			--q;
460
 
461
			while ( *q != 'x' )
462
			{
463
				if ( *q >= '0' && *q <= '9' )
464
					ucs += mult * (*q - '0');
465
				else if ( *q >= 'a' && *q <= 'f' )
466
					ucs += mult * (*q - 'a' + 10);
467
				else if ( *q >= 'A' && *q <= 'F' )
468
					ucs += mult * (*q - 'A' + 10 );
469
				else
470
					return 0;
471
				mult *= 16;
472
				--q;
473
			}
474
		}
475
		else
476
		{
477
			// Decimal.
478
			if ( !*(p+2) ) return 0;
479
 
480
			const char* q = p+2;
481
			q = strchr( q, ';' );
482
 
483
			if ( !q || !*q ) return 0;
484
 
485
			delta = q-p;
486
			--q;
487
 
488
			while ( *q != '#' )
489
			{
490
				if ( *q >= '0' && *q <= '9' )
491
					ucs += mult * (*q - '0');
492
				else
493
					return 0;
494
				mult *= 10;
495
				--q;
496
			}
497
		}
498
		if ( encoding == TIXML_ENCODING_UTF8 )
499
		{
500
			// convert the UCS to UTF-8
501
			ConvertUTF32ToUTF8( ucs, value, length );
502
		}
503
		else
504
		{
505
			*value = (char)ucs;
506
			*length = 1;
507
		}
508
		return p + delta + 1;
509
	}
510
 
511
	// Now try to match it.
512
	for( i=0; i
513
	{
514
		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
515
		{
516
			assert( strlen( entity[i].str ) == entity[i].strLength );
517
			*value = entity[i].chr;
518
			*length = 1;
519
			return ( p + entity[i].strLength );
520
		}
521
	}
522
 
523
	// So it wasn't an entity, its unrecognized, or something like that.
524
	*value = *p;	// Don't put back the last one, since we return it!
525
	//*length = 1;	// Leave unrecognized entities - this doesn't really work.
526
					// Just writes strange XML.
527
	return p+1;
528
}
529
 
530
 
531
bool TiXmlBase::StringEqual( const char* p,
532
							 const char* tag,
533
							 bool ignoreCase,
534
							 TiXmlEncoding encoding )
535
{
536
	assert( p );
537
	assert( tag );
538
	if ( !p || !*p )
539
	{
540
		assert( 0 );
541
		return false;
542
	}
543
 
544
	const char* q = p;
545
 
546
	if ( ignoreCase )
547
	{
548
		while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
549
		{
550
			++q;
551
			++tag;
552
		}
553
 
554
		if ( *tag == 0 )
555
			return true;
556
	}
557
	else
558
	{
559
		while ( *q && *tag && *q == *tag )
560
		{
561
			++q;
562
			++tag;
563
		}
564
 
565
		if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
566
			return true;
567
	}
568
	return false;
569
}
570
 
571
const char* TiXmlBase::ReadText(	const char* p,
572
									TIXML_STRING * text,
573
									bool trimWhiteSpace,
574
									const char* endTag,
575
									bool caseInsensitive,
576
									TiXmlEncoding encoding )
577
{
578
    *text = "";
579
	if (    !trimWhiteSpace			// certain tags always keep whitespace
580
		 || !condenseWhiteSpace )	// if true, whitespace is always kept
581
	{
582
		// Keep all the white space.
583
		while (	   p && *p
584
				&& !StringEqual( p, endTag, caseInsensitive, encoding )
585
			  )
586
		{
587
			int len;
588
			char cArr[4] = { 0, 0, 0, 0 };
589
			p = GetChar( p, cArr, &len, encoding );
590
			text->append( cArr, len );
591
		}
592
	}
593
	else
594
	{
595
		bool whitespace = false;
596
 
597
		// Remove leading white space:
598
		p = SkipWhiteSpace( p, encoding );
599
		while (	   p && *p
600
				&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
601
		{
602
			if ( *p == '\r' || *p == '\n' )
603
			{
604
				whitespace = true;
605
				++p;
606
			}
607
			else if ( IsWhiteSpace( *p ) )
608
			{
609
				whitespace = true;
610
				++p;
611
			}
612
			else
613
			{
614
				// If we've found whitespace, add it before the
615
				// new character. Any whitespace just becomes a space.
616
				if ( whitespace )
617
				{
618
					(*text) += ' ';
619
					whitespace = false;
620
				}
621
				int len;
622
				char cArr[4] = { 0, 0, 0, 0 };
623
				p = GetChar( p, cArr, &len, encoding );
624
				if ( len == 1 )
625
					(*text) += cArr[0];	// more efficient
626
				else
627
					text->append( cArr, len );
628
			}
629
		}
630
	}
631
	if ( p )
632
		p += strlen( endTag );
633
	return p;
634
}
635
 
636
#ifdef TIXML_USE_STL
637
 
638
void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
639
{
640
	// The basic issue with a document is that we don't know what we're
641
	// streaming. Read something presumed to be a tag (and hope), then
642
	// identify it, and call the appropriate stream method on the tag.
643
	//
644
	// This "pre-streaming" will never read the closing ">" so the
645
	// sub-tag can orient itself.
646
 
647
	if ( !StreamTo( in, '<', tag ) )
648
	{
649
		SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
650
		return;
651
	}
652
 
653
	while ( in->good() )
654
	{
655
		int tagIndex = (int) tag->length();
656
		while ( in->good() && in->peek() != '>' )
657
		{
658
			int c = in->get();
659
			if ( c <= 0 )
660
			{
661
				SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
662
				break;
663
			}
664
			(*tag) += (char) c;
665
		}
666
 
667
		if ( in->good() )
668
		{
669
			// We now have something we presume to be a node of
670
			// some sort. Identify it, and call the node to
671
			// continue streaming.
672
			TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
673
 
674
			if ( node )
675
			{
676
				node->StreamIn( in, tag );
677
				bool isElement = node->ToElement() != 0;
678
				delete node;
679
				node = 0;
680
 
681
				// If this is the root element, we're done. Parsing will be
682
				// done by the >> operator.
683
				if ( isElement )
684
				{
685
					return;
686
				}
687
			}
688
			else
689
			{
690
				SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
691
				return;
692
			}
693
		}
694
	}
695
	// We should have returned sooner.
696
	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
697
}
698
 
699
#endif
700
 
701
const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
702
{
703
	ClearError();
704
 
705
	// Parse away, at the document level. Since a document
706
	// contains nothing but other tags, most of what happens
707
	// here is skipping white space.
708
	if ( !p || !*p )
709
	{
710
		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
711
		return 0;
712
	}
713
 
714
	// Note that, for a document, this needs to come
715
	// before the while space skip, so that parsing
716
	// starts from the pointer we are given.
717
	location.Clear();
718
	if ( prevData )
719
	{
720
		location.row = prevData->cursor.row;
721
		location.col = prevData->cursor.col;
722
	}
723
	else
724
	{
725
		location.row = 0;
726
		location.col = 0;
727
	}
728
	TiXmlParsingData data( p, TabSize(), location.row, location.col );
729
	location = data.Cursor();
730
 
731
	if ( encoding == TIXML_ENCODING_UNKNOWN )
732
	{
733
		// Check for the Microsoft UTF-8 lead bytes.
734
		const unsigned char* pU = (const unsigned char*)p;
735
		if (	*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
736
			 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
737
			 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
738
		{
739
			encoding = TIXML_ENCODING_UTF8;
740
			useMicrosoftBOM = true;
741
		}
742
	}
743
 
744
    p = SkipWhiteSpace( p, encoding );
745
	if ( !p )
746
	{
747
		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
748
		return 0;
749
	}
750
 
751
	while ( p && *p )
752
	{
753
		TiXmlNode* node = Identify( p, encoding );
754
		if ( node )
755
		{
756
			p = node->Parse( p, &data, encoding );
757
			LinkEndChild( node );
758
		}
759
		else
760
		{
761
			break;
762
		}
763
 
764
		// Did we get encoding info?
765
		if (    encoding == TIXML_ENCODING_UNKNOWN
766
			 && node->ToDeclaration() )
767
		{
768
			TiXmlDeclaration* dec = node->ToDeclaration();
769
			const char* enc = dec->Encoding();
770
			assert( enc );
771
 
772
			if ( *enc == 0 )
773
				encoding = TIXML_ENCODING_UTF8;
774
			else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
775
				encoding = TIXML_ENCODING_UTF8;
776
			else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
777
				encoding = TIXML_ENCODING_UTF8;	// incorrect, but be nice
778
			else
779
				encoding = TIXML_ENCODING_LEGACY;
780
		}
781
 
782
		p = SkipWhiteSpace( p, encoding );
783
	}
784
 
785
	// Was this empty?
786
	if ( !firstChild ) {
787
		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
788
		return 0;
789
	}
790
 
791
	// All is well.
792
	return p;
793
}
794
 
795
void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
796
{
797
	// The first error in a chain is more accurate - don't set again!
798
	if ( error )
799
		return;
800
 
801
	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
802
	error   = true;
803
	errorId = err;
804
	errorDesc = errorString[ errorId ];
805
 
806
	errorLocation.Clear();
807
	if ( pError && data )
808
	{
809
		data->Stamp( pError, encoding );
810
		errorLocation = data->Cursor();
811
	}
812
}
813
 
814
 
815
TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
816
{
817
	TiXmlNode* returnNode = 0;
818
 
819
	p = SkipWhiteSpace( p, encoding );
820
	if( !p || !*p || *p != '<' )
821
	{
822
		return 0;
823
	}
824
 
825
	TiXmlDocument* doc = GetDocument();
826
	p = SkipWhiteSpace( p, encoding );
827
 
828
	if ( !p || !*p )
829
	{
830
		return 0;
831
	}
832
 
833
	// What is this thing?
834
	// - Elements start with a letter or underscore, but xml is reserved.
835
	// - Comments: ";
1346
 
1347
	if ( !StringEqual( p, startTag, false, encoding ) )
1348
	{
1349
		document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1350
		return 0;
1351
	}
1352
	p += strlen( startTag );
1353
 
1354
	// [ 1475201 ] TinyXML parses entities in comments
1355
	// Oops - ReadText doesn't work, because we don't want to parse the entities.
1356
	// p = ReadText( p, &value, false, endTag, false, encoding );
1357
	//
1358
	// from the XML spec:
1359
	/*
1360
	 [Definition: Comments may appear anywhere in a document outside other markup; in addition,
1361
	              they may appear within the document type declaration at places allowed by the grammar.
1362
				  They are not part of the document's character data; an XML processor MAY, but need not,
1363
				  make it possible for an application to retrieve the text of comments. For compatibility,
1364
				  the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
1365
				  references MUST NOT be recognized within comments.
1366
 
1367
				  An example of a comment:
1368
 
1369
				  
1370
	*/
1371
 
1372
    value = "";
1373
	// Keep all the white space.
1374
	while (	p && *p && !StringEqual( p, endTag, false, encoding ) )
1375
	{
1376
		value.append( p, 1 );
1377
		++p;
1378
	}
1379
	if ( p )
1380
		p += strlen( endTag );
1381
 
1382
	return p;
1383
}
1384
 
1385
 
1386
const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1387
{
1388
	p = SkipWhiteSpace( p, encoding );
1389
	if ( !p || !*p ) return 0;
1390
 
1391
//	int tabsize = 4;
1392
//	if ( document )
1393
//		tabsize = document->TabSize();
1394
 
1395
	if ( data )
1396
	{
1397
		data->Stamp( p, encoding );
1398
		location = data->Cursor();
1399
	}
1400
	// Read the name, the '=' and the value.
1401
	const char* pErr = p;
1402
	p = ReadName( p, &name, encoding );
1403
	if ( !p || !*p )
1404
	{
1405
		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1406
		return 0;
1407
	}
1408
	p = SkipWhiteSpace( p, encoding );
1409
	if ( !p || !*p || *p != '=' )
1410
	{
1411
		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1412
		return 0;
1413
	}
1414
 
1415
	++p;	// skip '='
1416
	p = SkipWhiteSpace( p, encoding );
1417
	if ( !p || !*p )
1418
	{
1419
		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1420
		return 0;
1421
	}
1422
 
1423
	const char* end;
1424
	const char SINGLE_QUOTE = '\'';
1425
	const char DOUBLE_QUOTE = '\"';
1426
 
1427
	if ( *p == SINGLE_QUOTE )
1428
	{
1429
		++p;
1430
		end = "\'";		// single quote in string
1431
		p = ReadText( p, &value, false, end, false, encoding );
1432
	}
1433
	else if ( *p == DOUBLE_QUOTE )
1434
	{
1435
		++p;
1436
		end = "\"";		// double quote in string
1437
		p = ReadText( p, &value, false, end, false, encoding );
1438
	}
1439
	else
1440
	{
1441
		// All attribute values should be in single or double quotes.
1442
		// But this is such a common error that the parser will try
1443
		// its best, even without them.
1444
		value = "";
1445
		while (    p && *p											// existence
1446
				&& !IsWhiteSpace( *p )		// whitespace
1447
				&& *p != '/' && *p != '>' )							// tag end
1448
		{
1449
			if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
1450
				// [ 1451649 ] Attribute values with trailing quotes not handled correctly
1451
				// We did not have an opening quote but seem to have a
1452
				// closing one. Give up and throw an error.
1453
				if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1454
				return 0;
1455
			}
1456
			value += *p;
1457
			++p;
1458
		}
1459
	}
1460
	return p;
1461
}
1462
 
1463
#ifdef TIXML_USE_STL
1464
void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
1465
{
1466
	while ( in->good() )
1467
	{
1468
		int c = in->peek();
1469
		if ( !cdata && (c == '<' ) )
1470
		{
1471
			return;
1472
		}
1473
		if ( c <= 0 )
1474
		{
1475
			TiXmlDocument* document = GetDocument();
1476
			if ( document )
1477
				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1478
			return;
1479
		}
1480
 
1481
		(*tag) += (char) c;
1482
		in->get();	// "commits" the peek made above
1483
 
1484
		if ( cdata && c == '>' && tag->size() >= 3 ) {
1485
			size_t len = tag->size();
1486
			if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
1487
				// terminator of cdata.
1488
				return;
1489
			}
1490
		}
1491
	}
1492
}
1493
#endif
1494
 
1495
const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1496
{
1497
	value = "";
1498
	TiXmlDocument* document = GetDocument();
1499
 
1500
	if ( data )
1501
	{
1502
		data->Stamp( p, encoding );
1503
		location = data->Cursor();
1504
	}
1505
 
1506
	const char* const startTag = "
1507
	const char* const endTag   = "]]>";
1508
 
1509
	if ( cdata || StringEqual( p, startTag, false, encoding ) )
1510
	{
1511
		cdata = true;
1512
 
1513
		if ( !StringEqual( p, startTag, false, encoding ) )
1514
		{
1515
			document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1516
			return 0;
1517
		}
1518
		p += strlen( startTag );
1519
 
1520
		// Keep all the white space, ignore the encoding, etc.
1521
		while (	   p && *p
1522
				&& !StringEqual( p, endTag, false, encoding )
1523
			  )
1524
		{
1525
			value += *p;
1526
			++p;
1527
		}
1528
 
1529
		TIXML_STRING dummy;
1530
		p = ReadText( p, &dummy, false, endTag, false, encoding );
1531
		return p;
1532
	}
1533
	else
1534
	{
1535
		bool ignoreWhite = true;
1536
 
1537
		const char* end = "<";
1538
		p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1539
		if ( p )
1540
			return p-1;	// don't truncate the '<'
1541
		return 0;
1542
	}
1543
}
1544
 
1545
#ifdef TIXML_USE_STL
1546
void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
1547
{
1548
	while ( in->good() )
1549
	{
1550
		int c = in->get();
1551
		if ( c <= 0 )
1552
		{
1553
			TiXmlDocument* document = GetDocument();
1554
			if ( document )
1555
				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1556
			return;
1557
		}
1558
		(*tag) += (char) c;
1559
 
1560
		if ( c == '>' )
1561
		{
1562
			// All is well.
1563
			return;
1564
		}
1565
	}
1566
}
1567
#endif
1568
 
1569
const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1570
{
1571
	p = SkipWhiteSpace( p, _encoding );
1572
	// Find the beginning, find the end, and look for
1573
	// the stuff in-between.
1574
	TiXmlDocument* document = GetDocument();
1575
	if ( !p || !*p || !StringEqual( p, "
1576
	{
1577
		if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1578
		return 0;
1579
	}
1580
	if ( data )
1581
	{
1582
		data->Stamp( p, _encoding );
1583
		location = data->Cursor();
1584
	}
1585
	p += 5;
1586
 
1587
	version = "";
1588
	encoding = "";
1589
	standalone = "";
1590
 
1591
	while ( p && *p )
1592
	{
1593
		if ( *p == '>' )
1594
		{
1595
			++p;
1596
			return p;
1597
		}
1598
 
1599
		p = SkipWhiteSpace( p, _encoding );
1600
		if ( StringEqual( p, "version", true, _encoding ) )
1601
		{
1602
			TiXmlAttribute attrib;
1603
			p = attrib.Parse( p, data, _encoding );
1604
			version = attrib.Value();
1605
		}
1606
		else if ( StringEqual( p, "encoding", true, _encoding ) )
1607
		{
1608
			TiXmlAttribute attrib;
1609
			p = attrib.Parse( p, data, _encoding );
1610
			encoding = attrib.Value();
1611
		}
1612
		else if ( StringEqual( p, "standalone", true, _encoding ) )
1613
		{
1614
			TiXmlAttribute attrib;
1615
			p = attrib.Parse( p, data, _encoding );
1616
			standalone = attrib.Value();
1617
		}
1618
		else
1619
		{
1620
			// Read over whatever it is.
1621
			while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1622
				++p;
1623
		}
1624
	}
1625
	return 0;
1626
}
1627
 
1628
bool TiXmlText::Blank() const
1629
{
1630
	for ( unsigned i=0; i
1631
		if ( !IsWhiteSpace( value[i] ) )
1632
			return false;
1633
	return true;
1634
}
1635