Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5725 | serge | 1 | /* |
2 | www.sourceforge.net/projects/tinyxml |
||
3 | Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com) |
||
4 | |||
5 | This software is provided 'as-is', without any express or implied |
||
6 | warranty. In no event will the authors be held liable for any |
||
7 | damages arising from the use of this software. |
||
8 | |||
9 | Permission is granted to anyone to use this software for any |
||
10 | purpose, including commercial applications, and to alter it and |
||
11 | redistribute it freely, subject to the following restrictions: |
||
12 | |||
13 | 1. The origin of this software must not be misrepresented; you must |
||
14 | not claim that you wrote the original software. If you use this |
||
15 | software in a product, an acknowledgment in the product documentation |
||
16 | would be appreciated but is not required. |
||
17 | |||
18 | 2. Altered source versions must be plainly marked as such, and |
||
19 | must not be misrepresented as being the original software. |
||
20 | |||
21 | 3. This notice may not be removed or altered from any source |
||
22 | distribution. |
||
23 | */ |
||
24 | |||
25 | #include "tinyxml.h" |
||
26 | |||
27 | //#define DEBUG_PARSER |
||
28 | #if defined( DEBUG_PARSER ) |
||
29 | # if defined( DEBUG ) && defined( _MSC_VER ) |
||
30 | # include |
||
31 | # define TIXML_LOG OutputDebugString |
||
32 | # else |
||
33 | # define TIXML_LOG printf |
||
34 | # endif |
||
35 | #endif |
||
36 | |||
37 | // Note tha "PutString" hardcodes the same list. This |
||
38 | // is less flexible than it appears. Changing the entries |
||
39 | // or order will break putstring. |
||
40 | TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = |
||
41 | { |
||
42 | { "&", 5, '&' }, |
||
43 | { "<", 4, '<' }, |
||
44 | { ">", 4, '>' }, |
||
45 | { """, 6, '\"' }, |
||
46 | { "'", 6, '\'' } |
||
47 | }; |
||
48 | |||
49 | // Bunch of unicode info at: |
||
50 | // http://www.unicode.org/faq/utf_bom.html |
||
51 | // Including the basic of this table, which determines the #bytes in the |
||
52 | // sequence from the lead byte. 1 placed for invalid sequences -- |
||
53 | // although the result will be junk, pass it through as much as possible. |
||
54 | // Beware of the non-characters in UTF-8: |
||
55 | // ef bb bf (Microsoft "lead bytes") |
||
56 | // ef bf be |
||
57 | // ef bf bf |
||
58 | |||
59 | const unsigned char TIXML_UTF_LEAD_0 = 0xefU; |
||
60 | const unsigned char TIXML_UTF_LEAD_1 = 0xbbU; |
||
61 | const unsigned char TIXML_UTF_LEAD_2 = 0xbfU; |
||
62 | |||
63 | const int TiXmlBase::utf8ByteTable[256] = |
||
64 | { |
||
65 | // 0 1 2 3 4 5 6 7 8 9 a b c d e f |
||
66 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00 |
||
67 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10 |
||
68 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20 |
||
69 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30 |
||
70 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40 |
||
71 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50 |
||
72 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60 |
||
73 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range |
||
74 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid |
||
75 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 |
||
76 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0 |
||
77 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0 |
||
78 | 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte |
||
79 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0 |
||
80 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte |
||
81 | 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid |
||
82 | }; |
||
83 | |||
84 | |||
85 | void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length ) |
||
86 | { |
||
87 | const unsigned long BYTE_MASK = 0xBF; |
||
88 | const unsigned long BYTE_MARK = 0x80; |
||
89 | const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
||
90 | |||
91 | if (input < 0x80) |
||
92 | *length = 1; |
||
93 | else if ( input < 0x800 ) |
||
94 | *length = 2; |
||
95 | else if ( input < 0x10000 ) |
||
96 | *length = 3; |
||
97 | else if ( input < 0x200000 ) |
||
98 | *length = 4; |
||
99 | else |
||
100 | { *length = 0; return; } // This code won't covert this correctly anyway. |
||
101 | |||
102 | output += *length; |
||
103 | |||
104 | // Scary scary fall throughs. |
||
105 | switch (*length) |
||
106 | { |
||
107 | case 4: |
||
108 | --output; |
||
109 | *output = (char)((input | BYTE_MARK) & BYTE_MASK); |
||
110 | input >>= 6; |
||
111 | case 3: |
||
112 | --output; |
||
113 | *output = (char)((input | BYTE_MARK) & BYTE_MASK); |
||
114 | input >>= 6; |
||
115 | case 2: |
||
116 | --output; |
||
117 | *output = (char)((input | BYTE_MARK) & BYTE_MASK); |
||
118 | input >>= 6; |
||
119 | case 1: |
||
120 | --output; |
||
121 | *output = (char)(input | FIRST_BYTE_MARK[*length]); |
||
122 | } |
||
123 | } |
||
124 | |||
125 | |||
126 | /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) |
||
127 | { |
||
128 | // This will only work for low-ascii, everything else is assumed to be a valid |
||
129 | // letter. I'm not sure this is the best approach, but it is quite tricky trying |
||
130 | // to figure out alhabetical vs. not across encoding. So take a very |
||
131 | // conservative approach. |
||
132 | |||
133 | // if ( encoding == TIXML_ENCODING_UTF8 ) |
||
134 | // { |
||
135 | if ( anyByte < 127 ) |
||
136 | return isalpha( anyByte ); |
||
137 | else |
||
138 | return 1; // What else to do? The unicode set is huge...get the english ones right. |
||
139 | // } |
||
140 | // else |
||
141 | // { |
||
142 | // return isalpha( anyByte ); |
||
143 | // } |
||
144 | } |
||
145 | |||
146 | |||
147 | /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ ) |
||
148 | { |
||
149 | // This will only work for low-ascii, everything else is assumed to be a valid |
||
150 | // letter. I'm not sure this is the best approach, but it is quite tricky trying |
||
151 | // to figure out alhabetical vs. not across encoding. So take a very |
||
152 | // conservative approach. |
||
153 | |||
154 | // if ( encoding == TIXML_ENCODING_UTF8 ) |
||
155 | // { |
||
156 | if ( anyByte < 127 ) |
||
157 | return isalnum( anyByte ); |
||
158 | else |
||
159 | return 1; // What else to do? The unicode set is huge...get the english ones right. |
||
160 | // } |
||
161 | // else |
||
162 | // { |
||
163 | // return isalnum( anyByte ); |
||
164 | // } |
||
165 | } |
||
166 | |||
167 | |||
168 | class TiXmlParsingData |
||
169 | { |
||
170 | friend class TiXmlDocument; |
||
171 | public: |
||
172 | void Stamp( const char* now, TiXmlEncoding encoding ); |
||
173 | |||
174 | const TiXmlCursor& Cursor() { return cursor; } |
||
175 | |||
176 | private: |
||
177 | // Only used by the document! |
||
178 | TiXmlParsingData( const char* start, int _tabsize, int row, int col ) |
||
179 | { |
||
180 | assert( start ); |
||
181 | stamp = start; |
||
182 | tabsize = _tabsize; |
||
183 | cursor.row = row; |
||
184 | cursor.col = col; |
||
185 | } |
||
186 | |||
187 | TiXmlCursor cursor; |
||
188 | const char* stamp; |
||
189 | int tabsize; |
||
190 | }; |
||
191 | |||
192 | |||
193 | void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding ) |
||
194 | { |
||
195 | assert( now ); |
||
196 | |||
197 | // Do nothing if the tabsize is 0. |
||
198 | if ( tabsize < 1 ) |
||
199 | { |
||
200 | return; |
||
201 | } |
||
202 | |||
203 | // Get the current row, column. |
||
204 | int row = cursor.row; |
||
205 | int col = cursor.col; |
||
206 | const char* p = stamp; |
||
207 | assert( p ); |
||
208 | |||
209 | while ( p < now ) |
||
210 | { |
||
211 | // Treat p as unsigned, so we have a happy compiler. |
||
212 | const unsigned char* pU = (const unsigned char*)p; |
||
213 | |||
214 | // Code contributed by Fletcher Dunn: (modified by lee) |
||
215 | switch (*pU) { |
||
216 | case 0: |
||
217 | // We *should* never get here, but in case we do, don't |
||
218 | // advance past the terminating null character, ever |
||
219 | return; |
||
220 | |||
221 | case '\r': |
||
222 | // bump down to the next line |
||
223 | ++row; |
||
224 | col = 0; |
||
225 | // Eat the character |
||
226 | ++p; |
||
227 | |||
228 | // Check for \r\n sequence, and treat this as a single character |
||
229 | if (*p == '\n') { |
||
230 | ++p; |
||
231 | } |
||
232 | break; |
||
233 | |||
234 | case '\n': |
||
235 | // bump down to the next line |
||
236 | ++row; |
||
237 | col = 0; |
||
238 | |||
239 | // Eat the character |
||
240 | ++p; |
||
241 | |||
242 | // Check for \n\r sequence, and treat this as a single |
||
243 | // character. (Yes, this bizarre thing does occur still |
||
244 | // on some arcane platforms...) |
||
245 | if (*p == '\r') { |
||
246 | ++p; |
||
247 | } |
||
248 | break; |
||
249 | |||
250 | case '\t': |
||
251 | // Eat the character |
||
252 | ++p; |
||
253 | |||
254 | // Skip to next tab stop |
||
255 | col = (col / tabsize + 1) * tabsize; |
||
256 | break; |
||
257 | |||
258 | case TIXML_UTF_LEAD_0: |
||
259 | if ( encoding == TIXML_ENCODING_UTF8 ) |
||
260 | { |
||
261 | if ( *(p+1) && *(p+2) ) |
||
262 | { |
||
263 | // In these cases, don't advance the column. These are |
||
264 | // 0-width spaces. |
||
265 | if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 ) |
||
266 | p += 3; |
||
267 | else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU ) |
||
268 | p += 3; |
||
269 | else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU ) |
||
270 | p += 3; |
||
271 | else |
||
272 | { p +=3; ++col; } // A normal character. |
||
273 | } |
||
274 | } |
||
275 | else |
||
276 | { |
||
277 | ++p; |
||
278 | ++col; |
||
279 | } |
||
280 | break; |
||
281 | |||
282 | default: |
||
283 | if ( encoding == TIXML_ENCODING_UTF8 ) |
||
284 | { |
||
285 | // Eat the 1 to 4 byte utf8 character. |
||
286 | int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)]; |
||
287 | if ( step == 0 ) |
||
288 | step = 1; // Error case from bad encoding, but handle gracefully. |
||
289 | p += step; |
||
290 | |||
291 | // Just advance one column, of course. |
||
292 | ++col; |
||
293 | } |
||
294 | else |
||
295 | { |
||
296 | ++p; |
||
297 | ++col; |
||
298 | } |
||
299 | break; |
||
300 | } |
||
301 | } |
||
302 | cursor.row = row; |
||
303 | cursor.col = col; |
||
304 | assert( cursor.row >= -1 ); |
||
305 | assert( cursor.col >= -1 ); |
||
306 | stamp = p; |
||
307 | assert( stamp ); |
||
308 | } |
||
309 | |||
310 | |||
311 | const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ) |
||
312 | { |
||
313 | if ( !p || !*p ) |
||
314 | { |
||
315 | return 0; |
||
316 | } |
||
317 | if ( encoding == TIXML_ENCODING_UTF8 ) |
||
318 | { |
||
319 | while ( *p ) |
||
320 | { |
||
321 | const unsigned char* pU = (const unsigned char*)p; |
||
322 | |||
323 | // Skip the stupid Microsoft UTF-8 Byte order marks |
||
324 | if ( *(pU+0)==TIXML_UTF_LEAD_0 |
||
325 | && *(pU+1)==TIXML_UTF_LEAD_1 |
||
326 | && *(pU+2)==TIXML_UTF_LEAD_2 ) |
||
327 | { |
||
328 | p += 3; |
||
329 | continue; |
||
330 | } |
||
331 | else if(*(pU+0)==TIXML_UTF_LEAD_0 |
||
332 | && *(pU+1)==0xbfU |
||
333 | && *(pU+2)==0xbeU ) |
||
334 | { |
||
335 | p += 3; |
||
336 | continue; |
||
337 | } |
||
338 | else if(*(pU+0)==TIXML_UTF_LEAD_0 |
||
339 | && *(pU+1)==0xbfU |
||
340 | && *(pU+2)==0xbfU ) |
||
341 | { |
||
342 | p += 3; |
||
343 | continue; |
||
344 | } |
||
345 | |||
346 | if ( IsWhiteSpace( *p ) ) |
||
347 | ++p; |
||
348 | else |
||
349 | break; |
||
350 | } |
||
351 | } |
||
352 | else |
||
353 | { |
||
354 | while ( *p && IsWhiteSpace( *p ) ) |
||
355 | ++p; |
||
356 | } |
||
357 | |||
358 | return p; |
||
359 | } |
||
360 | |||
361 | #ifdef TIXML_USE_STL |
||
362 | /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag ) |
||
363 | { |
||
364 | for( ;; ) |
||
365 | { |
||
366 | if ( !in->good() ) return false; |
||
367 | |||
368 | int c = in->peek(); |
||
369 | // At this scope, we can't get to a document. So fail silently. |
||
370 | if ( !IsWhiteSpace( c ) || c <= 0 ) |
||
371 | return true; |
||
372 | |||
373 | *tag += (char) in->get(); |
||
374 | } |
||
375 | } |
||
376 | |||
377 | /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag ) |
||
378 | { |
||
379 | //assert( character > 0 && character < 128 ); // else it won't work in utf-8 |
||
380 | while ( in->good() ) |
||
381 | { |
||
382 | int c = in->peek(); |
||
383 | if ( c == character ) |
||
384 | return true; |
||
385 | if ( c <= 0 ) // Silent failure: can't get document at this scope |
||
386 | return false; |
||
387 | |||
388 | in->get(); |
||
389 | *tag += (char) c; |
||
390 | } |
||
391 | return false; |
||
392 | } |
||
393 | #endif |
||
394 | |||
395 | // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The |
||
396 | // "assign" optimization removes over 10% of the execution time. |
||
397 | // |
||
398 | const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding ) |
||
399 | { |
||
400 | // Oddly, not supported on some comilers, |
||
401 | //name->clear(); |
||
402 | // So use this: |
||
403 | *name = ""; |
||
404 | assert( p ); |
||
405 | |||
406 | // Names start with letters or underscores. |
||
407 | // Of course, in unicode, tinyxml has no idea what a letter *is*. The |
||
408 | // algorithm is generous. |
||
409 | // |
||
410 | // After that, they can be letters, underscores, numbers, |
||
411 | // hyphens, or colons. (Colons are valid ony for namespaces, |
||
412 | // but tinyxml can't tell namespaces from names.) |
||
413 | if ( p && *p |
||
414 | && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) ) |
||
415 | { |
||
416 | const char* start = p; |
||
417 | while( p && *p |
||
418 | && ( IsAlphaNum( (unsigned char ) *p, encoding ) |
||
419 | || *p == '_' |
||
420 | || *p == '-' |
||
421 | || *p == '.' |
||
422 | || *p == ':' ) ) |
||
423 | { |
||
424 | //(*name) += *p; // expensive |
||
425 | ++p; |
||
426 | } |
||
427 | if ( p-start > 0 ) { |
||
428 | name->assign( start, p-start ); |
||
429 | } |
||
430 | return p; |
||
431 | } |
||
432 | return 0; |
||
433 | } |
||
434 | |||
435 | const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding ) |
||
436 | { |
||
437 | // Presume an entity, and pull it out. |
||
438 | TIXML_STRING ent; |
||
439 | int i; |
||
440 | *length = 0; |
||
441 | |||
442 | if ( *(p+1) && *(p+1) == '#' && *(p+2) ) |
||
443 | { |
||
444 | unsigned long ucs = 0; |
||
445 | ptrdiff_t delta = 0; |
||
446 | unsigned mult = 1; |
||
447 | |||
448 | if ( *(p+2) == 'x' ) |
||
449 | { |
||
450 | // Hexadecimal. |
||
451 | if ( !*(p+3) ) return 0; |
||
452 | |||
453 | const char* q = p+3; |
||
454 | q = strchr( q, ';' ); |
||
455 | |||
456 | if ( !q || !*q ) return 0; |
||
457 | |||
458 | delta = q-p; |
||
459 | --q; |
||
460 | |||
461 | while ( *q != 'x' ) |
||
462 | { |
||
463 | if ( *q >= '0' && *q <= '9' ) |
||
464 | ucs += mult * (*q - '0'); |
||
465 | else if ( *q >= 'a' && *q <= 'f' ) |
||
466 | ucs += mult * (*q - 'a' + 10); |
||
467 | else if ( *q >= 'A' && *q <= 'F' ) |
||
468 | ucs += mult * (*q - 'A' + 10 ); |
||
469 | else |
||
470 | return 0; |
||
471 | mult *= 16; |
||
472 | --q; |
||
473 | } |
||
474 | } |
||
475 | else |
||
476 | { |
||
477 | // Decimal. |
||
478 | if ( !*(p+2) ) return 0; |
||
479 | |||
480 | const char* q = p+2; |
||
481 | q = strchr( q, ';' ); |
||
482 | |||
483 | if ( !q || !*q ) return 0; |
||
484 | |||
485 | delta = q-p; |
||
486 | --q; |
||
487 | |||
488 | while ( *q != '#' ) |
||
489 | { |
||
490 | if ( *q >= '0' && *q <= '9' ) |
||
491 | ucs += mult * (*q - '0'); |
||
492 | else |
||
493 | return 0; |
||
494 | mult *= 10; |
||
495 | --q; |
||
496 | } |
||
497 | } |
||
498 | if ( encoding == TIXML_ENCODING_UTF8 ) |
||
499 | { |
||
500 | // convert the UCS to UTF-8 |
||
501 | ConvertUTF32ToUTF8( ucs, value, length ); |
||
502 | } |
||
503 | else |
||
504 | { |
||
505 | *value = (char)ucs; |
||
506 | *length = 1; |
||
507 | } |
||
508 | return p + delta + 1; |
||
509 | } |
||
510 | |||
511 | // Now try to match it. |
||
512 | for( i=0; i |
||
513 | { |
||
514 | if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 ) |
||
515 | { |
||
516 | assert( strlen( entity[i].str ) == entity[i].strLength ); |
||
517 | *value = entity[i].chr; |
||
518 | *length = 1; |
||
519 | return ( p + entity[i].strLength ); |
||
520 | } |
||
521 | } |
||
522 | |||
523 | // So it wasn't an entity, its unrecognized, or something like that. |
||
524 | *value = *p; // Don't put back the last one, since we return it! |
||
525 | //*length = 1; // Leave unrecognized entities - this doesn't really work. |
||
526 | // Just writes strange XML. |
||
527 | return p+1; |
||
528 | } |
||
529 | |||
530 | |||
531 | bool TiXmlBase::StringEqual( const char* p, |
||
532 | const char* tag, |
||
533 | bool ignoreCase, |
||
534 | TiXmlEncoding encoding ) |
||
535 | { |
||
536 | assert( p ); |
||
537 | assert( tag ); |
||
538 | if ( !p || !*p ) |
||
539 | { |
||
540 | assert( 0 ); |
||
541 | return false; |
||
542 | } |
||
543 | |||
544 | const char* q = p; |
||
545 | |||
546 | if ( ignoreCase ) |
||
547 | { |
||
548 | while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) ) |
||
549 | { |
||
550 | ++q; |
||
551 | ++tag; |
||
552 | } |
||
553 | |||
554 | if ( *tag == 0 ) |
||
555 | return true; |
||
556 | } |
||
557 | else |
||
558 | { |
||
559 | while ( *q && *tag && *q == *tag ) |
||
560 | { |
||
561 | ++q; |
||
562 | ++tag; |
||
563 | } |
||
564 | |||
565 | if ( *tag == 0 ) // Have we found the end of the tag, and everything equal? |
||
566 | return true; |
||
567 | } |
||
568 | return false; |
||
569 | } |
||
570 | |||
571 | const char* TiXmlBase::ReadText( const char* p, |
||
572 | TIXML_STRING * text, |
||
573 | bool trimWhiteSpace, |
||
574 | const char* endTag, |
||
575 | bool caseInsensitive, |
||
576 | TiXmlEncoding encoding ) |
||
577 | { |
||
578 | *text = ""; |
||
579 | if ( !trimWhiteSpace // certain tags always keep whitespace |
||
580 | || !condenseWhiteSpace ) // if true, whitespace is always kept |
||
581 | { |
||
582 | // Keep all the white space. |
||
583 | while ( p && *p |
||
584 | && !StringEqual( p, endTag, caseInsensitive, encoding ) |
||
585 | ) |
||
586 | { |
||
587 | int len; |
||
588 | char cArr[4] = { 0, 0, 0, 0 }; |
||
589 | p = GetChar( p, cArr, &len, encoding ); |
||
590 | text->append( cArr, len ); |
||
591 | } |
||
592 | } |
||
593 | else |
||
594 | { |
||
595 | bool whitespace = false; |
||
596 | |||
597 | // Remove leading white space: |
||
598 | p = SkipWhiteSpace( p, encoding ); |
||
599 | while ( p && *p |
||
600 | && !StringEqual( p, endTag, caseInsensitive, encoding ) ) |
||
601 | { |
||
602 | if ( *p == '\r' || *p == '\n' ) |
||
603 | { |
||
604 | whitespace = true; |
||
605 | ++p; |
||
606 | } |
||
607 | else if ( IsWhiteSpace( *p ) ) |
||
608 | { |
||
609 | whitespace = true; |
||
610 | ++p; |
||
611 | } |
||
612 | else |
||
613 | { |
||
614 | // If we've found whitespace, add it before the |
||
615 | // new character. Any whitespace just becomes a space. |
||
616 | if ( whitespace ) |
||
617 | { |
||
618 | (*text) += ' '; |
||
619 | whitespace = false; |
||
620 | } |
||
621 | int len; |
||
622 | char cArr[4] = { 0, 0, 0, 0 }; |
||
623 | p = GetChar( p, cArr, &len, encoding ); |
||
624 | if ( len == 1 ) |
||
625 | (*text) += cArr[0]; // more efficient |
||
626 | else |
||
627 | text->append( cArr, len ); |
||
628 | } |
||
629 | } |
||
630 | } |
||
631 | if ( p ) |
||
632 | p += strlen( endTag ); |
||
633 | return p; |
||
634 | } |
||
635 | |||
636 | #ifdef TIXML_USE_STL |
||
637 | |||
638 | void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag ) |
||
639 | { |
||
640 | // The basic issue with a document is that we don't know what we're |
||
641 | // streaming. Read something presumed to be a tag (and hope), then |
||
642 | // identify it, and call the appropriate stream method on the tag. |
||
643 | // |
||
644 | // This "pre-streaming" will never read the closing ">" so the |
||
645 | // sub-tag can orient itself. |
||
646 | |||
647 | if ( !StreamTo( in, '<', tag ) ) |
||
648 | { |
||
649 | SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
650 | return; |
||
651 | } |
||
652 | |||
653 | while ( in->good() ) |
||
654 | { |
||
655 | int tagIndex = (int) tag->length(); |
||
656 | while ( in->good() && in->peek() != '>' ) |
||
657 | { |
||
658 | int c = in->get(); |
||
659 | if ( c <= 0 ) |
||
660 | { |
||
661 | SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
662 | break; |
||
663 | } |
||
664 | (*tag) += (char) c; |
||
665 | } |
||
666 | |||
667 | if ( in->good() ) |
||
668 | { |
||
669 | // We now have something we presume to be a node of |
||
670 | // some sort. Identify it, and call the node to |
||
671 | // continue streaming. |
||
672 | TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING ); |
||
673 | |||
674 | if ( node ) |
||
675 | { |
||
676 | node->StreamIn( in, tag ); |
||
677 | bool isElement = node->ToElement() != 0; |
||
678 | delete node; |
||
679 | node = 0; |
||
680 | |||
681 | // If this is the root element, we're done. Parsing will be |
||
682 | // done by the >> operator. |
||
683 | if ( isElement ) |
||
684 | { |
||
685 | return; |
||
686 | } |
||
687 | } |
||
688 | else |
||
689 | { |
||
690 | SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
691 | return; |
||
692 | } |
||
693 | } |
||
694 | } |
||
695 | // We should have returned sooner. |
||
696 | SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
697 | } |
||
698 | |||
699 | #endif |
||
700 | |||
701 | const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding ) |
||
702 | { |
||
703 | ClearError(); |
||
704 | |||
705 | // Parse away, at the document level. Since a document |
||
706 | // contains nothing but other tags, most of what happens |
||
707 | // here is skipping white space. |
||
708 | if ( !p || !*p ) |
||
709 | { |
||
710 | SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
711 | return 0; |
||
712 | } |
||
713 | |||
714 | // Note that, for a document, this needs to come |
||
715 | // before the while space skip, so that parsing |
||
716 | // starts from the pointer we are given. |
||
717 | location.Clear(); |
||
718 | if ( prevData ) |
||
719 | { |
||
720 | location.row = prevData->cursor.row; |
||
721 | location.col = prevData->cursor.col; |
||
722 | } |
||
723 | else |
||
724 | { |
||
725 | location.row = 0; |
||
726 | location.col = 0; |
||
727 | } |
||
728 | TiXmlParsingData data( p, TabSize(), location.row, location.col ); |
||
729 | location = data.Cursor(); |
||
730 | |||
731 | if ( encoding == TIXML_ENCODING_UNKNOWN ) |
||
732 | { |
||
733 | // Check for the Microsoft UTF-8 lead bytes. |
||
734 | const unsigned char* pU = (const unsigned char*)p; |
||
735 | if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0 |
||
736 | && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1 |
||
737 | && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 ) |
||
738 | { |
||
739 | encoding = TIXML_ENCODING_UTF8; |
||
740 | useMicrosoftBOM = true; |
||
741 | } |
||
742 | } |
||
743 | |||
744 | p = SkipWhiteSpace( p, encoding ); |
||
745 | if ( !p ) |
||
746 | { |
||
747 | SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
748 | return 0; |
||
749 | } |
||
750 | |||
751 | while ( p && *p ) |
||
752 | { |
||
753 | TiXmlNode* node = Identify( p, encoding ); |
||
754 | if ( node ) |
||
755 | { |
||
756 | p = node->Parse( p, &data, encoding ); |
||
757 | LinkEndChild( node ); |
||
758 | } |
||
759 | else |
||
760 | { |
||
761 | break; |
||
762 | } |
||
763 | |||
764 | // Did we get encoding info? |
||
765 | if ( encoding == TIXML_ENCODING_UNKNOWN |
||
766 | && node->ToDeclaration() ) |
||
767 | { |
||
768 | TiXmlDeclaration* dec = node->ToDeclaration(); |
||
769 | const char* enc = dec->Encoding(); |
||
770 | assert( enc ); |
||
771 | |||
772 | if ( *enc == 0 ) |
||
773 | encoding = TIXML_ENCODING_UTF8; |
||
774 | else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) ) |
||
775 | encoding = TIXML_ENCODING_UTF8; |
||
776 | else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) ) |
||
777 | encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice |
||
778 | else |
||
779 | encoding = TIXML_ENCODING_LEGACY; |
||
780 | } |
||
781 | |||
782 | p = SkipWhiteSpace( p, encoding ); |
||
783 | } |
||
784 | |||
785 | // Was this empty? |
||
786 | if ( !firstChild ) { |
||
787 | SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding ); |
||
788 | return 0; |
||
789 | } |
||
790 | |||
791 | // All is well. |
||
792 | return p; |
||
793 | } |
||
794 | |||
795 | void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding ) |
||
796 | { |
||
797 | // The first error in a chain is more accurate - don't set again! |
||
798 | if ( error ) |
||
799 | return; |
||
800 | |||
801 | assert( err > 0 && err < TIXML_ERROR_STRING_COUNT ); |
||
802 | error = true; |
||
803 | errorId = err; |
||
804 | errorDesc = errorString[ errorId ]; |
||
805 | |||
806 | errorLocation.Clear(); |
||
807 | if ( pError && data ) |
||
808 | { |
||
809 | data->Stamp( pError, encoding ); |
||
810 | errorLocation = data->Cursor(); |
||
811 | } |
||
812 | } |
||
813 | |||
814 | |||
815 | TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding ) |
||
816 | { |
||
817 | TiXmlNode* returnNode = 0; |
||
818 | |||
819 | p = SkipWhiteSpace( p, encoding ); |
||
820 | if( !p || !*p || *p != '<' ) |
||
821 | { |
||
822 | return 0; |
||
823 | } |
||
824 | |||
825 | TiXmlDocument* doc = GetDocument(); |
||
826 | p = SkipWhiteSpace( p, encoding ); |
||
827 | |||
828 | if ( !p || !*p ) |
||
829 | { |
||
830 | return 0; |
||
831 | } |
||
832 | |||
833 | // What is this thing? |
||
834 | // - Elements start with a letter or underscore, but xml is reserved. |
||
835 | // - Comments: "; |
||
1346 | |||
1347 | if ( !StringEqual( p, startTag, false, encoding ) ) |
||
1348 | { |
||
1349 | document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding ); |
||
1350 | return 0; |
||
1351 | } |
||
1352 | p += strlen( startTag ); |
||
1353 | |||
1354 | // [ 1475201 ] TinyXML parses entities in comments |
||
1355 | // Oops - ReadText doesn't work, because we don't want to parse the entities. |
||
1356 | // p = ReadText( p, &value, false, endTag, false, encoding ); |
||
1357 | // |
||
1358 | // from the XML spec: |
||
1359 | /* |
||
1360 | [Definition: Comments may appear anywhere in a document outside other markup; in addition, |
||
1361 | they may appear within the document type declaration at places allowed by the grammar. |
||
1362 | They are not part of the document's character data; an XML processor MAY, but need not, |
||
1363 | make it possible for an application to retrieve the text of comments. For compatibility, |
||
1364 | the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity |
||
1365 | references MUST NOT be recognized within comments. |
||
1366 | |||
1367 | An example of a comment: |
||
1368 | |||
1369 | |||
1370 | */ |
||
1371 | |||
1372 | value = ""; |
||
1373 | // Keep all the white space. |
||
1374 | while ( p && *p && !StringEqual( p, endTag, false, encoding ) ) |
||
1375 | { |
||
1376 | value.append( p, 1 ); |
||
1377 | ++p; |
||
1378 | } |
||
1379 | if ( p ) |
||
1380 | p += strlen( endTag ); |
||
1381 | |||
1382 | return p; |
||
1383 | } |
||
1384 | |||
1385 | |||
1386 | const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
||
1387 | { |
||
1388 | p = SkipWhiteSpace( p, encoding ); |
||
1389 | if ( !p || !*p ) return 0; |
||
1390 | |||
1391 | // int tabsize = 4; |
||
1392 | // if ( document ) |
||
1393 | // tabsize = document->TabSize(); |
||
1394 | |||
1395 | if ( data ) |
||
1396 | { |
||
1397 | data->Stamp( p, encoding ); |
||
1398 | location = data->Cursor(); |
||
1399 | } |
||
1400 | // Read the name, the '=' and the value. |
||
1401 | const char* pErr = p; |
||
1402 | p = ReadName( p, &name, encoding ); |
||
1403 | if ( !p || !*p ) |
||
1404 | { |
||
1405 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding ); |
||
1406 | return 0; |
||
1407 | } |
||
1408 | p = SkipWhiteSpace( p, encoding ); |
||
1409 | if ( !p || !*p || *p != '=' ) |
||
1410 | { |
||
1411 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); |
||
1412 | return 0; |
||
1413 | } |
||
1414 | |||
1415 | ++p; // skip '=' |
||
1416 | p = SkipWhiteSpace( p, encoding ); |
||
1417 | if ( !p || !*p ) |
||
1418 | { |
||
1419 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); |
||
1420 | return 0; |
||
1421 | } |
||
1422 | |||
1423 | const char* end; |
||
1424 | const char SINGLE_QUOTE = '\''; |
||
1425 | const char DOUBLE_QUOTE = '\"'; |
||
1426 | |||
1427 | if ( *p == SINGLE_QUOTE ) |
||
1428 | { |
||
1429 | ++p; |
||
1430 | end = "\'"; // single quote in string |
||
1431 | p = ReadText( p, &value, false, end, false, encoding ); |
||
1432 | } |
||
1433 | else if ( *p == DOUBLE_QUOTE ) |
||
1434 | { |
||
1435 | ++p; |
||
1436 | end = "\""; // double quote in string |
||
1437 | p = ReadText( p, &value, false, end, false, encoding ); |
||
1438 | } |
||
1439 | else |
||
1440 | { |
||
1441 | // All attribute values should be in single or double quotes. |
||
1442 | // But this is such a common error that the parser will try |
||
1443 | // its best, even without them. |
||
1444 | value = ""; |
||
1445 | while ( p && *p // existence |
||
1446 | && !IsWhiteSpace( *p ) // whitespace |
||
1447 | && *p != '/' && *p != '>' ) // tag end |
||
1448 | { |
||
1449 | if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) { |
||
1450 | // [ 1451649 ] Attribute values with trailing quotes not handled correctly |
||
1451 | // We did not have an opening quote but seem to have a |
||
1452 | // closing one. Give up and throw an error. |
||
1453 | if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding ); |
||
1454 | return 0; |
||
1455 | } |
||
1456 | value += *p; |
||
1457 | ++p; |
||
1458 | } |
||
1459 | } |
||
1460 | return p; |
||
1461 | } |
||
1462 | |||
1463 | #ifdef TIXML_USE_STL |
||
1464 | void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag ) |
||
1465 | { |
||
1466 | while ( in->good() ) |
||
1467 | { |
||
1468 | int c = in->peek(); |
||
1469 | if ( !cdata && (c == '<' ) ) |
||
1470 | { |
||
1471 | return; |
||
1472 | } |
||
1473 | if ( c <= 0 ) |
||
1474 | { |
||
1475 | TiXmlDocument* document = GetDocument(); |
||
1476 | if ( document ) |
||
1477 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
1478 | return; |
||
1479 | } |
||
1480 | |||
1481 | (*tag) += (char) c; |
||
1482 | in->get(); // "commits" the peek made above |
||
1483 | |||
1484 | if ( cdata && c == '>' && tag->size() >= 3 ) { |
||
1485 | size_t len = tag->size(); |
||
1486 | if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) { |
||
1487 | // terminator of cdata. |
||
1488 | return; |
||
1489 | } |
||
1490 | } |
||
1491 | } |
||
1492 | } |
||
1493 | #endif |
||
1494 | |||
1495 | const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding ) |
||
1496 | { |
||
1497 | value = ""; |
||
1498 | TiXmlDocument* document = GetDocument(); |
||
1499 | |||
1500 | if ( data ) |
||
1501 | { |
||
1502 | data->Stamp( p, encoding ); |
||
1503 | location = data->Cursor(); |
||
1504 | } |
||
1505 | |||
1506 | const char* const startTag = " |
||
1507 | const char* const endTag = "]]>"; |
||
1508 | |||
1509 | if ( cdata || StringEqual( p, startTag, false, encoding ) ) |
||
1510 | { |
||
1511 | cdata = true; |
||
1512 | |||
1513 | if ( !StringEqual( p, startTag, false, encoding ) ) |
||
1514 | { |
||
1515 | document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding ); |
||
1516 | return 0; |
||
1517 | } |
||
1518 | p += strlen( startTag ); |
||
1519 | |||
1520 | // Keep all the white space, ignore the encoding, etc. |
||
1521 | while ( p && *p |
||
1522 | && !StringEqual( p, endTag, false, encoding ) |
||
1523 | ) |
||
1524 | { |
||
1525 | value += *p; |
||
1526 | ++p; |
||
1527 | } |
||
1528 | |||
1529 | TIXML_STRING dummy; |
||
1530 | p = ReadText( p, &dummy, false, endTag, false, encoding ); |
||
1531 | return p; |
||
1532 | } |
||
1533 | else |
||
1534 | { |
||
1535 | bool ignoreWhite = true; |
||
1536 | |||
1537 | const char* end = "<"; |
||
1538 | p = ReadText( p, &value, ignoreWhite, end, false, encoding ); |
||
1539 | if ( p ) |
||
1540 | return p-1; // don't truncate the '<' |
||
1541 | return 0; |
||
1542 | } |
||
1543 | } |
||
1544 | |||
1545 | #ifdef TIXML_USE_STL |
||
1546 | void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag ) |
||
1547 | { |
||
1548 | while ( in->good() ) |
||
1549 | { |
||
1550 | int c = in->get(); |
||
1551 | if ( c <= 0 ) |
||
1552 | { |
||
1553 | TiXmlDocument* document = GetDocument(); |
||
1554 | if ( document ) |
||
1555 | document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN ); |
||
1556 | return; |
||
1557 | } |
||
1558 | (*tag) += (char) c; |
||
1559 | |||
1560 | if ( c == '>' ) |
||
1561 | { |
||
1562 | // All is well. |
||
1563 | return; |
||
1564 | } |
||
1565 | } |
||
1566 | } |
||
1567 | #endif |
||
1568 | |||
1569 | const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding ) |
||
1570 | { |
||
1571 | p = SkipWhiteSpace( p, _encoding ); |
||
1572 | // Find the beginning, find the end, and look for |
||
1573 | // the stuff in-between. |
||
1574 | TiXmlDocument* document = GetDocument(); |
||
1575 | if ( !p || !*p || !StringEqual( p, " |
||
1576 | { |
||
1577 | if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding ); |
||
1578 | return 0; |
||
1579 | } |
||
1580 | if ( data ) |
||
1581 | { |
||
1582 | data->Stamp( p, _encoding ); |
||
1583 | location = data->Cursor(); |
||
1584 | } |
||
1585 | p += 5; |
||
1586 | |||
1587 | version = ""; |
||
1588 | encoding = ""; |
||
1589 | standalone = ""; |
||
1590 | |||
1591 | while ( p && *p ) |
||
1592 | { |
||
1593 | if ( *p == '>' ) |
||
1594 | { |
||
1595 | ++p; |
||
1596 | return p; |
||
1597 | } |
||
1598 | |||
1599 | p = SkipWhiteSpace( p, _encoding ); |
||
1600 | if ( StringEqual( p, "version", true, _encoding ) ) |
||
1601 | { |
||
1602 | TiXmlAttribute attrib; |
||
1603 | p = attrib.Parse( p, data, _encoding ); |
||
1604 | version = attrib.Value(); |
||
1605 | } |
||
1606 | else if ( StringEqual( p, "encoding", true, _encoding ) ) |
||
1607 | { |
||
1608 | TiXmlAttribute attrib; |
||
1609 | p = attrib.Parse( p, data, _encoding ); |
||
1610 | encoding = attrib.Value(); |
||
1611 | } |
||
1612 | else if ( StringEqual( p, "standalone", true, _encoding ) ) |
||
1613 | { |
||
1614 | TiXmlAttribute attrib; |
||
1615 | p = attrib.Parse( p, data, _encoding ); |
||
1616 | standalone = attrib.Value(); |
||
1617 | } |
||
1618 | else |
||
1619 | { |
||
1620 | // Read over whatever it is. |
||
1621 | while( p && *p && *p != '>' && !IsWhiteSpace( *p ) ) |
||
1622 | ++p; |
||
1623 | } |
||
1624 | } |
||
1625 | return 0; |
||
1626 | } |
||
1627 | |||
1628 | bool TiXmlText::Blank() const |
||
1629 | { |
||
1630 | for ( unsigned i=0; i |
||
1631 | if ( !IsWhiteSpace( value[i] ) ) |
||
1632 | return false; |
||
1633 | return true; |
||
1634 | }?xml",>=>' |
||
1635 |