Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
3584 | sourcerer | 1 | /* |
2 | * This file is part of Hubbub. |
||
3 | * Licensed under the MIT License, |
||
4 | * http://www.opensource.org/licenses/mit-license.php |
||
5 | * Copyright 2007 John-Mark Bell |
||
6 | * Copyright 2008 Andrew Sidwell |
||
7 | */ |
||
8 | #include |
||
9 | #include |
||
10 | #include |
||
11 | |||
12 | #include |
||
13 | |||
14 | typedef signed char int8_t; |
||
15 | typedef signed short int16_t; |
||
16 | typedef signed int int32_t; |
||
17 | |||
18 | typedef unsigned char uint8_t; |
||
19 | typedef unsigned short uint16_t; |
||
20 | typedef unsigned int uint32_t; |
||
21 | |||
22 | #include |
||
23 | |||
24 | #include |
||
25 | |||
26 | #include "utils/parserutilserror.h" |
||
27 | #include "utils/utils.h" |
||
28 | |||
29 | #include "hubbub/errors.h" |
||
30 | #include "tokeniser/entities.h" |
||
31 | #include "tokeniser/tokeniser.h" |
||
32 | |||
33 | /** |
||
34 | * Table of mappings between Windows-1252 codepoints 128-159 and UCS4 |
||
35 | */ |
||
36 | static const uint32_t cp1252Table[32] = { |
||
37 | 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, |
||
38 | 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, |
||
39 | 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, |
||
40 | 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178 |
||
41 | }; |
||
42 | |||
43 | /** |
||
44 | * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER |
||
45 | */ |
||
46 | static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' }; |
||
47 | static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; |
||
48 | |||
49 | |||
50 | /** |
||
51 | * String for when we want to emit newlines |
||
52 | */ |
||
53 | static const uint8_t lf = '\n'; |
||
54 | static const hubbub_string lf_str = { &lf, 1 }; |
||
55 | |||
56 | |||
57 | /** |
||
58 | * Tokeniser states |
||
59 | */ |
||
60 | typedef enum hubbub_tokeniser_state { |
||
61 | STATE_DATA, |
||
62 | STATE_CHARACTER_REFERENCE_DATA, |
||
63 | STATE_TAG_OPEN, |
||
64 | STATE_CLOSE_TAG_OPEN, |
||
65 | STATE_TAG_NAME, |
||
66 | STATE_BEFORE_ATTRIBUTE_NAME, |
||
67 | STATE_ATTRIBUTE_NAME, |
||
68 | STATE_AFTER_ATTRIBUTE_NAME, |
||
69 | STATE_BEFORE_ATTRIBUTE_VALUE, |
||
70 | STATE_ATTRIBUTE_VALUE_DQ, |
||
71 | STATE_ATTRIBUTE_VALUE_SQ, |
||
72 | STATE_ATTRIBUTE_VALUE_UQ, |
||
73 | STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, |
||
74 | STATE_AFTER_ATTRIBUTE_VALUE_Q, |
||
75 | STATE_SELF_CLOSING_START_TAG, |
||
76 | STATE_BOGUS_COMMENT, |
||
77 | STATE_MARKUP_DECLARATION_OPEN, |
||
78 | STATE_MATCH_COMMENT, |
||
79 | STATE_COMMENT_START, |
||
80 | STATE_COMMENT_START_DASH, |
||
81 | STATE_COMMENT, |
||
82 | STATE_COMMENT_END_DASH, |
||
83 | STATE_COMMENT_END, |
||
84 | STATE_MATCH_DOCTYPE, |
||
85 | STATE_DOCTYPE, |
||
86 | STATE_BEFORE_DOCTYPE_NAME, |
||
87 | STATE_DOCTYPE_NAME, |
||
88 | STATE_AFTER_DOCTYPE_NAME, |
||
89 | STATE_MATCH_PUBLIC, |
||
90 | STATE_BEFORE_DOCTYPE_PUBLIC, |
||
91 | STATE_DOCTYPE_PUBLIC_DQ, |
||
92 | STATE_DOCTYPE_PUBLIC_SQ, |
||
93 | STATE_AFTER_DOCTYPE_PUBLIC, |
||
94 | STATE_MATCH_SYSTEM, |
||
95 | STATE_BEFORE_DOCTYPE_SYSTEM, |
||
96 | STATE_DOCTYPE_SYSTEM_DQ, |
||
97 | STATE_DOCTYPE_SYSTEM_SQ, |
||
98 | STATE_AFTER_DOCTYPE_SYSTEM, |
||
99 | STATE_BOGUS_DOCTYPE, |
||
100 | STATE_MATCH_CDATA, |
||
101 | STATE_CDATA_BLOCK, |
||
102 | STATE_NUMBERED_ENTITY, |
||
103 | STATE_NAMED_ENTITY |
||
104 | } hubbub_tokeniser_state; |
||
105 | |||
106 | /** |
||
107 | * Context for tokeniser |
||
108 | */ |
||
109 | typedef struct hubbub_tokeniser_context { |
||
110 | size_t pending; /**< Count of pending chars */ |
||
111 | |||
112 | hubbub_string current_comment; /**< Current comment text */ |
||
113 | |||
114 | hubbub_token_type current_tag_type; /**< Type of current_tag */ |
||
115 | hubbub_tag current_tag; /**< Current tag */ |
||
116 | hubbub_doctype current_doctype; /**< Current doctype */ |
||
117 | hubbub_tokeniser_state prev_state; /**< Previous state */ |
||
118 | |||
119 | uint8_t last_start_tag_name[10]; /**< Name of the last start tag |
||
120 | * emitted */ |
||
121 | size_t last_start_tag_len; /**< Length of last start tag */ |
||
122 | |||
123 | struct { |
||
124 | uint32_t count; |
||
125 | bool match; |
||
126 | } close_tag_match; /**< State for matching close |
||
127 | * tags */ |
||
128 | |||
129 | struct { |
||
130 | uint32_t count; /**< Index into "DOCTYPE" */ |
||
131 | } match_doctype; /**< State for matching doctype */ |
||
132 | |||
133 | struct { |
||
134 | uint32_t count; /**< Index into "[CDATA[" */ |
||
135 | uint32_t end; /**< Index into "]]>" */ |
||
136 | } match_cdata; /**< State for matching cdata */ |
||
137 | |||
138 | struct { |
||
139 | size_t offset; /**< Offset in buffer */ |
||
140 | uint32_t length; /**< Length of entity */ |
||
141 | uint32_t codepoint; /**< UCS4 codepoint */ |
||
142 | bool complete; /**< True if match complete */ |
||
143 | |||
144 | uint32_t poss_length; /**< Optimistic length |
||
145 | * when matching named |
||
146 | * character references */ |
||
147 | uint8_t base; /**< Base for numeric |
||
148 | * entities */ |
||
149 | int32_t context; /**< Context for named |
||
150 | * entity search */ |
||
151 | size_t prev_len; /**< Previous byte length |
||
152 | * of str */ |
||
153 | bool had_data; /**< Whether we read |
||
154 | * anything after (x)? */ |
||
155 | bool overflow; /**< Whether this entity has |
||
156 | * has overflowed the maximum |
||
157 | * numeric entity value */ |
||
158 | hubbub_tokeniser_state return_state; /**< State we were |
||
159 | * called from */ |
||
160 | } match_entity; /**< Entity matching state */ |
||
161 | |||
162 | struct { |
||
163 | uint32_t line; /**< Current line of input */ |
||
164 | uint32_t col; /**< Current character in |
||
165 | * line */ |
||
166 | } position; /**< Position in source data */ |
||
167 | |||
168 | uint32_t allowed_char; /**< Used for quote matching */ |
||
169 | |||
170 | } hubbub_tokeniser_context; |
||
171 | |||
172 | /** |
||
173 | * Tokeniser data structure |
||
174 | */ |
||
175 | struct hubbub_tokeniser { |
||
176 | hubbub_tokeniser_state state; /**< Current tokeniser state */ |
||
177 | hubbub_content_model content_model; /**< Current content |
||
178 | * model flag */ |
||
179 | bool escape_flag; /**< Escape flag **/ |
||
180 | bool process_cdata_section; /**< Whether to process CDATA sections*/ |
||
181 | bool paused; /**< flag for if parsing is currently paused */ |
||
182 | |||
183 | parserutils_inputstream *input; /**< Input stream */ |
||
184 | parserutils_buffer *buffer; /**< Input buffer */ |
||
185 | parserutils_buffer *insert_buf; /**< Stream insertion buffer */ |
||
186 | |||
187 | hubbub_tokeniser_context context; /**< Tokeniser context */ |
||
188 | |||
189 | hubbub_token_handler token_handler; /**< Token handling callback */ |
||
190 | void *token_pw; /**< Token handler data */ |
||
191 | |||
192 | hubbub_error_handler error_handler; /**< Error handling callback */ |
||
193 | void *error_pw; /**< Error handler data */ |
||
194 | |||
195 | hubbub_allocator_fn alloc; /**< Memory (de)allocation function */ |
||
196 | void *alloc_pw; /**< Client private data */ |
||
197 | }; |
||
198 | |||
199 | static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser); |
||
200 | static hubbub_error hubbub_tokeniser_handle_character_reference_data( |
||
201 | hubbub_tokeniser *tokeniser); |
||
202 | static hubbub_error hubbub_tokeniser_handle_tag_open( |
||
203 | hubbub_tokeniser *tokeniser); |
||
204 | static hubbub_error hubbub_tokeniser_handle_close_tag_open( |
||
205 | hubbub_tokeniser *tokeniser); |
||
206 | static hubbub_error hubbub_tokeniser_handle_tag_name( |
||
207 | hubbub_tokeniser *tokeniser); |
||
208 | static hubbub_error hubbub_tokeniser_handle_before_attribute_name( |
||
209 | hubbub_tokeniser *tokeniser); |
||
210 | static hubbub_error hubbub_tokeniser_handle_attribute_name( |
||
211 | hubbub_tokeniser *tokeniser); |
||
212 | static hubbub_error hubbub_tokeniser_handle_after_attribute_name( |
||
213 | hubbub_tokeniser *tokeniser); |
||
214 | static hubbub_error hubbub_tokeniser_handle_before_attribute_value( |
||
215 | hubbub_tokeniser *tokeniser); |
||
216 | static hubbub_error hubbub_tokeniser_handle_attribute_value_dq( |
||
217 | hubbub_tokeniser *tokeniser); |
||
218 | static hubbub_error hubbub_tokeniser_handle_attribute_value_sq( |
||
219 | hubbub_tokeniser *tokeniser); |
||
220 | static hubbub_error hubbub_tokeniser_handle_attribute_value_uq( |
||
221 | hubbub_tokeniser *tokeniser); |
||
222 | static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value( |
||
223 | hubbub_tokeniser *tokeniser); |
||
224 | static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q( |
||
225 | hubbub_tokeniser *tokeniser); |
||
226 | static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag( |
||
227 | hubbub_tokeniser *tokeniser); |
||
228 | static hubbub_error hubbub_tokeniser_handle_bogus_comment( |
||
229 | hubbub_tokeniser *tokeniser); |
||
230 | static hubbub_error hubbub_tokeniser_handle_markup_declaration_open( |
||
231 | hubbub_tokeniser *tokeniser); |
||
232 | static hubbub_error hubbub_tokeniser_handle_match_comment( |
||
233 | hubbub_tokeniser *tokeniser); |
||
234 | static hubbub_error hubbub_tokeniser_handle_comment( |
||
235 | hubbub_tokeniser *tokeniser); |
||
236 | static hubbub_error hubbub_tokeniser_handle_match_doctype( |
||
237 | hubbub_tokeniser *tokeniser); |
||
238 | static hubbub_error hubbub_tokeniser_handle_doctype( |
||
239 | hubbub_tokeniser *tokeniser); |
||
240 | static hubbub_error hubbub_tokeniser_handle_before_doctype_name( |
||
241 | hubbub_tokeniser *tokeniser); |
||
242 | static hubbub_error hubbub_tokeniser_handle_doctype_name( |
||
243 | hubbub_tokeniser *tokeniser); |
||
244 | static hubbub_error hubbub_tokeniser_handle_after_doctype_name( |
||
245 | hubbub_tokeniser *tokeniser); |
||
246 | static hubbub_error hubbub_tokeniser_handle_match_public( |
||
247 | hubbub_tokeniser *tokeniser); |
||
248 | static hubbub_error hubbub_tokeniser_handle_before_doctype_public( |
||
249 | hubbub_tokeniser *tokeniser); |
||
250 | static hubbub_error hubbub_tokeniser_handle_doctype_public_dq( |
||
251 | hubbub_tokeniser *tokeniser); |
||
252 | static hubbub_error hubbub_tokeniser_handle_doctype_public_sq( |
||
253 | hubbub_tokeniser *tokeniser); |
||
254 | static hubbub_error hubbub_tokeniser_handle_after_doctype_public( |
||
255 | hubbub_tokeniser *tokeniser); |
||
256 | static hubbub_error hubbub_tokeniser_handle_match_system( |
||
257 | hubbub_tokeniser *tokeniser); |
||
258 | static hubbub_error hubbub_tokeniser_handle_before_doctype_system( |
||
259 | hubbub_tokeniser *tokeniser); |
||
260 | static hubbub_error hubbub_tokeniser_handle_doctype_system_dq( |
||
261 | hubbub_tokeniser *tokeniser); |
||
262 | static hubbub_error hubbub_tokeniser_handle_doctype_system_sq( |
||
263 | hubbub_tokeniser *tokeniser); |
||
264 | static hubbub_error hubbub_tokeniser_handle_after_doctype_system( |
||
265 | hubbub_tokeniser *tokeniser); |
||
266 | static hubbub_error hubbub_tokeniser_handle_bogus_doctype( |
||
267 | hubbub_tokeniser *tokeniser); |
||
268 | static hubbub_error hubbub_tokeniser_handle_match_cdata( |
||
269 | hubbub_tokeniser *tokeniser); |
||
270 | static hubbub_error hubbub_tokeniser_handle_cdata_block( |
||
271 | hubbub_tokeniser *tokeniser); |
||
272 | static hubbub_error hubbub_tokeniser_consume_character_reference( |
||
273 | hubbub_tokeniser *tokeniser, size_t off); |
||
274 | static hubbub_error hubbub_tokeniser_handle_numbered_entity( |
||
275 | hubbub_tokeniser *tokeniser); |
||
276 | static hubbub_error hubbub_tokeniser_handle_named_entity( |
||
277 | hubbub_tokeniser *tokeniser); |
||
278 | |||
279 | static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser, |
||
280 | const hubbub_string *chars); |
||
281 | static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser); |
||
282 | static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser); |
||
283 | static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser); |
||
284 | static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser, |
||
285 | bool force_quirks); |
||
286 | static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, |
||
287 | hubbub_token *token); |
||
288 | |||
289 | /** |
||
290 | * Create a hubbub tokeniser |
||
291 | * |
||
292 | * \param input Input stream instance |
||
293 | * \param alloc Memory (de)allocation function |
||
294 | * \param pw Pointer to client-specific private data (may be NULL) |
||
295 | * \param tokeniser Pointer to location to receive tokeniser instance |
||
296 | * \return HUBBUB_OK on success, |
||
297 | * HUBBUB_BADPARM on bad parameters, |
||
298 | * HUBBUB_NOMEM on memory exhaustion |
||
299 | */ |
||
300 | hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input, |
||
301 | hubbub_allocator_fn alloc, void *pw, |
||
302 | hubbub_tokeniser **tokeniser) |
||
303 | { |
||
304 | parserutils_error perror; |
||
305 | hubbub_tokeniser *tok; |
||
306 | |||
307 | if (input == NULL || alloc == NULL || tokeniser == NULL) |
||
308 | return HUBBUB_BADPARM; |
||
309 | |||
310 | tok = alloc(NULL, sizeof(hubbub_tokeniser), pw); |
||
311 | if (tok == NULL) |
||
312 | return HUBBUB_NOMEM; |
||
313 | |||
314 | perror = parserutils_buffer_create(alloc, pw, &tok->buffer); |
||
315 | if (perror != PARSERUTILS_OK) { |
||
316 | alloc(tok, 0, pw); |
||
317 | return hubbub_error_from_parserutils_error(perror); |
||
318 | } |
||
319 | |||
320 | perror = parserutils_buffer_create(alloc, pw, &tok->insert_buf); |
||
321 | if (perror != PARSERUTILS_OK) { |
||
322 | parserutils_buffer_destroy(tok->buffer); |
||
323 | alloc(tok, 0, pw); |
||
324 | return hubbub_error_from_parserutils_error(perror); |
||
325 | } |
||
326 | |||
327 | tok->state = STATE_DATA; |
||
328 | tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; |
||
329 | |||
330 | tok->escape_flag = false; |
||
331 | tok->process_cdata_section = false; |
||
332 | |||
333 | tok->paused = false; |
||
334 | |||
335 | tok->input = input; |
||
336 | |||
337 | tok->token_handler = NULL; |
||
338 | tok->token_pw = NULL; |
||
339 | |||
340 | tok->error_handler = NULL; |
||
341 | tok->error_pw = NULL; |
||
342 | |||
343 | tok->alloc = alloc; |
||
344 | tok->alloc_pw = pw; |
||
345 | |||
346 | memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); |
||
347 | |||
348 | *tokeniser = tok; |
||
349 | |||
350 | return HUBBUB_OK; |
||
351 | } |
||
352 | |||
353 | /** |
||
354 | * Destroy a hubbub tokeniser |
||
355 | * |
||
356 | * \param tokeniser The tokeniser instance to destroy |
||
357 | * \return HUBBUB_OK on success, appropriate error otherwise |
||
358 | */ |
||
359 | hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser) |
||
360 | { |
||
361 | if (tokeniser == NULL) |
||
362 | return HUBBUB_BADPARM; |
||
363 | |||
364 | if (tokeniser->context.current_tag.attributes != NULL) { |
||
365 | tokeniser->alloc(tokeniser->context.current_tag.attributes, |
||
366 | 0, tokeniser->alloc_pw); |
||
367 | } |
||
368 | |||
369 | parserutils_buffer_destroy(tokeniser->insert_buf); |
||
370 | |||
371 | parserutils_buffer_destroy(tokeniser->buffer); |
||
372 | |||
373 | tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw); |
||
374 | |||
375 | return HUBBUB_OK; |
||
376 | } |
||
377 | |||
378 | /** |
||
379 | * Configure a hubbub tokeniser |
||
380 | * |
||
381 | * \param tokeniser The tokeniser instance to configure |
||
382 | * \param type The option type to set |
||
383 | * \param params Option-specific parameters |
||
384 | * \return HUBBUB_OK on success, appropriate error otherwise |
||
385 | */ |
||
386 | hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, |
||
387 | hubbub_tokeniser_opttype type, |
||
388 | hubbub_tokeniser_optparams *params) |
||
389 | { |
||
390 | hubbub_error err = HUBBUB_OK; |
||
391 | |||
392 | if (tokeniser == NULL || params == NULL) |
||
393 | return HUBBUB_BADPARM; |
||
394 | |||
395 | switch (type) { |
||
396 | case HUBBUB_TOKENISER_TOKEN_HANDLER: |
||
397 | tokeniser->token_handler = params->token_handler.handler; |
||
398 | tokeniser->token_pw = params->token_handler.pw; |
||
399 | break; |
||
400 | case HUBBUB_TOKENISER_ERROR_HANDLER: |
||
401 | tokeniser->error_handler = params->error_handler.handler; |
||
402 | tokeniser->error_pw = params->error_handler.pw; |
||
403 | break; |
||
404 | case HUBBUB_TOKENISER_CONTENT_MODEL: |
||
405 | tokeniser->content_model = params->content_model.model; |
||
406 | break; |
||
407 | case HUBBUB_TOKENISER_PROCESS_CDATA: |
||
408 | tokeniser->process_cdata_section = params->process_cdata; |
||
409 | break; |
||
410 | case HUBBUB_TOKENISER_PAUSE: |
||
411 | if (params->pause_parse == true) { |
||
412 | tokeniser->paused = true; |
||
413 | } else { |
||
414 | if (tokeniser->paused == true) { |
||
415 | tokeniser->paused = false; |
||
416 | err = hubbub_tokeniser_run(tokeniser); |
||
417 | } |
||
418 | } |
||
419 | } |
||
420 | |||
421 | return err; |
||
422 | } |
||
423 | |||
424 | /** |
||
425 | * Insert a chunk of data into the input stream. |
||
426 | * |
||
427 | * Inserts the given data into the input stream ready for parsing but |
||
428 | * does not cause any additional processing of the input. |
||
429 | * |
||
430 | * \param tokeniser Tokeniser instance |
||
431 | * \param data Data to insert (UTF-8 encoded) |
||
432 | * \param len Length, in bytes, of data |
||
433 | * \return HUBBUB_OK on success, appropriate error otherwise |
||
434 | */ |
||
435 | hubbub_error hubbub_tokeniser_insert_chunk(hubbub_tokeniser *tokeniser, |
||
436 | const uint8_t *data, size_t len) |
||
437 | { |
||
438 | parserutils_error perror; |
||
439 | |||
440 | if (tokeniser == NULL || data == NULL) |
||
441 | return HUBBUB_BADPARM; |
||
442 | |||
443 | perror = parserutils_buffer_append(tokeniser->insert_buf, data, len); |
||
444 | if (perror != PARSERUTILS_OK) |
||
445 | return hubbub_error_from_parserutils_error(perror); |
||
446 | |||
447 | return HUBBUB_OK; |
||
448 | } |
||
449 | |||
450 | /** |
||
451 | * Process remaining data in the input stream |
||
452 | * |
||
453 | * \param tokeniser The tokeniser instance to invoke |
||
454 | * \return HUBBUB_OK on success, appropriate error otherwise |
||
455 | */ |
||
456 | hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) |
||
457 | { |
||
458 | hubbub_error cont = HUBBUB_OK; |
||
459 | |||
460 | if (tokeniser == NULL) |
||
461 | return HUBBUB_BADPARM; |
||
462 | |||
463 | if (tokeniser->paused == true) |
||
464 | return HUBBUB_PAUSED; |
||
465 | |||
466 | #if 0 |
||
467 | #define state(x) \ |
||
468 | case x: \ |
||
469 | printf( #x "\n"); |
||
470 | #else |
||
471 | #define state(x) \ |
||
472 | case x: |
||
473 | #endif |
||
474 | |||
475 | while (cont == HUBBUB_OK) { |
||
476 | switch (tokeniser->state) { |
||
477 | state(STATE_DATA) |
||
478 | cont = hubbub_tokeniser_handle_data(tokeniser); |
||
479 | break; |
||
480 | state(STATE_CHARACTER_REFERENCE_DATA) |
||
481 | cont = hubbub_tokeniser_handle_character_reference_data( |
||
482 | tokeniser); |
||
483 | break; |
||
484 | state(STATE_TAG_OPEN) |
||
485 | cont = hubbub_tokeniser_handle_tag_open(tokeniser); |
||
486 | break; |
||
487 | state(STATE_CLOSE_TAG_OPEN) |
||
488 | cont = hubbub_tokeniser_handle_close_tag_open( |
||
489 | tokeniser); |
||
490 | break; |
||
491 | state(STATE_TAG_NAME) |
||
492 | cont = hubbub_tokeniser_handle_tag_name(tokeniser); |
||
493 | break; |
||
494 | state(STATE_BEFORE_ATTRIBUTE_NAME) |
||
495 | cont = hubbub_tokeniser_handle_before_attribute_name( |
||
496 | tokeniser); |
||
497 | break; |
||
498 | state(STATE_ATTRIBUTE_NAME) |
||
499 | cont = hubbub_tokeniser_handle_attribute_name( |
||
500 | tokeniser); |
||
501 | break; |
||
502 | state(STATE_AFTER_ATTRIBUTE_NAME) |
||
503 | cont = hubbub_tokeniser_handle_after_attribute_name( |
||
504 | tokeniser); |
||
505 | break; |
||
506 | state(STATE_BEFORE_ATTRIBUTE_VALUE) |
||
507 | cont = hubbub_tokeniser_handle_before_attribute_value( |
||
508 | tokeniser); |
||
509 | break; |
||
510 | state(STATE_ATTRIBUTE_VALUE_DQ) |
||
511 | cont = hubbub_tokeniser_handle_attribute_value_dq( |
||
512 | tokeniser); |
||
513 | break; |
||
514 | state(STATE_ATTRIBUTE_VALUE_SQ) |
||
515 | cont = hubbub_tokeniser_handle_attribute_value_sq( |
||
516 | tokeniser); |
||
517 | break; |
||
518 | state(STATE_ATTRIBUTE_VALUE_UQ) |
||
519 | cont = hubbub_tokeniser_handle_attribute_value_uq( |
||
520 | tokeniser); |
||
521 | break; |
||
522 | state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) |
||
523 | cont = hubbub_tokeniser_handle_character_reference_in_attribute_value( |
||
524 | tokeniser); |
||
525 | break; |
||
526 | state(STATE_AFTER_ATTRIBUTE_VALUE_Q) |
||
527 | cont = hubbub_tokeniser_handle_after_attribute_value_q( |
||
528 | tokeniser); |
||
529 | break; |
||
530 | state(STATE_SELF_CLOSING_START_TAG) |
||
531 | cont = hubbub_tokeniser_handle_self_closing_start_tag( |
||
532 | tokeniser); |
||
533 | break; |
||
534 | state(STATE_BOGUS_COMMENT) |
||
535 | cont = hubbub_tokeniser_handle_bogus_comment( |
||
536 | tokeniser); |
||
537 | break; |
||
538 | state(STATE_MARKUP_DECLARATION_OPEN) |
||
539 | cont = hubbub_tokeniser_handle_markup_declaration_open( |
||
540 | tokeniser); |
||
541 | break; |
||
542 | state(STATE_MATCH_COMMENT) |
||
543 | cont = hubbub_tokeniser_handle_match_comment( |
||
544 | tokeniser); |
||
545 | break; |
||
546 | case STATE_COMMENT_START: |
||
547 | case STATE_COMMENT_START_DASH: |
||
548 | case STATE_COMMENT: |
||
549 | case STATE_COMMENT_END_DASH: |
||
550 | case STATE_COMMENT_END: |
||
551 | cont = hubbub_tokeniser_handle_comment(tokeniser); |
||
552 | break; |
||
553 | state(STATE_MATCH_DOCTYPE) |
||
554 | cont = hubbub_tokeniser_handle_match_doctype( |
||
555 | tokeniser); |
||
556 | break; |
||
557 | state(STATE_DOCTYPE) |
||
558 | cont = hubbub_tokeniser_handle_doctype(tokeniser); |
||
559 | break; |
||
560 | state(STATE_BEFORE_DOCTYPE_NAME) |
||
561 | cont = hubbub_tokeniser_handle_before_doctype_name( |
||
562 | tokeniser); |
||
563 | break; |
||
564 | state(STATE_DOCTYPE_NAME) |
||
565 | cont = hubbub_tokeniser_handle_doctype_name( |
||
566 | tokeniser); |
||
567 | break; |
||
568 | state(STATE_AFTER_DOCTYPE_NAME) |
||
569 | cont = hubbub_tokeniser_handle_after_doctype_name( |
||
570 | tokeniser); |
||
571 | break; |
||
572 | |||
573 | state(STATE_MATCH_PUBLIC) |
||
574 | cont = hubbub_tokeniser_handle_match_public( |
||
575 | tokeniser); |
||
576 | break; |
||
577 | state(STATE_BEFORE_DOCTYPE_PUBLIC) |
||
578 | cont = hubbub_tokeniser_handle_before_doctype_public( |
||
579 | tokeniser); |
||
580 | break; |
||
581 | state(STATE_DOCTYPE_PUBLIC_DQ) |
||
582 | cont = hubbub_tokeniser_handle_doctype_public_dq( |
||
583 | tokeniser); |
||
584 | break; |
||
585 | state(STATE_DOCTYPE_PUBLIC_SQ) |
||
586 | cont = hubbub_tokeniser_handle_doctype_public_sq( |
||
587 | tokeniser); |
||
588 | break; |
||
589 | state(STATE_AFTER_DOCTYPE_PUBLIC) |
||
590 | cont = hubbub_tokeniser_handle_after_doctype_public( |
||
591 | tokeniser); |
||
592 | break; |
||
593 | state(STATE_MATCH_SYSTEM) |
||
594 | cont = hubbub_tokeniser_handle_match_system( |
||
595 | tokeniser); |
||
596 | break; |
||
597 | state(STATE_BEFORE_DOCTYPE_SYSTEM) |
||
598 | cont = hubbub_tokeniser_handle_before_doctype_system( |
||
599 | tokeniser); |
||
600 | break; |
||
601 | state(STATE_DOCTYPE_SYSTEM_DQ) |
||
602 | cont = hubbub_tokeniser_handle_doctype_system_dq( |
||
603 | tokeniser); |
||
604 | break; |
||
605 | state(STATE_DOCTYPE_SYSTEM_SQ) |
||
606 | cont = hubbub_tokeniser_handle_doctype_system_sq( |
||
607 | tokeniser); |
||
608 | break; |
||
609 | state(STATE_AFTER_DOCTYPE_SYSTEM) |
||
610 | cont = hubbub_tokeniser_handle_after_doctype_system( |
||
611 | tokeniser); |
||
612 | break; |
||
613 | state(STATE_BOGUS_DOCTYPE) |
||
614 | cont = hubbub_tokeniser_handle_bogus_doctype( |
||
615 | tokeniser); |
||
616 | break; |
||
617 | state(STATE_MATCH_CDATA) |
||
618 | cont = hubbub_tokeniser_handle_match_cdata( |
||
619 | tokeniser); |
||
620 | break; |
||
621 | state(STATE_CDATA_BLOCK) |
||
622 | cont = hubbub_tokeniser_handle_cdata_block( |
||
623 | tokeniser); |
||
624 | break; |
||
625 | state(STATE_NUMBERED_ENTITY) |
||
626 | cont = hubbub_tokeniser_handle_numbered_entity( |
||
627 | tokeniser); |
||
628 | break; |
||
629 | state(STATE_NAMED_ENTITY) |
||
630 | cont = hubbub_tokeniser_handle_named_entity( |
||
631 | tokeniser); |
||
632 | break; |
||
633 | } |
||
634 | } |
||
635 | |||
636 | return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont; |
||
637 | } |
||
638 | |||
639 | |||
640 | /** |
||
641 | * Various macros for manipulating buffers. |
||
642 | * |
||
643 | * \todo make some of these inline functions (type-safety) |
||
644 | * \todo document them properly here |
||
645 | */ |
||
646 | |||
647 | #define START_BUF(str, cptr, length) \ |
||
648 | do { \ |
||
649 | parserutils_error perror; \ |
||
650 | perror = parserutils_buffer_append(tokeniser->buffer, \ |
||
651 | (uint8_t *) (cptr), (length)); \ |
||
652 | if (perror != PARSERUTILS_OK) \ |
||
653 | return hubbub_error_from_parserutils_error(perror); \ |
||
654 | (str).len = (length); \ |
||
655 | } while (0) |
||
656 | |||
657 | #define COLLECT(str, cptr, length) \ |
||
658 | do { \ |
||
659 | parserutils_error perror; \ |
||
660 | assert(str.len != 0); \ |
||
661 | perror = parserutils_buffer_append(tokeniser->buffer, \ |
||
662 | (uint8_t *) (cptr), (length)); \ |
||
663 | if (perror != PARSERUTILS_OK) \ |
||
664 | return hubbub_error_from_parserutils_error(perror); \ |
||
665 | (str).len += (length); \ |
||
666 | } while (0) |
||
667 | |||
668 | #define COLLECT_MS(str, cptr, length) \ |
||
669 | do { \ |
||
670 | parserutils_error perror; \ |
||
671 | perror = parserutils_buffer_append(tokeniser->buffer, \ |
||
672 | (uint8_t *) (cptr), (length)); \ |
||
673 | if (perror != PARSERUTILS_OK) \ |
||
674 | return hubbub_error_from_parserutils_error(perror); \ |
||
675 | (str).len += (length); \ |
||
676 | } while (0) |
||
677 | |||
678 | |||
679 | /* this should always be called with an empty "chars" buffer */ |
||
680 | hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) |
||
681 | { |
||
682 | parserutils_error error; |
||
683 | hubbub_token token; |
||
684 | const uint8_t *cptr; |
||
685 | size_t len; |
||
686 | |||
687 | while ((error = parserutils_inputstream_peek(tokeniser->input, |
||
688 | tokeniser->context.pending, &cptr, &len)) == |
||
689 | PARSERUTILS_OK) { |
||
690 | const uint8_t c = *cptr; |
||
691 | |||
692 | if (c == '&' && |
||
693 | (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA || |
||
694 | tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) && |
||
695 | tokeniser->escape_flag == false) { |
||
696 | tokeniser->state = |
||
697 | STATE_CHARACTER_REFERENCE_DATA; |
||
698 | /* Don't eat the '&'; it'll be handled by entity |
||
699 | * consumption */ |
||
700 | break; |
||
701 | } else if (c == '-' && |
||
702 | tokeniser->escape_flag == false && |
||
703 | (tokeniser->content_model == |
||
704 | HUBBUB_CONTENT_MODEL_RCDATA || |
||
705 | tokeniser->content_model == |
||
706 | HUBBUB_CONTENT_MODEL_CDATA) && |
||
707 | tokeniser->context.pending >= 3) { |
||
708 | size_t ignore; |
||
709 | error = parserutils_inputstream_peek( |
||
710 | tokeniser->input, |
||
711 | tokeniser->context.pending - 3, |
||
712 | &cptr, |
||
713 | &ignore); |
||
714 | |||
715 | assert(error == PARSERUTILS_OK); |
||
716 | |||
717 | if (strncmp((char *)cptr, |
||
718 | "", SLEN("-->")) == 0) { |
||
757 | tokeniser->escape_flag = false; |
||
758 | } |
||
759 | |||
760 | tokeniser->context.pending += len; |
||
761 | } else if (c == '\0') { |
||
762 | if (tokeniser->context.pending > 0) { |
||
763 | /* Emit any pending characters */ |
||
764 | emit_current_chars(tokeniser); |
||
765 | } |
||
766 | |||
767 | /* Emit a replacement character */ |
||
768 | emit_character_token(tokeniser, &u_fffd_str); |
||
769 | |||
770 | /* Advance past NUL */ |
||
771 | parserutils_inputstream_advance(tokeniser->input, 1); |
||
772 | } else if (c == '\r') { |
||
773 | error = parserutils_inputstream_peek( |
||
774 | tokeniser->input, |
||
775 | tokeniser->context.pending + len, |
||
776 | &cptr, |
||
777 | &len); |
||
778 | |||
779 | if (error != PARSERUTILS_OK && |
||
780 | error != PARSERUTILS_EOF) { |
||
781 | break; |
||
782 | } |
||
783 | |||
784 | if (tokeniser->context.pending > 0) { |
||
785 | /* Emit any pending characters */ |
||
786 | emit_current_chars(tokeniser); |
||
787 | } |
||
788 | |||
789 | if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
790 | /* Emit newline */ |
||
791 | emit_character_token(tokeniser, &lf_str); |
||
792 | } |
||
793 | |||
794 | /* Advance over */ |
||
795 | parserutils_inputstream_advance(tokeniser->input, 1); |
||
796 | } else { |
||
797 | /* Just collect into buffer */ |
||
798 | tokeniser->context.pending += len; |
||
799 | } |
||
800 | } |
||
801 | |||
802 | if (tokeniser->state != STATE_TAG_OPEN && |
||
803 | (tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) && |
||
804 | tokeniser->context.pending > 0) { |
||
805 | /* Emit any pending characters */ |
||
806 | emit_current_chars(tokeniser); |
||
807 | } |
||
808 | |||
809 | if (error == PARSERUTILS_EOF) { |
||
810 | token.type = HUBBUB_TOKEN_EOF; |
||
811 | hubbub_tokeniser_emit_token(tokeniser, &token); |
||
812 | } |
||
813 | |||
814 | if (error == PARSERUTILS_EOF) { |
||
815 | return HUBBUB_NEEDDATA; |
||
816 | } else { |
||
817 | return hubbub_error_from_parserutils_error(error); |
||
818 | } |
||
819 | } |
||
820 | |||
821 | /* emit any pending tokens before calling */ |
||
822 | hubbub_error hubbub_tokeniser_handle_character_reference_data( |
||
823 | hubbub_tokeniser *tokeniser) |
||
824 | { |
||
825 | assert(tokeniser->context.pending == 0); |
||
826 | |||
827 | if (tokeniser->context.match_entity.complete == false) { |
||
828 | return hubbub_tokeniser_consume_character_reference(tokeniser, |
||
829 | tokeniser->context.pending); |
||
830 | } else { |
||
831 | hubbub_token token; |
||
832 | |||
833 | uint8_t utf8[6]; |
||
834 | uint8_t *utf8ptr = utf8; |
||
835 | size_t len = sizeof(utf8); |
||
836 | |||
837 | token.type = HUBBUB_TOKEN_CHARACTER; |
||
838 | |||
839 | if (tokeniser->context.match_entity.codepoint) { |
||
840 | parserutils_charset_utf8_from_ucs4( |
||
841 | tokeniser->context.match_entity.codepoint, |
||
842 | &utf8ptr, &len); |
||
843 | |||
844 | token.data.character.ptr = utf8; |
||
845 | token.data.character.len = sizeof(utf8) - len; |
||
846 | |||
847 | hubbub_tokeniser_emit_token(tokeniser, &token); |
||
848 | |||
849 | /* +1 for ampersand */ |
||
850 | parserutils_inputstream_advance(tokeniser->input, |
||
851 | tokeniser->context.match_entity.length |
||
852 | + 1); |
||
853 | } else { |
||
854 | parserutils_error error; |
||
855 | const uint8_t *cptr = NULL; |
||
856 | |||
857 | error = parserutils_inputstream_peek( |
||
858 | tokeniser->input, |
||
859 | tokeniser->context.pending, |
||
860 | &cptr, |
||
861 | &len); |
||
862 | if (error != PARSERUTILS_OK) { |
||
863 | return hubbub_error_from_parserutils_error( |
||
864 | error); |
||
865 | } |
||
866 | |||
867 | token.data.character.ptr = cptr; |
||
868 | token.data.character.len = len; |
||
869 | |||
870 | hubbub_tokeniser_emit_token(tokeniser, &token); |
||
871 | parserutils_inputstream_advance(tokeniser->input, len); |
||
872 | } |
||
873 | |||
874 | /* Reset for next time */ |
||
875 | tokeniser->context.match_entity.complete = false; |
||
876 | |||
877 | tokeniser->state = STATE_DATA; |
||
878 | } |
||
879 | |||
880 | return HUBBUB_OK; |
||
881 | } |
||
882 | |||
883 | /* this state always switches to another state straight away */ |
||
884 | /* this state expects the current character to be '<' */ |
||
885 | hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) |
||
886 | { |
||
887 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
888 | |||
889 | size_t len; |
||
890 | const uint8_t *cptr; |
||
891 | parserutils_error error; |
||
892 | uint8_t c; |
||
893 | |||
894 | assert(tokeniser->context.pending == 1); |
||
895 | /* assert(tokeniser->context.chars.ptr[0] == '<'); */ |
||
896 | |||
897 | error = parserutils_inputstream_peek(tokeniser->input, |
||
898 | tokeniser->context.pending, &cptr, &len); |
||
899 | |||
900 | if (error != PARSERUTILS_OK) { |
||
901 | if (error == PARSERUTILS_EOF) { |
||
902 | /* Return to data state with '<' still in "chars" */ |
||
903 | tokeniser->state = STATE_DATA; |
||
904 | return HUBBUB_OK; |
||
905 | } else { |
||
906 | return hubbub_error_from_parserutils_error(error); |
||
907 | } |
||
908 | } |
||
909 | |||
910 | c = *cptr; |
||
911 | |||
912 | if (c == '/') { |
||
913 | tokeniser->context.pending += len; |
||
914 | |||
915 | tokeniser->context.close_tag_match.match = false; |
||
916 | tokeniser->context.close_tag_match.count = 0; |
||
917 | |||
918 | tokeniser->state = STATE_CLOSE_TAG_OPEN; |
||
919 | } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || |
||
920 | tokeniser->content_model == |
||
921 | HUBBUB_CONTENT_MODEL_CDATA) { |
||
922 | /* Return to data state with '<' still in "chars" */ |
||
923 | tokeniser->state = STATE_DATA; |
||
924 | } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { |
||
925 | if (c == '!') { |
||
926 | parserutils_inputstream_advance(tokeniser->input, |
||
927 | SLEN(" |
||
928 | |||
929 | tokeniser->context.pending = 0; |
||
930 | tokeniser->state = STATE_MARKUP_DECLARATION_OPEN; |
||
931 | } else if ('A' <= c && c <= 'Z') { |
||
932 | uint8_t lc = (c + 0x20); |
||
933 | |||
934 | START_BUF(ctag->name, &lc, len); |
||
935 | ctag->n_attributes = 0; |
||
936 | tokeniser->context.current_tag_type = |
||
937 | HUBBUB_TOKEN_START_TAG; |
||
938 | |||
939 | tokeniser->context.pending += len; |
||
940 | |||
941 | tokeniser->state = STATE_TAG_NAME; |
||
942 | } else if ('a' <= c && c <= 'z') { |
||
943 | START_BUF(ctag->name, cptr, len); |
||
944 | ctag->n_attributes = 0; |
||
945 | tokeniser->context.current_tag_type = |
||
946 | HUBBUB_TOKEN_START_TAG; |
||
947 | |||
948 | tokeniser->context.pending += len; |
||
949 | |||
950 | tokeniser->state = STATE_TAG_NAME; |
||
951 | } else if (c == '>') { |
||
952 | /** \todo parse error */ |
||
953 | |||
954 | tokeniser->context.pending += len; |
||
955 | tokeniser->state = STATE_DATA; |
||
956 | } else if (c == '?') { |
||
957 | /** \todo parse error */ |
||
958 | |||
959 | /* Cursor still at "<", need to advance past it */ |
||
960 | parserutils_inputstream_advance( |
||
961 | tokeniser->input, SLEN("<")); |
||
962 | tokeniser->context.pending = 0; |
||
963 | |||
964 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
965 | } else { |
||
966 | /* Return to data state with '<' still in "chars" */ |
||
967 | tokeniser->state = STATE_DATA; |
||
968 | } |
||
969 | } |
||
970 | |||
971 | return HUBBUB_OK; |
||
972 | } |
||
973 | |||
974 | /* this state expects tokeniser->context.chars to be "" */ |
||
975 | /* this state never stays in this state for more than one character */ |
||
976 | hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) |
||
977 | { |
||
978 | hubbub_tokeniser_context *ctx = &tokeniser->context; |
||
979 | |||
980 | size_t len; |
||
981 | const uint8_t *cptr; |
||
982 | parserutils_error error; |
||
983 | uint8_t c; |
||
984 | |||
985 | assert(tokeniser->context.pending == 2); |
||
986 | /* assert(tokeniser->context.chars.ptr[0] == '<'); */ |
||
987 | /* assert(tokeniser->context.chars.ptr[1] == '/'); */ |
||
988 | |||
989 | /**\todo fragment case */ |
||
990 | |||
991 | if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || |
||
992 | tokeniser->content_model == |
||
993 | HUBBUB_CONTENT_MODEL_CDATA) { |
||
994 | uint8_t *start_tag_name = |
||
995 | tokeniser->context.last_start_tag_name; |
||
996 | size_t start_tag_len = |
||
997 | tokeniser->context.last_start_tag_len; |
||
998 | |||
999 | while ((error = parserutils_inputstream_peek(tokeniser->input, |
||
1000 | ctx->pending + |
||
1001 | ctx->close_tag_match.count, |
||
1002 | &cptr, |
||
1003 | &len)) == PARSERUTILS_OK) { |
||
1004 | c = *cptr; |
||
1005 | |||
1006 | if ((start_tag_name[ctx->close_tag_match.count] & ~0x20) |
||
1007 | != (c & ~0x20)) { |
||
1008 | break; |
||
1009 | } |
||
1010 | |||
1011 | ctx->close_tag_match.count += len; |
||
1012 | |||
1013 | if (ctx->close_tag_match.count == start_tag_len) { |
||
1014 | ctx->close_tag_match.match = true; |
||
1015 | break; |
||
1016 | } |
||
1017 | } |
||
1018 | |||
1019 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
1020 | return hubbub_error_from_parserutils_error(error); |
||
1021 | } |
||
1022 | |||
1023 | if (ctx->close_tag_match.match == true) { |
||
1024 | error = parserutils_inputstream_peek( |
||
1025 | tokeniser->input, |
||
1026 | ctx->pending + |
||
1027 | ctx->close_tag_match.count, |
||
1028 | &cptr, |
||
1029 | &len); |
||
1030 | |||
1031 | if (error != PARSERUTILS_OK && |
||
1032 | error != PARSERUTILS_EOF) { |
||
1033 | return hubbub_error_from_parserutils_error( |
||
1034 | error); |
||
1035 | } else if (error != PARSERUTILS_EOF) { |
||
1036 | c = *cptr; |
||
1037 | |||
1038 | if (c != '\t' && c != '\n' && c != '\f' && |
||
1039 | c != ' ' && c != '>' && |
||
1040 | c != '/') { |
||
1041 | ctx->close_tag_match.match = false; |
||
1042 | } |
||
1043 | } |
||
1044 | } |
||
1045 | } |
||
1046 | |||
1047 | if (ctx->close_tag_match.match == false && |
||
1048 | tokeniser->content_model != |
||
1049 | HUBBUB_CONTENT_MODEL_PCDATA) { |
||
1050 | /* We should emit "" here, but instead we leave it in the |
||
1051 | * buffer so the data state emits it with any characters |
||
1052 | * following it */ |
||
1053 | tokeniser->state = STATE_DATA; |
||
1054 | } else { |
||
1055 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1056 | tokeniser->context.pending, &cptr, &len); |
||
1057 | |||
1058 | if (error == PARSERUTILS_EOF) { |
||
1059 | /** \todo parse error */ |
||
1060 | |||
1061 | /* Return to data state with "" pending */ |
||
1062 | tokeniser->state = STATE_DATA; |
||
1063 | return HUBBUB_OK; |
||
1064 | } else if (error != PARSERUTILS_OK) { |
||
1065 | return hubbub_error_from_parserutils_error(error); |
||
1066 | } |
||
1067 | |||
1068 | c = *cptr; |
||
1069 | |||
1070 | if ('A' <= c && c <= 'Z') { |
||
1071 | uint8_t lc = (c + 0x20); |
||
1072 | START_BUF(tokeniser->context.current_tag.name, |
||
1073 | &lc, len); |
||
1074 | tokeniser->context.current_tag.n_attributes = 0; |
||
1075 | |||
1076 | tokeniser->context.current_tag_type = |
||
1077 | HUBBUB_TOKEN_END_TAG; |
||
1078 | |||
1079 | tokeniser->context.pending += len; |
||
1080 | |||
1081 | tokeniser->state = STATE_TAG_NAME; |
||
1082 | } else if ('a' <= c && c <= 'z') { |
||
1083 | START_BUF(tokeniser->context.current_tag.name, |
||
1084 | cptr, len); |
||
1085 | tokeniser->context.current_tag.n_attributes = 0; |
||
1086 | |||
1087 | tokeniser->context.current_tag_type = |
||
1088 | HUBBUB_TOKEN_END_TAG; |
||
1089 | |||
1090 | tokeniser->context.pending += len; |
||
1091 | |||
1092 | tokeniser->state = STATE_TAG_NAME; |
||
1093 | } else if (c == '>') { |
||
1094 | /* Cursor still at "", need to collect ">" */ |
||
1095 | tokeniser->context.pending += len; |
||
1096 | |||
1097 | /* Now need to advance past ">" */ |
||
1098 | parserutils_inputstream_advance(tokeniser->input, |
||
1099 | tokeniser->context.pending); |
||
1100 | tokeniser->context.pending = 0; |
||
1101 | |||
1102 | /** \todo parse error */ |
||
1103 | tokeniser->state = STATE_DATA; |
||
1104 | } else { |
||
1105 | /** \todo parse error */ |
||
1106 | |||
1107 | /* Cursor still at "", need to advance past it */ |
||
1108 | parserutils_inputstream_advance(tokeniser->input, |
||
1109 | tokeniser->context.pending); |
||
1110 | tokeniser->context.pending = 0; |
||
1111 | |||
1112 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
1113 | } |
||
1114 | } |
||
1115 | |||
1116 | return HUBBUB_OK; |
||
1117 | } |
||
1118 | |||
1119 | /* this state expects tokeniser->context.current_tag to already have its |
||
1120 | first character set */ |
||
1121 | hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) |
||
1122 | { |
||
1123 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1124 | |||
1125 | size_t len; |
||
1126 | const uint8_t *cptr; |
||
1127 | parserutils_error error; |
||
1128 | uint8_t c; |
||
1129 | |||
1130 | assert(tokeniser->context.pending > 0); |
||
1131 | /* assert(tokeniser->context.chars.ptr[0] == '<'); */ |
||
1132 | assert(ctag->name.len > 0); |
||
1133 | /* assert(ctag->name.ptr); */ |
||
1134 | |||
1135 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1136 | tokeniser->context.pending, &cptr, &len); |
||
1137 | |||
1138 | if (error != PARSERUTILS_OK) { |
||
1139 | if (error == PARSERUTILS_EOF) { |
||
1140 | tokeniser->state = STATE_DATA; |
||
1141 | return emit_current_tag(tokeniser); |
||
1142 | } else { |
||
1143 | return hubbub_error_from_parserutils_error(error); |
||
1144 | } |
||
1145 | } |
||
1146 | |||
1147 | c = *cptr; |
||
1148 | |||
1149 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
1150 | tokeniser->context.pending += len; |
||
1151 | tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; |
||
1152 | } else if (c == '>') { |
||
1153 | tokeniser->context.pending += len; |
||
1154 | tokeniser->state = STATE_DATA; |
||
1155 | return emit_current_tag(tokeniser); |
||
1156 | } else if (c == '\0') { |
||
1157 | COLLECT(ctag->name, u_fffd, sizeof(u_fffd)); |
||
1158 | tokeniser->context.pending += len; |
||
1159 | } else if (c == '/') { |
||
1160 | tokeniser->context.pending += len; |
||
1161 | tokeniser->state = STATE_SELF_CLOSING_START_TAG; |
||
1162 | } else if ('A' <= c && c <= 'Z') { |
||
1163 | uint8_t lc = (c + 0x20); |
||
1164 | COLLECT(ctag->name, &lc, len); |
||
1165 | tokeniser->context.pending += len; |
||
1166 | } else { |
||
1167 | COLLECT(ctag->name, cptr, len); |
||
1168 | tokeniser->context.pending += len; |
||
1169 | } |
||
1170 | |||
1171 | return HUBBUB_OK; |
||
1172 | } |
||
1173 | |||
1174 | hubbub_error hubbub_tokeniser_handle_before_attribute_name( |
||
1175 | hubbub_tokeniser *tokeniser) |
||
1176 | { |
||
1177 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1178 | |||
1179 | size_t len; |
||
1180 | const uint8_t *cptr; |
||
1181 | parserutils_error error; |
||
1182 | uint8_t c; |
||
1183 | |||
1184 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1185 | tokeniser->context.pending, &cptr, &len); |
||
1186 | |||
1187 | if (error != PARSERUTILS_OK) { |
||
1188 | if (error == PARSERUTILS_EOF) { |
||
1189 | tokeniser->state = STATE_DATA; |
||
1190 | return emit_current_tag(tokeniser); |
||
1191 | } else { |
||
1192 | return hubbub_error_from_parserutils_error(error); |
||
1193 | } |
||
1194 | } |
||
1195 | |||
1196 | c = *cptr; |
||
1197 | |||
1198 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
1199 | /* pass over in silence */ |
||
1200 | tokeniser->context.pending += len; |
||
1201 | } else if (c == '>') { |
||
1202 | tokeniser->context.pending += len; |
||
1203 | tokeniser->state = STATE_DATA; |
||
1204 | return emit_current_tag(tokeniser); |
||
1205 | } else if (c == '/') { |
||
1206 | tokeniser->context.pending += len; |
||
1207 | tokeniser->state = STATE_SELF_CLOSING_START_TAG; |
||
1208 | } else { |
||
1209 | hubbub_attribute *attr; |
||
1210 | |||
1211 | if (c == '"' || c == '\'' || c == '=') { |
||
1212 | /** \todo parse error */ |
||
1213 | } |
||
1214 | |||
1215 | attr = tokeniser->alloc(ctag->attributes, |
||
1216 | (ctag->n_attributes + 1) * |
||
1217 | sizeof(hubbub_attribute), |
||
1218 | tokeniser->alloc_pw); |
||
1219 | if (attr == NULL) |
||
1220 | return HUBBUB_NOMEM; |
||
1221 | |||
1222 | ctag->attributes = attr; |
||
1223 | |||
1224 | if ('A' <= c && c <= 'Z') { |
||
1225 | uint8_t lc = (c + 0x20); |
||
1226 | START_BUF(attr[ctag->n_attributes].name, &lc, len); |
||
1227 | } else if (c == '\0') { |
||
1228 | START_BUF(attr[ctag->n_attributes].name, |
||
1229 | u_fffd, sizeof(u_fffd)); |
||
1230 | } else { |
||
1231 | START_BUF(attr[ctag->n_attributes].name, cptr, len); |
||
1232 | } |
||
1233 | |||
1234 | attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; |
||
1235 | attr[ctag->n_attributes].value.ptr = NULL; |
||
1236 | attr[ctag->n_attributes].value.len = 0; |
||
1237 | |||
1238 | ctag->n_attributes++; |
||
1239 | |||
1240 | tokeniser->context.pending += len; |
||
1241 | tokeniser->state = STATE_ATTRIBUTE_NAME; |
||
1242 | } |
||
1243 | |||
1244 | return HUBBUB_OK; |
||
1245 | } |
||
1246 | |||
1247 | hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser) |
||
1248 | { |
||
1249 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1250 | |||
1251 | size_t len; |
||
1252 | const uint8_t *cptr; |
||
1253 | parserutils_error error; |
||
1254 | uint8_t c; |
||
1255 | |||
1256 | assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0); |
||
1257 | |||
1258 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1259 | tokeniser->context.pending, &cptr, &len); |
||
1260 | |||
1261 | if (error != PARSERUTILS_OK) { |
||
1262 | if (error == PARSERUTILS_EOF) { |
||
1263 | tokeniser->state = STATE_DATA; |
||
1264 | return emit_current_tag(tokeniser); |
||
1265 | } else { |
||
1266 | return hubbub_error_from_parserutils_error(error); |
||
1267 | } |
||
1268 | } |
||
1269 | |||
1270 | c = *cptr; |
||
1271 | |||
1272 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
1273 | tokeniser->context.pending += len; |
||
1274 | tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME; |
||
1275 | } else if (c == '=') { |
||
1276 | tokeniser->context.pending += len; |
||
1277 | tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; |
||
1278 | } else if (c == '>') { |
||
1279 | tokeniser->context.pending += len; |
||
1280 | tokeniser->state = STATE_DATA; |
||
1281 | return emit_current_tag(tokeniser); |
||
1282 | } else if (c == '/') { |
||
1283 | tokeniser->context.pending += len; |
||
1284 | tokeniser->state = STATE_SELF_CLOSING_START_TAG; |
||
1285 | } else if (c == '\0') { |
||
1286 | COLLECT(ctag->attributes[ctag->n_attributes - 1].name, |
||
1287 | u_fffd, sizeof(u_fffd)); |
||
1288 | tokeniser->context.pending += len; |
||
1289 | } else if ('A' <= c && c <= 'Z') { |
||
1290 | uint8_t lc = (c + 0x20); |
||
1291 | COLLECT(ctag->attributes[ctag->n_attributes - 1].name, |
||
1292 | &lc, len); |
||
1293 | tokeniser->context.pending += len; |
||
1294 | } else { |
||
1295 | COLLECT(ctag->attributes[ctag->n_attributes - 1].name, |
||
1296 | cptr, len); |
||
1297 | tokeniser->context.pending += len; |
||
1298 | } |
||
1299 | |||
1300 | return HUBBUB_OK; |
||
1301 | } |
||
1302 | |||
1303 | hubbub_error hubbub_tokeniser_handle_after_attribute_name( |
||
1304 | hubbub_tokeniser *tokeniser) |
||
1305 | { |
||
1306 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1307 | |||
1308 | size_t len; |
||
1309 | const uint8_t *cptr; |
||
1310 | parserutils_error error; |
||
1311 | uint8_t c; |
||
1312 | |||
1313 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1314 | tokeniser->context.pending, &cptr, &len); |
||
1315 | |||
1316 | if (error != PARSERUTILS_OK) { |
||
1317 | if (error == PARSERUTILS_EOF) { |
||
1318 | tokeniser->state = STATE_DATA; |
||
1319 | return emit_current_tag(tokeniser); |
||
1320 | } else { |
||
1321 | return hubbub_error_from_parserutils_error(error); |
||
1322 | } |
||
1323 | } |
||
1324 | |||
1325 | c = *cptr; |
||
1326 | |||
1327 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
1328 | tokeniser->context.pending += len; |
||
1329 | } else if (c == '=') { |
||
1330 | tokeniser->context.pending += len; |
||
1331 | tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; |
||
1332 | } else if (c == '>') { |
||
1333 | tokeniser->context.pending += len; |
||
1334 | |||
1335 | tokeniser->state = STATE_DATA; |
||
1336 | return emit_current_tag(tokeniser); |
||
1337 | } else if (c == '/') { |
||
1338 | tokeniser->context.pending += len; |
||
1339 | tokeniser->state = STATE_SELF_CLOSING_START_TAG; |
||
1340 | } else { |
||
1341 | hubbub_attribute *attr; |
||
1342 | |||
1343 | if (c == '"' || c == '\'') { |
||
1344 | /** \todo parse error */ |
||
1345 | } |
||
1346 | |||
1347 | attr = tokeniser->alloc(ctag->attributes, |
||
1348 | (ctag->n_attributes + 1) * |
||
1349 | sizeof(hubbub_attribute), |
||
1350 | tokeniser->alloc_pw); |
||
1351 | if (attr == NULL) |
||
1352 | return HUBBUB_NOMEM; |
||
1353 | |||
1354 | ctag->attributes = attr; |
||
1355 | |||
1356 | if ('A' <= c && c <= 'Z') { |
||
1357 | uint8_t lc = (c + 0x20); |
||
1358 | START_BUF(attr[ctag->n_attributes].name, &lc, len); |
||
1359 | } else if (c == '\0') { |
||
1360 | START_BUF(attr[ctag->n_attributes].name, |
||
1361 | u_fffd, sizeof(u_fffd)); |
||
1362 | } else { |
||
1363 | START_BUF(attr[ctag->n_attributes].name, cptr, len); |
||
1364 | } |
||
1365 | |||
1366 | attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; |
||
1367 | attr[ctag->n_attributes].value.ptr = NULL; |
||
1368 | attr[ctag->n_attributes].value.len = 0; |
||
1369 | |||
1370 | ctag->n_attributes++; |
||
1371 | |||
1372 | tokeniser->context.pending += len; |
||
1373 | tokeniser->state = STATE_ATTRIBUTE_NAME; |
||
1374 | } |
||
1375 | |||
1376 | return HUBBUB_OK; |
||
1377 | } |
||
1378 | |||
1379 | /* this state is only ever triggered by an '=' */ |
||
1380 | hubbub_error hubbub_tokeniser_handle_before_attribute_value( |
||
1381 | hubbub_tokeniser *tokeniser) |
||
1382 | { |
||
1383 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1384 | |||
1385 | size_t len; |
||
1386 | const uint8_t *cptr; |
||
1387 | parserutils_error error; |
||
1388 | uint8_t c; |
||
1389 | |||
1390 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1391 | tokeniser->context.pending, &cptr, &len); |
||
1392 | |||
1393 | if (error != PARSERUTILS_OK) { |
||
1394 | if (error == PARSERUTILS_EOF) { |
||
1395 | /** \todo parse error */ |
||
1396 | tokeniser->state = STATE_DATA; |
||
1397 | return emit_current_tag(tokeniser); |
||
1398 | } else { |
||
1399 | return hubbub_error_from_parserutils_error(error); |
||
1400 | } |
||
1401 | } |
||
1402 | |||
1403 | c = *cptr; |
||
1404 | |||
1405 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
1406 | tokeniser->context.pending += len; |
||
1407 | } else if (c == '"') { |
||
1408 | tokeniser->context.pending += len; |
||
1409 | tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ; |
||
1410 | } else if (c == '&') { |
||
1411 | /* Don't consume the '&' -- reprocess in UQ state */ |
||
1412 | tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; |
||
1413 | } else if (c == '\'') { |
||
1414 | tokeniser->context.pending += len; |
||
1415 | tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ; |
||
1416 | } else if (c == '>') { |
||
1417 | /** \todo parse error */ |
||
1418 | tokeniser->context.pending += len; |
||
1419 | |||
1420 | tokeniser->state = STATE_DATA; |
||
1421 | return emit_current_tag(tokeniser); |
||
1422 | } else if (c == '\0') { |
||
1423 | START_BUF(ctag->attributes[ctag->n_attributes - 1].value, |
||
1424 | u_fffd, sizeof(u_fffd)); |
||
1425 | tokeniser->context.pending += len; |
||
1426 | tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; |
||
1427 | } else { |
||
1428 | if (c == '=') { |
||
1429 | /** \todo parse error */ |
||
1430 | } |
||
1431 | |||
1432 | START_BUF(ctag->attributes[ctag->n_attributes - 1].value, |
||
1433 | cptr, len); |
||
1434 | |||
1435 | tokeniser->context.pending += len; |
||
1436 | tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; |
||
1437 | } |
||
1438 | |||
1439 | return HUBBUB_OK; |
||
1440 | } |
||
1441 | |||
1442 | hubbub_error hubbub_tokeniser_handle_attribute_value_dq( |
||
1443 | hubbub_tokeniser *tokeniser) |
||
1444 | { |
||
1445 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1446 | |||
1447 | size_t len; |
||
1448 | const uint8_t *cptr; |
||
1449 | parserutils_error error; |
||
1450 | uint8_t c; |
||
1451 | |||
1452 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1453 | tokeniser->context.pending, &cptr, &len); |
||
1454 | |||
1455 | if (error != PARSERUTILS_OK) { |
||
1456 | if (error == PARSERUTILS_EOF) { |
||
1457 | tokeniser->state = STATE_DATA; |
||
1458 | return emit_current_tag(tokeniser); |
||
1459 | } else { |
||
1460 | return hubbub_error_from_parserutils_error(error); |
||
1461 | } |
||
1462 | } |
||
1463 | |||
1464 | c = *cptr; |
||
1465 | |||
1466 | if (c == '"') { |
||
1467 | tokeniser->context.pending += len; |
||
1468 | tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q; |
||
1469 | } else if (c == '&') { |
||
1470 | tokeniser->context.prev_state = tokeniser->state; |
||
1471 | tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; |
||
1472 | tokeniser->context.allowed_char = '"'; |
||
1473 | /* Don't eat the '&'; it'll be handled by entity consumption */ |
||
1474 | } else if (c == '\0') { |
||
1475 | COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, |
||
1476 | u_fffd, sizeof(u_fffd)); |
||
1477 | tokeniser->context.pending += len; |
||
1478 | } else if (c == '\r') { |
||
1479 | error = parserutils_inputstream_peek( |
||
1480 | tokeniser->input, |
||
1481 | tokeniser->context.pending + len, |
||
1482 | &cptr, |
||
1483 | &len); |
||
1484 | |||
1485 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
1486 | return hubbub_error_from_parserutils_error(error); |
||
1487 | } else if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
1488 | COLLECT_MS(ctag->attributes[ |
||
1489 | ctag->n_attributes - 1].value, |
||
1490 | &lf, sizeof(lf)); |
||
1491 | } |
||
1492 | |||
1493 | /* Consume '\r' */ |
||
1494 | tokeniser->context.pending += 1; |
||
1495 | } else { |
||
1496 | COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, |
||
1497 | cptr, len); |
||
1498 | tokeniser->context.pending += len; |
||
1499 | } |
||
1500 | |||
1501 | return HUBBUB_OK; |
||
1502 | } |
||
1503 | |||
1504 | hubbub_error hubbub_tokeniser_handle_attribute_value_sq( |
||
1505 | hubbub_tokeniser *tokeniser) |
||
1506 | { |
||
1507 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1508 | |||
1509 | size_t len; |
||
1510 | const uint8_t *cptr; |
||
1511 | parserutils_error error; |
||
1512 | uint8_t c; |
||
1513 | |||
1514 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1515 | tokeniser->context.pending, &cptr, &len); |
||
1516 | |||
1517 | if (error != PARSERUTILS_OK) { |
||
1518 | if (error == PARSERUTILS_EOF) { |
||
1519 | tokeniser->state = STATE_DATA; |
||
1520 | return emit_current_tag(tokeniser); |
||
1521 | } else { |
||
1522 | return hubbub_error_from_parserutils_error(error); |
||
1523 | } |
||
1524 | } |
||
1525 | |||
1526 | c = *cptr; |
||
1527 | |||
1528 | if (c == '\'') { |
||
1529 | tokeniser->context.pending += len; |
||
1530 | tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q; |
||
1531 | } else if (c == '&') { |
||
1532 | tokeniser->context.prev_state = tokeniser->state; |
||
1533 | tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; |
||
1534 | tokeniser->context.allowed_char = '\''; |
||
1535 | /* Don't eat the '&'; it'll be handled by entity consumption */ |
||
1536 | } else if (c == '\0') { |
||
1537 | COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, |
||
1538 | u_fffd, sizeof(u_fffd)); |
||
1539 | tokeniser->context.pending += len; |
||
1540 | } else if (c == '\r') { |
||
1541 | error = parserutils_inputstream_peek( |
||
1542 | tokeniser->input, |
||
1543 | tokeniser->context.pending + len, |
||
1544 | &cptr, |
||
1545 | &len); |
||
1546 | |||
1547 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
1548 | return hubbub_error_from_parserutils_error(error); |
||
1549 | } else if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
1550 | COLLECT_MS(ctag->attributes[ |
||
1551 | ctag->n_attributes - 1].value, |
||
1552 | &lf, sizeof(lf)); |
||
1553 | } |
||
1554 | |||
1555 | /* Consume \r */ |
||
1556 | tokeniser->context.pending += 1; |
||
1557 | } else { |
||
1558 | COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, |
||
1559 | cptr, len); |
||
1560 | tokeniser->context.pending += len; |
||
1561 | } |
||
1562 | |||
1563 | return HUBBUB_OK; |
||
1564 | } |
||
1565 | |||
1566 | hubbub_error hubbub_tokeniser_handle_attribute_value_uq( |
||
1567 | hubbub_tokeniser *tokeniser) |
||
1568 | { |
||
1569 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1570 | uint8_t c; |
||
1571 | |||
1572 | size_t len; |
||
1573 | const uint8_t *cptr; |
||
1574 | parserutils_error error; |
||
1575 | |||
1576 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1577 | tokeniser->context.pending, &cptr, &len); |
||
1578 | |||
1579 | if (error != PARSERUTILS_OK) { |
||
1580 | if (error == PARSERUTILS_EOF) { |
||
1581 | tokeniser->state = STATE_DATA; |
||
1582 | return emit_current_tag(tokeniser); |
||
1583 | } else { |
||
1584 | return hubbub_error_from_parserutils_error(error); |
||
1585 | } |
||
1586 | } |
||
1587 | |||
1588 | c = *cptr; |
||
1589 | |||
1590 | assert(c == '&' || |
||
1591 | ctag->attributes[ctag->n_attributes - 1].value.len >= 1); |
||
1592 | |||
1593 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
1594 | tokeniser->context.pending += len; |
||
1595 | tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; |
||
1596 | } else if (c == '&') { |
||
1597 | tokeniser->context.prev_state = tokeniser->state; |
||
1598 | tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; |
||
1599 | /* Don't eat the '&'; it'll be handled by entity consumption */ |
||
1600 | } else if (c == '>') { |
||
1601 | tokeniser->context.pending += len; |
||
1602 | tokeniser->state = STATE_DATA; |
||
1603 | return emit_current_tag(tokeniser); |
||
1604 | } else if (c == '\0') { |
||
1605 | COLLECT(ctag->attributes[ctag->n_attributes - 1].value, |
||
1606 | u_fffd, sizeof(u_fffd)); |
||
1607 | tokeniser->context.pending += len; |
||
1608 | } else { |
||
1609 | if (c == '"' || c == '\'' || c == '=') { |
||
1610 | /** \todo parse error */ |
||
1611 | } |
||
1612 | |||
1613 | COLLECT(ctag->attributes[ctag->n_attributes - 1].value, |
||
1614 | cptr, len); |
||
1615 | tokeniser->context.pending += len; |
||
1616 | } |
||
1617 | |||
1618 | return HUBBUB_OK; |
||
1619 | } |
||
1620 | |||
1621 | hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value( |
||
1622 | hubbub_tokeniser *tokeniser) |
||
1623 | { |
||
1624 | if (tokeniser->context.match_entity.complete == false) { |
||
1625 | return hubbub_tokeniser_consume_character_reference(tokeniser, |
||
1626 | tokeniser->context.pending); |
||
1627 | } else { |
||
1628 | hubbub_tag *ctag = &tokeniser->context.current_tag; |
||
1629 | hubbub_attribute *attr = &ctag->attributes[ |
||
1630 | ctag->n_attributes - 1]; |
||
1631 | |||
1632 | uint8_t utf8[6]; |
||
1633 | uint8_t *utf8ptr = utf8; |
||
1634 | size_t len = sizeof(utf8); |
||
1635 | |||
1636 | if (tokeniser->context.match_entity.codepoint) { |
||
1637 | parserutils_charset_utf8_from_ucs4( |
||
1638 | tokeniser->context.match_entity.codepoint, |
||
1639 | &utf8ptr, &len); |
||
1640 | |||
1641 | COLLECT_MS(attr->value, utf8, sizeof(utf8) - len); |
||
1642 | |||
1643 | /* +1 for the ampersand */ |
||
1644 | tokeniser->context.pending += |
||
1645 | tokeniser->context.match_entity.length |
||
1646 | + 1; |
||
1647 | } else { |
||
1648 | size_t len = 0; |
||
1649 | const uint8_t *cptr = NULL; |
||
1650 | parserutils_error error; |
||
1651 | |||
1652 | error = parserutils_inputstream_peek( |
||
1653 | tokeniser->input, |
||
1654 | tokeniser->context.pending, |
||
1655 | &cptr, |
||
1656 | &len); |
||
1657 | if (error != PARSERUTILS_OK) { |
||
1658 | return hubbub_error_from_parserutils_error( |
||
1659 | error); |
||
1660 | } |
||
1661 | |||
1662 | /* Insert the ampersand */ |
||
1663 | COLLECT_MS(attr->value, cptr, len); |
||
1664 | tokeniser->context.pending += len; |
||
1665 | } |
||
1666 | |||
1667 | /* Reset for next time */ |
||
1668 | tokeniser->context.match_entity.complete = false; |
||
1669 | |||
1670 | /* And back to the previous state */ |
||
1671 | tokeniser->state = tokeniser->context.prev_state; |
||
1672 | } |
||
1673 | |||
1674 | return HUBBUB_OK; |
||
1675 | } |
||
1676 | |||
1677 | /* always switches state */ |
||
1678 | hubbub_error hubbub_tokeniser_handle_after_attribute_value_q( |
||
1679 | hubbub_tokeniser *tokeniser) |
||
1680 | { |
||
1681 | size_t len; |
||
1682 | const uint8_t *cptr; |
||
1683 | parserutils_error error; |
||
1684 | uint8_t c; |
||
1685 | |||
1686 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1687 | tokeniser->context.pending, &cptr, &len); |
||
1688 | |||
1689 | if (error != PARSERUTILS_OK) { |
||
1690 | if (error == PARSERUTILS_EOF) { |
||
1691 | tokeniser->state = STATE_DATA; |
||
1692 | return emit_current_tag(tokeniser); |
||
1693 | } else { |
||
1694 | return hubbub_error_from_parserutils_error(error); |
||
1695 | } |
||
1696 | } |
||
1697 | |||
1698 | c = *cptr; |
||
1699 | |||
1700 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
1701 | tokeniser->context.pending += len; |
||
1702 | tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; |
||
1703 | } else if (c == '>') { |
||
1704 | tokeniser->context.pending += len; |
||
1705 | |||
1706 | tokeniser->state = STATE_DATA; |
||
1707 | return emit_current_tag(tokeniser); |
||
1708 | } else if (c == '/') { |
||
1709 | tokeniser->context.pending += len; |
||
1710 | tokeniser->state = STATE_SELF_CLOSING_START_TAG; |
||
1711 | } else { |
||
1712 | /** \todo parse error */ |
||
1713 | /* Reprocess character in before attribute name state */ |
||
1714 | tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; |
||
1715 | } |
||
1716 | |||
1717 | return HUBBUB_OK; |
||
1718 | } |
||
1719 | |||
1720 | hubbub_error hubbub_tokeniser_handle_self_closing_start_tag( |
||
1721 | hubbub_tokeniser *tokeniser) |
||
1722 | { |
||
1723 | size_t len; |
||
1724 | const uint8_t *cptr; |
||
1725 | parserutils_error error; |
||
1726 | uint8_t c; |
||
1727 | |||
1728 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1729 | tokeniser->context.pending, &cptr, &len); |
||
1730 | |||
1731 | if (error != PARSERUTILS_OK) { |
||
1732 | if (error == PARSERUTILS_EOF) { |
||
1733 | tokeniser->state = STATE_DATA; |
||
1734 | return emit_current_tag(tokeniser); |
||
1735 | } else { |
||
1736 | return hubbub_error_from_parserutils_error(error); |
||
1737 | } |
||
1738 | } |
||
1739 | |||
1740 | c = *cptr; |
||
1741 | |||
1742 | if (c == '>') { |
||
1743 | tokeniser->context.pending += len; |
||
1744 | tokeniser->state = STATE_DATA; |
||
1745 | |||
1746 | tokeniser->context.current_tag.self_closing = true; |
||
1747 | return emit_current_tag(tokeniser); |
||
1748 | } else { |
||
1749 | /* Reprocess character in before attribute name state */ |
||
1750 | tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; |
||
1751 | } |
||
1752 | |||
1753 | return HUBBUB_OK; |
||
1754 | } |
||
1755 | |||
1756 | /* this state expects tokeniser->context.chars to be empty on first entry */ |
||
1757 | hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) |
||
1758 | { |
||
1759 | size_t len; |
||
1760 | const uint8_t *cptr; |
||
1761 | parserutils_error error; |
||
1762 | uint8_t c; |
||
1763 | |||
1764 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1765 | tokeniser->context.pending, &cptr, &len); |
||
1766 | |||
1767 | if (error != PARSERUTILS_OK) { |
||
1768 | if (error == PARSERUTILS_EOF) { |
||
1769 | tokeniser->state = STATE_DATA; |
||
1770 | return emit_current_comment(tokeniser); |
||
1771 | } else { |
||
1772 | return hubbub_error_from_parserutils_error(error); |
||
1773 | } |
||
1774 | } |
||
1775 | |||
1776 | c = *cptr; |
||
1777 | |||
1778 | if (c == '>') { |
||
1779 | tokeniser->context.pending += len; |
||
1780 | tokeniser->state = STATE_DATA; |
||
1781 | return emit_current_comment(tokeniser); |
||
1782 | } else if (c == '\0') { |
||
1783 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1784 | u_fffd, sizeof(u_fffd)); |
||
1785 | if (error != PARSERUTILS_OK) |
||
1786 | return hubbub_error_from_parserutils_error(error); |
||
1787 | |||
1788 | tokeniser->context.pending += len; |
||
1789 | } else if (c == '\r') { |
||
1790 | error = parserutils_inputstream_peek( |
||
1791 | tokeniser->input, |
||
1792 | tokeniser->context.pending, |
||
1793 | &cptr, |
||
1794 | &len); |
||
1795 | |||
1796 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
1797 | return hubbub_error_from_parserutils_error(error); |
||
1798 | } else if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
1799 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1800 | &lf, sizeof(lf)); |
||
1801 | if (error != PARSERUTILS_OK) { |
||
1802 | return hubbub_error_from_parserutils_error( |
||
1803 | error); |
||
1804 | } |
||
1805 | } |
||
1806 | tokeniser->context.pending += len; |
||
1807 | } else { |
||
1808 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1809 | (uint8_t *) cptr, len); |
||
1810 | if (error != PARSERUTILS_OK) |
||
1811 | return hubbub_error_from_parserutils_error(error); |
||
1812 | |||
1813 | tokeniser->context.pending += len; |
||
1814 | } |
||
1815 | |||
1816 | return HUBBUB_OK; |
||
1817 | } |
||
1818 | |||
1819 | /* this state always switches to another state straight away */ |
||
1820 | hubbub_error hubbub_tokeniser_handle_markup_declaration_open( |
||
1821 | hubbub_tokeniser *tokeniser) |
||
1822 | { |
||
1823 | size_t len; |
||
1824 | const uint8_t *cptr; |
||
1825 | parserutils_error error; |
||
1826 | uint8_t c; |
||
1827 | |||
1828 | assert(tokeniser->context.pending == 0); |
||
1829 | |||
1830 | error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len); |
||
1831 | |||
1832 | if (error != PARSERUTILS_OK) { |
||
1833 | if (error == PARSERUTILS_EOF) { |
||
1834 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
1835 | return HUBBUB_OK; |
||
1836 | } else { |
||
1837 | return hubbub_error_from_parserutils_error(error); |
||
1838 | } |
||
1839 | } |
||
1840 | |||
1841 | c = *cptr; |
||
1842 | |||
1843 | if (c == '-') { |
||
1844 | tokeniser->context.pending = len; |
||
1845 | tokeniser->state = STATE_MATCH_COMMENT; |
||
1846 | } else if ((c & ~0x20) == 'D') { |
||
1847 | tokeniser->context.pending = len; |
||
1848 | tokeniser->context.match_doctype.count = len; |
||
1849 | tokeniser->state = STATE_MATCH_DOCTYPE; |
||
1850 | } else if (tokeniser->process_cdata_section == true && c == '[') { |
||
1851 | tokeniser->context.pending = len; |
||
1852 | tokeniser->context.match_cdata.count = len; |
||
1853 | tokeniser->state = STATE_MATCH_CDATA; |
||
1854 | } else { |
||
1855 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
1856 | } |
||
1857 | |||
1858 | return HUBBUB_OK; |
||
1859 | } |
||
1860 | |||
1861 | |||
1862 | hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser) |
||
1863 | { |
||
1864 | size_t len; |
||
1865 | const uint8_t *cptr; |
||
1866 | parserutils_error error; |
||
1867 | |||
1868 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1869 | tokeniser->context.pending, &cptr, &len); |
||
1870 | |||
1871 | if (error != PARSERUTILS_OK) { |
||
1872 | if (error == PARSERUTILS_EOF) { |
||
1873 | tokeniser->context.pending = |
||
1874 | tokeniser->context.current_comment.len = 0; |
||
1875 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
1876 | return HUBBUB_OK; |
||
1877 | } else { |
||
1878 | return hubbub_error_from_parserutils_error(error); |
||
1879 | } |
||
1880 | } |
||
1881 | |||
1882 | tokeniser->context.pending = tokeniser->context.current_comment.len = 0; |
||
1883 | |||
1884 | if (*cptr == '-') { |
||
1885 | parserutils_inputstream_advance(tokeniser->input, SLEN("--")); |
||
1886 | tokeniser->state = STATE_COMMENT_START; |
||
1887 | } else { |
||
1888 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
1889 | } |
||
1890 | |||
1891 | return HUBBUB_OK; |
||
1892 | } |
||
1893 | |||
1894 | |||
1895 | hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) |
||
1896 | { |
||
1897 | size_t len; |
||
1898 | const uint8_t *cptr; |
||
1899 | parserutils_error error; |
||
1900 | uint8_t c; |
||
1901 | |||
1902 | error = parserutils_inputstream_peek(tokeniser->input, |
||
1903 | tokeniser->context.pending, &cptr, &len); |
||
1904 | |||
1905 | if (error != PARSERUTILS_OK) { |
||
1906 | if (error == PARSERUTILS_EOF) { |
||
1907 | tokeniser->state = STATE_DATA; |
||
1908 | return emit_current_comment(tokeniser); |
||
1909 | } else { |
||
1910 | return hubbub_error_from_parserutils_error(error); |
||
1911 | } |
||
1912 | } |
||
1913 | |||
1914 | c = *cptr; |
||
1915 | |||
1916 | if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH || |
||
1917 | tokeniser->state == STATE_COMMENT_START || |
||
1918 | tokeniser->state == STATE_COMMENT_END)) { |
||
1919 | tokeniser->context.pending += len; |
||
1920 | |||
1921 | /** \todo parse error if state != COMMENT_END */ |
||
1922 | tokeniser->state = STATE_DATA; |
||
1923 | return emit_current_comment(tokeniser); |
||
1924 | } else if (c == '-') { |
||
1925 | if (tokeniser->state == STATE_COMMENT_START) { |
||
1926 | tokeniser->state = STATE_COMMENT_START_DASH; |
||
1927 | } else if (tokeniser->state == STATE_COMMENT_START_DASH) { |
||
1928 | tokeniser->state = STATE_COMMENT_END; |
||
1929 | } else if (tokeniser->state == STATE_COMMENT) { |
||
1930 | tokeniser->state = STATE_COMMENT_END_DASH; |
||
1931 | } else if (tokeniser->state == STATE_COMMENT_END_DASH) { |
||
1932 | tokeniser->state = STATE_COMMENT_END; |
||
1933 | } else if (tokeniser->state == STATE_COMMENT_END) { |
||
1934 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1935 | (uint8_t *) "-", SLEN("-")); |
||
1936 | if (error != PARSERUTILS_OK) { |
||
1937 | return hubbub_error_from_parserutils_error( |
||
1938 | error); |
||
1939 | } |
||
1940 | } |
||
1941 | |||
1942 | tokeniser->context.pending += len; |
||
1943 | } else { |
||
1944 | if (tokeniser->state == STATE_COMMENT_START_DASH || |
||
1945 | tokeniser->state == STATE_COMMENT_END_DASH) { |
||
1946 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1947 | (uint8_t *) "-", SLEN("-")); |
||
1948 | if (error != PARSERUTILS_OK) { |
||
1949 | return hubbub_error_from_parserutils_error( |
||
1950 | error); |
||
1951 | } |
||
1952 | } else if (tokeniser->state == STATE_COMMENT_END) { |
||
1953 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1954 | (uint8_t *) "--", SLEN("--")); |
||
1955 | if (error != PARSERUTILS_OK) { |
||
1956 | return hubbub_error_from_parserutils_error( |
||
1957 | error); |
||
1958 | } |
||
1959 | } |
||
1960 | |||
1961 | if (c == '\0') { |
||
1962 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1963 | u_fffd, sizeof(u_fffd)); |
||
1964 | if (error != PARSERUTILS_OK) { |
||
1965 | return hubbub_error_from_parserutils_error( |
||
1966 | error); |
||
1967 | } |
||
1968 | } else if (c == '\r') { |
||
1969 | size_t next_len; |
||
1970 | error = parserutils_inputstream_peek( |
||
1971 | tokeniser->input, |
||
1972 | tokeniser->context.pending + len, |
||
1973 | &cptr, |
||
1974 | &next_len); |
||
1975 | if (error != PARSERUTILS_OK && |
||
1976 | error != PARSERUTILS_EOF) { |
||
1977 | return hubbub_error_from_parserutils_error( |
||
1978 | error); |
||
1979 | } else if (error != PARSERUTILS_EOF && *cptr != '\n') { |
||
1980 | error = parserutils_buffer_append( |
||
1981 | tokeniser->buffer, |
||
1982 | &lf, sizeof(lf)); |
||
1983 | if (error != PARSERUTILS_OK) { |
||
1984 | return hubbub_error_from_parserutils_error( |
||
1985 | error); |
||
1986 | } |
||
1987 | } |
||
1988 | } else { |
||
1989 | error = parserutils_buffer_append(tokeniser->buffer, |
||
1990 | cptr, len); |
||
1991 | if (error != PARSERUTILS_OK) { |
||
1992 | return hubbub_error_from_parserutils_error( |
||
1993 | error); |
||
1994 | } |
||
1995 | } |
||
1996 | |||
1997 | tokeniser->context.pending += len; |
||
1998 | tokeniser->state = STATE_COMMENT; |
||
1999 | } |
||
2000 | |||
2001 | return HUBBUB_OK; |
||
2002 | } |
||
2003 | |||
2004 | |||
2005 | |||
2006 | |||
2007 | #define DOCTYPE "DOCTYPE" |
||
2008 | #define DOCTYPE_LEN (SLEN(DOCTYPE) - 1) |
||
2009 | |||
2010 | hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) |
||
2011 | { |
||
2012 | size_t len; |
||
2013 | const uint8_t *cptr; |
||
2014 | parserutils_error error; |
||
2015 | uint8_t c; |
||
2016 | |||
2017 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2018 | tokeniser->context.match_doctype.count, &cptr, &len); |
||
2019 | |||
2020 | if (error != PARSERUTILS_OK) { |
||
2021 | if (error == PARSERUTILS_EOF) { |
||
2022 | tokeniser->context.current_comment.len = |
||
2023 | tokeniser->context.pending = 0; |
||
2024 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
2025 | return HUBBUB_OK; |
||
2026 | } else { |
||
2027 | return hubbub_error_from_parserutils_error(error); |
||
2028 | } |
||
2029 | } |
||
2030 | |||
2031 | c = *cptr; |
||
2032 | |||
2033 | assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN); |
||
2034 | |||
2035 | if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) { |
||
2036 | tokeniser->context.current_comment.len = |
||
2037 | tokeniser->context.pending = 0; |
||
2038 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
2039 | return HUBBUB_OK; |
||
2040 | } |
||
2041 | |||
2042 | tokeniser->context.pending += len; |
||
2043 | |||
2044 | if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) { |
||
2045 | /* Skip over the DOCTYPE bit */ |
||
2046 | parserutils_inputstream_advance(tokeniser->input, |
||
2047 | tokeniser->context.pending); |
||
2048 | |||
2049 | memset(&tokeniser->context.current_doctype, 0, |
||
2050 | sizeof tokeniser->context.current_doctype); |
||
2051 | tokeniser->context.current_doctype.public_missing = true; |
||
2052 | tokeniser->context.current_doctype.system_missing = true; |
||
2053 | tokeniser->context.pending = 0; |
||
2054 | |||
2055 | tokeniser->state = STATE_DOCTYPE; |
||
2056 | } |
||
2057 | |||
2058 | tokeniser->context.match_doctype.count++; |
||
2059 | |||
2060 | return HUBBUB_OK; |
||
2061 | } |
||
2062 | |||
2063 | #undef DOCTYPE |
||
2064 | #undef DOCTYPE_LEN |
||
2065 | |||
2066 | hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser) |
||
2067 | { |
||
2068 | size_t len; |
||
2069 | const uint8_t *cptr; |
||
2070 | parserutils_error error; |
||
2071 | uint8_t c; |
||
2072 | |||
2073 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2074 | tokeniser->context.pending, &cptr, &len); |
||
2075 | |||
2076 | if (error != PARSERUTILS_OK) { |
||
2077 | if (error == PARSERUTILS_EOF) { |
||
2078 | tokeniser->state = STATE_BEFORE_DOCTYPE_NAME; |
||
2079 | return HUBBUB_OK; |
||
2080 | } else { |
||
2081 | return hubbub_error_from_parserutils_error(error); |
||
2082 | } |
||
2083 | } |
||
2084 | |||
2085 | c = *cptr; |
||
2086 | |||
2087 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2088 | tokeniser->context.pending += len; |
||
2089 | } |
||
2090 | |||
2091 | tokeniser->state = STATE_BEFORE_DOCTYPE_NAME; |
||
2092 | |||
2093 | return HUBBUB_OK; |
||
2094 | } |
||
2095 | |||
2096 | hubbub_error hubbub_tokeniser_handle_before_doctype_name( |
||
2097 | hubbub_tokeniser *tokeniser) |
||
2098 | { |
||
2099 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2100 | size_t len; |
||
2101 | const uint8_t *cptr; |
||
2102 | parserutils_error error; |
||
2103 | uint8_t c; |
||
2104 | |||
2105 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2106 | tokeniser->context.pending, &cptr, &len); |
||
2107 | |||
2108 | if (error != PARSERUTILS_OK) { |
||
2109 | if (error == PARSERUTILS_EOF) { |
||
2110 | /** \todo parse error */ |
||
2111 | /* Emit current doctype, force-quirks on */ |
||
2112 | tokeniser->state = STATE_DATA; |
||
2113 | return emit_current_doctype(tokeniser, true); |
||
2114 | } else { |
||
2115 | return hubbub_error_from_parserutils_error(error); |
||
2116 | } |
||
2117 | } |
||
2118 | |||
2119 | c = *cptr; |
||
2120 | |||
2121 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2122 | /* pass over in silence */ |
||
2123 | tokeniser->context.pending += len; |
||
2124 | } else if (c == '>') { |
||
2125 | /** \todo parse error */ |
||
2126 | tokeniser->context.pending += len; |
||
2127 | tokeniser->state = STATE_DATA; |
||
2128 | return emit_current_doctype(tokeniser, true); |
||
2129 | } else { |
||
2130 | if (c == '\0') { |
||
2131 | START_BUF(cdoc->name, u_fffd, sizeof(u_fffd)); |
||
2132 | } else if ('A' <= c && c <= 'Z') { |
||
2133 | uint8_t lc = c + 0x20; |
||
2134 | |||
2135 | START_BUF(cdoc->name, &lc, len); |
||
2136 | } else { |
||
2137 | START_BUF(cdoc->name, cptr, len); |
||
2138 | } |
||
2139 | |||
2140 | tokeniser->context.pending += len; |
||
2141 | tokeniser->state = STATE_DOCTYPE_NAME; |
||
2142 | } |
||
2143 | |||
2144 | return HUBBUB_OK; |
||
2145 | } |
||
2146 | |||
2147 | hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) |
||
2148 | { |
||
2149 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2150 | size_t len; |
||
2151 | const uint8_t *cptr; |
||
2152 | parserutils_error error; |
||
2153 | uint8_t c; |
||
2154 | |||
2155 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2156 | tokeniser->context.pending, &cptr, &len); |
||
2157 | |||
2158 | if (error != PARSERUTILS_OK) { |
||
2159 | if (error == PARSERUTILS_EOF) { |
||
2160 | tokeniser->state = STATE_DATA; |
||
2161 | return emit_current_doctype(tokeniser, true); |
||
2162 | } else { |
||
2163 | return hubbub_error_from_parserutils_error(error); |
||
2164 | } |
||
2165 | } |
||
2166 | |||
2167 | c = *cptr; |
||
2168 | |||
2169 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2170 | tokeniser->context.pending += len; |
||
2171 | tokeniser->state = STATE_AFTER_DOCTYPE_NAME; |
||
2172 | } else if (c == '>') { |
||
2173 | tokeniser->context.pending += len; |
||
2174 | tokeniser->state = STATE_DATA; |
||
2175 | return emit_current_doctype(tokeniser, false); |
||
2176 | } else if (c == '\0') { |
||
2177 | COLLECT(cdoc->name, u_fffd, sizeof(u_fffd)); |
||
2178 | tokeniser->context.pending += len; |
||
2179 | } else if ('A' <= c && c <= 'Z') { |
||
2180 | uint8_t lc = c + 0x20; |
||
2181 | COLLECT(cdoc->name, &lc, len); |
||
2182 | tokeniser->context.pending += len; |
||
2183 | } else { |
||
2184 | COLLECT(cdoc->name, cptr, len); |
||
2185 | tokeniser->context.pending += len; |
||
2186 | } |
||
2187 | |||
2188 | return HUBBUB_OK; |
||
2189 | } |
||
2190 | |||
2191 | hubbub_error hubbub_tokeniser_handle_after_doctype_name( |
||
2192 | hubbub_tokeniser *tokeniser) |
||
2193 | { |
||
2194 | size_t len; |
||
2195 | const uint8_t *cptr; |
||
2196 | parserutils_error error; |
||
2197 | uint8_t c; |
||
2198 | |||
2199 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2200 | tokeniser->context.pending, &cptr, &len); |
||
2201 | |||
2202 | if (error != PARSERUTILS_OK) { |
||
2203 | if (error == PARSERUTILS_EOF) { |
||
2204 | tokeniser->state = STATE_DATA; |
||
2205 | return emit_current_doctype(tokeniser, true); |
||
2206 | } else { |
||
2207 | return hubbub_error_from_parserutils_error(error); |
||
2208 | } |
||
2209 | } |
||
2210 | |||
2211 | c = *cptr; |
||
2212 | tokeniser->context.pending += len; |
||
2213 | |||
2214 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2215 | /* pass over in silence */ |
||
2216 | } else if (c == '>') { |
||
2217 | tokeniser->state = STATE_DATA; |
||
2218 | return emit_current_doctype(tokeniser, false); |
||
2219 | } else if ((c & ~0x20) == 'P') { |
||
2220 | tokeniser->context.match_doctype.count = 1; |
||
2221 | tokeniser->state = STATE_MATCH_PUBLIC; |
||
2222 | } else if ((c & ~0x20) == 'S') { |
||
2223 | tokeniser->context.match_doctype.count = 1; |
||
2224 | tokeniser->state = STATE_MATCH_SYSTEM; |
||
2225 | } else { |
||
2226 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2227 | tokeniser->context.current_doctype.force_quirks = true; |
||
2228 | } |
||
2229 | |||
2230 | return HUBBUB_OK; |
||
2231 | } |
||
2232 | |||
2233 | #define PUBLIC "PUBLIC" |
||
2234 | #define PUBLIC_LEN (SLEN(PUBLIC) - 1) |
||
2235 | |||
2236 | hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser) |
||
2237 | { |
||
2238 | size_t len; |
||
2239 | const uint8_t *cptr; |
||
2240 | parserutils_error error; |
||
2241 | uint8_t c; |
||
2242 | |||
2243 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2244 | tokeniser->context.pending, &cptr, &len); |
||
2245 | |||
2246 | if (error != PARSERUTILS_OK) { |
||
2247 | if (error == PARSERUTILS_EOF) { |
||
2248 | tokeniser->context.current_doctype.force_quirks = true; |
||
2249 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2250 | return HUBBUB_OK; |
||
2251 | } else { |
||
2252 | return hubbub_error_from_parserutils_error(error); |
||
2253 | } |
||
2254 | } |
||
2255 | |||
2256 | c = *cptr; |
||
2257 | |||
2258 | assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN); |
||
2259 | |||
2260 | if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) { |
||
2261 | tokeniser->context.current_doctype.force_quirks = true; |
||
2262 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2263 | return HUBBUB_OK; |
||
2264 | } |
||
2265 | |||
2266 | tokeniser->context.pending += len; |
||
2267 | |||
2268 | if (tokeniser->context.match_doctype.count == PUBLIC_LEN) { |
||
2269 | tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC; |
||
2270 | } |
||
2271 | |||
2272 | tokeniser->context.match_doctype.count++; |
||
2273 | |||
2274 | return HUBBUB_OK; |
||
2275 | } |
||
2276 | |||
2277 | #undef PUBLIC |
||
2278 | #undef PUBLIC_LEN |
||
2279 | |||
2280 | hubbub_error hubbub_tokeniser_handle_before_doctype_public( |
||
2281 | hubbub_tokeniser *tokeniser) |
||
2282 | { |
||
2283 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2284 | size_t len; |
||
2285 | const uint8_t *cptr; |
||
2286 | parserutils_error error; |
||
2287 | uint8_t c; |
||
2288 | |||
2289 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2290 | tokeniser->context.pending, &cptr, &len); |
||
2291 | |||
2292 | if (error != PARSERUTILS_OK) { |
||
2293 | if (error == PARSERUTILS_EOF) { |
||
2294 | tokeniser->state = STATE_DATA; |
||
2295 | return emit_current_doctype(tokeniser, true); |
||
2296 | } else { |
||
2297 | return hubbub_error_from_parserutils_error(error); |
||
2298 | } |
||
2299 | } |
||
2300 | |||
2301 | c = *cptr; |
||
2302 | tokeniser->context.pending += len; |
||
2303 | |||
2304 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2305 | /* pass over in silence */ |
||
2306 | } else if (c == '"') { |
||
2307 | cdoc->public_missing = false; |
||
2308 | cdoc->public_id.len = 0; |
||
2309 | tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ; |
||
2310 | } else if (c == '\'') { |
||
2311 | cdoc->public_missing = false; |
||
2312 | cdoc->public_id.len = 0; |
||
2313 | tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ; |
||
2314 | } else if (c == '>') { |
||
2315 | tokeniser->state = STATE_DATA; |
||
2316 | return emit_current_doctype(tokeniser, true); |
||
2317 | } else { |
||
2318 | cdoc->force_quirks = true; |
||
2319 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2320 | } |
||
2321 | |||
2322 | return HUBBUB_OK; |
||
2323 | } |
||
2324 | |||
2325 | hubbub_error hubbub_tokeniser_handle_doctype_public_dq( |
||
2326 | hubbub_tokeniser *tokeniser) |
||
2327 | { |
||
2328 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2329 | size_t len; |
||
2330 | const uint8_t *cptr; |
||
2331 | parserutils_error error; |
||
2332 | uint8_t c; |
||
2333 | |||
2334 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2335 | tokeniser->context.pending, &cptr, &len); |
||
2336 | |||
2337 | if (error != PARSERUTILS_OK) { |
||
2338 | if (error == PARSERUTILS_EOF) { |
||
2339 | tokeniser->state = STATE_DATA; |
||
2340 | return emit_current_doctype(tokeniser, true); |
||
2341 | } else { |
||
2342 | return hubbub_error_from_parserutils_error(error); |
||
2343 | } |
||
2344 | } |
||
2345 | |||
2346 | c = *cptr; |
||
2347 | |||
2348 | if (c == '"') { |
||
2349 | tokeniser->context.pending += len; |
||
2350 | tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC; |
||
2351 | } else if (c == '>') { |
||
2352 | tokeniser->context.pending += len; |
||
2353 | tokeniser->state = STATE_DATA; |
||
2354 | return emit_current_doctype(tokeniser, true); |
||
2355 | } else if (c == '\0') { |
||
2356 | COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd)); |
||
2357 | tokeniser->context.pending += len; |
||
2358 | } else if (c == '\r') { |
||
2359 | error = parserutils_inputstream_peek( |
||
2360 | tokeniser->input, |
||
2361 | tokeniser->context.pending, |
||
2362 | &cptr, |
||
2363 | &len); |
||
2364 | |||
2365 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
2366 | return hubbub_error_from_parserutils_error(error); |
||
2367 | } else if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
2368 | COLLECT_MS(cdoc->public_id, &lf, sizeof(lf)); |
||
2369 | } |
||
2370 | |||
2371 | /* Collect '\r' */ |
||
2372 | tokeniser->context.pending += 1; |
||
2373 | } else { |
||
2374 | COLLECT_MS(cdoc->public_id, cptr, len); |
||
2375 | |||
2376 | tokeniser->context.pending += len; |
||
2377 | } |
||
2378 | |||
2379 | return HUBBUB_OK; |
||
2380 | } |
||
2381 | |||
2382 | hubbub_error hubbub_tokeniser_handle_doctype_public_sq( |
||
2383 | hubbub_tokeniser *tokeniser) |
||
2384 | { |
||
2385 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2386 | size_t len; |
||
2387 | const uint8_t *cptr; |
||
2388 | parserutils_error error; |
||
2389 | uint8_t c; |
||
2390 | |||
2391 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2392 | tokeniser->context.pending, &cptr, &len); |
||
2393 | |||
2394 | if (error != PARSERUTILS_OK) { |
||
2395 | if (error == PARSERUTILS_EOF) { |
||
2396 | tokeniser->state = STATE_DATA; |
||
2397 | return emit_current_doctype(tokeniser, true); |
||
2398 | } else { |
||
2399 | return hubbub_error_from_parserutils_error(error); |
||
2400 | } |
||
2401 | } |
||
2402 | |||
2403 | c = *cptr; |
||
2404 | |||
2405 | if (c == '\'') { |
||
2406 | tokeniser->context.pending += len; |
||
2407 | tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC; |
||
2408 | } else if (c == '>') { |
||
2409 | tokeniser->context.pending += len; |
||
2410 | tokeniser->state = STATE_DATA; |
||
2411 | return emit_current_doctype(tokeniser, true); |
||
2412 | } else if (c == '\0') { |
||
2413 | COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd)); |
||
2414 | tokeniser->context.pending += len; |
||
2415 | } else if (c == '\r') { |
||
2416 | error = parserutils_inputstream_peek( |
||
2417 | tokeniser->input, |
||
2418 | tokeniser->context.pending, |
||
2419 | &cptr, |
||
2420 | &len); |
||
2421 | |||
2422 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
2423 | return hubbub_error_from_parserutils_error(error); |
||
2424 | } else if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
2425 | COLLECT_MS(cdoc->public_id, &lf, sizeof(lf)); |
||
2426 | } |
||
2427 | |||
2428 | /* Collect '\r' */ |
||
2429 | tokeniser->context.pending += 1; |
||
2430 | } else { |
||
2431 | COLLECT_MS(cdoc->public_id, cptr, len); |
||
2432 | tokeniser->context.pending += len; |
||
2433 | } |
||
2434 | |||
2435 | return HUBBUB_OK; |
||
2436 | } |
||
2437 | |||
2438 | |||
2439 | hubbub_error hubbub_tokeniser_handle_after_doctype_public( |
||
2440 | hubbub_tokeniser *tokeniser) |
||
2441 | { |
||
2442 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2443 | size_t len; |
||
2444 | const uint8_t *cptr; |
||
2445 | parserutils_error error; |
||
2446 | uint8_t c; |
||
2447 | |||
2448 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2449 | tokeniser->context.pending, &cptr, &len); |
||
2450 | |||
2451 | if (error != PARSERUTILS_OK) { |
||
2452 | if (error == PARSERUTILS_EOF) { |
||
2453 | tokeniser->state = STATE_DATA; |
||
2454 | return emit_current_doctype(tokeniser, true); |
||
2455 | } else { |
||
2456 | return hubbub_error_from_parserutils_error(error); |
||
2457 | } |
||
2458 | } |
||
2459 | |||
2460 | c = *cptr; |
||
2461 | tokeniser->context.pending += len; |
||
2462 | |||
2463 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2464 | /* pass over in silence */ |
||
2465 | } else if (c == '"') { |
||
2466 | cdoc->system_missing = false; |
||
2467 | cdoc->system_id.len = 0; |
||
2468 | |||
2469 | tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ; |
||
2470 | } else if (c == '\'') { |
||
2471 | cdoc->system_missing = false; |
||
2472 | cdoc->system_id.len = 0; |
||
2473 | |||
2474 | tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ; |
||
2475 | } else if (c == '>') { |
||
2476 | tokeniser->state = STATE_DATA; |
||
2477 | return emit_current_doctype(tokeniser, false); |
||
2478 | } else { |
||
2479 | cdoc->force_quirks = true; |
||
2480 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2481 | } |
||
2482 | |||
2483 | return HUBBUB_OK; |
||
2484 | } |
||
2485 | |||
2486 | |||
2487 | |||
2488 | #define SYSTEM "SYSTEM" |
||
2489 | #define SYSTEM_LEN (SLEN(SYSTEM) - 1) |
||
2490 | |||
2491 | hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser) |
||
2492 | { |
||
2493 | size_t len; |
||
2494 | const uint8_t *cptr; |
||
2495 | parserutils_error error; |
||
2496 | uint8_t c; |
||
2497 | |||
2498 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2499 | tokeniser->context.pending, &cptr, &len); |
||
2500 | |||
2501 | if (error != PARSERUTILS_OK){ |
||
2502 | if (error == PARSERUTILS_EOF) { |
||
2503 | tokeniser->context.current_doctype.force_quirks = true; |
||
2504 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2505 | return HUBBUB_OK; |
||
2506 | } else { |
||
2507 | return hubbub_error_from_parserutils_error(error); |
||
2508 | } |
||
2509 | } |
||
2510 | |||
2511 | c = *cptr; |
||
2512 | |||
2513 | assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN); |
||
2514 | |||
2515 | if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) { |
||
2516 | tokeniser->context.current_doctype.force_quirks = true; |
||
2517 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2518 | return HUBBUB_OK; |
||
2519 | } |
||
2520 | |||
2521 | tokeniser->context.pending += len; |
||
2522 | |||
2523 | if (tokeniser->context.match_doctype.count == SYSTEM_LEN) { |
||
2524 | tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM; |
||
2525 | } |
||
2526 | |||
2527 | tokeniser->context.match_doctype.count++; |
||
2528 | |||
2529 | return HUBBUB_OK; |
||
2530 | } |
||
2531 | |||
2532 | #undef SYSTEM |
||
2533 | #undef SYSTEM_LEN |
||
2534 | |||
2535 | hubbub_error hubbub_tokeniser_handle_before_doctype_system( |
||
2536 | hubbub_tokeniser *tokeniser) |
||
2537 | { |
||
2538 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2539 | size_t len; |
||
2540 | const uint8_t *cptr; |
||
2541 | parserutils_error error; |
||
2542 | uint8_t c; |
||
2543 | |||
2544 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2545 | tokeniser->context.pending, &cptr, &len); |
||
2546 | |||
2547 | if (error != PARSERUTILS_OK) { |
||
2548 | if (error == PARSERUTILS_EOF) { |
||
2549 | tokeniser->state = STATE_DATA; |
||
2550 | return emit_current_doctype(tokeniser, true); |
||
2551 | } else { |
||
2552 | return hubbub_error_from_parserutils_error(error); |
||
2553 | } |
||
2554 | } |
||
2555 | |||
2556 | c = *cptr; |
||
2557 | tokeniser->context.pending += len; |
||
2558 | |||
2559 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2560 | /* pass over */ |
||
2561 | } else if (c == '"') { |
||
2562 | cdoc->system_missing = false; |
||
2563 | cdoc->system_id.len = 0; |
||
2564 | |||
2565 | tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ; |
||
2566 | } else if (c == '\'') { |
||
2567 | cdoc->system_missing = false; |
||
2568 | cdoc->system_id.len = 0; |
||
2569 | |||
2570 | tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ; |
||
2571 | } else if (c == '>') { |
||
2572 | tokeniser->state = STATE_DATA; |
||
2573 | return emit_current_doctype(tokeniser, true); |
||
2574 | } else { |
||
2575 | cdoc->force_quirks = true; |
||
2576 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2577 | } |
||
2578 | |||
2579 | return HUBBUB_OK; |
||
2580 | } |
||
2581 | |||
2582 | hubbub_error hubbub_tokeniser_handle_doctype_system_dq( |
||
2583 | hubbub_tokeniser *tokeniser) |
||
2584 | { |
||
2585 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2586 | size_t len; |
||
2587 | const uint8_t *cptr; |
||
2588 | parserutils_error error; |
||
2589 | uint8_t c; |
||
2590 | |||
2591 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2592 | tokeniser->context.pending, &cptr, &len); |
||
2593 | |||
2594 | if (error != PARSERUTILS_OK) { |
||
2595 | if (error == PARSERUTILS_EOF) { |
||
2596 | tokeniser->state = STATE_DATA; |
||
2597 | return emit_current_doctype(tokeniser, true); |
||
2598 | } else { |
||
2599 | return hubbub_error_from_parserutils_error(error); |
||
2600 | } |
||
2601 | } |
||
2602 | |||
2603 | c = *cptr; |
||
2604 | |||
2605 | if (c == '"') { |
||
2606 | tokeniser->context.pending += len; |
||
2607 | tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM; |
||
2608 | } else if (c == '>') { |
||
2609 | tokeniser->context.pending += len; |
||
2610 | tokeniser->state = STATE_DATA; |
||
2611 | return emit_current_doctype(tokeniser, true); |
||
2612 | } else if (c == '\0') { |
||
2613 | COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd)); |
||
2614 | tokeniser->context.pending += len; |
||
2615 | } else if (c == '\r') { |
||
2616 | error = parserutils_inputstream_peek( |
||
2617 | tokeniser->input, |
||
2618 | tokeniser->context.pending, |
||
2619 | &cptr, |
||
2620 | &len); |
||
2621 | |||
2622 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
2623 | return hubbub_error_from_parserutils_error(error); |
||
2624 | } else if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
2625 | COLLECT_MS(cdoc->system_id, &lf, sizeof(lf)); |
||
2626 | } |
||
2627 | |||
2628 | /* Collect '\r' */ |
||
2629 | tokeniser->context.pending += 1; |
||
2630 | } else { |
||
2631 | COLLECT_MS(cdoc->system_id, cptr, len); |
||
2632 | tokeniser->context.pending += len; |
||
2633 | } |
||
2634 | |||
2635 | return HUBBUB_OK; |
||
2636 | } |
||
2637 | |||
2638 | hubbub_error hubbub_tokeniser_handle_doctype_system_sq( |
||
2639 | hubbub_tokeniser *tokeniser) |
||
2640 | { |
||
2641 | hubbub_doctype *cdoc = &tokeniser->context.current_doctype; |
||
2642 | size_t len; |
||
2643 | const uint8_t *cptr; |
||
2644 | parserutils_error error; |
||
2645 | uint8_t c; |
||
2646 | |||
2647 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2648 | tokeniser->context.pending, &cptr, &len); |
||
2649 | |||
2650 | if (error != PARSERUTILS_OK) { |
||
2651 | if (error == PARSERUTILS_EOF) { |
||
2652 | tokeniser->state = STATE_DATA; |
||
2653 | return emit_current_doctype(tokeniser, true); |
||
2654 | } else { |
||
2655 | return hubbub_error_from_parserutils_error(error); |
||
2656 | } |
||
2657 | } |
||
2658 | |||
2659 | c = *cptr; |
||
2660 | |||
2661 | if (c == '\'') { |
||
2662 | tokeniser->context.pending += len; |
||
2663 | tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM; |
||
2664 | } else if (c == '>') { |
||
2665 | tokeniser->context.pending += len; |
||
2666 | tokeniser->state = STATE_DATA; |
||
2667 | return emit_current_doctype(tokeniser, true); |
||
2668 | } else if (c == '\0') { |
||
2669 | COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd)); |
||
2670 | tokeniser->context.pending += len; |
||
2671 | } else if (c == '\r') { |
||
2672 | error = parserutils_inputstream_peek( |
||
2673 | tokeniser->input, |
||
2674 | tokeniser->context.pending, |
||
2675 | &cptr, |
||
2676 | &len); |
||
2677 | |||
2678 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
2679 | return hubbub_error_from_parserutils_error(error); |
||
2680 | } else if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
2681 | COLLECT_MS(cdoc->system_id, &lf, sizeof(lf)); |
||
2682 | } |
||
2683 | |||
2684 | /* Collect '\r' */ |
||
2685 | tokeniser->context.pending += 1; |
||
2686 | } else { |
||
2687 | COLLECT_MS(cdoc->system_id, cptr, len); |
||
2688 | tokeniser->context.pending += len; |
||
2689 | } |
||
2690 | |||
2691 | return HUBBUB_OK; |
||
2692 | } |
||
2693 | |||
2694 | hubbub_error hubbub_tokeniser_handle_after_doctype_system( |
||
2695 | hubbub_tokeniser *tokeniser) |
||
2696 | { |
||
2697 | size_t len; |
||
2698 | const uint8_t *cptr; |
||
2699 | parserutils_error error; |
||
2700 | uint8_t c; |
||
2701 | |||
2702 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2703 | tokeniser->context.pending, &cptr, &len); |
||
2704 | |||
2705 | if (error != PARSERUTILS_OK) { |
||
2706 | if (error == PARSERUTILS_EOF) { |
||
2707 | tokeniser->state = STATE_DATA; |
||
2708 | return emit_current_doctype(tokeniser, true); |
||
2709 | } else { |
||
2710 | return hubbub_error_from_parserutils_error(error); |
||
2711 | } |
||
2712 | } |
||
2713 | |||
2714 | c = *cptr; |
||
2715 | tokeniser->context.pending += len; |
||
2716 | |||
2717 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { |
||
2718 | /* pass over in silence */ |
||
2719 | } else if (c == '>') { |
||
2720 | tokeniser->state = STATE_DATA; |
||
2721 | return emit_current_doctype(tokeniser, false); |
||
2722 | } else { |
||
2723 | tokeniser->state = STATE_BOGUS_DOCTYPE; |
||
2724 | } |
||
2725 | |||
2726 | return HUBBUB_OK; |
||
2727 | } |
||
2728 | |||
2729 | |||
2730 | hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser) |
||
2731 | { |
||
2732 | size_t len; |
||
2733 | const uint8_t *cptr; |
||
2734 | parserutils_error error; |
||
2735 | uint8_t c; |
||
2736 | |||
2737 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2738 | tokeniser->context.pending, &cptr, &len); |
||
2739 | |||
2740 | if (error != PARSERUTILS_OK) { |
||
2741 | if (error == PARSERUTILS_EOF) { |
||
2742 | tokeniser->state = STATE_DATA; |
||
2743 | return emit_current_doctype(tokeniser, false); |
||
2744 | } else { |
||
2745 | return hubbub_error_from_parserutils_error(error); |
||
2746 | } |
||
2747 | } |
||
2748 | |||
2749 | c = *cptr; |
||
2750 | tokeniser->context.pending += len; |
||
2751 | |||
2752 | if (c == '>') { |
||
2753 | tokeniser->state = STATE_DATA; |
||
2754 | return emit_current_doctype(tokeniser, false); |
||
2755 | } |
||
2756 | |||
2757 | return HUBBUB_OK; |
||
2758 | } |
||
2759 | |||
2760 | |||
2761 | |||
2762 | #define CDATA "[CDATA[" |
||
2763 | #define CDATA_LEN (SLEN(CDATA) - 1) |
||
2764 | |||
2765 | hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser) |
||
2766 | { |
||
2767 | size_t len; |
||
2768 | const uint8_t *cptr; |
||
2769 | parserutils_error error; |
||
2770 | uint8_t c; |
||
2771 | |||
2772 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2773 | tokeniser->context.pending, &cptr, &len); |
||
2774 | |||
2775 | if (error != PARSERUTILS_OK) { |
||
2776 | if (error == PARSERUTILS_EOF) { |
||
2777 | tokeniser->context.current_comment.len = |
||
2778 | tokeniser->context.pending = 0; |
||
2779 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
2780 | return HUBBUB_OK; |
||
2781 | } else { |
||
2782 | return hubbub_error_from_parserutils_error(error); |
||
2783 | } |
||
2784 | } |
||
2785 | |||
2786 | c = *cptr; |
||
2787 | |||
2788 | assert(tokeniser->context.match_cdata.count <= CDATA_LEN); |
||
2789 | |||
2790 | if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) { |
||
2791 | tokeniser->context.current_comment.len = |
||
2792 | tokeniser->context.pending = |
||
2793 | 0; |
||
2794 | tokeniser->state = STATE_BOGUS_COMMENT; |
||
2795 | return HUBBUB_OK; |
||
2796 | } |
||
2797 | |||
2798 | tokeniser->context.pending += len; |
||
2799 | |||
2800 | if (tokeniser->context.match_cdata.count == CDATA_LEN) { |
||
2801 | parserutils_inputstream_advance(tokeniser->input, |
||
2802 | tokeniser->context.match_cdata.count + len); |
||
2803 | tokeniser->context.pending = 0; |
||
2804 | tokeniser->context.match_cdata.end = 0; |
||
2805 | tokeniser->state = STATE_CDATA_BLOCK; |
||
2806 | } |
||
2807 | |||
2808 | tokeniser->context.match_cdata.count += len; |
||
2809 | |||
2810 | return HUBBUB_OK; |
||
2811 | } |
||
2812 | |||
2813 | #undef CDATA |
||
2814 | #undef CDATA_LEN |
||
2815 | |||
2816 | |||
2817 | hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser) |
||
2818 | { |
||
2819 | size_t len; |
||
2820 | const uint8_t *cptr; |
||
2821 | parserutils_error error; |
||
2822 | uint8_t c; |
||
2823 | |||
2824 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2825 | tokeniser->context.pending, &cptr, &len); |
||
2826 | |||
2827 | if (error != PARSERUTILS_OK) { |
||
2828 | if (error == PARSERUTILS_EOF) { |
||
2829 | tokeniser->state = STATE_DATA; |
||
2830 | return emit_current_chars(tokeniser); |
||
2831 | } else { |
||
2832 | return hubbub_error_from_parserutils_error(error); |
||
2833 | } |
||
2834 | } |
||
2835 | |||
2836 | c = *cptr; |
||
2837 | |||
2838 | if (c == ']' && (tokeniser->context.match_cdata.end == 0 || |
||
2839 | tokeniser->context.match_cdata.end == 1)) { |
||
2840 | tokeniser->context.pending += len; |
||
2841 | tokeniser->context.match_cdata.end += len; |
||
2842 | } else if (c == '>' && tokeniser->context.match_cdata.end == 2) { |
||
2843 | /* Remove the previous two "]]" */ |
||
2844 | tokeniser->context.pending -= 2; |
||
2845 | |||
2846 | /* Emit any pending characters */ |
||
2847 | emit_current_chars(tokeniser); |
||
2848 | |||
2849 | /* Now move past the "]]>" bit */ |
||
2850 | parserutils_inputstream_advance(tokeniser->input, SLEN("]]>")); |
||
2851 | |||
2852 | tokeniser->state = STATE_DATA; |
||
2853 | } else if (c == '\0') { |
||
2854 | if (tokeniser->context.pending > 0) { |
||
2855 | /* Emit any pending characters */ |
||
2856 | emit_current_chars(tokeniser); |
||
2857 | } |
||
2858 | |||
2859 | /* Perform NUL-byte replacement */ |
||
2860 | emit_character_token(tokeniser, &u_fffd_str); |
||
2861 | |||
2862 | parserutils_inputstream_advance(tokeniser->input, len); |
||
2863 | tokeniser->context.match_cdata.end = 0; |
||
2864 | } else if (c == '\r') { |
||
2865 | error = parserutils_inputstream_peek( |
||
2866 | tokeniser->input, |
||
2867 | tokeniser->context.pending + len, |
||
2868 | &cptr, |
||
2869 | &len); |
||
2870 | |||
2871 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
2872 | return hubbub_error_from_parserutils_error(error); |
||
2873 | } |
||
2874 | |||
2875 | if (tokeniser->context.pending > 0) { |
||
2876 | /* Emit any pending characters */ |
||
2877 | emit_current_chars(tokeniser); |
||
2878 | } |
||
2879 | |||
2880 | if (error == PARSERUTILS_EOF || *cptr != '\n') { |
||
2881 | /* Emit newline */ |
||
2882 | emit_character_token(tokeniser, &lf_str); |
||
2883 | } |
||
2884 | |||
2885 | /* Advance over \r */ |
||
2886 | parserutils_inputstream_advance(tokeniser->input, 1); |
||
2887 | tokeniser->context.match_cdata.end = 0; |
||
2888 | } else { |
||
2889 | tokeniser->context.pending += len; |
||
2890 | tokeniser->context.match_cdata.end = 0; |
||
2891 | } |
||
2892 | |||
2893 | return HUBBUB_OK; |
||
2894 | } |
||
2895 | |||
2896 | |||
2897 | hubbub_error hubbub_tokeniser_consume_character_reference( |
||
2898 | hubbub_tokeniser *tokeniser, size_t pos) |
||
2899 | { |
||
2900 | uint32_t allowed_char = tokeniser->context.allowed_char; |
||
2901 | |||
2902 | size_t len; |
||
2903 | const uint8_t *cptr; |
||
2904 | parserutils_error error; |
||
2905 | uint8_t c; |
||
2906 | size_t off; |
||
2907 | |||
2908 | error = parserutils_inputstream_peek(tokeniser->input, pos, |
||
2909 | &cptr, &len); |
||
2910 | |||
2911 | /* We should always start on an ampersand */ |
||
2912 | assert(error == PARSERUTILS_OK); |
||
2913 | assert(len == 1 && *cptr == '&'); |
||
2914 | |||
2915 | off = pos + len; |
||
2916 | |||
2917 | /* Look at the character after the ampersand */ |
||
2918 | error = parserutils_inputstream_peek(tokeniser->input, off, |
||
2919 | &cptr, &len); |
||
2920 | |||
2921 | if (error != PARSERUTILS_OK) { |
||
2922 | if (error == PARSERUTILS_EOF) { |
||
2923 | tokeniser->context.match_entity.complete = true; |
||
2924 | tokeniser->context.match_entity.codepoint = 0; |
||
2925 | return HUBBUB_OK; |
||
2926 | } else { |
||
2927 | return hubbub_error_from_parserutils_error(error); |
||
2928 | } |
||
2929 | } |
||
2930 | |||
2931 | c = *cptr; |
||
2932 | |||
2933 | /* Set things up */ |
||
2934 | tokeniser->context.match_entity.offset = off; |
||
2935 | tokeniser->context.match_entity.poss_length = 0; |
||
2936 | tokeniser->context.match_entity.length = 0; |
||
2937 | tokeniser->context.match_entity.base = 0; |
||
2938 | tokeniser->context.match_entity.codepoint = 0; |
||
2939 | tokeniser->context.match_entity.had_data = false; |
||
2940 | tokeniser->context.match_entity.return_state = tokeniser->state; |
||
2941 | tokeniser->context.match_entity.complete = false; |
||
2942 | tokeniser->context.match_entity.overflow = false; |
||
2943 | tokeniser->context.match_entity.context = -1; |
||
2944 | tokeniser->context.match_entity.prev_len = len; |
||
2945 | |||
2946 | /* Reset allowed character for future calls */ |
||
2947 | tokeniser->context.allowed_char = '\0'; |
||
2948 | |||
2949 | if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || |
||
2950 | c == '<' || c == '&' || |
||
2951 | (allowed_char && c == allowed_char)) { |
||
2952 | tokeniser->context.match_entity.complete = true; |
||
2953 | tokeniser->context.match_entity.codepoint = 0; |
||
2954 | } else if (c == '#') { |
||
2955 | tokeniser->context.match_entity.length += len; |
||
2956 | tokeniser->state = STATE_NUMBERED_ENTITY; |
||
2957 | } else { |
||
2958 | tokeniser->state = STATE_NAMED_ENTITY; |
||
2959 | } |
||
2960 | |||
2961 | return HUBBUB_OK; |
||
2962 | } |
||
2963 | |||
2964 | |||
2965 | hubbub_error hubbub_tokeniser_handle_numbered_entity( |
||
2966 | hubbub_tokeniser *tokeniser) |
||
2967 | { |
||
2968 | hubbub_tokeniser_context *ctx = &tokeniser->context; |
||
2969 | |||
2970 | size_t len; |
||
2971 | const uint8_t *cptr; |
||
2972 | parserutils_error error; |
||
2973 | |||
2974 | error = parserutils_inputstream_peek(tokeniser->input, |
||
2975 | ctx->match_entity.offset + ctx->match_entity.length, |
||
2976 | &cptr, &len); |
||
2977 | |||
2978 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
2979 | return hubbub_error_from_parserutils_error(error); |
||
2980 | } |
||
2981 | |||
2982 | if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) { |
||
2983 | uint8_t c = *cptr; |
||
2984 | if ((c & ~0x20) == 'X') { |
||
2985 | ctx->match_entity.base = 16; |
||
2986 | ctx->match_entity.length += len; |
||
2987 | } else { |
||
2988 | ctx->match_entity.base = 10; |
||
2989 | } |
||
2990 | } |
||
2991 | |||
2992 | while ((error = parserutils_inputstream_peek(tokeniser->input, |
||
2993 | ctx->match_entity.offset + ctx->match_entity.length, |
||
2994 | &cptr, &len)) == PARSERUTILS_OK) { |
||
2995 | uint8_t c = *cptr; |
||
2996 | |||
2997 | if (ctx->match_entity.base == 10 && |
||
2998 | ('0' <= c && c <= '9')) { |
||
2999 | ctx->match_entity.had_data = true; |
||
3000 | ctx->match_entity.codepoint = |
||
3001 | ctx->match_entity.codepoint * 10 + (c - '0'); |
||
3002 | |||
3003 | ctx->match_entity.length += len; |
||
3004 | } else if (ctx->match_entity.base == 16 && |
||
3005 | (('0' <= c && c <= '9') || |
||
3006 | ('A' <= (c & ~0x20) && |
||
3007 | (c & ~0x20) <= 'F'))) { |
||
3008 | ctx->match_entity.had_data = true; |
||
3009 | ctx->match_entity.codepoint *= 16; |
||
3010 | |||
3011 | if ('0' <= c && c <= '9') { |
||
3012 | ctx->match_entity.codepoint += (c - '0'); |
||
3013 | } else { |
||
3014 | ctx->match_entity.codepoint += |
||
3015 | ((c & ~0x20) - 'A' + 10); |
||
3016 | } |
||
3017 | |||
3018 | ctx->match_entity.length += len; |
||
3019 | } else { |
||
3020 | break; |
||
3021 | } |
||
3022 | |||
3023 | if (ctx->match_entity.codepoint >= 0x10FFFF) { |
||
3024 | ctx->match_entity.overflow = true; |
||
3025 | } |
||
3026 | } |
||
3027 | |||
3028 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
3029 | return hubbub_error_from_parserutils_error(error); |
||
3030 | } |
||
3031 | |||
3032 | /* Eat trailing semicolon, if any */ |
||
3033 | if (error != PARSERUTILS_EOF && *cptr == ';') { |
||
3034 | ctx->match_entity.length += len; |
||
3035 | } |
||
3036 | |||
3037 | /* Had data, so calculate final codepoint */ |
||
3038 | if (ctx->match_entity.had_data) { |
||
3039 | uint32_t cp = ctx->match_entity.codepoint; |
||
3040 | |||
3041 | if (0x80 <= cp && cp <= 0x9F) { |
||
3042 | cp = cp1252Table[cp - 0x80]; |
||
3043 | } else if (cp == 0x0D) { |
||
3044 | cp = 0x000A; |
||
3045 | } else if (ctx->match_entity.overflow || |
||
3046 | cp <= 0x0008 || cp == 0x000B || |
||
3047 | (0x000E <= cp && cp <= 0x001F) || |
||
3048 | (0x007F <= cp && cp <= 0x009F) || |
||
3049 | (0xD800 <= cp && cp <= 0xDFFF) || |
||
3050 | (0xFDD0 <= cp && cp <= 0xFDEF) || |
||
3051 | (cp & 0xFFFE) == 0xFFFE) { |
||
3052 | /* the check for cp > 0x10FFFF per spec is performed |
||
3053 | * in the loop above to avoid overflow */ |
||
3054 | cp = 0xFFFD; |
||
3055 | } |
||
3056 | |||
3057 | ctx->match_entity.codepoint = cp; |
||
3058 | } |
||
3059 | |||
3060 | /* Flag completion */ |
||
3061 | ctx->match_entity.complete = true; |
||
3062 | |||
3063 | /* And back to the state we were entered in */ |
||
3064 | tokeniser->state = ctx->match_entity.return_state; |
||
3065 | |||
3066 | return HUBBUB_OK; |
||
3067 | } |
||
3068 | |||
3069 | hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) |
||
3070 | { |
||
3071 | hubbub_tokeniser_context *ctx = &tokeniser->context; |
||
3072 | |||
3073 | size_t len; |
||
3074 | const uint8_t *cptr; |
||
3075 | parserutils_error error; |
||
3076 | |||
3077 | while ((error = parserutils_inputstream_peek(tokeniser->input, |
||
3078 | ctx->match_entity.offset + |
||
3079 | ctx->match_entity.poss_length, |
||
3080 | &cptr, &len)) == PARSERUTILS_OK) { |
||
3081 | uint32_t cp; |
||
3082 | |||
3083 | uint8_t c = *cptr; |
||
3084 | hubbub_error error; |
||
3085 | |||
3086 | if (c > 0x7F) { |
||
3087 | /* Entity names are ASCII only */ |
||
3088 | break; |
||
3089 | } |
||
3090 | |||
3091 | error = hubbub_entities_search_step(c, &cp, |
||
3092 | &ctx->match_entity.context); |
||
3093 | if (error == HUBBUB_OK) { |
||
3094 | /* Had a match - store it for later */ |
||
3095 | ctx->match_entity.codepoint = cp; |
||
3096 | |||
3097 | ctx->match_entity.length = |
||
3098 | ctx->match_entity.poss_length + len; |
||
3099 | ctx->match_entity.poss_length = |
||
3100 | ctx->match_entity.length; |
||
3101 | } else if (error == HUBBUB_INVALID) { |
||
3102 | /* No further matches - use last found */ |
||
3103 | break; |
||
3104 | } else { |
||
3105 | /* Need more data */ |
||
3106 | ctx->match_entity.poss_length += len; |
||
3107 | } |
||
3108 | } |
||
3109 | |||
3110 | if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) { |
||
3111 | return hubbub_error_from_parserutils_error(error); |
||
3112 | } |
||
3113 | |||
3114 | if (ctx->match_entity.length > 0) { |
||
3115 | uint8_t c; |
||
3116 | error = parserutils_inputstream_peek(tokeniser->input, |
||
3117 | ctx->match_entity.offset + |
||
3118 | ctx->match_entity.length - 1, |
||
3119 | &cptr, &len); |
||
3120 | /* We're re-reading a character we've already read after. |
||
3121 | * Therefore, there's no way that an error may occur as |
||
3122 | * a result. */ |
||
3123 | assert(error == PARSERUTILS_OK); |
||
3124 | |||
3125 | c = *cptr; |
||
3126 | |||
3127 | if ((tokeniser->context.match_entity.return_state == |
||
3128 | STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) && |
||
3129 | c != ';') { |
||
3130 | error = parserutils_inputstream_peek(tokeniser->input, |
||
3131 | ctx->match_entity.offset + |
||
3132 | ctx->match_entity.length, |
||
3133 | &cptr, &len); |
||
3134 | /* We must have attempted to read one more character |
||
3135 | * than was present in the entity name, as that is the |
||
3136 | * only way to break out of the loop above. If that |
||
3137 | * failed, then any non-EOF case will have been handled |
||
3138 | * by the if statement after the loop thus it cannot |
||
3139 | * occur here. */ |
||
3140 | assert(error == PARSERUTILS_OK || |
||
3141 | error == PARSERUTILS_EOF); |
||
3142 | |||
3143 | if (error == PARSERUTILS_EOF) { |
||
3144 | ctx->match_entity.codepoint = 0; |
||
3145 | } |
||
3146 | |||
3147 | c = *cptr; |
||
3148 | if ((0x0030 <= c && c <= 0x0039) || |
||
3149 | (0x0041 <= c && c <= 0x005A) || |
||
3150 | (0x0061 <= c && c <= 0x007A)) { |
||
3151 | ctx->match_entity.codepoint = 0; |
||
3152 | } |
||
3153 | } |
||
3154 | } |
||
3155 | |||
3156 | /* Flag completion */ |
||
3157 | ctx->match_entity.complete = true; |
||
3158 | |||
3159 | /* And back to the state from whence we came */ |
||
3160 | tokeniser->state = ctx->match_entity.return_state; |
||
3161 | |||
3162 | return HUBBUB_OK; |
||
3163 | } |
||
3164 | |||
3165 | |||
3166 | |||
3167 | /*** Token emitting bits ***/ |
||
3168 | |||
3169 | /** |
||
3170 | * Emit a character token. |
||
3171 | * |
||
3172 | * \param tokeniser Tokeniser instance |
||
3173 | * \param chars Pointer to hubbub_string to emit |
||
3174 | * \return true |
||
3175 | */ |
||
3176 | hubbub_error emit_character_token(hubbub_tokeniser *tokeniser, |
||
3177 | const hubbub_string *chars) |
||
3178 | { |
||
3179 | hubbub_token token; |
||
3180 | |||
3181 | token.type = HUBBUB_TOKEN_CHARACTER; |
||
3182 | token.data.character = *chars; |
||
3183 | |||
3184 | return hubbub_tokeniser_emit_token(tokeniser, &token); |
||
3185 | } |
||
3186 | |||
3187 | /** |
||
3188 | * Emit the current pending characters being stored in the tokeniser context. |
||
3189 | * |
||
3190 | * \param tokeniser Tokeniser instance |
||
3191 | * \return true |
||
3192 | */ |
||
3193 | hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser) |
||
3194 | { |
||
3195 | hubbub_token token; |
||
3196 | size_t len; |
||
3197 | const uint8_t *cptr = NULL; |
||
3198 | parserutils_error error; |
||
3199 | |||
3200 | /* Calling this with nothing to output is a probable bug */ |
||
3201 | assert(tokeniser->context.pending > 0); |
||
3202 | |||
3203 | error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len); |
||
3204 | if (error != PARSERUTILS_OK) |
||
3205 | return hubbub_error_from_parserutils_error(error); |
||
3206 | |||
3207 | token.type = HUBBUB_TOKEN_CHARACTER; |
||
3208 | token.data.character.ptr = cptr; |
||
3209 | token.data.character.len = tokeniser->context.pending; |
||
3210 | |||
3211 | return hubbub_tokeniser_emit_token(tokeniser, &token); |
||
3212 | } |
||
3213 | |||
3214 | /** |
||
3215 | * Emit the current tag token being stored in the tokeniser context. |
||
3216 | * |
||
3217 | * \param tokeniser Tokeniser instance |
||
3218 | * \return true |
||
3219 | */ |
||
3220 | hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser) |
||
3221 | { |
||
3222 | hubbub_error err; |
||
3223 | hubbub_token token; |
||
3224 | uint32_t n_attributes; |
||
3225 | hubbub_attribute *attrs; |
||
3226 | uint8_t *ptr; |
||
3227 | uint32_t i, j; |
||
3228 | |||
3229 | /* Emit current tag */ |
||
3230 | token.type = tokeniser->context.current_tag_type; |
||
3231 | token.data.tag = tokeniser->context.current_tag; |
||
3232 | token.data.tag.ns = HUBBUB_NS_HTML; |
||
3233 | |||
3234 | |||
3235 | n_attributes = token.data.tag.n_attributes; |
||
3236 | attrs = token.data.tag.attributes; |
||
3237 | |||
3238 | /* Set pointers correctly... */ |
||
3239 | ptr = tokeniser->buffer->data; |
||
3240 | token.data.tag.name.ptr = tokeniser->buffer->data; |
||
3241 | ptr += token.data.tag.name.len; |
||
3242 | |||
3243 | for (i = 0; i < n_attributes; i++) { |
||
3244 | attrs[i].name.ptr = ptr; |
||
3245 | ptr += attrs[i].name.len; |
||
3246 | attrs[i].value.ptr = ptr; |
||
3247 | ptr += attrs[i].value.len; |
||
3248 | } |
||
3249 | |||
3250 | |||
3251 | /* Discard duplicate attributes */ |
||
3252 | for (i = 0; i < n_attributes; i++) { |
||
3253 | for (j = 0; j < n_attributes; j++) { |
||
3254 | uint32_t move; |
||
3255 | |||
3256 | if (j == i || |
||
3257 | attrs[i].name.len != |
||
3258 | attrs[j].name.len || |
||
3259 | strncmp((char *) attrs[i].name.ptr, |
||
3260 | (char *) attrs[j].name.ptr, |
||
3261 | attrs[i].name.len) != 0) { |
||
3262 | /* Attributes don't match */ |
||
3263 | continue; |
||
3264 | } |
||
3265 | |||
3266 | assert(i < j); |
||
3267 | |||
3268 | /* Calculate amount to move */ |
||
3269 | move = (n_attributes - 1 - j) * |
||
3270 | sizeof(hubbub_attribute); |
||
3271 | |||
3272 | if (move > 0) { |
||
3273 | memmove(&attrs[j],&attrs[j+1], move); |
||
3274 | } |
||
3275 | |||
3276 | /* We've deleted an item, so we need to |
||
3277 | * reprocess this index */ |
||
3278 | j--; |
||
3279 | |||
3280 | /* And reduce the number of attributes */ |
||
3281 | n_attributes--; |
||
3282 | } |
||
3283 | } |
||
3284 | |||
3285 | token.data.tag.n_attributes = n_attributes; |
||
3286 | |||
3287 | err = hubbub_tokeniser_emit_token(tokeniser, &token); |
||
3288 | |||
3289 | if (token.type == HUBBUB_TOKEN_START_TAG) { |
||
3290 | /* Save start tag name for R?CDATA */ |
||
3291 | if (token.data.tag.name.len < |
||
3292 | sizeof(tokeniser->context.last_start_tag_name)) { |
||
3293 | strncpy((char *) tokeniser->context.last_start_tag_name, |
||
3294 | (const char *) token.data.tag.name.ptr, |
||
3295 | token.data.tag.name.len); |
||
3296 | tokeniser->context.last_start_tag_len = |
||
3297 | token.data.tag.name.len; |
||
3298 | } else { |
||
3299 | tokeniser->context.last_start_tag_name[0] = '\0'; |
||
3300 | tokeniser->context.last_start_tag_len = 0; |
||
3301 | } |
||
3302 | } else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ { |
||
3303 | /* Reset content model after R?CDATA elements */ |
||
3304 | tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA; |
||
3305 | } |
||
3306 | |||
3307 | /* Reset the self-closing flag */ |
||
3308 | tokeniser->context.current_tag.self_closing = false; |
||
3309 | |||
3310 | return err; |
||
3311 | } |
||
3312 | |||
3313 | /** |
||
3314 | * Emit the current comment token being stored in the tokeniser context. |
||
3315 | * |
||
3316 | * \param tokeniser Tokeniser instance |
||
3317 | * \return true |
||
3318 | */ |
||
3319 | hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser) |
||
3320 | { |
||
3321 | hubbub_token token; |
||
3322 | |||
3323 | token.type = HUBBUB_TOKEN_COMMENT; |
||
3324 | token.data.comment.ptr = tokeniser->buffer->data; |
||
3325 | token.data.comment.len = tokeniser->buffer->length; |
||
3326 | |||
3327 | return hubbub_tokeniser_emit_token(tokeniser, &token); |
||
3328 | } |
||
3329 | |||
3330 | /** |
||
3331 | * Emit the current doctype token being stored in the tokeniser context. |
||
3332 | * |
||
3333 | * \param tokeniser Tokeniser instance |
||
3334 | * \param force_quirks Force quirks mode on this document |
||
3335 | * \return true |
||
3336 | */ |
||
3337 | hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser, |
||
3338 | bool force_quirks) |
||
3339 | { |
||
3340 | hubbub_token token; |
||
3341 | |||
3342 | /* Emit doctype */ |
||
3343 | token.type = HUBBUB_TOKEN_DOCTYPE; |
||
3344 | token.data.doctype = tokeniser->context.current_doctype; |
||
3345 | if (force_quirks == true) |
||
3346 | token.data.doctype.force_quirks = true; |
||
3347 | |||
3348 | /* Set pointers correctly */ |
||
3349 | token.data.doctype.name.ptr = tokeniser->buffer->data; |
||
3350 | |||
3351 | if (token.data.doctype.public_missing == false) { |
||
3352 | token.data.doctype.public_id.ptr = tokeniser->buffer->data + |
||
3353 | token.data.doctype.name.len; |
||
3354 | } |
||
3355 | |||
3356 | if (token.data.doctype.system_missing == false) { |
||
3357 | token.data.doctype.system_id.ptr = tokeniser->buffer->data + |
||
3358 | token.data.doctype.name.len + |
||
3359 | token.data.doctype.public_id.len; |
||
3360 | } |
||
3361 | |||
3362 | return hubbub_tokeniser_emit_token(tokeniser, &token); |
||
3363 | } |
||
3364 | |||
3365 | /** |
||
3366 | * Emit a token, performing sanity checks if necessary |
||
3367 | * |
||
3368 | * \param tokeniser Tokeniser instance |
||
3369 | * \param token Token to emit |
||
3370 | */ |
||
3371 | hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, |
||
3372 | hubbub_token *token) |
||
3373 | { |
||
3374 | hubbub_error err = HUBBUB_OK; |
||
3375 | |||
3376 | assert(tokeniser != NULL); |
||
3377 | assert(token != NULL); |
||
3378 | assert(tokeniser->insert_buf->length == 0); |
||
3379 | |||
3380 | #ifndef NDEBUG |
||
3381 | /* Sanity checks */ |
||
3382 | switch (token->type) { |
||
3383 | case HUBBUB_TOKEN_DOCTYPE: |
||
3384 | assert(memchr(token->data.doctype.name.ptr, 0xff, |
||
3385 | token->data.doctype.name.len) == NULL); |
||
3386 | if (token->data.doctype.public_missing == false) |
||
3387 | assert(memchr(token->data.doctype.public_id.ptr, 0xff, |
||
3388 | token->data.doctype.public_id.len) == NULL); |
||
3389 | if (token->data.doctype.system_missing == false) |
||
3390 | assert(memchr(token->data.doctype.system_id.ptr, 0xff, |
||
3391 | token->data.doctype.system_id.len) == NULL); |
||
3392 | break; |
||
3393 | case HUBBUB_TOKEN_START_TAG: |
||
3394 | case HUBBUB_TOKEN_END_TAG: |
||
3395 | { |
||
3396 | uint32_t i; |
||
3397 | assert(memchr(token->data.tag.name.ptr, 0xff, |
||
3398 | token->data.tag.name.len) == NULL); |
||
3399 | for (i = 0; i < token->data.tag.n_attributes; i++) { |
||
3400 | hubbub_attribute *attr = &token->data.tag.attributes[i]; |
||
3401 | |||
3402 | assert(memchr(attr->name.ptr, 0xff, attr->name.len) == |
||
3403 | NULL); |
||
3404 | assert(memchr(attr->value.ptr, 0xff, attr->value.len) == |
||
3405 | NULL); |
||
3406 | } |
||
3407 | } |
||
3408 | break; |
||
3409 | case HUBBUB_TOKEN_COMMENT: |
||
3410 | assert(memchr(token->data.comment.ptr, 0xff, |
||
3411 | token->data.comment.len) == NULL); |
||
3412 | break; |
||
3413 | case HUBBUB_TOKEN_CHARACTER: |
||
3414 | assert(memchr(token->data.character.ptr, 0xff, |
||
3415 | token->data.character.len) == NULL); |
||
3416 | break; |
||
3417 | case HUBBUB_TOKEN_EOF: |
||
3418 | break; |
||
3419 | } |
||
3420 | #endif |
||
3421 | |||
3422 | /* Emit the token */ |
||
3423 | if (tokeniser->token_handler) { |
||
3424 | err = tokeniser->token_handler(token, tokeniser->token_pw); |
||
3425 | } |
||
3426 | |||
3427 | /* Discard current buffer */ |
||
3428 | if (tokeniser->buffer->length) { |
||
3429 | parserutils_buffer_discard(tokeniser->buffer, 0, |
||
3430 | tokeniser->buffer->length); |
||
3431 | } |
||
3432 | |||
3433 | /* Advance the pointer */ |
||
3434 | if (tokeniser->context.pending) { |
||
3435 | parserutils_inputstream_advance(tokeniser->input, |
||
3436 | tokeniser->context.pending); |
||
3437 | tokeniser->context.pending = 0; |
||
3438 | } |
||
3439 | |||
3440 | if (tokeniser->insert_buf->length > 0) { |
||
3441 | parserutils_inputstream_insert(tokeniser->input, |
||
3442 | tokeniser->insert_buf->data, |
||
3443 | tokeniser->insert_buf->length); |
||
3444 | parserutils_buffer_discard(tokeniser->insert_buf, 0, |
||
3445 | tokeniser->insert_buf->length); |
||
3446 | } |
||
3447 | |||
3448 | /* Ensure callback can pause the tokenise */ |
||
3449 | if (err == HUBBUB_PAUSED) { |
||
3450 | tokeniser->paused = true; |
||
3451 | } |
||
3452 | |||
3453 | return err; |
||
3454 | }> |