Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
3584 sourcerer 1
/*
2
 * This file is part of Hubbub.
3
 * Licensed under the MIT License,
4
 *                http://www.opensource.org/licenses/mit-license.php
5
 * Copyright 2007 John-Mark Bell 
6
 * Copyright 2008 Andrew Sidwell 
7
 */
8
#include 
9
#include 
10
#include 
11
 
12
#include 
13
 
14
typedef signed char int8_t;
15
typedef signed short int16_t;
16
typedef signed int int32_t;
17
 
18
typedef unsigned char uint8_t;
19
typedef unsigned short uint16_t;
20
typedef unsigned int uint32_t;
21
 
22
#include 
23
 
24
#include 
25
 
26
#include "utils/parserutilserror.h"
27
#include "utils/utils.h"
28
 
29
#include "hubbub/errors.h"
30
#include "tokeniser/entities.h"
31
#include "tokeniser/tokeniser.h"
32
 
33
/**
34
 * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
35
 */
36
static const uint32_t cp1252Table[32] = {
37
	0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
38
	0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
39
	0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
40
	0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
41
};
42
 
43
/**
44
 * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER
45
 */
46
static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' };
47
static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) };
48
 
49
 
50
/**
51
 * String for when we want to emit newlines
52
 */
53
static const uint8_t lf = '\n';
54
static const hubbub_string lf_str = { &lf, 1 };
55
 
56
 
57
/**
58
 * Tokeniser states
59
 */
60
typedef enum hubbub_tokeniser_state {
61
	STATE_DATA,
62
	STATE_CHARACTER_REFERENCE_DATA,
63
	STATE_TAG_OPEN,
64
	STATE_CLOSE_TAG_OPEN,
65
	STATE_TAG_NAME,
66
	STATE_BEFORE_ATTRIBUTE_NAME,
67
	STATE_ATTRIBUTE_NAME,
68
	STATE_AFTER_ATTRIBUTE_NAME,
69
	STATE_BEFORE_ATTRIBUTE_VALUE,
70
	STATE_ATTRIBUTE_VALUE_DQ,
71
	STATE_ATTRIBUTE_VALUE_SQ,
72
	STATE_ATTRIBUTE_VALUE_UQ,
73
	STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE,
74
	STATE_AFTER_ATTRIBUTE_VALUE_Q,
75
	STATE_SELF_CLOSING_START_TAG,
76
	STATE_BOGUS_COMMENT,
77
	STATE_MARKUP_DECLARATION_OPEN,
78
	STATE_MATCH_COMMENT,
79
	STATE_COMMENT_START,
80
	STATE_COMMENT_START_DASH,
81
	STATE_COMMENT,
82
	STATE_COMMENT_END_DASH,
83
	STATE_COMMENT_END,
84
	STATE_MATCH_DOCTYPE,
85
	STATE_DOCTYPE,
86
	STATE_BEFORE_DOCTYPE_NAME,
87
	STATE_DOCTYPE_NAME,
88
	STATE_AFTER_DOCTYPE_NAME,
89
	STATE_MATCH_PUBLIC,
90
	STATE_BEFORE_DOCTYPE_PUBLIC,
91
	STATE_DOCTYPE_PUBLIC_DQ,
92
	STATE_DOCTYPE_PUBLIC_SQ,
93
	STATE_AFTER_DOCTYPE_PUBLIC,
94
	STATE_MATCH_SYSTEM,
95
	STATE_BEFORE_DOCTYPE_SYSTEM,
96
	STATE_DOCTYPE_SYSTEM_DQ,
97
	STATE_DOCTYPE_SYSTEM_SQ,
98
	STATE_AFTER_DOCTYPE_SYSTEM,
99
	STATE_BOGUS_DOCTYPE,
100
	STATE_MATCH_CDATA,
101
	STATE_CDATA_BLOCK,
102
	STATE_NUMBERED_ENTITY,
103
	STATE_NAMED_ENTITY
104
} hubbub_tokeniser_state;
105
 
106
/**
107
 * Context for tokeniser
108
 */
109
typedef struct hubbub_tokeniser_context {
110
	size_t pending;				/**< Count of pending chars */
111
 
112
	hubbub_string current_comment;		/**< Current comment text */
113
 
114
	hubbub_token_type current_tag_type;	/**< Type of current_tag */
115
	hubbub_tag current_tag;			/**< Current tag */
116
	hubbub_doctype current_doctype;		/**< Current doctype */
117
	hubbub_tokeniser_state prev_state;	/**< Previous state */
118
 
119
	uint8_t last_start_tag_name[10];	/**< Name of the last start tag
120
						 * emitted */
121
	size_t last_start_tag_len;		/**< Length of last start tag */
122
 
123
	struct {
124
		uint32_t count;
125
		bool match;
126
	} close_tag_match;			/**< State for matching close
127
						 * tags */
128
 
129
	struct {
130
		uint32_t count;			/**< Index into "DOCTYPE" */
131
	} match_doctype;			/**< State for matching doctype */
132
 
133
	struct {
134
		uint32_t count;			/**< Index into "[CDATA[" */
135
		uint32_t end;			/**< Index into "]]>" */
136
	} match_cdata;				/**< State for matching cdata */
137
 
138
	struct {
139
		size_t offset;			/**< Offset in buffer */
140
		uint32_t length;		/**< Length of entity */
141
		uint32_t codepoint;		/**< UCS4 codepoint */
142
		bool complete;			/**< True if match complete */
143
 
144
		uint32_t poss_length;		/**< Optimistic length
145
						 * when matching named
146
						 * character references */
147
		uint8_t base;			/**< Base for numeric
148
						 * entities */
149
		int32_t context;		/**< Context for named
150
						 * entity search */
151
		size_t prev_len;		/**< Previous byte length
152
						 * of str */
153
		bool had_data;			/**< Whether we read
154
						 * anything after &#(x)? */
155
		bool overflow;			/**< Whether this entity has
156
						 * has overflowed the maximum
157
						 * numeric entity value */
158
		hubbub_tokeniser_state return_state;	/**< State we were
159
							 * called from */
160
	} match_entity;				/**< Entity matching state */
161
 
162
	struct {
163
		uint32_t line;			/**< Current line of input */
164
		uint32_t col;			/**< Current character in
165
						 * line */
166
	} position;				/**< Position in source data */
167
 
168
	uint32_t allowed_char;			/**< Used for quote matching */
169
 
170
} hubbub_tokeniser_context;
171
 
172
/**
173
 * Tokeniser data structure
174
 */
175
struct hubbub_tokeniser {
176
	hubbub_tokeniser_state state;	/**< Current tokeniser state */
177
	hubbub_content_model content_model;	/**< Current content
178
						 * model flag */
179
	bool escape_flag;		/**< Escape flag **/
180
	bool process_cdata_section;	/**< Whether to process CDATA sections*/
181
	bool paused; /**< flag for if parsing is currently paused */
182
 
183
	parserutils_inputstream *input;	/**< Input stream */
184
	parserutils_buffer *buffer;	/**< Input buffer */
185
	parserutils_buffer *insert_buf; /**< Stream insertion buffer */
186
 
187
	hubbub_tokeniser_context context;	/**< Tokeniser context */
188
 
189
	hubbub_token_handler token_handler;	/**< Token handling callback */
190
	void *token_pw;				/**< Token handler data */
191
 
192
	hubbub_error_handler error_handler;	/**< Error handling callback */
193
	void *error_pw;				/**< Error handler data */
194
 
195
	hubbub_allocator_fn alloc;	/**< Memory (de)allocation function */
196
	void *alloc_pw;			/**< Client private data */
197
};
198
 
199
static hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
200
static hubbub_error hubbub_tokeniser_handle_character_reference_data(
201
		hubbub_tokeniser *tokeniser);
202
static hubbub_error hubbub_tokeniser_handle_tag_open(
203
		hubbub_tokeniser *tokeniser);
204
static hubbub_error hubbub_tokeniser_handle_close_tag_open(
205
		hubbub_tokeniser *tokeniser);
206
static hubbub_error hubbub_tokeniser_handle_tag_name(
207
		hubbub_tokeniser *tokeniser);
208
static hubbub_error hubbub_tokeniser_handle_before_attribute_name(
209
		hubbub_tokeniser *tokeniser);
210
static hubbub_error hubbub_tokeniser_handle_attribute_name(
211
		hubbub_tokeniser *tokeniser);
212
static hubbub_error hubbub_tokeniser_handle_after_attribute_name(
213
		hubbub_tokeniser *tokeniser);
214
static hubbub_error hubbub_tokeniser_handle_before_attribute_value(
215
		hubbub_tokeniser *tokeniser);
216
static hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
217
		hubbub_tokeniser *tokeniser);
218
static hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
219
		hubbub_tokeniser *tokeniser);
220
static hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
221
		hubbub_tokeniser *tokeniser);
222
static hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
223
		hubbub_tokeniser *tokeniser);
224
static hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
225
		hubbub_tokeniser *tokeniser);
226
static hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
227
		hubbub_tokeniser *tokeniser);
228
static hubbub_error hubbub_tokeniser_handle_bogus_comment(
229
		hubbub_tokeniser *tokeniser);
230
static hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
231
		hubbub_tokeniser *tokeniser);
232
static hubbub_error hubbub_tokeniser_handle_match_comment(
233
		hubbub_tokeniser *tokeniser);
234
static hubbub_error hubbub_tokeniser_handle_comment(
235
		hubbub_tokeniser *tokeniser);
236
static hubbub_error hubbub_tokeniser_handle_match_doctype(
237
		hubbub_tokeniser *tokeniser);
238
static hubbub_error hubbub_tokeniser_handle_doctype(
239
		hubbub_tokeniser *tokeniser);
240
static hubbub_error hubbub_tokeniser_handle_before_doctype_name(
241
		hubbub_tokeniser *tokeniser);
242
static hubbub_error hubbub_tokeniser_handle_doctype_name(
243
		hubbub_tokeniser *tokeniser);
244
static hubbub_error hubbub_tokeniser_handle_after_doctype_name(
245
		hubbub_tokeniser *tokeniser);
246
static hubbub_error hubbub_tokeniser_handle_match_public(
247
		hubbub_tokeniser *tokeniser);
248
static hubbub_error hubbub_tokeniser_handle_before_doctype_public(
249
		hubbub_tokeniser *tokeniser);
250
static hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
251
		hubbub_tokeniser *tokeniser);
252
static hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
253
		hubbub_tokeniser *tokeniser);
254
static hubbub_error hubbub_tokeniser_handle_after_doctype_public(
255
		hubbub_tokeniser *tokeniser);
256
static hubbub_error hubbub_tokeniser_handle_match_system(
257
		hubbub_tokeniser *tokeniser);
258
static hubbub_error hubbub_tokeniser_handle_before_doctype_system(
259
		hubbub_tokeniser *tokeniser);
260
static hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
261
		hubbub_tokeniser *tokeniser);
262
static hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
263
		hubbub_tokeniser *tokeniser);
264
static hubbub_error hubbub_tokeniser_handle_after_doctype_system(
265
		hubbub_tokeniser *tokeniser);
266
static hubbub_error hubbub_tokeniser_handle_bogus_doctype(
267
		hubbub_tokeniser *tokeniser);
268
static hubbub_error hubbub_tokeniser_handle_match_cdata(
269
		hubbub_tokeniser *tokeniser);
270
static hubbub_error hubbub_tokeniser_handle_cdata_block(
271
		hubbub_tokeniser *tokeniser);
272
static hubbub_error hubbub_tokeniser_consume_character_reference(
273
		hubbub_tokeniser *tokeniser, size_t off);
274
static hubbub_error hubbub_tokeniser_handle_numbered_entity(
275
		hubbub_tokeniser *tokeniser);
276
static hubbub_error hubbub_tokeniser_handle_named_entity(
277
		hubbub_tokeniser *tokeniser);
278
 
279
static inline hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
280
		const hubbub_string *chars);
281
static inline hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser);
282
static inline hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser);
283
static inline hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser);
284
static inline hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
285
		bool force_quirks);
286
static hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
287
		hubbub_token *token);
288
 
289
/**
290
 * Create a hubbub tokeniser
291
 *
292
 * \param input      Input stream instance
293
 * \param alloc      Memory (de)allocation function
294
 * \param pw         Pointer to client-specific private data (may be NULL)
295
 * \param tokeniser  Pointer to location to receive tokeniser instance
296
 * \return HUBBUB_OK on success,
297
 *         HUBBUB_BADPARM on bad parameters,
298
 *         HUBBUB_NOMEM on memory exhaustion
299
 */
300
hubbub_error hubbub_tokeniser_create(parserutils_inputstream *input,
301
		hubbub_allocator_fn alloc, void *pw,
302
		hubbub_tokeniser **tokeniser)
303
{
304
	parserutils_error perror;
305
	hubbub_tokeniser *tok;
306
 
307
	if (input == NULL || alloc == NULL || tokeniser == NULL)
308
		return HUBBUB_BADPARM;
309
 
310
	tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
311
	if (tok == NULL)
312
		return HUBBUB_NOMEM;
313
 
314
	perror = parserutils_buffer_create(alloc, pw, &tok->buffer);
315
	if (perror != PARSERUTILS_OK) {
316
		alloc(tok, 0, pw);
317
		return hubbub_error_from_parserutils_error(perror);
318
	}
319
 
320
	perror = parserutils_buffer_create(alloc, pw, &tok->insert_buf);
321
	if (perror != PARSERUTILS_OK) {
322
		parserutils_buffer_destroy(tok->buffer);
323
		alloc(tok, 0, pw);
324
		return hubbub_error_from_parserutils_error(perror);
325
	}
326
 
327
	tok->state = STATE_DATA;
328
	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
329
 
330
	tok->escape_flag = false;
331
	tok->process_cdata_section = false;
332
 
333
	tok->paused = false;
334
 
335
	tok->input = input;
336
 
337
	tok->token_handler = NULL;
338
	tok->token_pw = NULL;
339
 
340
	tok->error_handler = NULL;
341
	tok->error_pw = NULL;
342
 
343
	tok->alloc = alloc;
344
	tok->alloc_pw = pw;
345
 
346
	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
347
 
348
	*tokeniser = tok;
349
 
350
	return HUBBUB_OK;
351
}
352
 
353
/**
354
 * Destroy a hubbub tokeniser
355
 *
356
 * \param tokeniser  The tokeniser instance to destroy
357
 * \return HUBBUB_OK on success, appropriate error otherwise
358
 */
359
hubbub_error hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
360
{
361
	if (tokeniser == NULL)
362
		return HUBBUB_BADPARM;
363
 
364
	if (tokeniser->context.current_tag.attributes != NULL) {
365
		tokeniser->alloc(tokeniser->context.current_tag.attributes,
366
				0, tokeniser->alloc_pw);
367
	}
368
 
369
	parserutils_buffer_destroy(tokeniser->insert_buf);
370
 
371
	parserutils_buffer_destroy(tokeniser->buffer);
372
 
373
	tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
374
 
375
	return HUBBUB_OK;
376
}
377
 
378
/**
379
 * Configure a hubbub tokeniser
380
 *
381
 * \param tokeniser  The tokeniser instance to configure
382
 * \param type       The option type to set
383
 * \param params     Option-specific parameters
384
 * \return HUBBUB_OK on success, appropriate error otherwise
385
 */
386
hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
387
		hubbub_tokeniser_opttype type,
388
		hubbub_tokeniser_optparams *params)
389
{
390
	hubbub_error err = HUBBUB_OK;
391
 
392
	if (tokeniser == NULL || params == NULL)
393
		return HUBBUB_BADPARM;
394
 
395
	switch (type) {
396
	case HUBBUB_TOKENISER_TOKEN_HANDLER:
397
		tokeniser->token_handler = params->token_handler.handler;
398
		tokeniser->token_pw = params->token_handler.pw;
399
		break;
400
	case HUBBUB_TOKENISER_ERROR_HANDLER:
401
		tokeniser->error_handler = params->error_handler.handler;
402
		tokeniser->error_pw = params->error_handler.pw;
403
		break;
404
	case HUBBUB_TOKENISER_CONTENT_MODEL:
405
		tokeniser->content_model = params->content_model.model;
406
		break;
407
	case HUBBUB_TOKENISER_PROCESS_CDATA:
408
		tokeniser->process_cdata_section = params->process_cdata;
409
		break;
410
	case HUBBUB_TOKENISER_PAUSE:
411
		if (params->pause_parse == true) {
412
			tokeniser->paused = true;
413
		} else {
414
			if (tokeniser->paused == true) {
415
				tokeniser->paused = false;
416
				err = hubbub_tokeniser_run(tokeniser);
417
			}
418
		}
419
	}
420
 
421
	return err;
422
}
423
 
424
/**
425
 * Insert a chunk of data into the input stream.
426
 *
427
 * Inserts the given data into the input stream ready for parsing but
428
 * does not cause any additional processing of the input.
429
 *
430
 * \param tokeniser  Tokeniser instance
431
 * \param data       Data to insert (UTF-8 encoded)
432
 * \param len        Length, in bytes, of data
433
 * \return HUBBUB_OK on success, appropriate error otherwise
434
 */
435
hubbub_error hubbub_tokeniser_insert_chunk(hubbub_tokeniser *tokeniser,
436
		const uint8_t *data, size_t len)
437
{
438
	parserutils_error perror;
439
 
440
	if (tokeniser == NULL || data == NULL)
441
		return HUBBUB_BADPARM;
442
 
443
	perror = parserutils_buffer_append(tokeniser->insert_buf, data, len);
444
	if (perror != PARSERUTILS_OK)
445
		return hubbub_error_from_parserutils_error(perror);
446
 
447
	return HUBBUB_OK;
448
}
449
 
450
/**
451
 * Process remaining data in the input stream
452
 *
453
 * \param tokeniser  The tokeniser instance to invoke
454
 * \return HUBBUB_OK on success, appropriate error otherwise
455
 */
456
hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
457
{
458
	hubbub_error cont = HUBBUB_OK;
459
 
460
	if (tokeniser == NULL)
461
		return HUBBUB_BADPARM;
462
 
463
	if (tokeniser->paused == true)
464
		return HUBBUB_PAUSED;
465
 
466
#if 0
467
#define state(x) \
468
		case x: \
469
			printf( #x "\n");
470
#else
471
#define state(x) \
472
		case x:
473
#endif
474
 
475
	while (cont == HUBBUB_OK) {
476
		switch (tokeniser->state) {
477
		state(STATE_DATA)
478
			cont = hubbub_tokeniser_handle_data(tokeniser);
479
			break;
480
		state(STATE_CHARACTER_REFERENCE_DATA)
481
			cont = hubbub_tokeniser_handle_character_reference_data(
482
					tokeniser);
483
			break;
484
		state(STATE_TAG_OPEN)
485
			cont = hubbub_tokeniser_handle_tag_open(tokeniser);
486
			break;
487
		state(STATE_CLOSE_TAG_OPEN)
488
			cont = hubbub_tokeniser_handle_close_tag_open(
489
					tokeniser);
490
			break;
491
		state(STATE_TAG_NAME)
492
			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
493
			break;
494
		state(STATE_BEFORE_ATTRIBUTE_NAME)
495
			cont = hubbub_tokeniser_handle_before_attribute_name(
496
					tokeniser);
497
			break;
498
		state(STATE_ATTRIBUTE_NAME)
499
			cont = hubbub_tokeniser_handle_attribute_name(
500
					tokeniser);
501
			break;
502
		state(STATE_AFTER_ATTRIBUTE_NAME)
503
			cont = hubbub_tokeniser_handle_after_attribute_name(
504
					tokeniser);
505
			break;
506
		state(STATE_BEFORE_ATTRIBUTE_VALUE)
507
			cont = hubbub_tokeniser_handle_before_attribute_value(
508
					tokeniser);
509
			break;
510
		state(STATE_ATTRIBUTE_VALUE_DQ)
511
			cont = hubbub_tokeniser_handle_attribute_value_dq(
512
					tokeniser);
513
			break;
514
		state(STATE_ATTRIBUTE_VALUE_SQ)
515
			cont = hubbub_tokeniser_handle_attribute_value_sq(
516
					tokeniser);
517
			break;
518
		state(STATE_ATTRIBUTE_VALUE_UQ)
519
			cont = hubbub_tokeniser_handle_attribute_value_uq(
520
					tokeniser);
521
			break;
522
		state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE)
523
			cont = hubbub_tokeniser_handle_character_reference_in_attribute_value(
524
					tokeniser);
525
			break;
526
		state(STATE_AFTER_ATTRIBUTE_VALUE_Q)
527
			cont = hubbub_tokeniser_handle_after_attribute_value_q(
528
					tokeniser);
529
			break;
530
		state(STATE_SELF_CLOSING_START_TAG)
531
			cont = hubbub_tokeniser_handle_self_closing_start_tag(
532
					tokeniser);
533
			break;
534
		state(STATE_BOGUS_COMMENT)
535
			cont = hubbub_tokeniser_handle_bogus_comment(
536
					tokeniser);
537
			break;
538
		state(STATE_MARKUP_DECLARATION_OPEN)
539
			cont = hubbub_tokeniser_handle_markup_declaration_open(
540
					tokeniser);
541
			break;
542
		state(STATE_MATCH_COMMENT)
543
			cont = hubbub_tokeniser_handle_match_comment(
544
					tokeniser);
545
			break;
546
		case STATE_COMMENT_START:
547
		case STATE_COMMENT_START_DASH:
548
		case STATE_COMMENT:
549
		case STATE_COMMENT_END_DASH:
550
		case STATE_COMMENT_END:
551
			cont = hubbub_tokeniser_handle_comment(tokeniser);
552
			break;
553
		state(STATE_MATCH_DOCTYPE)
554
			cont = hubbub_tokeniser_handle_match_doctype(
555
					tokeniser);
556
			break;
557
		state(STATE_DOCTYPE)
558
			cont = hubbub_tokeniser_handle_doctype(tokeniser);
559
			break;
560
		state(STATE_BEFORE_DOCTYPE_NAME)
561
			cont = hubbub_tokeniser_handle_before_doctype_name(
562
					tokeniser);
563
			break;
564
		state(STATE_DOCTYPE_NAME)
565
			cont = hubbub_tokeniser_handle_doctype_name(
566
					tokeniser);
567
			break;
568
		state(STATE_AFTER_DOCTYPE_NAME)
569
			cont = hubbub_tokeniser_handle_after_doctype_name(
570
					tokeniser);
571
			break;
572
 
573
		state(STATE_MATCH_PUBLIC)
574
			cont = hubbub_tokeniser_handle_match_public(
575
					tokeniser);
576
			break;
577
		state(STATE_BEFORE_DOCTYPE_PUBLIC)
578
			cont = hubbub_tokeniser_handle_before_doctype_public(
579
					tokeniser);
580
			break;
581
		state(STATE_DOCTYPE_PUBLIC_DQ)
582
			cont = hubbub_tokeniser_handle_doctype_public_dq(
583
					tokeniser);
584
			break;
585
		state(STATE_DOCTYPE_PUBLIC_SQ)
586
			cont = hubbub_tokeniser_handle_doctype_public_sq(
587
					tokeniser);
588
			break;
589
		state(STATE_AFTER_DOCTYPE_PUBLIC)
590
			cont = hubbub_tokeniser_handle_after_doctype_public(
591
					tokeniser);
592
			break;
593
		state(STATE_MATCH_SYSTEM)
594
			cont = hubbub_tokeniser_handle_match_system(
595
					tokeniser);
596
			break;
597
		state(STATE_BEFORE_DOCTYPE_SYSTEM)
598
			cont = hubbub_tokeniser_handle_before_doctype_system(
599
					tokeniser);
600
			break;
601
		state(STATE_DOCTYPE_SYSTEM_DQ)
602
			cont = hubbub_tokeniser_handle_doctype_system_dq(
603
					tokeniser);
604
			break;
605
		state(STATE_DOCTYPE_SYSTEM_SQ)
606
			cont = hubbub_tokeniser_handle_doctype_system_sq(
607
					tokeniser);
608
			break;
609
		state(STATE_AFTER_DOCTYPE_SYSTEM)
610
			cont = hubbub_tokeniser_handle_after_doctype_system(
611
					tokeniser);
612
			break;
613
		state(STATE_BOGUS_DOCTYPE)
614
			cont = hubbub_tokeniser_handle_bogus_doctype(
615
					tokeniser);
616
			break;
617
		state(STATE_MATCH_CDATA)
618
			cont = hubbub_tokeniser_handle_match_cdata(
619
					tokeniser);
620
			break;
621
		state(STATE_CDATA_BLOCK)
622
			cont = hubbub_tokeniser_handle_cdata_block(
623
					tokeniser);
624
			break;
625
		state(STATE_NUMBERED_ENTITY)
626
			cont = hubbub_tokeniser_handle_numbered_entity(
627
					tokeniser);
628
			break;
629
		state(STATE_NAMED_ENTITY)
630
			cont = hubbub_tokeniser_handle_named_entity(
631
					tokeniser);
632
			break;
633
		}
634
	}
635
 
636
	return (cont == HUBBUB_NEEDDATA) ? HUBBUB_OK : cont;
637
}
638
 
639
 
640
/**
641
 * Various macros for manipulating buffers.
642
 *
643
 * \todo make some of these inline functions (type-safety)
644
 * \todo document them properly here
645
 */
646
 
647
#define START_BUF(str, cptr, length) \
648
	do { \
649
		parserutils_error perror; \
650
		perror = parserutils_buffer_append(tokeniser->buffer, \
651
				(uint8_t *) (cptr), (length)); \
652
		if (perror != PARSERUTILS_OK) \
653
			return hubbub_error_from_parserutils_error(perror); \
654
		(str).len = (length); \
655
	} while (0)
656
 
657
#define COLLECT(str, cptr, length) \
658
	do { \
659
		parserutils_error perror; \
660
		assert(str.len != 0); \
661
		perror = parserutils_buffer_append(tokeniser->buffer, \
662
				(uint8_t *) (cptr), (length)); \
663
		if (perror != PARSERUTILS_OK) \
664
			return hubbub_error_from_parserutils_error(perror); \
665
		(str).len += (length); \
666
	} while (0)
667
 
668
#define COLLECT_MS(str, cptr, length) \
669
	do { \
670
		parserutils_error perror; \
671
		perror = parserutils_buffer_append(tokeniser->buffer, \
672
				(uint8_t *) (cptr), (length)); \
673
		if (perror != PARSERUTILS_OK) \
674
			return hubbub_error_from_parserutils_error(perror); \
675
		(str).len += (length); \
676
	} while (0)
677
 
678
 
679
/* this should always be called with an empty "chars" buffer */
680
hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
681
{
682
	parserutils_error error;
683
	hubbub_token token;
684
	const uint8_t *cptr;
685
	size_t len;
686
 
687
	while ((error = parserutils_inputstream_peek(tokeniser->input,
688
			tokeniser->context.pending, &cptr, &len)) ==
689
					PARSERUTILS_OK) {
690
		const uint8_t c = *cptr;
691
 
692
		if (c == '&' &&
693
				(tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA ||
694
				tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) &&
695
				tokeniser->escape_flag == false) {
696
			tokeniser->state =
697
					STATE_CHARACTER_REFERENCE_DATA;
698
			/* Don't eat the '&'; it'll be handled by entity
699
			 * consumption */
700
			break;
701
		} else if (c == '-' &&
702
				tokeniser->escape_flag == false &&
703
				(tokeniser->content_model ==
704
						HUBBUB_CONTENT_MODEL_RCDATA ||
705
				tokeniser->content_model ==
706
						HUBBUB_CONTENT_MODEL_CDATA) &&
707
				tokeniser->context.pending >= 3) {
708
			size_t ignore;
709
			error = parserutils_inputstream_peek(
710
					tokeniser->input,
711
					tokeniser->context.pending - 3,
712
					&cptr,
713
					&ignore);
714
 
715
			assert(error == PARSERUTILS_OK);
716
 
717
			if (strncmp((char *)cptr,
718
					"", SLEN("-->")) == 0) {
757
				tokeniser->escape_flag = false;
758
			}
759
 
760
			tokeniser->context.pending += len;
761
		} else if (c == '\0') {
762
			if (tokeniser->context.pending > 0) {
763
				/* Emit any pending characters */
764
				emit_current_chars(tokeniser);
765
			}
766
 
767
			/* Emit a replacement character */
768
			emit_character_token(tokeniser, &u_fffd_str);
769
 
770
			/* Advance past NUL */
771
			parserutils_inputstream_advance(tokeniser->input, 1);
772
		} else if (c == '\r') {
773
			error = parserutils_inputstream_peek(
774
					tokeniser->input,
775
					tokeniser->context.pending + len,
776
					&cptr,
777
					&len);
778
 
779
			if (error != PARSERUTILS_OK &&
780
					error != PARSERUTILS_EOF) {
781
				break;
782
			}
783
 
784
			if (tokeniser->context.pending > 0) {
785
				/* Emit any pending characters */
786
				emit_current_chars(tokeniser);
787
			}
788
 
789
			if (error == PARSERUTILS_EOF ||	*cptr != '\n') {
790
				/* Emit newline */
791
				emit_character_token(tokeniser, &lf_str);
792
			}
793
 
794
			/* Advance over */
795
			parserutils_inputstream_advance(tokeniser->input, 1);
796
		} else {
797
			/* Just collect into buffer */
798
			tokeniser->context.pending += len;
799
		}
800
	}
801
 
802
	if (tokeniser->state != STATE_TAG_OPEN &&
803
		(tokeniser->state != STATE_DATA || error == PARSERUTILS_EOF) &&
804
			tokeniser->context.pending > 0) {
805
		/* Emit any pending characters */
806
		emit_current_chars(tokeniser);
807
	}
808
 
809
	if (error == PARSERUTILS_EOF) {
810
		token.type = HUBBUB_TOKEN_EOF;
811
		hubbub_tokeniser_emit_token(tokeniser, &token);
812
	}
813
 
814
	if (error == PARSERUTILS_EOF) {
815
		return HUBBUB_NEEDDATA;
816
	} else {
817
		return hubbub_error_from_parserutils_error(error);
818
	}
819
}
820
 
821
/* emit any pending tokens before calling */
822
hubbub_error hubbub_tokeniser_handle_character_reference_data(
823
		hubbub_tokeniser *tokeniser)
824
{
825
	assert(tokeniser->context.pending == 0);
826
 
827
	if (tokeniser->context.match_entity.complete == false) {
828
		return hubbub_tokeniser_consume_character_reference(tokeniser,
829
				tokeniser->context.pending);
830
	} else {
831
		hubbub_token token;
832
 
833
		uint8_t utf8[6];
834
		uint8_t *utf8ptr = utf8;
835
		size_t len = sizeof(utf8);
836
 
837
		token.type = HUBBUB_TOKEN_CHARACTER;
838
 
839
		if (tokeniser->context.match_entity.codepoint) {
840
			parserutils_charset_utf8_from_ucs4(
841
				tokeniser->context.match_entity.codepoint,
842
				&utf8ptr, &len);
843
 
844
			token.data.character.ptr = utf8;
845
			token.data.character.len = sizeof(utf8) - len;
846
 
847
			hubbub_tokeniser_emit_token(tokeniser, &token);
848
 
849
			/* +1 for ampersand */
850
			parserutils_inputstream_advance(tokeniser->input,
851
					tokeniser->context.match_entity.length
852
							+ 1);
853
		} else {
854
			parserutils_error error;
855
			const uint8_t *cptr = NULL;
856
 
857
			error = parserutils_inputstream_peek(
858
					tokeniser->input,
859
					tokeniser->context.pending,
860
					&cptr,
861
					&len);
862
			if (error != PARSERUTILS_OK) {
863
				return hubbub_error_from_parserutils_error(
864
						error);
865
			}
866
 
867
			token.data.character.ptr = cptr;
868
			token.data.character.len = len;
869
 
870
			hubbub_tokeniser_emit_token(tokeniser, &token);
871
			parserutils_inputstream_advance(tokeniser->input, len);
872
		}
873
 
874
		/* Reset for next time */
875
		tokeniser->context.match_entity.complete = false;
876
 
877
		tokeniser->state = STATE_DATA;
878
	}
879
 
880
	return HUBBUB_OK;
881
}
882
 
883
/* this state always switches to another state straight away */
884
/* this state expects the current character to be '<' */
885
hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
886
{
887
	hubbub_tag *ctag = &tokeniser->context.current_tag;
888
 
889
	size_t len;
890
	const uint8_t *cptr;
891
	parserutils_error error;
892
	uint8_t c;
893
 
894
	assert(tokeniser->context.pending == 1);
895
/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
896
 
897
	error = parserutils_inputstream_peek(tokeniser->input,
898
			tokeniser->context.pending, &cptr, &len);
899
 
900
	if (error != PARSERUTILS_OK) {
901
		if (error == PARSERUTILS_EOF) {
902
			/* Return to data state with '<' still in "chars" */
903
			tokeniser->state = STATE_DATA;
904
			return HUBBUB_OK;
905
		} else {
906
			return hubbub_error_from_parserutils_error(error);
907
		}
908
	}
909
 
910
	c = *cptr;
911
 
912
	if (c == '/') {
913
		tokeniser->context.pending += len;
914
 
915
		tokeniser->context.close_tag_match.match = false;
916
		tokeniser->context.close_tag_match.count = 0;
917
 
918
		tokeniser->state = STATE_CLOSE_TAG_OPEN;
919
	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
920
			tokeniser->content_model ==
921
					HUBBUB_CONTENT_MODEL_CDATA) {
922
		/* Return to data state with '<' still in "chars" */
923
		tokeniser->state = STATE_DATA;
924
	} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
925
		if (c == '!') {
926
			parserutils_inputstream_advance(tokeniser->input,
927
					SLEN("
928
 
929
			tokeniser->context.pending = 0;
930
			tokeniser->state = STATE_MARKUP_DECLARATION_OPEN;
931
		} else if ('A' <= c && c <= 'Z') {
932
			uint8_t lc = (c + 0x20);
933
 
934
			START_BUF(ctag->name, &lc, len);
935
			ctag->n_attributes = 0;
936
			tokeniser->context.current_tag_type =
937
					HUBBUB_TOKEN_START_TAG;
938
 
939
			tokeniser->context.pending += len;
940
 
941
			tokeniser->state = STATE_TAG_NAME;
942
		} else if ('a' <= c && c <= 'z') {
943
			START_BUF(ctag->name, cptr, len);
944
			ctag->n_attributes = 0;
945
			tokeniser->context.current_tag_type =
946
					HUBBUB_TOKEN_START_TAG;
947
 
948
			tokeniser->context.pending += len;
949
 
950
			tokeniser->state = STATE_TAG_NAME;
951
		} else if (c == '>') {
952
			/** \todo parse error */
953
 
954
			tokeniser->context.pending += len;
955
			tokeniser->state = STATE_DATA;
956
		} else if (c == '?') {
957
			/** \todo parse error */
958
 
959
			/* Cursor still at "<", need to advance past it */
960
			parserutils_inputstream_advance(
961
					tokeniser->input, SLEN("<"));
962
			tokeniser->context.pending = 0;
963
 
964
			tokeniser->state = STATE_BOGUS_COMMENT;
965
		} else {
966
			/* Return to data state with '<' still in "chars" */
967
			tokeniser->state = STATE_DATA;
968
		}
969
	}
970
 
971
	return HUBBUB_OK;
972
}
973
 
974
/* this state expects tokeniser->context.chars to be "
975
/* this state never stays in this state for more than one character */
976
hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
977
{
978
	hubbub_tokeniser_context *ctx = &tokeniser->context;
979
 
980
	size_t len;
981
	const uint8_t *cptr;
982
	parserutils_error error;
983
	uint8_t c;
984
 
985
	assert(tokeniser->context.pending == 2);
986
/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
987
/*	assert(tokeniser->context.chars.ptr[1] == '/'); */
988
 
989
	/**\todo fragment case */
990
 
991
	if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
992
			tokeniser->content_model ==
993
					HUBBUB_CONTENT_MODEL_CDATA) {
994
		uint8_t *start_tag_name =
995
			tokeniser->context.last_start_tag_name;
996
		size_t start_tag_len =
997
			tokeniser->context.last_start_tag_len;
998
 
999
		while ((error = parserutils_inputstream_peek(tokeniser->input,
1000
					ctx->pending +
1001
						ctx->close_tag_match.count,
1002
					&cptr,
1003
					&len)) == PARSERUTILS_OK) {
1004
			c = *cptr;
1005
 
1006
			if ((start_tag_name[ctx->close_tag_match.count] & ~0x20)
1007
					!= (c & ~0x20)) {
1008
				break;
1009
			}
1010
 
1011
			ctx->close_tag_match.count += len;
1012
 
1013
			if (ctx->close_tag_match.count == start_tag_len) {
1014
				ctx->close_tag_match.match = true;
1015
				break;
1016
			}
1017
		}
1018
 
1019
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1020
			return hubbub_error_from_parserutils_error(error);
1021
		}
1022
 
1023
		if (ctx->close_tag_match.match == true) {
1024
			error = parserutils_inputstream_peek(
1025
			 		tokeniser->input,
1026
			 		ctx->pending +
1027
				 		ctx->close_tag_match.count,
1028
					&cptr,
1029
			 		&len);
1030
 
1031
			if (error != PARSERUTILS_OK &&
1032
					error != PARSERUTILS_EOF) {
1033
				return hubbub_error_from_parserutils_error(
1034
						error);
1035
			} else if (error != PARSERUTILS_EOF) {
1036
				c = *cptr;
1037
 
1038
				if (c != '\t' && c != '\n' && c != '\f' &&
1039
						c != ' ' && c != '>' &&
1040
						c != '/') {
1041
					ctx->close_tag_match.match = false;
1042
				}
1043
			}
1044
		}
1045
	}
1046
 
1047
	if (ctx->close_tag_match.match == false &&
1048
			tokeniser->content_model !=
1049
					HUBBUB_CONTENT_MODEL_PCDATA) {
1050
		/* We should emit "
1051
		 * buffer so the data state emits it with any characters
1052
		 * following it */
1053
		tokeniser->state = STATE_DATA;
1054
	} else {
1055
		error = parserutils_inputstream_peek(tokeniser->input,
1056
				tokeniser->context.pending, &cptr, &len);
1057
 
1058
		if (error == PARSERUTILS_EOF) {
1059
			/** \todo parse error */
1060
 
1061
			/* Return to data state with "
1062
			tokeniser->state = STATE_DATA;
1063
			return HUBBUB_OK;
1064
		} else if (error != PARSERUTILS_OK) {
1065
			return hubbub_error_from_parserutils_error(error);
1066
		}
1067
 
1068
		c = *cptr;
1069
 
1070
		if ('A' <= c && c <= 'Z') {
1071
			uint8_t lc = (c + 0x20);
1072
			START_BUF(tokeniser->context.current_tag.name,
1073
					&lc, len);
1074
			tokeniser->context.current_tag.n_attributes = 0;
1075
 
1076
			tokeniser->context.current_tag_type =
1077
					HUBBUB_TOKEN_END_TAG;
1078
 
1079
			tokeniser->context.pending += len;
1080
 
1081
			tokeniser->state = STATE_TAG_NAME;
1082
		} else if ('a' <= c && c <= 'z') {
1083
			START_BUF(tokeniser->context.current_tag.name,
1084
					cptr, len);
1085
			tokeniser->context.current_tag.n_attributes = 0;
1086
 
1087
			tokeniser->context.current_tag_type =
1088
					HUBBUB_TOKEN_END_TAG;
1089
 
1090
			tokeniser->context.pending += len;
1091
 
1092
			tokeniser->state = STATE_TAG_NAME;
1093
		} else if (c == '>') {
1094
			/* Cursor still at "" */
1095
			tokeniser->context.pending += len;
1096
 
1097
			/* Now need to advance past "" */
1098
			parserutils_inputstream_advance(tokeniser->input,
1099
					tokeniser->context.pending);
1100
			tokeniser->context.pending = 0;
1101
 
1102
			/** \todo parse error */
1103
			tokeniser->state = STATE_DATA;
1104
		} else {
1105
			/** \todo parse error */
1106
 
1107
			/* Cursor still at "
1108
			parserutils_inputstream_advance(tokeniser->input,
1109
					tokeniser->context.pending);
1110
			tokeniser->context.pending = 0;
1111
 
1112
			tokeniser->state = STATE_BOGUS_COMMENT;
1113
		}
1114
	}
1115
 
1116
	return HUBBUB_OK;
1117
}
1118
 
1119
/* this state expects tokeniser->context.current_tag to already have its
1120
   first character set */
1121
hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
1122
{
1123
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1124
 
1125
	size_t len;
1126
	const uint8_t *cptr;
1127
	parserutils_error error;
1128
	uint8_t c;
1129
 
1130
	assert(tokeniser->context.pending > 0);
1131
/*	assert(tokeniser->context.chars.ptr[0] == '<'); */
1132
	assert(ctag->name.len > 0);
1133
/*	assert(ctag->name.ptr); */
1134
 
1135
	error = parserutils_inputstream_peek(tokeniser->input,
1136
			tokeniser->context.pending, &cptr, &len);
1137
 
1138
	if (error != PARSERUTILS_OK) {
1139
		if (error == PARSERUTILS_EOF) {
1140
			tokeniser->state = STATE_DATA;
1141
			return emit_current_tag(tokeniser);
1142
		} else {
1143
			return hubbub_error_from_parserutils_error(error);
1144
		}
1145
	}
1146
 
1147
	c = *cptr;
1148
 
1149
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1150
		tokeniser->context.pending += len;
1151
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1152
	} else if (c == '>') {
1153
		tokeniser->context.pending += len;
1154
		tokeniser->state = STATE_DATA;
1155
		return emit_current_tag(tokeniser);
1156
	} else if (c == '\0') {
1157
		COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
1158
		tokeniser->context.pending += len;
1159
	} else if (c == '/') {
1160
		tokeniser->context.pending += len;
1161
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1162
	} else if ('A' <= c && c <= 'Z') {
1163
		uint8_t lc = (c + 0x20);
1164
		COLLECT(ctag->name, &lc, len);
1165
		tokeniser->context.pending += len;
1166
	} else {
1167
		COLLECT(ctag->name, cptr, len);
1168
		tokeniser->context.pending += len;
1169
	}
1170
 
1171
	return HUBBUB_OK;
1172
}
1173
 
1174
hubbub_error hubbub_tokeniser_handle_before_attribute_name(
1175
		hubbub_tokeniser *tokeniser)
1176
{
1177
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1178
 
1179
	size_t len;
1180
	const uint8_t *cptr;
1181
	parserutils_error error;
1182
	uint8_t c;
1183
 
1184
	error = parserutils_inputstream_peek(tokeniser->input,
1185
			tokeniser->context.pending, &cptr, &len);
1186
 
1187
	if (error != PARSERUTILS_OK) {
1188
		if (error == PARSERUTILS_EOF) {
1189
			tokeniser->state = STATE_DATA;
1190
			return emit_current_tag(tokeniser);
1191
		} else {
1192
			return hubbub_error_from_parserutils_error(error);
1193
		}
1194
	}
1195
 
1196
	c = *cptr;
1197
 
1198
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1199
		/* pass over in silence */
1200
		tokeniser->context.pending += len;
1201
	} else if (c == '>') {
1202
		tokeniser->context.pending += len;
1203
		tokeniser->state = STATE_DATA;
1204
		return emit_current_tag(tokeniser);
1205
	} else if (c == '/') {
1206
		tokeniser->context.pending += len;
1207
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1208
	} else {
1209
		hubbub_attribute *attr;
1210
 
1211
		if (c == '"' || c == '\'' || c == '=') {
1212
			/** \todo parse error */
1213
		}
1214
 
1215
		attr = tokeniser->alloc(ctag->attributes,
1216
				(ctag->n_attributes + 1) *
1217
					sizeof(hubbub_attribute),
1218
				tokeniser->alloc_pw);
1219
		if (attr == NULL)
1220
			return HUBBUB_NOMEM;
1221
 
1222
		ctag->attributes = attr;
1223
 
1224
		if ('A' <= c && c <= 'Z') {
1225
			uint8_t lc = (c + 0x20);
1226
			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1227
		} else if (c == '\0') {
1228
			START_BUF(attr[ctag->n_attributes].name,
1229
					u_fffd, sizeof(u_fffd));
1230
		} else {
1231
			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1232
		}
1233
 
1234
		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1235
		attr[ctag->n_attributes].value.ptr = NULL;
1236
		attr[ctag->n_attributes].value.len = 0;
1237
 
1238
		ctag->n_attributes++;
1239
 
1240
		tokeniser->context.pending += len;
1241
		tokeniser->state = STATE_ATTRIBUTE_NAME;
1242
	}
1243
 
1244
	return HUBBUB_OK;
1245
}
1246
 
1247
hubbub_error hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
1248
{
1249
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1250
 
1251
	size_t len;
1252
	const uint8_t *cptr;
1253
	parserutils_error error;
1254
	uint8_t c;
1255
 
1256
	assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0);
1257
 
1258
	error = parserutils_inputstream_peek(tokeniser->input,
1259
			tokeniser->context.pending, &cptr, &len);
1260
 
1261
	if (error != PARSERUTILS_OK) {
1262
		if (error == PARSERUTILS_EOF) {
1263
			tokeniser->state = STATE_DATA;
1264
			return emit_current_tag(tokeniser);
1265
		} else {
1266
			return hubbub_error_from_parserutils_error(error);
1267
		}
1268
	}
1269
 
1270
	c = *cptr;
1271
 
1272
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1273
		tokeniser->context.pending += len;
1274
		tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME;
1275
	} else if (c == '=') {
1276
		tokeniser->context.pending += len;
1277
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1278
	} else if (c == '>') {
1279
		tokeniser->context.pending += len;
1280
		tokeniser->state = STATE_DATA;
1281
		return emit_current_tag(tokeniser);
1282
	} else if (c == '/') {
1283
		tokeniser->context.pending += len;
1284
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1285
	} else if (c == '\0') {
1286
		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1287
				u_fffd, sizeof(u_fffd));
1288
		tokeniser->context.pending += len;
1289
	} else if ('A' <= c && c <= 'Z') {
1290
		uint8_t lc = (c + 0x20);
1291
		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1292
				&lc, len);
1293
		tokeniser->context.pending += len;
1294
	} else {
1295
		COLLECT(ctag->attributes[ctag->n_attributes - 1].name,
1296
				cptr, len);
1297
		tokeniser->context.pending += len;
1298
	}
1299
 
1300
	return HUBBUB_OK;
1301
}
1302
 
1303
hubbub_error hubbub_tokeniser_handle_after_attribute_name(
1304
		hubbub_tokeniser *tokeniser)
1305
{
1306
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1307
 
1308
	size_t len;
1309
	const uint8_t *cptr;
1310
	parserutils_error error;
1311
	uint8_t c;
1312
 
1313
	error = parserutils_inputstream_peek(tokeniser->input,
1314
			tokeniser->context.pending, &cptr, &len);
1315
 
1316
	if (error != PARSERUTILS_OK) {
1317
		if (error == PARSERUTILS_EOF) {
1318
			tokeniser->state = STATE_DATA;
1319
			return emit_current_tag(tokeniser);
1320
		} else {
1321
			return hubbub_error_from_parserutils_error(error);
1322
		}
1323
	}
1324
 
1325
	c = *cptr;
1326
 
1327
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1328
		tokeniser->context.pending += len;
1329
	} else if (c == '=') {
1330
		tokeniser->context.pending += len;
1331
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE;
1332
	} else if (c == '>') {
1333
		tokeniser->context.pending += len;
1334
 
1335
		tokeniser->state = STATE_DATA;
1336
		return emit_current_tag(tokeniser);
1337
	} else if (c == '/') {
1338
		tokeniser->context.pending += len;
1339
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1340
	} else {
1341
		hubbub_attribute *attr;
1342
 
1343
		if (c == '"' || c == '\'') {
1344
			/** \todo parse error */
1345
		}
1346
 
1347
		attr = tokeniser->alloc(ctag->attributes,
1348
				(ctag->n_attributes + 1) *
1349
					sizeof(hubbub_attribute),
1350
				tokeniser->alloc_pw);
1351
		if (attr == NULL)
1352
			return HUBBUB_NOMEM;
1353
 
1354
		ctag->attributes = attr;
1355
 
1356
		if ('A' <= c && c <= 'Z') {
1357
			uint8_t lc = (c + 0x20);
1358
			START_BUF(attr[ctag->n_attributes].name, &lc, len);
1359
		} else if (c == '\0') {
1360
			START_BUF(attr[ctag->n_attributes].name,
1361
					u_fffd, sizeof(u_fffd));
1362
		} else {
1363
			START_BUF(attr[ctag->n_attributes].name, cptr, len);
1364
		}
1365
 
1366
		attr[ctag->n_attributes].ns = HUBBUB_NS_NULL;
1367
		attr[ctag->n_attributes].value.ptr = NULL;
1368
		attr[ctag->n_attributes].value.len = 0;
1369
 
1370
		ctag->n_attributes++;
1371
 
1372
		tokeniser->context.pending += len;
1373
		tokeniser->state = STATE_ATTRIBUTE_NAME;
1374
	}
1375
 
1376
	return HUBBUB_OK;
1377
}
1378
 
1379
/* this state is only ever triggered by an '=' */
1380
hubbub_error hubbub_tokeniser_handle_before_attribute_value(
1381
		hubbub_tokeniser *tokeniser)
1382
{
1383
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1384
 
1385
	size_t len;
1386
	const uint8_t *cptr;
1387
	parserutils_error error;
1388
	uint8_t c;
1389
 
1390
	error = parserutils_inputstream_peek(tokeniser->input,
1391
			tokeniser->context.pending, &cptr, &len);
1392
 
1393
	if (error != PARSERUTILS_OK) {
1394
		if (error == PARSERUTILS_EOF) {
1395
			/** \todo parse error */
1396
			tokeniser->state = STATE_DATA;
1397
			return emit_current_tag(tokeniser);
1398
		} else {
1399
			return hubbub_error_from_parserutils_error(error);
1400
		}
1401
	}
1402
 
1403
	c = *cptr;
1404
 
1405
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1406
		tokeniser->context.pending += len;
1407
	} else if (c == '"') {
1408
		tokeniser->context.pending += len;
1409
		tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ;
1410
	} else if (c == '&') {
1411
		/* Don't consume the '&' -- reprocess in UQ state */
1412
		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1413
	} else if (c == '\'') {
1414
		tokeniser->context.pending += len;
1415
		tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ;
1416
	} else if (c == '>') {
1417
		/** \todo parse error */
1418
		tokeniser->context.pending += len;
1419
 
1420
		tokeniser->state = STATE_DATA;
1421
		return emit_current_tag(tokeniser);
1422
	} else if (c == '\0') {
1423
		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1424
				u_fffd, sizeof(u_fffd));
1425
		tokeniser->context.pending += len;
1426
		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1427
	} else {
1428
		if (c == '=') {
1429
			/** \todo parse error */
1430
		}
1431
 
1432
		START_BUF(ctag->attributes[ctag->n_attributes - 1].value,
1433
				cptr, len);
1434
 
1435
		tokeniser->context.pending += len;
1436
		tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ;
1437
	}
1438
 
1439
	return HUBBUB_OK;
1440
}
1441
 
1442
hubbub_error hubbub_tokeniser_handle_attribute_value_dq(
1443
		hubbub_tokeniser *tokeniser)
1444
{
1445
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1446
 
1447
	size_t len;
1448
	const uint8_t *cptr;
1449
	parserutils_error error;
1450
	uint8_t c;
1451
 
1452
	error = parserutils_inputstream_peek(tokeniser->input,
1453
			tokeniser->context.pending, &cptr, &len);
1454
 
1455
	if (error != PARSERUTILS_OK) {
1456
		if (error == PARSERUTILS_EOF) {
1457
			tokeniser->state = STATE_DATA;
1458
			return emit_current_tag(tokeniser);
1459
		} else {
1460
			return hubbub_error_from_parserutils_error(error);
1461
		}
1462
	}
1463
 
1464
	c = *cptr;
1465
 
1466
	if (c == '"') {
1467
		tokeniser->context.pending += len;
1468
		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1469
	} else if (c == '&') {
1470
		tokeniser->context.prev_state = tokeniser->state;
1471
		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1472
		tokeniser->context.allowed_char = '"';
1473
		/* Don't eat the '&'; it'll be handled by entity consumption */
1474
	} else if (c == '\0') {
1475
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1476
				u_fffd, sizeof(u_fffd));
1477
		tokeniser->context.pending += len;
1478
	} else if (c == '\r') {
1479
		error = parserutils_inputstream_peek(
1480
				tokeniser->input,
1481
				tokeniser->context.pending + len,
1482
				&cptr,
1483
				&len);
1484
 
1485
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1486
			return hubbub_error_from_parserutils_error(error);
1487
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1488
			COLLECT_MS(ctag->attributes[
1489
					ctag->n_attributes - 1].value,
1490
					&lf, sizeof(lf));
1491
		}
1492
 
1493
		/* Consume '\r' */
1494
		tokeniser->context.pending += 1;
1495
	} else {
1496
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1497
				cptr, len);
1498
		tokeniser->context.pending += len;
1499
	}
1500
 
1501
	return HUBBUB_OK;
1502
}
1503
 
1504
hubbub_error hubbub_tokeniser_handle_attribute_value_sq(
1505
		hubbub_tokeniser *tokeniser)
1506
{
1507
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1508
 
1509
	size_t len;
1510
	const uint8_t *cptr;
1511
	parserutils_error error;
1512
	uint8_t c;
1513
 
1514
	error = parserutils_inputstream_peek(tokeniser->input,
1515
			tokeniser->context.pending, &cptr, &len);
1516
 
1517
	if (error != PARSERUTILS_OK) {
1518
		if (error == PARSERUTILS_EOF) {
1519
			tokeniser->state = STATE_DATA;
1520
			return emit_current_tag(tokeniser);
1521
		} else {
1522
			return hubbub_error_from_parserutils_error(error);
1523
		}
1524
	}
1525
 
1526
	c = *cptr;
1527
 
1528
	if (c == '\'') {
1529
		tokeniser->context.pending += len;
1530
		tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q;
1531
	} else if (c == '&') {
1532
		tokeniser->context.prev_state = tokeniser->state;
1533
		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1534
		tokeniser->context.allowed_char = '\'';
1535
		/* Don't eat the '&'; it'll be handled by entity consumption */
1536
	} else if (c == '\0') {
1537
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1538
				u_fffd, sizeof(u_fffd));
1539
		tokeniser->context.pending += len;
1540
	} else if (c == '\r') {
1541
		error = parserutils_inputstream_peek(
1542
				tokeniser->input,
1543
				tokeniser->context.pending + len,
1544
				&cptr,
1545
				&len);
1546
 
1547
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1548
			return hubbub_error_from_parserutils_error(error);
1549
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1550
			COLLECT_MS(ctag->attributes[
1551
					ctag->n_attributes - 1].value,
1552
					&lf, sizeof(lf));
1553
		}
1554
 
1555
		/* Consume \r */
1556
		tokeniser->context.pending += 1;
1557
	} else {
1558
		COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value,
1559
				cptr, len);
1560
		tokeniser->context.pending += len;
1561
	}
1562
 
1563
	return HUBBUB_OK;
1564
}
1565
 
1566
hubbub_error hubbub_tokeniser_handle_attribute_value_uq(
1567
		hubbub_tokeniser *tokeniser)
1568
{
1569
	hubbub_tag *ctag = &tokeniser->context.current_tag;
1570
	uint8_t c;
1571
 
1572
	size_t len;
1573
	const uint8_t *cptr;
1574
	parserutils_error error;
1575
 
1576
	error = parserutils_inputstream_peek(tokeniser->input,
1577
			tokeniser->context.pending, &cptr, &len);
1578
 
1579
	if (error != PARSERUTILS_OK) {
1580
		if (error == PARSERUTILS_EOF) {
1581
			tokeniser->state = STATE_DATA;
1582
			return emit_current_tag(tokeniser);
1583
		} else {
1584
			return hubbub_error_from_parserutils_error(error);
1585
		}
1586
	}
1587
 
1588
	c = *cptr;
1589
 
1590
	assert(c == '&' ||
1591
		ctag->attributes[ctag->n_attributes - 1].value.len >= 1);
1592
 
1593
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1594
		tokeniser->context.pending += len;
1595
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1596
	} else if (c == '&') {
1597
		tokeniser->context.prev_state = tokeniser->state;
1598
		tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
1599
		/* Don't eat the '&'; it'll be handled by entity consumption */
1600
	} else if (c == '>') {
1601
		tokeniser->context.pending += len;
1602
		tokeniser->state = STATE_DATA;
1603
		return emit_current_tag(tokeniser);
1604
	} else if (c == '\0') {
1605
		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1606
				u_fffd, sizeof(u_fffd));
1607
		tokeniser->context.pending += len;
1608
	} else {
1609
		if (c == '"' || c == '\'' || c == '=') {
1610
			/** \todo parse error */
1611
		}
1612
 
1613
		COLLECT(ctag->attributes[ctag->n_attributes - 1].value,
1614
				cptr, len);
1615
		tokeniser->context.pending += len;
1616
	}
1617
 
1618
	return HUBBUB_OK;
1619
}
1620
 
1621
hubbub_error hubbub_tokeniser_handle_character_reference_in_attribute_value(
1622
		hubbub_tokeniser *tokeniser)
1623
{
1624
	if (tokeniser->context.match_entity.complete == false) {
1625
		return hubbub_tokeniser_consume_character_reference(tokeniser,
1626
				tokeniser->context.pending);
1627
	} else {
1628
		hubbub_tag *ctag = &tokeniser->context.current_tag;
1629
		hubbub_attribute *attr = &ctag->attributes[
1630
				ctag->n_attributes - 1];
1631
 
1632
		uint8_t utf8[6];
1633
		uint8_t *utf8ptr = utf8;
1634
		size_t len = sizeof(utf8);
1635
 
1636
		if (tokeniser->context.match_entity.codepoint) {
1637
			parserutils_charset_utf8_from_ucs4(
1638
				tokeniser->context.match_entity.codepoint,
1639
				&utf8ptr, &len);
1640
 
1641
			COLLECT_MS(attr->value, utf8, sizeof(utf8) - len);
1642
 
1643
			/* +1 for the ampersand */
1644
			tokeniser->context.pending +=
1645
					tokeniser->context.match_entity.length
1646
					+ 1;
1647
		} else {
1648
			size_t len = 0;
1649
			const uint8_t *cptr = NULL;
1650
			parserutils_error error;
1651
 
1652
			error = parserutils_inputstream_peek(
1653
					tokeniser->input,
1654
					tokeniser->context.pending,
1655
					&cptr,
1656
					&len);
1657
			if (error != PARSERUTILS_OK) {
1658
				return hubbub_error_from_parserutils_error(
1659
						error);
1660
			}
1661
 
1662
			/* Insert the ampersand */
1663
			COLLECT_MS(attr->value, cptr, len);
1664
			tokeniser->context.pending += len;
1665
		}
1666
 
1667
		/* Reset for next time */
1668
		tokeniser->context.match_entity.complete = false;
1669
 
1670
		/* And back to the previous state */
1671
		tokeniser->state = tokeniser->context.prev_state;
1672
	}
1673
 
1674
	return HUBBUB_OK;
1675
}
1676
 
1677
/* always switches state */
1678
hubbub_error hubbub_tokeniser_handle_after_attribute_value_q(
1679
		hubbub_tokeniser *tokeniser)
1680
{
1681
	size_t len;
1682
	const uint8_t *cptr;
1683
	parserutils_error error;
1684
	uint8_t c;
1685
 
1686
	error = parserutils_inputstream_peek(tokeniser->input,
1687
			tokeniser->context.pending, &cptr, &len);
1688
 
1689
	if (error != PARSERUTILS_OK) {
1690
		if (error == PARSERUTILS_EOF) {
1691
			tokeniser->state = STATE_DATA;
1692
			return emit_current_tag(tokeniser);
1693
		} else {
1694
			return hubbub_error_from_parserutils_error(error);
1695
		}
1696
	}
1697
 
1698
	c = *cptr;
1699
 
1700
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
1701
		tokeniser->context.pending += len;
1702
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1703
	} else if (c == '>') {
1704
		tokeniser->context.pending += len;
1705
 
1706
		tokeniser->state = STATE_DATA;
1707
		return emit_current_tag(tokeniser);
1708
	} else if (c == '/') {
1709
		tokeniser->context.pending += len;
1710
		tokeniser->state = STATE_SELF_CLOSING_START_TAG;
1711
	} else {
1712
		/** \todo parse error */
1713
		/* Reprocess character in before attribute name state */
1714
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1715
	}
1716
 
1717
	return HUBBUB_OK;
1718
}
1719
 
1720
hubbub_error hubbub_tokeniser_handle_self_closing_start_tag(
1721
		hubbub_tokeniser *tokeniser)
1722
{
1723
	size_t len;
1724
	const uint8_t *cptr;
1725
	parserutils_error error;
1726
	uint8_t c;
1727
 
1728
	error = parserutils_inputstream_peek(tokeniser->input,
1729
			tokeniser->context.pending, &cptr, &len);
1730
 
1731
	if (error != PARSERUTILS_OK) {
1732
		if (error == PARSERUTILS_EOF) {
1733
			tokeniser->state = STATE_DATA;
1734
			return emit_current_tag(tokeniser);
1735
		} else {
1736
			return hubbub_error_from_parserutils_error(error);
1737
		}
1738
	}
1739
 
1740
	c = *cptr;
1741
 
1742
	if (c == '>') {
1743
		tokeniser->context.pending += len;
1744
		tokeniser->state = STATE_DATA;
1745
 
1746
		tokeniser->context.current_tag.self_closing = true;
1747
		return emit_current_tag(tokeniser);
1748
	} else {
1749
		/* Reprocess character in before attribute name state */
1750
		tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
1751
	}
1752
 
1753
	return HUBBUB_OK;
1754
}
1755
 
1756
/* this state expects tokeniser->context.chars to be empty on first entry */
1757
hubbub_error hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
1758
{
1759
	size_t len;
1760
	const uint8_t *cptr;
1761
	parserutils_error error;
1762
	uint8_t c;
1763
 
1764
	error = parserutils_inputstream_peek(tokeniser->input,
1765
			tokeniser->context.pending, &cptr, &len);
1766
 
1767
	if (error != PARSERUTILS_OK) {
1768
		if (error == PARSERUTILS_EOF) {
1769
			tokeniser->state = STATE_DATA;
1770
			return emit_current_comment(tokeniser);
1771
		} else {
1772
			return hubbub_error_from_parserutils_error(error);
1773
		}
1774
	}
1775
 
1776
	c = *cptr;
1777
 
1778
	if (c == '>') {
1779
		tokeniser->context.pending += len;
1780
		tokeniser->state = STATE_DATA;
1781
		return emit_current_comment(tokeniser);
1782
	} else if (c == '\0') {
1783
		error = parserutils_buffer_append(tokeniser->buffer,
1784
				u_fffd, sizeof(u_fffd));
1785
		if (error != PARSERUTILS_OK)
1786
			return hubbub_error_from_parserutils_error(error);
1787
 
1788
		tokeniser->context.pending += len;
1789
	} else if (c == '\r') {
1790
		error = parserutils_inputstream_peek(
1791
				tokeniser->input,
1792
				tokeniser->context.pending,
1793
				&cptr,
1794
				&len);
1795
 
1796
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
1797
			return hubbub_error_from_parserutils_error(error);
1798
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
1799
			error = parserutils_buffer_append(tokeniser->buffer,
1800
					&lf, sizeof(lf));
1801
			if (error != PARSERUTILS_OK) {
1802
				return hubbub_error_from_parserutils_error(
1803
						error);
1804
			}
1805
		}
1806
		tokeniser->context.pending += len;
1807
	} else {
1808
		error = parserutils_buffer_append(tokeniser->buffer,
1809
				(uint8_t *) cptr, len);
1810
		if (error != PARSERUTILS_OK)
1811
			return hubbub_error_from_parserutils_error(error);
1812
 
1813
		tokeniser->context.pending += len;
1814
	}
1815
 
1816
	return HUBBUB_OK;
1817
}
1818
 
1819
/* this state always switches to another state straight away */
1820
hubbub_error hubbub_tokeniser_handle_markup_declaration_open(
1821
		hubbub_tokeniser *tokeniser)
1822
{
1823
	size_t len;
1824
	const uint8_t *cptr;
1825
	parserutils_error error;
1826
	uint8_t c;
1827
 
1828
	assert(tokeniser->context.pending == 0);
1829
 
1830
	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
1831
 
1832
	if (error != PARSERUTILS_OK) {
1833
		if (error == PARSERUTILS_EOF) {
1834
			tokeniser->state = STATE_BOGUS_COMMENT;
1835
			return HUBBUB_OK;
1836
		} else {
1837
			return hubbub_error_from_parserutils_error(error);
1838
		}
1839
	}
1840
 
1841
	c = *cptr;
1842
 
1843
	if (c == '-') {
1844
		tokeniser->context.pending = len;
1845
		tokeniser->state = STATE_MATCH_COMMENT;
1846
	} else if ((c & ~0x20) == 'D') {
1847
		tokeniser->context.pending = len;
1848
		tokeniser->context.match_doctype.count = len;
1849
		tokeniser->state = STATE_MATCH_DOCTYPE;
1850
	} else if (tokeniser->process_cdata_section == true && c == '[') {
1851
		tokeniser->context.pending = len;
1852
		tokeniser->context.match_cdata.count = len;
1853
		tokeniser->state = STATE_MATCH_CDATA;
1854
	} else {
1855
		tokeniser->state = STATE_BOGUS_COMMENT;
1856
	}
1857
 
1858
	return HUBBUB_OK;
1859
}
1860
 
1861
 
1862
hubbub_error hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser)
1863
{
1864
	size_t len;
1865
	const uint8_t *cptr;
1866
	parserutils_error error;
1867
 
1868
	error = parserutils_inputstream_peek(tokeniser->input,
1869
			tokeniser->context.pending, &cptr, &len);
1870
 
1871
	if (error != PARSERUTILS_OK) {
1872
		if (error == PARSERUTILS_EOF) {
1873
			tokeniser->context.pending =
1874
				tokeniser->context.current_comment.len = 0;
1875
			tokeniser->state = STATE_BOGUS_COMMENT;
1876
			return HUBBUB_OK;
1877
		} else {
1878
			return hubbub_error_from_parserutils_error(error);
1879
		}
1880
	}
1881
 
1882
	tokeniser->context.pending = tokeniser->context.current_comment.len = 0;
1883
 
1884
	if (*cptr == '-') {
1885
		parserutils_inputstream_advance(tokeniser->input, SLEN("--"));
1886
		tokeniser->state = STATE_COMMENT_START;
1887
	} else {
1888
		tokeniser->state = STATE_BOGUS_COMMENT;
1889
	}
1890
 
1891
	return HUBBUB_OK;
1892
}
1893
 
1894
 
1895
hubbub_error hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
1896
{
1897
	size_t len;
1898
	const uint8_t *cptr;
1899
	parserutils_error error;
1900
	uint8_t c;
1901
 
1902
	error = parserutils_inputstream_peek(tokeniser->input,
1903
			tokeniser->context.pending, &cptr, &len);
1904
 
1905
	if (error != PARSERUTILS_OK) {
1906
		if (error == PARSERUTILS_EOF) {
1907
			tokeniser->state = STATE_DATA;
1908
			return emit_current_comment(tokeniser);
1909
		} else {
1910
			return hubbub_error_from_parserutils_error(error);
1911
		}
1912
	}
1913
 
1914
	c = *cptr;
1915
 
1916
	if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH ||
1917
			tokeniser->state == STATE_COMMENT_START ||
1918
			tokeniser->state == STATE_COMMENT_END)) {
1919
		tokeniser->context.pending += len;
1920
 
1921
		/** \todo parse error if state != COMMENT_END */
1922
		tokeniser->state = STATE_DATA;
1923
		return emit_current_comment(tokeniser);
1924
	} else if (c == '-') {
1925
		if (tokeniser->state == STATE_COMMENT_START) {
1926
			tokeniser->state = STATE_COMMENT_START_DASH;
1927
		} else if (tokeniser->state == STATE_COMMENT_START_DASH) {
1928
			tokeniser->state = STATE_COMMENT_END;
1929
		} else if (tokeniser->state == STATE_COMMENT) {
1930
			tokeniser->state = STATE_COMMENT_END_DASH;
1931
		} else if (tokeniser->state == STATE_COMMENT_END_DASH) {
1932
			tokeniser->state = STATE_COMMENT_END;
1933
		} else if (tokeniser->state == STATE_COMMENT_END) {
1934
			error = parserutils_buffer_append(tokeniser->buffer,
1935
					(uint8_t *) "-", SLEN("-"));
1936
			if (error != PARSERUTILS_OK) {
1937
				return hubbub_error_from_parserutils_error(
1938
						error);
1939
			}
1940
		}
1941
 
1942
		tokeniser->context.pending += len;
1943
	} else {
1944
		if (tokeniser->state == STATE_COMMENT_START_DASH ||
1945
				tokeniser->state == STATE_COMMENT_END_DASH) {
1946
			error = parserutils_buffer_append(tokeniser->buffer,
1947
					(uint8_t *) "-", SLEN("-"));
1948
			if (error != PARSERUTILS_OK) {
1949
				return hubbub_error_from_parserutils_error(
1950
						error);
1951
			}
1952
		} else if (tokeniser->state == STATE_COMMENT_END) {
1953
			error = parserutils_buffer_append(tokeniser->buffer,
1954
					(uint8_t *) "--", SLEN("--"));
1955
			if (error != PARSERUTILS_OK) {
1956
				return hubbub_error_from_parserutils_error(
1957
						error);
1958
			}
1959
		}
1960
 
1961
		if (c == '\0') {
1962
			error = parserutils_buffer_append(tokeniser->buffer,
1963
					u_fffd, sizeof(u_fffd));
1964
			if (error != PARSERUTILS_OK) {
1965
				return hubbub_error_from_parserutils_error(
1966
						error);
1967
			}
1968
		} else if (c == '\r') {
1969
			size_t next_len;
1970
			error = parserutils_inputstream_peek(
1971
					tokeniser->input,
1972
					tokeniser->context.pending + len,
1973
					&cptr,
1974
					&next_len);
1975
			if (error != PARSERUTILS_OK &&
1976
					error != PARSERUTILS_EOF) {
1977
				return hubbub_error_from_parserutils_error(
1978
						error);
1979
			} else if (error != PARSERUTILS_EOF && *cptr != '\n') {
1980
				error = parserutils_buffer_append(
1981
						tokeniser->buffer,
1982
						&lf, sizeof(lf));
1983
				if (error != PARSERUTILS_OK) {
1984
					return hubbub_error_from_parserutils_error(
1985
							error);
1986
				}
1987
			}
1988
		} else {
1989
			error = parserutils_buffer_append(tokeniser->buffer,
1990
					cptr, len);
1991
			if (error != PARSERUTILS_OK) {
1992
				return hubbub_error_from_parserutils_error(
1993
						error);
1994
			}
1995
		}
1996
 
1997
		tokeniser->context.pending += len;
1998
		tokeniser->state = STATE_COMMENT;
1999
	}
2000
 
2001
	return HUBBUB_OK;
2002
}
2003
 
2004
 
2005
 
2006
 
2007
#define DOCTYPE		"DOCTYPE"
2008
#define DOCTYPE_LEN	(SLEN(DOCTYPE) - 1)
2009
 
2010
hubbub_error hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
2011
{
2012
	size_t len;
2013
	const uint8_t *cptr;
2014
	parserutils_error error;
2015
	uint8_t c;
2016
 
2017
	error = parserutils_inputstream_peek(tokeniser->input,
2018
			tokeniser->context.match_doctype.count, &cptr, &len);
2019
 
2020
	if (error != PARSERUTILS_OK) {
2021
		if (error == PARSERUTILS_EOF) {
2022
			tokeniser->context.current_comment.len =
2023
					tokeniser->context.pending = 0;
2024
			tokeniser->state = STATE_BOGUS_COMMENT;
2025
			return HUBBUB_OK;
2026
		} else {
2027
			return hubbub_error_from_parserutils_error(error);
2028
		}
2029
	}
2030
 
2031
	c = *cptr;
2032
 
2033
	assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN);
2034
 
2035
	if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2036
		tokeniser->context.current_comment.len =
2037
				tokeniser->context.pending = 0;
2038
		tokeniser->state = STATE_BOGUS_COMMENT;
2039
		return HUBBUB_OK;
2040
	}
2041
 
2042
	tokeniser->context.pending += len;
2043
 
2044
	if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) {
2045
		/* Skip over the DOCTYPE bit */
2046
		parserutils_inputstream_advance(tokeniser->input,
2047
				tokeniser->context.pending);
2048
 
2049
		memset(&tokeniser->context.current_doctype, 0,
2050
				sizeof tokeniser->context.current_doctype);
2051
		tokeniser->context.current_doctype.public_missing = true;
2052
		tokeniser->context.current_doctype.system_missing = true;
2053
		tokeniser->context.pending = 0;
2054
 
2055
		tokeniser->state = STATE_DOCTYPE;
2056
	}
2057
 
2058
	tokeniser->context.match_doctype.count++;
2059
 
2060
	return HUBBUB_OK;
2061
}
2062
 
2063
#undef DOCTYPE
2064
#undef DOCTYPE_LEN
2065
 
2066
hubbub_error hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
2067
{
2068
	size_t len;
2069
	const uint8_t *cptr;
2070
	parserutils_error error;
2071
	uint8_t c;
2072
 
2073
	error = parserutils_inputstream_peek(tokeniser->input,
2074
			tokeniser->context.pending, &cptr, &len);
2075
 
2076
	if (error != PARSERUTILS_OK) {
2077
		if (error == PARSERUTILS_EOF) {
2078
			tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2079
			return HUBBUB_OK;
2080
		} else {
2081
			return hubbub_error_from_parserutils_error(error);
2082
		}
2083
	}
2084
 
2085
	c = *cptr;
2086
 
2087
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2088
		tokeniser->context.pending += len;
2089
	}
2090
 
2091
	tokeniser->state = STATE_BEFORE_DOCTYPE_NAME;
2092
 
2093
	return HUBBUB_OK;
2094
}
2095
 
2096
hubbub_error hubbub_tokeniser_handle_before_doctype_name(
2097
		hubbub_tokeniser *tokeniser)
2098
{
2099
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2100
	size_t len;
2101
	const uint8_t *cptr;
2102
	parserutils_error error;
2103
	uint8_t c;
2104
 
2105
	error = parserutils_inputstream_peek(tokeniser->input,
2106
			tokeniser->context.pending, &cptr, &len);
2107
 
2108
	if (error != PARSERUTILS_OK) {
2109
		if (error == PARSERUTILS_EOF) {
2110
			/** \todo parse error */
2111
			/* Emit current doctype, force-quirks on */
2112
			tokeniser->state = STATE_DATA;
2113
			return emit_current_doctype(tokeniser, true);
2114
		} else {
2115
			return hubbub_error_from_parserutils_error(error);
2116
		}
2117
	}
2118
 
2119
	c = *cptr;
2120
 
2121
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2122
		/* pass over in silence */
2123
		tokeniser->context.pending += len;
2124
	} else if (c == '>') {
2125
		/** \todo parse error */
2126
		tokeniser->context.pending += len;
2127
		tokeniser->state = STATE_DATA;
2128
		return emit_current_doctype(tokeniser, true);
2129
	} else {
2130
		if (c == '\0') {
2131
			START_BUF(cdoc->name, u_fffd, sizeof(u_fffd));
2132
		} else if ('A' <= c && c <= 'Z') {
2133
			uint8_t lc = c + 0x20;
2134
 
2135
			START_BUF(cdoc->name, &lc, len);
2136
		} else {
2137
			START_BUF(cdoc->name, cptr, len);
2138
		}
2139
 
2140
		tokeniser->context.pending += len;
2141
		tokeniser->state = STATE_DOCTYPE_NAME;
2142
	}
2143
 
2144
	return HUBBUB_OK;
2145
}
2146
 
2147
hubbub_error hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
2148
{
2149
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2150
	size_t len;
2151
	const uint8_t *cptr;
2152
	parserutils_error error;
2153
	uint8_t c;
2154
 
2155
	error = parserutils_inputstream_peek(tokeniser->input,
2156
			tokeniser->context.pending, &cptr, &len);
2157
 
2158
	if (error != PARSERUTILS_OK) {
2159
		if (error == PARSERUTILS_EOF) {
2160
			tokeniser->state = STATE_DATA;
2161
			return emit_current_doctype(tokeniser, true);
2162
		} else {
2163
			return hubbub_error_from_parserutils_error(error);
2164
		}
2165
	}
2166
 
2167
	c = *cptr;
2168
 
2169
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2170
		tokeniser->context.pending += len;
2171
		tokeniser->state = STATE_AFTER_DOCTYPE_NAME;
2172
	} else if (c == '>') {
2173
		tokeniser->context.pending += len;
2174
		tokeniser->state = STATE_DATA;
2175
		return emit_current_doctype(tokeniser, false);
2176
	} else if (c == '\0') {
2177
		COLLECT(cdoc->name, u_fffd, sizeof(u_fffd));
2178
		tokeniser->context.pending += len;
2179
	} else if ('A' <= c && c <= 'Z') {
2180
		uint8_t lc = c + 0x20;
2181
		COLLECT(cdoc->name, &lc, len);
2182
		tokeniser->context.pending += len;
2183
	} else {
2184
		COLLECT(cdoc->name, cptr, len);
2185
		tokeniser->context.pending += len;
2186
	}
2187
 
2188
	return HUBBUB_OK;
2189
}
2190
 
2191
hubbub_error hubbub_tokeniser_handle_after_doctype_name(
2192
		hubbub_tokeniser *tokeniser)
2193
{
2194
	size_t len;
2195
	const uint8_t *cptr;
2196
	parserutils_error error;
2197
	uint8_t c;
2198
 
2199
	error = parserutils_inputstream_peek(tokeniser->input,
2200
			tokeniser->context.pending, &cptr, &len);
2201
 
2202
	if (error != PARSERUTILS_OK) {
2203
		if (error == PARSERUTILS_EOF) {
2204
			tokeniser->state = STATE_DATA;
2205
			return emit_current_doctype(tokeniser, true);
2206
		} else {
2207
			return hubbub_error_from_parserutils_error(error);
2208
		}
2209
	}
2210
 
2211
	c = *cptr;
2212
	tokeniser->context.pending += len;
2213
 
2214
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2215
		/* pass over in silence */
2216
	} else if (c == '>') {
2217
		tokeniser->state = STATE_DATA;
2218
		return emit_current_doctype(tokeniser, false);
2219
	} else if ((c & ~0x20) == 'P') {
2220
		tokeniser->context.match_doctype.count = 1;
2221
		tokeniser->state = STATE_MATCH_PUBLIC;
2222
	} else if ((c & ~0x20) == 'S') {
2223
		tokeniser->context.match_doctype.count = 1;
2224
		tokeniser->state = STATE_MATCH_SYSTEM;
2225
	} else {
2226
		tokeniser->state = STATE_BOGUS_DOCTYPE;
2227
		tokeniser->context.current_doctype.force_quirks = true;
2228
	}
2229
 
2230
	return HUBBUB_OK;
2231
}
2232
 
2233
#define PUBLIC		"PUBLIC"
2234
#define PUBLIC_LEN	(SLEN(PUBLIC) - 1)
2235
 
2236
hubbub_error hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser)
2237
{
2238
	size_t len;
2239
	const uint8_t *cptr;
2240
	parserutils_error error;
2241
	uint8_t c;
2242
 
2243
	error = parserutils_inputstream_peek(tokeniser->input,
2244
			tokeniser->context.pending, &cptr, &len);
2245
 
2246
	if (error != PARSERUTILS_OK) {
2247
		if (error == PARSERUTILS_EOF) {
2248
			tokeniser->context.current_doctype.force_quirks = true;
2249
			tokeniser->state = STATE_BOGUS_DOCTYPE;
2250
			return HUBBUB_OK;
2251
		} else {
2252
			return hubbub_error_from_parserutils_error(error);
2253
		}
2254
	}
2255
 
2256
	c = *cptr;
2257
 
2258
	assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN);
2259
 
2260
	if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2261
		tokeniser->context.current_doctype.force_quirks = true;
2262
		tokeniser->state = STATE_BOGUS_DOCTYPE;
2263
		return HUBBUB_OK;
2264
	}
2265
 
2266
	tokeniser->context.pending += len;
2267
 
2268
	if (tokeniser->context.match_doctype.count == PUBLIC_LEN) {
2269
		tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC;
2270
	}
2271
 
2272
	tokeniser->context.match_doctype.count++;
2273
 
2274
	return HUBBUB_OK;
2275
}
2276
 
2277
#undef PUBLIC
2278
#undef PUBLIC_LEN
2279
 
2280
hubbub_error hubbub_tokeniser_handle_before_doctype_public(
2281
		hubbub_tokeniser *tokeniser)
2282
{
2283
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2284
	size_t len;
2285
	const uint8_t *cptr;
2286
	parserutils_error error;
2287
	uint8_t c;
2288
 
2289
	error = parserutils_inputstream_peek(tokeniser->input,
2290
			tokeniser->context.pending, &cptr, &len);
2291
 
2292
	if (error != PARSERUTILS_OK) {
2293
		if (error == PARSERUTILS_EOF) {
2294
			tokeniser->state = STATE_DATA;
2295
			return emit_current_doctype(tokeniser, true);
2296
		} else {
2297
			return hubbub_error_from_parserutils_error(error);
2298
		}
2299
	}
2300
 
2301
	c = *cptr;
2302
	tokeniser->context.pending += len;
2303
 
2304
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2305
		/* pass over in silence */
2306
	} else if (c == '"') {
2307
		cdoc->public_missing = false;
2308
		cdoc->public_id.len = 0;
2309
		tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ;
2310
	} else if (c == '\'') {
2311
		cdoc->public_missing = false;
2312
		cdoc->public_id.len = 0;
2313
		tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ;
2314
	} else if (c == '>') {
2315
		tokeniser->state = STATE_DATA;
2316
		return emit_current_doctype(tokeniser, true);
2317
	} else {
2318
		cdoc->force_quirks = true;
2319
		tokeniser->state = STATE_BOGUS_DOCTYPE;
2320
	}
2321
 
2322
	return HUBBUB_OK;
2323
}
2324
 
2325
hubbub_error hubbub_tokeniser_handle_doctype_public_dq(
2326
		hubbub_tokeniser *tokeniser)
2327
{
2328
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2329
	size_t len;
2330
	const uint8_t *cptr;
2331
	parserutils_error error;
2332
	uint8_t c;
2333
 
2334
	error = parserutils_inputstream_peek(tokeniser->input,
2335
			tokeniser->context.pending, &cptr, &len);
2336
 
2337
	if (error != PARSERUTILS_OK) {
2338
		if (error == PARSERUTILS_EOF) {
2339
			tokeniser->state = STATE_DATA;
2340
			return emit_current_doctype(tokeniser, true);
2341
		} else {
2342
			return hubbub_error_from_parserutils_error(error);
2343
		}
2344
	}
2345
 
2346
	c = *cptr;
2347
 
2348
	if (c == '"') {
2349
		tokeniser->context.pending += len;
2350
		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2351
	} else if (c == '>') {
2352
		tokeniser->context.pending += len;
2353
		tokeniser->state = STATE_DATA;
2354
		return emit_current_doctype(tokeniser, true);
2355
	} else if (c == '\0') {
2356
		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2357
		tokeniser->context.pending += len;
2358
	} else if (c == '\r') {
2359
		error = parserutils_inputstream_peek(
2360
				tokeniser->input,
2361
				tokeniser->context.pending,
2362
				&cptr,
2363
				&len);
2364
 
2365
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2366
			return hubbub_error_from_parserutils_error(error);
2367
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2368
			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2369
		}
2370
 
2371
		/* Collect '\r' */
2372
		tokeniser->context.pending += 1;
2373
	} else {
2374
		COLLECT_MS(cdoc->public_id, cptr, len);
2375
 
2376
		tokeniser->context.pending += len;
2377
	}
2378
 
2379
	return HUBBUB_OK;
2380
}
2381
 
2382
hubbub_error hubbub_tokeniser_handle_doctype_public_sq(
2383
		hubbub_tokeniser *tokeniser)
2384
{
2385
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2386
	size_t len;
2387
	const uint8_t *cptr;
2388
	parserutils_error error;
2389
	uint8_t c;
2390
 
2391
	error = parserutils_inputstream_peek(tokeniser->input,
2392
			tokeniser->context.pending, &cptr, &len);
2393
 
2394
	if (error != PARSERUTILS_OK) {
2395
		if (error == PARSERUTILS_EOF) {
2396
			tokeniser->state = STATE_DATA;
2397
			return emit_current_doctype(tokeniser, true);
2398
		} else {
2399
			return hubbub_error_from_parserutils_error(error);
2400
		}
2401
	}
2402
 
2403
	c = *cptr;
2404
 
2405
	if (c == '\'') {
2406
		tokeniser->context.pending += len;
2407
		tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC;
2408
	} else if (c == '>') {
2409
		tokeniser->context.pending += len;
2410
		tokeniser->state = STATE_DATA;
2411
		return emit_current_doctype(tokeniser, true);
2412
	} else if (c == '\0') {
2413
		COLLECT_MS(cdoc->public_id, u_fffd, sizeof(u_fffd));
2414
		tokeniser->context.pending += len;
2415
	} else if (c == '\r') {
2416
		error = parserutils_inputstream_peek(
2417
				tokeniser->input,
2418
				tokeniser->context.pending,
2419
				&cptr,
2420
				&len);
2421
 
2422
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2423
			return hubbub_error_from_parserutils_error(error);
2424
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2425
			COLLECT_MS(cdoc->public_id, &lf, sizeof(lf));
2426
		}
2427
 
2428
		/* Collect '\r' */
2429
		tokeniser->context.pending += 1;
2430
	} else {
2431
		COLLECT_MS(cdoc->public_id, cptr, len);
2432
		tokeniser->context.pending += len;
2433
	}
2434
 
2435
	return HUBBUB_OK;
2436
}
2437
 
2438
 
2439
hubbub_error hubbub_tokeniser_handle_after_doctype_public(
2440
		hubbub_tokeniser *tokeniser)
2441
{
2442
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2443
	size_t len;
2444
	const uint8_t *cptr;
2445
	parserutils_error error;
2446
	uint8_t c;
2447
 
2448
	error = parserutils_inputstream_peek(tokeniser->input,
2449
			tokeniser->context.pending, &cptr, &len);
2450
 
2451
	if (error != PARSERUTILS_OK) {
2452
		if (error == PARSERUTILS_EOF) {
2453
			tokeniser->state = STATE_DATA;
2454
			return emit_current_doctype(tokeniser, true);
2455
		} else {
2456
			return hubbub_error_from_parserutils_error(error);
2457
		}
2458
	}
2459
 
2460
	c = *cptr;
2461
	tokeniser->context.pending += len;
2462
 
2463
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2464
		/* pass over in silence */
2465
	} else if (c == '"') {
2466
		cdoc->system_missing = false;
2467
		cdoc->system_id.len = 0;
2468
 
2469
		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2470
	} else if (c == '\'') {
2471
		cdoc->system_missing = false;
2472
		cdoc->system_id.len = 0;
2473
 
2474
		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2475
	} else if (c == '>') {
2476
		tokeniser->state = STATE_DATA;
2477
		return emit_current_doctype(tokeniser, false);
2478
	} else {
2479
		cdoc->force_quirks = true;
2480
		tokeniser->state = STATE_BOGUS_DOCTYPE;
2481
	}
2482
 
2483
	return HUBBUB_OK;
2484
}
2485
 
2486
 
2487
 
2488
#define SYSTEM		"SYSTEM"
2489
#define SYSTEM_LEN	(SLEN(SYSTEM) - 1)
2490
 
2491
hubbub_error hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser)
2492
{
2493
	size_t len;
2494
	const uint8_t *cptr;
2495
	parserutils_error error;
2496
	uint8_t c;
2497
 
2498
	error = parserutils_inputstream_peek(tokeniser->input,
2499
			tokeniser->context.pending, &cptr, &len);
2500
 
2501
	if (error != PARSERUTILS_OK){
2502
		if (error == PARSERUTILS_EOF) {
2503
			tokeniser->context.current_doctype.force_quirks = true;
2504
			tokeniser->state = STATE_BOGUS_DOCTYPE;
2505
			return HUBBUB_OK;
2506
		} else {
2507
			return hubbub_error_from_parserutils_error(error);
2508
		}
2509
	}
2510
 
2511
	c = *cptr;
2512
 
2513
	assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN);
2514
 
2515
	if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) {
2516
		tokeniser->context.current_doctype.force_quirks = true;
2517
		tokeniser->state = STATE_BOGUS_DOCTYPE;
2518
		return HUBBUB_OK;
2519
	}
2520
 
2521
	tokeniser->context.pending += len;
2522
 
2523
	if (tokeniser->context.match_doctype.count == SYSTEM_LEN) {
2524
		tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM;
2525
	}
2526
 
2527
	tokeniser->context.match_doctype.count++;
2528
 
2529
	return HUBBUB_OK;
2530
}
2531
 
2532
#undef SYSTEM
2533
#undef SYSTEM_LEN
2534
 
2535
hubbub_error hubbub_tokeniser_handle_before_doctype_system(
2536
		hubbub_tokeniser *tokeniser)
2537
{
2538
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2539
	size_t len;
2540
	const uint8_t *cptr;
2541
	parserutils_error error;
2542
	uint8_t c;
2543
 
2544
	error = parserutils_inputstream_peek(tokeniser->input,
2545
			tokeniser->context.pending, &cptr, &len);
2546
 
2547
	if (error != PARSERUTILS_OK) {
2548
		if (error == PARSERUTILS_EOF) {
2549
			tokeniser->state = STATE_DATA;
2550
			return emit_current_doctype(tokeniser, true);
2551
		} else {
2552
			return hubbub_error_from_parserutils_error(error);
2553
		}
2554
	}
2555
 
2556
	c = *cptr;
2557
	tokeniser->context.pending += len;
2558
 
2559
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2560
		/* pass over */
2561
	} else if (c == '"') {
2562
		cdoc->system_missing = false;
2563
		cdoc->system_id.len = 0;
2564
 
2565
		tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ;
2566
	} else if (c == '\'') {
2567
		cdoc->system_missing = false;
2568
		cdoc->system_id.len = 0;
2569
 
2570
		tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ;
2571
	} else if (c == '>') {
2572
		tokeniser->state = STATE_DATA;
2573
		return emit_current_doctype(tokeniser, true);
2574
	} else {
2575
		cdoc->force_quirks = true;
2576
		tokeniser->state = STATE_BOGUS_DOCTYPE;
2577
	}
2578
 
2579
	return HUBBUB_OK;
2580
}
2581
 
2582
hubbub_error hubbub_tokeniser_handle_doctype_system_dq(
2583
		hubbub_tokeniser *tokeniser)
2584
{
2585
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2586
	size_t len;
2587
	const uint8_t *cptr;
2588
	parserutils_error error;
2589
	uint8_t c;
2590
 
2591
	error = parserutils_inputstream_peek(tokeniser->input,
2592
			tokeniser->context.pending, &cptr, &len);
2593
 
2594
	if (error != PARSERUTILS_OK) {
2595
		if (error == PARSERUTILS_EOF) {
2596
			tokeniser->state = STATE_DATA;
2597
			return emit_current_doctype(tokeniser, true);
2598
		} else {
2599
			return hubbub_error_from_parserutils_error(error);
2600
		}
2601
	}
2602
 
2603
	c = *cptr;
2604
 
2605
	if (c == '"') {
2606
		tokeniser->context.pending += len;
2607
		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2608
	} else if (c == '>') {
2609
		tokeniser->context.pending += len;
2610
		tokeniser->state = STATE_DATA;
2611
		return emit_current_doctype(tokeniser, true);
2612
	} else if (c == '\0') {
2613
		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2614
		tokeniser->context.pending += len;
2615
	} else if (c == '\r') {
2616
		error = parserutils_inputstream_peek(
2617
				tokeniser->input,
2618
				tokeniser->context.pending,
2619
				&cptr,
2620
				&len);
2621
 
2622
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2623
			return hubbub_error_from_parserutils_error(error);
2624
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2625
			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2626
		}
2627
 
2628
		/* Collect '\r' */
2629
		tokeniser->context.pending += 1;
2630
	} else {
2631
		COLLECT_MS(cdoc->system_id, cptr, len);
2632
		tokeniser->context.pending += len;
2633
	}
2634
 
2635
	return HUBBUB_OK;
2636
}
2637
 
2638
hubbub_error hubbub_tokeniser_handle_doctype_system_sq(
2639
		hubbub_tokeniser *tokeniser)
2640
{
2641
	hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
2642
	size_t len;
2643
	const uint8_t *cptr;
2644
	parserutils_error error;
2645
	uint8_t c;
2646
 
2647
	error = parserutils_inputstream_peek(tokeniser->input,
2648
			tokeniser->context.pending, &cptr, &len);
2649
 
2650
	if (error != PARSERUTILS_OK) {
2651
		if (error == PARSERUTILS_EOF) {
2652
			tokeniser->state = STATE_DATA;
2653
			return emit_current_doctype(tokeniser, true);
2654
		} else {
2655
			return hubbub_error_from_parserutils_error(error);
2656
		}
2657
	}
2658
 
2659
	c = *cptr;
2660
 
2661
	if (c == '\'') {
2662
		tokeniser->context.pending += len;
2663
		tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM;
2664
	} else if (c == '>') {
2665
		tokeniser->context.pending += len;
2666
		tokeniser->state = STATE_DATA;
2667
		return emit_current_doctype(tokeniser, true);
2668
	} else if (c == '\0') {
2669
		COLLECT_MS(cdoc->system_id, u_fffd, sizeof(u_fffd));
2670
		tokeniser->context.pending += len;
2671
	} else if (c == '\r') {
2672
		error = parserutils_inputstream_peek(
2673
				tokeniser->input,
2674
				tokeniser->context.pending,
2675
				&cptr,
2676
				&len);
2677
 
2678
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2679
			return hubbub_error_from_parserutils_error(error);
2680
		} else if (error == PARSERUTILS_EOF || *cptr != '\n') {
2681
			COLLECT_MS(cdoc->system_id, &lf, sizeof(lf));
2682
		}
2683
 
2684
		/* Collect '\r' */
2685
		tokeniser->context.pending += 1;
2686
	} else {
2687
		COLLECT_MS(cdoc->system_id, cptr, len);
2688
		tokeniser->context.pending += len;
2689
	}
2690
 
2691
	return HUBBUB_OK;
2692
}
2693
 
2694
hubbub_error hubbub_tokeniser_handle_after_doctype_system(
2695
		hubbub_tokeniser *tokeniser)
2696
{
2697
	size_t len;
2698
	const uint8_t *cptr;
2699
	parserutils_error error;
2700
	uint8_t c;
2701
 
2702
	error = parserutils_inputstream_peek(tokeniser->input,
2703
			tokeniser->context.pending, &cptr, &len);
2704
 
2705
	if (error != PARSERUTILS_OK) {
2706
		if (error == PARSERUTILS_EOF) {
2707
			tokeniser->state = STATE_DATA;
2708
			return emit_current_doctype(tokeniser, true);
2709
		} else {
2710
			return hubbub_error_from_parserutils_error(error);
2711
		}
2712
	}
2713
 
2714
	c = *cptr;
2715
	tokeniser->context.pending += len;
2716
 
2717
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
2718
		/* pass over in silence */
2719
	} else if (c == '>') {
2720
		tokeniser->state = STATE_DATA;
2721
		return emit_current_doctype(tokeniser, false);
2722
	} else {
2723
		tokeniser->state = STATE_BOGUS_DOCTYPE;
2724
	}
2725
 
2726
	return HUBBUB_OK;
2727
}
2728
 
2729
 
2730
hubbub_error hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
2731
{
2732
	size_t len;
2733
	const uint8_t *cptr;
2734
	parserutils_error error;
2735
	uint8_t c;
2736
 
2737
	error = parserutils_inputstream_peek(tokeniser->input,
2738
			tokeniser->context.pending, &cptr, &len);
2739
 
2740
	if (error != PARSERUTILS_OK) {
2741
		if (error == PARSERUTILS_EOF) {
2742
			tokeniser->state = STATE_DATA;
2743
			return emit_current_doctype(tokeniser, false);
2744
		} else {
2745
			return hubbub_error_from_parserutils_error(error);
2746
		}
2747
	}
2748
 
2749
	c = *cptr;
2750
	tokeniser->context.pending += len;
2751
 
2752
	if (c == '>') {
2753
		tokeniser->state = STATE_DATA;
2754
		return emit_current_doctype(tokeniser, false);
2755
	}
2756
 
2757
	return HUBBUB_OK;
2758
}
2759
 
2760
 
2761
 
2762
#define CDATA		"[CDATA["
2763
#define CDATA_LEN	(SLEN(CDATA) - 1)
2764
 
2765
hubbub_error hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser)
2766
{
2767
	size_t len;
2768
	const uint8_t *cptr;
2769
	parserutils_error error;
2770
	uint8_t c;
2771
 
2772
	error = parserutils_inputstream_peek(tokeniser->input,
2773
			tokeniser->context.pending, &cptr, &len);
2774
 
2775
	if (error != PARSERUTILS_OK) {
2776
		if (error == PARSERUTILS_EOF) {
2777
			tokeniser->context.current_comment.len =
2778
					tokeniser->context.pending = 0;
2779
			tokeniser->state = STATE_BOGUS_COMMENT;
2780
			return HUBBUB_OK;
2781
		} else {
2782
			return hubbub_error_from_parserutils_error(error);
2783
		}
2784
	}
2785
 
2786
	c = *cptr;
2787
 
2788
	assert(tokeniser->context.match_cdata.count <= CDATA_LEN);
2789
 
2790
	if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) {
2791
		tokeniser->context.current_comment.len =
2792
				tokeniser->context.pending =
2793
				0;
2794
		tokeniser->state = STATE_BOGUS_COMMENT;
2795
		return HUBBUB_OK;
2796
	}
2797
 
2798
	tokeniser->context.pending += len;
2799
 
2800
	if (tokeniser->context.match_cdata.count == CDATA_LEN) {
2801
		parserutils_inputstream_advance(tokeniser->input,
2802
				tokeniser->context.match_cdata.count + len);
2803
		tokeniser->context.pending = 0;
2804
		tokeniser->context.match_cdata.end = 0;
2805
		tokeniser->state = STATE_CDATA_BLOCK;
2806
	}
2807
 
2808
	tokeniser->context.match_cdata.count += len;
2809
 
2810
	return HUBBUB_OK;
2811
}
2812
 
2813
#undef CDATA
2814
#undef CDATA_LEN
2815
 
2816
 
2817
hubbub_error hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser)
2818
{
2819
	size_t len;
2820
	const uint8_t *cptr;
2821
	parserutils_error error;
2822
	uint8_t c;
2823
 
2824
	error = parserutils_inputstream_peek(tokeniser->input,
2825
			tokeniser->context.pending, &cptr, &len);
2826
 
2827
	if (error != PARSERUTILS_OK) {
2828
		if (error == PARSERUTILS_EOF) {
2829
			tokeniser->state = STATE_DATA;
2830
			return emit_current_chars(tokeniser);
2831
		} else {
2832
			return hubbub_error_from_parserutils_error(error);
2833
		}
2834
	}
2835
 
2836
	c = *cptr;
2837
 
2838
	if (c == ']' && (tokeniser->context.match_cdata.end == 0 ||
2839
			tokeniser->context.match_cdata.end == 1)) {
2840
		tokeniser->context.pending += len;
2841
		tokeniser->context.match_cdata.end += len;
2842
	} else if (c == '>' && tokeniser->context.match_cdata.end == 2) {
2843
		/* Remove the previous two "]]" */
2844
		tokeniser->context.pending -= 2;
2845
 
2846
		/* Emit any pending characters */
2847
		emit_current_chars(tokeniser);
2848
 
2849
		/* Now move past the "]]>" bit */
2850
		parserutils_inputstream_advance(tokeniser->input, SLEN("]]>"));
2851
 
2852
		tokeniser->state = STATE_DATA;
2853
	} else if (c == '\0') {
2854
		if (tokeniser->context.pending > 0) {
2855
			/* Emit any pending characters */
2856
			emit_current_chars(tokeniser);
2857
		}
2858
 
2859
		/* Perform NUL-byte replacement */
2860
		emit_character_token(tokeniser, &u_fffd_str);
2861
 
2862
		parserutils_inputstream_advance(tokeniser->input, len);
2863
		tokeniser->context.match_cdata.end = 0;
2864
	} else if (c == '\r') {
2865
		error = parserutils_inputstream_peek(
2866
				tokeniser->input,
2867
				tokeniser->context.pending + len,
2868
				&cptr,
2869
				&len);
2870
 
2871
		if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2872
			return hubbub_error_from_parserutils_error(error);
2873
		}
2874
 
2875
		if (tokeniser->context.pending > 0) {
2876
			/* Emit any pending characters */
2877
			emit_current_chars(tokeniser);
2878
		}
2879
 
2880
		if (error == PARSERUTILS_EOF || *cptr != '\n') {
2881
			/* Emit newline */
2882
			emit_character_token(tokeniser, &lf_str);
2883
		}
2884
 
2885
		/* Advance over \r */
2886
		parserutils_inputstream_advance(tokeniser->input, 1);
2887
		tokeniser->context.match_cdata.end = 0;
2888
	} else {
2889
		tokeniser->context.pending += len;
2890
		tokeniser->context.match_cdata.end = 0;
2891
	}
2892
 
2893
	return HUBBUB_OK;
2894
}
2895
 
2896
 
2897
hubbub_error hubbub_tokeniser_consume_character_reference(
2898
		hubbub_tokeniser *tokeniser, size_t pos)
2899
{
2900
	uint32_t allowed_char = tokeniser->context.allowed_char;
2901
 
2902
	size_t len;
2903
	const uint8_t *cptr;
2904
	parserutils_error error;
2905
	uint8_t c;
2906
	size_t off;
2907
 
2908
	error = parserutils_inputstream_peek(tokeniser->input, pos,
2909
			&cptr, &len);
2910
 
2911
	/* We should always start on an ampersand */
2912
	assert(error == PARSERUTILS_OK);
2913
	assert(len == 1 && *cptr == '&');
2914
 
2915
	off = pos + len;
2916
 
2917
	/* Look at the character after the ampersand */
2918
	error = parserutils_inputstream_peek(tokeniser->input, off,
2919
			&cptr, &len);
2920
 
2921
	if (error != PARSERUTILS_OK) {
2922
		if (error == PARSERUTILS_EOF) {
2923
			tokeniser->context.match_entity.complete = true;
2924
			tokeniser->context.match_entity.codepoint = 0;
2925
			return HUBBUB_OK;
2926
		} else {
2927
			return hubbub_error_from_parserutils_error(error);
2928
		}
2929
	}
2930
 
2931
	c = *cptr;
2932
 
2933
	/* Set things up */
2934
	tokeniser->context.match_entity.offset = off;
2935
	tokeniser->context.match_entity.poss_length = 0;
2936
	tokeniser->context.match_entity.length = 0;
2937
	tokeniser->context.match_entity.base = 0;
2938
	tokeniser->context.match_entity.codepoint = 0;
2939
	tokeniser->context.match_entity.had_data = false;
2940
	tokeniser->context.match_entity.return_state = tokeniser->state;
2941
	tokeniser->context.match_entity.complete = false;
2942
	tokeniser->context.match_entity.overflow = false;
2943
	tokeniser->context.match_entity.context = -1;
2944
	tokeniser->context.match_entity.prev_len = len;
2945
 
2946
	/* Reset allowed character for future calls */
2947
	tokeniser->context.allowed_char = '\0';
2948
 
2949
	if (c == '\t' || c == '\n' || c == '\f' || c == ' ' ||
2950
			c == '<' || c == '&' ||
2951
			(allowed_char && c == allowed_char)) {
2952
		tokeniser->context.match_entity.complete = true;
2953
		tokeniser->context.match_entity.codepoint = 0;
2954
	} else if (c == '#') {
2955
		tokeniser->context.match_entity.length += len;
2956
		tokeniser->state = STATE_NUMBERED_ENTITY;
2957
	} else {
2958
		tokeniser->state = STATE_NAMED_ENTITY;
2959
	}
2960
 
2961
	return HUBBUB_OK;
2962
}
2963
 
2964
 
2965
hubbub_error hubbub_tokeniser_handle_numbered_entity(
2966
		hubbub_tokeniser *tokeniser)
2967
{
2968
	hubbub_tokeniser_context *ctx = &tokeniser->context;
2969
 
2970
	size_t len;
2971
	const uint8_t *cptr;
2972
	parserutils_error error;
2973
 
2974
	error = parserutils_inputstream_peek(tokeniser->input,
2975
			ctx->match_entity.offset + ctx->match_entity.length,
2976
			&cptr, &len);
2977
 
2978
	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
2979
		return hubbub_error_from_parserutils_error(error);
2980
	}
2981
 
2982
	if (error != PARSERUTILS_EOF && ctx->match_entity.base == 0) {
2983
		uint8_t c = *cptr;
2984
		if ((c & ~0x20) == 'X') {
2985
			ctx->match_entity.base = 16;
2986
			ctx->match_entity.length += len;
2987
		} else {
2988
			ctx->match_entity.base = 10;
2989
		}
2990
	}
2991
 
2992
	while ((error = parserutils_inputstream_peek(tokeniser->input,
2993
			ctx->match_entity.offset + ctx->match_entity.length,
2994
			&cptr, &len)) == PARSERUTILS_OK) {
2995
		uint8_t c = *cptr;
2996
 
2997
		if (ctx->match_entity.base == 10 &&
2998
				('0' <= c && c <= '9')) {
2999
			ctx->match_entity.had_data = true;
3000
			ctx->match_entity.codepoint =
3001
				ctx->match_entity.codepoint * 10 + (c - '0');
3002
 
3003
			ctx->match_entity.length += len;
3004
		} else if (ctx->match_entity.base == 16 &&
3005
				(('0' <= c && c <= '9') ||
3006
				('A' <= (c & ~0x20) &&
3007
						(c & ~0x20) <= 'F'))) {
3008
			ctx->match_entity.had_data = true;
3009
			ctx->match_entity.codepoint *= 16;
3010
 
3011
			if ('0' <= c && c <= '9') {
3012
				ctx->match_entity.codepoint += (c - '0');
3013
			} else {
3014
				ctx->match_entity.codepoint +=
3015
						((c & ~0x20) - 'A' + 10);
3016
			}
3017
 
3018
			ctx->match_entity.length += len;
3019
		} else {
3020
			break;
3021
		}
3022
 
3023
		if (ctx->match_entity.codepoint >= 0x10FFFF) {
3024
			ctx->match_entity.overflow = true;
3025
		}
3026
	}
3027
 
3028
	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3029
		return hubbub_error_from_parserutils_error(error);
3030
	}
3031
 
3032
	/* Eat trailing semicolon, if any */
3033
	if (error != PARSERUTILS_EOF && *cptr == ';') {
3034
		ctx->match_entity.length += len;
3035
	}
3036
 
3037
	/* Had data, so calculate final codepoint */
3038
	if (ctx->match_entity.had_data) {
3039
		uint32_t cp = ctx->match_entity.codepoint;
3040
 
3041
		if (0x80 <= cp && cp <= 0x9F) {
3042
			cp = cp1252Table[cp - 0x80];
3043
		} else if (cp == 0x0D) {
3044
			cp = 0x000A;
3045
		} else if (ctx->match_entity.overflow ||
3046
				cp <= 0x0008 || cp == 0x000B ||
3047
				(0x000E <= cp && cp <= 0x001F) ||
3048
				(0x007F <= cp && cp <= 0x009F) ||
3049
				(0xD800 <= cp && cp <= 0xDFFF) ||
3050
				(0xFDD0 <= cp && cp <= 0xFDEF) ||
3051
				(cp & 0xFFFE) == 0xFFFE) {
3052
			/* the check for cp > 0x10FFFF per spec is performed
3053
			 * in the loop above to avoid overflow */
3054
			cp = 0xFFFD;
3055
		}
3056
 
3057
		ctx->match_entity.codepoint = cp;
3058
	}
3059
 
3060
	/* Flag completion */
3061
	ctx->match_entity.complete = true;
3062
 
3063
	/* And back to the state we were entered in */
3064
	tokeniser->state = ctx->match_entity.return_state;
3065
 
3066
	return HUBBUB_OK;
3067
}
3068
 
3069
hubbub_error hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
3070
{
3071
	hubbub_tokeniser_context *ctx = &tokeniser->context;
3072
 
3073
	size_t len;
3074
	const uint8_t *cptr;
3075
	parserutils_error error;
3076
 
3077
	while ((error = parserutils_inputstream_peek(tokeniser->input,
3078
			ctx->match_entity.offset +
3079
					ctx->match_entity.poss_length,
3080
			&cptr, &len)) == PARSERUTILS_OK) {
3081
		uint32_t cp;
3082
 
3083
		uint8_t c = *cptr;
3084
		hubbub_error error;
3085
 
3086
		if (c > 0x7F) {
3087
			/* Entity names are ASCII only */
3088
			break;
3089
		}
3090
 
3091
		error = hubbub_entities_search_step(c, &cp,
3092
				&ctx->match_entity.context);
3093
		if (error == HUBBUB_OK) {
3094
			/* Had a match - store it for later */
3095
			ctx->match_entity.codepoint = cp;
3096
 
3097
			ctx->match_entity.length =
3098
					ctx->match_entity.poss_length + len;
3099
			ctx->match_entity.poss_length =
3100
					ctx->match_entity.length;
3101
		} else if (error == HUBBUB_INVALID) {
3102
			/* No further matches - use last found */
3103
			break;
3104
		} else {
3105
			/* Need more data */
3106
			ctx->match_entity.poss_length += len;
3107
		}
3108
	}
3109
 
3110
	if (error != PARSERUTILS_OK && error != PARSERUTILS_EOF) {
3111
		return hubbub_error_from_parserutils_error(error);
3112
	}
3113
 
3114
	if (ctx->match_entity.length > 0) {
3115
		uint8_t c;
3116
		error = parserutils_inputstream_peek(tokeniser->input,
3117
				ctx->match_entity.offset +
3118
					ctx->match_entity.length - 1,
3119
				&cptr, &len);
3120
		/* We're re-reading a character we've already read after.
3121
		 * Therefore, there's no way that an error may occur as
3122
		 * a result. */
3123
		assert(error == PARSERUTILS_OK);
3124
 
3125
		c = *cptr;
3126
 
3127
		if ((tokeniser->context.match_entity.return_state ==
3128
				STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
3129
				c != ';') {
3130
			error = parserutils_inputstream_peek(tokeniser->input,
3131
					ctx->match_entity.offset +
3132
						ctx->match_entity.length,
3133
					&cptr, &len);
3134
			/* We must have attempted to read one more character
3135
			 * than was present in the entity name, as that is the
3136
			 * only way to break out of the loop above. If that
3137
			 * failed, then any non-EOF case will have been handled
3138
			 * by the if statement after the loop thus it cannot
3139
			 * occur here. */
3140
			assert(error == PARSERUTILS_OK ||
3141
					error == PARSERUTILS_EOF);
3142
 
3143
			if (error == PARSERUTILS_EOF) {
3144
				ctx->match_entity.codepoint = 0;
3145
			}
3146
 
3147
			c = *cptr;
3148
			if ((0x0030 <= c && c <= 0x0039) ||
3149
					(0x0041 <= c && c <= 0x005A) ||
3150
					(0x0061 <= c && c <= 0x007A)) {
3151
				ctx->match_entity.codepoint = 0;
3152
			}
3153
		}
3154
	}
3155
 
3156
	/* Flag completion */
3157
	ctx->match_entity.complete = true;
3158
 
3159
	/* And back to the state from whence we came */
3160
	tokeniser->state = ctx->match_entity.return_state;
3161
 
3162
	return HUBBUB_OK;
3163
}
3164
 
3165
 
3166
 
3167
/*** Token emitting bits ***/
3168
 
3169
/**
3170
 * Emit a character token.
3171
 *
3172
 * \param tokeniser	Tokeniser instance
3173
 * \param chars		Pointer to hubbub_string to emit
3174
 * \return	true
3175
 */
3176
hubbub_error emit_character_token(hubbub_tokeniser *tokeniser,
3177
		const hubbub_string *chars)
3178
{
3179
	hubbub_token token;
3180
 
3181
	token.type = HUBBUB_TOKEN_CHARACTER;
3182
	token.data.character = *chars;
3183
 
3184
	return hubbub_tokeniser_emit_token(tokeniser, &token);
3185
}
3186
 
3187
/**
3188
 * Emit the current pending characters being stored in the tokeniser context.
3189
 *
3190
 * \param tokeniser	Tokeniser instance
3191
 * \return	true
3192
 */
3193
hubbub_error emit_current_chars(hubbub_tokeniser *tokeniser)
3194
{
3195
	hubbub_token token;
3196
	size_t len;
3197
	const uint8_t *cptr = NULL;
3198
	parserutils_error error;
3199
 
3200
	/* Calling this with nothing to output is a probable bug */
3201
	assert(tokeniser->context.pending > 0);
3202
 
3203
	error = parserutils_inputstream_peek(tokeniser->input, 0, &cptr, &len);
3204
	if (error != PARSERUTILS_OK)
3205
		return hubbub_error_from_parserutils_error(error);
3206
 
3207
	token.type = HUBBUB_TOKEN_CHARACTER;
3208
	token.data.character.ptr = cptr;
3209
	token.data.character.len = tokeniser->context.pending;
3210
 
3211
	return hubbub_tokeniser_emit_token(tokeniser, &token);
3212
}
3213
 
3214
/**
3215
 * Emit the current tag token being stored in the tokeniser context.
3216
 *
3217
 * \param tokeniser	Tokeniser instance
3218
 * \return	true
3219
 */
3220
hubbub_error emit_current_tag(hubbub_tokeniser *tokeniser)
3221
{
3222
	hubbub_error err;
3223
	hubbub_token token;
3224
	uint32_t n_attributes;
3225
	hubbub_attribute *attrs;
3226
	uint8_t *ptr;
3227
	uint32_t i, j;
3228
 
3229
	/* Emit current tag */
3230
	token.type = tokeniser->context.current_tag_type;
3231
	token.data.tag = tokeniser->context.current_tag;
3232
 	token.data.tag.ns = HUBBUB_NS_HTML;
3233
 
3234
 
3235
	n_attributes = token.data.tag.n_attributes;
3236
	attrs = token.data.tag.attributes;
3237
 
3238
	/* Set pointers correctly... */
3239
	ptr = tokeniser->buffer->data;
3240
	token.data.tag.name.ptr = tokeniser->buffer->data;
3241
	ptr += token.data.tag.name.len;
3242
 
3243
	for (i = 0; i < n_attributes; i++) {
3244
		attrs[i].name.ptr = ptr;
3245
		ptr += attrs[i].name.len;
3246
		attrs[i].value.ptr = ptr;
3247
		ptr += attrs[i].value.len;
3248
	}
3249
 
3250
 
3251
	/* Discard duplicate attributes */
3252
	for (i = 0; i < n_attributes; i++) {
3253
		for (j = 0; j < n_attributes; j++) {
3254
			uint32_t move;
3255
 
3256
			if (j == i ||
3257
				attrs[i].name.len !=
3258
						attrs[j].name.len ||
3259
				strncmp((char *) attrs[i].name.ptr,
3260
					(char *) attrs[j].name.ptr,
3261
					attrs[i].name.len) != 0) {
3262
				/* Attributes don't match */
3263
				continue;
3264
			}
3265
 
3266
			assert(i < j);
3267
 
3268
			/* Calculate amount to move */
3269
			move = (n_attributes - 1 - j) *
3270
					sizeof(hubbub_attribute);
3271
 
3272
			if (move > 0) {
3273
				memmove(&attrs[j],&attrs[j+1], move);
3274
			}
3275
 
3276
			/* We've deleted an item, so we need to
3277
			 * reprocess this index */
3278
			j--;
3279
 
3280
			/* And reduce the number of attributes */
3281
			n_attributes--;
3282
		}
3283
	}
3284
 
3285
	token.data.tag.n_attributes = n_attributes;
3286
 
3287
	err = hubbub_tokeniser_emit_token(tokeniser, &token);
3288
 
3289
	if (token.type == HUBBUB_TOKEN_START_TAG) {
3290
		/* Save start tag name for R?CDATA */
3291
		if (token.data.tag.name.len <
3292
			sizeof(tokeniser->context.last_start_tag_name)) {
3293
			strncpy((char *) tokeniser->context.last_start_tag_name,
3294
				(const char *) token.data.tag.name.ptr,
3295
				token.data.tag.name.len);
3296
			tokeniser->context.last_start_tag_len =
3297
					token.data.tag.name.len;
3298
		} else {
3299
			tokeniser->context.last_start_tag_name[0] = '\0';
3300
			tokeniser->context.last_start_tag_len = 0;
3301
		}
3302
	} else /* if (token->type == HUBBUB_TOKEN_END_TAG) */ {
3303
		/* Reset content model after R?CDATA elements */
3304
		tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
3305
	}
3306
 
3307
	/* Reset the self-closing flag */
3308
	tokeniser->context.current_tag.self_closing = false;
3309
 
3310
	return err;
3311
}
3312
 
3313
/**
3314
 * Emit the current comment token being stored in the tokeniser context.
3315
 *
3316
 * \param tokeniser	Tokeniser instance
3317
 * \return	true
3318
 */
3319
hubbub_error emit_current_comment(hubbub_tokeniser *tokeniser)
3320
{
3321
	hubbub_token token;
3322
 
3323
	token.type = HUBBUB_TOKEN_COMMENT;
3324
	token.data.comment.ptr = tokeniser->buffer->data;
3325
	token.data.comment.len = tokeniser->buffer->length;
3326
 
3327
	return hubbub_tokeniser_emit_token(tokeniser, &token);
3328
}
3329
 
3330
/**
3331
 * Emit the current doctype token being stored in the tokeniser context.
3332
 *
3333
 * \param tokeniser	Tokeniser instance
3334
 * \param force_quirks	Force quirks mode on this document
3335
 * \return	true
3336
 */
3337
hubbub_error emit_current_doctype(hubbub_tokeniser *tokeniser,
3338
		bool force_quirks)
3339
{
3340
	hubbub_token token;
3341
 
3342
	/* Emit doctype */
3343
	token.type = HUBBUB_TOKEN_DOCTYPE;
3344
	token.data.doctype = tokeniser->context.current_doctype;
3345
	if (force_quirks == true)
3346
		token.data.doctype.force_quirks = true;
3347
 
3348
	/* Set pointers correctly */
3349
	token.data.doctype.name.ptr = tokeniser->buffer->data;
3350
 
3351
	if (token.data.doctype.public_missing == false) {
3352
		token.data.doctype.public_id.ptr = tokeniser->buffer->data +
3353
				token.data.doctype.name.len;
3354
	}
3355
 
3356
	if (token.data.doctype.system_missing == false) {
3357
		token.data.doctype.system_id.ptr = tokeniser->buffer->data +
3358
				token.data.doctype.name.len +
3359
				token.data.doctype.public_id.len;
3360
	}
3361
 
3362
	return hubbub_tokeniser_emit_token(tokeniser, &token);
3363
}
3364
 
3365
/**
3366
 * Emit a token, performing sanity checks if necessary
3367
 *
3368
 * \param tokeniser  Tokeniser instance
3369
 * \param token      Token to emit
3370
 */
3371
hubbub_error hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
3372
		hubbub_token *token)
3373
{
3374
	hubbub_error err = HUBBUB_OK;
3375
 
3376
	assert(tokeniser != NULL);
3377
	assert(token != NULL);
3378
	assert(tokeniser->insert_buf->length == 0);
3379
 
3380
#ifndef NDEBUG
3381
	/* Sanity checks */
3382
	switch (token->type) {
3383
	case HUBBUB_TOKEN_DOCTYPE:
3384
		assert(memchr(token->data.doctype.name.ptr, 0xff,
3385
				token->data.doctype.name.len) == NULL);
3386
		if (token->data.doctype.public_missing == false)
3387
			assert(memchr(token->data.doctype.public_id.ptr, 0xff,
3388
				token->data.doctype.public_id.len) == NULL);
3389
		if (token->data.doctype.system_missing == false)
3390
			assert(memchr(token->data.doctype.system_id.ptr, 0xff,
3391
				token->data.doctype.system_id.len) == NULL);
3392
		break;
3393
	case HUBBUB_TOKEN_START_TAG:
3394
	case HUBBUB_TOKEN_END_TAG:
3395
	{
3396
		uint32_t i;
3397
		assert(memchr(token->data.tag.name.ptr, 0xff,
3398
				token->data.tag.name.len) == NULL);
3399
		for (i = 0; i < token->data.tag.n_attributes; i++) {
3400
			hubbub_attribute *attr = &token->data.tag.attributes[i];
3401
 
3402
			assert(memchr(attr->name.ptr, 0xff, attr->name.len) ==
3403
					NULL);
3404
			assert(memchr(attr->value.ptr, 0xff, attr->value.len) ==
3405
					NULL);
3406
		}
3407
	}
3408
		break;
3409
	case HUBBUB_TOKEN_COMMENT:
3410
		assert(memchr(token->data.comment.ptr, 0xff,
3411
				token->data.comment.len) == NULL);
3412
		break;
3413
	case HUBBUB_TOKEN_CHARACTER:
3414
		assert(memchr(token->data.character.ptr, 0xff,
3415
				token->data.character.len) == NULL);
3416
		break;
3417
	case HUBBUB_TOKEN_EOF:
3418
		break;
3419
	}
3420
#endif
3421
 
3422
	/* Emit the token */
3423
	if (tokeniser->token_handler) {
3424
		err = tokeniser->token_handler(token, tokeniser->token_pw);
3425
	}
3426
 
3427
	/* Discard current buffer */
3428
	if (tokeniser->buffer->length) {
3429
		parserutils_buffer_discard(tokeniser->buffer, 0,
3430
				tokeniser->buffer->length);
3431
	}
3432
 
3433
	/* Advance the pointer */
3434
	if (tokeniser->context.pending) {
3435
		parserutils_inputstream_advance(tokeniser->input,
3436
				tokeniser->context.pending);
3437
		tokeniser->context.pending = 0;
3438
	}
3439
 
3440
	if (tokeniser->insert_buf->length > 0) {
3441
		parserutils_inputstream_insert(tokeniser->input,
3442
				tokeniser->insert_buf->data,
3443
				tokeniser->insert_buf->length);
3444
		parserutils_buffer_discard(tokeniser->insert_buf, 0,
3445
				tokeniser->insert_buf->length);
3446
	}
3447
 
3448
	/* Ensure callback can pause the tokenise */
3449
	if (err == HUBBUB_PAUSED) {
3450
		tokeniser->paused = true;
3451
	}
3452
 
3453
	return err;
3454
}