0,0 → 1,328 |
/* |
* This file is part of Hubbub. |
* Licensed under the MIT License, |
* http://www.opensource.org/licenses/mit-license.php |
* Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org> |
*/ |
|
#include <assert.h> |
#include <string.h> |
|
#include <parserutils/charset/mibenum.h> |
#include <parserutils/input/inputstream.h> |
|
#include <hubbub/parser.h> |
|
#include "charset/detect.h" |
#include "tokeniser/tokeniser.h" |
#include "treebuilder/treebuilder.h" |
#include "utils/parserutilserror.h" |
|
/** |
* Hubbub parser object |
*/ |
struct hubbub_parser { |
parserutils_inputstream *stream; /**< Input stream instance */ |
hubbub_tokeniser *tok; /**< Tokeniser instance */ |
hubbub_treebuilder *tb; /**< Treebuilder instance */ |
|
hubbub_allocator_fn alloc; /**< Memory (de)allocation function */ |
void *pw; /**< Client data */ |
}; |
|
/** |
* Create a hubbub parser |
* |
* \param enc Source document encoding, or NULL to autodetect |
* \param fix_enc Permit fixing up of encoding if it's frequently misused |
* \param alloc Memory (de)allocation function |
* \param pw Pointer to client-specific private data (may be NULL) |
* \param parser Pointer to location to receive parser instance |
* \return HUBBUB_OK on success, |
* HUBBUB_BADPARM on bad parameters, |
* HUBBUB_NOMEM on memory exhaustion, |
* HUBBUB_BADENCODING if ::enc is unsupported |
*/ |
hubbub_error hubbub_parser_create(const char *enc, bool fix_enc, |
hubbub_allocator_fn alloc, void *pw, hubbub_parser **parser) |
{ |
parserutils_error perror; |
hubbub_error error; |
hubbub_parser *p; |
|
if (alloc == NULL || parser == NULL) |
return HUBBUB_BADPARM; |
|
p = alloc(NULL, sizeof(hubbub_parser), pw); |
if (p == NULL) |
return HUBBUB_NOMEM; |
|
/* If we have an encoding and we're permitted to fix up likely broken |
* ones, then attempt to do so. */ |
if (enc != NULL && fix_enc == true) { |
uint16_t mibenum = parserutils_charset_mibenum_from_name(enc, |
strlen(enc)); |
|
if (mibenum != 0) { |
hubbub_charset_fix_charset(&mibenum); |
|
enc = parserutils_charset_mibenum_to_name(mibenum); |
} |
} |
|
perror = parserutils_inputstream_create(enc, |
enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, |
hubbub_charset_extract, alloc, pw, &p->stream); |
if (perror != PARSERUTILS_OK) { |
alloc(p, 0, pw); |
return hubbub_error_from_parserutils_error(perror); |
} |
|
error = hubbub_tokeniser_create(p->stream, alloc, pw, &p->tok); |
if (error != HUBBUB_OK) { |
parserutils_inputstream_destroy(p->stream); |
alloc(p, 0, pw); |
return error; |
} |
|
error = hubbub_treebuilder_create(p->tok, alloc, pw, &p->tb); |
if (error != HUBBUB_OK) { |
hubbub_tokeniser_destroy(p->tok); |
parserutils_inputstream_destroy(p->stream); |
alloc(p, 0, pw); |
return error; |
} |
|
p->alloc = alloc; |
p->pw = pw; |
|
*parser = p; |
|
return HUBBUB_OK; |
} |
|
/** |
* Destroy a hubbub parser |
* |
* \param parser Parser instance to destroy |
* \return HUBBUB_OK on success, appropriate error otherwise |
*/ |
hubbub_error hubbub_parser_destroy(hubbub_parser *parser) |
{ |
if (parser == NULL) |
return HUBBUB_BADPARM; |
|
hubbub_treebuilder_destroy(parser->tb); |
|
hubbub_tokeniser_destroy(parser->tok); |
|
parserutils_inputstream_destroy(parser->stream); |
|
parser->alloc(parser, 0, parser->pw); |
|
return HUBBUB_OK; |
} |
|
/** |
* Configure a hubbub parser |
* |
* \param parser Parser instance to configure |
* \param type Option to set |
* \param params Option-specific parameters |
* \return HUBBUB_OK on success, appropriate error otherwise |
*/ |
hubbub_error hubbub_parser_setopt(hubbub_parser *parser, |
hubbub_parser_opttype type, |
hubbub_parser_optparams *params) |
{ |
hubbub_error result = HUBBUB_OK; |
|
if (parser == NULL || params == NULL) |
return HUBBUB_BADPARM; |
|
switch (type) { |
case HUBBUB_PARSER_TOKEN_HANDLER: |
if (parser->tb != NULL) { |
/* Client is defining their own token handler, |
* so we must destroy the default treebuilder */ |
hubbub_treebuilder_destroy(parser->tb); |
parser->tb = NULL; |
} |
result = hubbub_tokeniser_setopt(parser->tok, |
HUBBUB_TOKENISER_TOKEN_HANDLER, |
(hubbub_tokeniser_optparams *) params); |
break; |
|
case HUBBUB_PARSER_ERROR_HANDLER: |
/* The error handler does not cascade, so tell both the |
* treebuilder (if extant) and the tokeniser. */ |
if (parser->tb != NULL) { |
result = hubbub_treebuilder_setopt(parser->tb, |
HUBBUB_TREEBUILDER_ERROR_HANDLER, |
(hubbub_treebuilder_optparams *) params); |
} |
if (result == HUBBUB_OK) { |
result = hubbub_tokeniser_setopt(parser->tok, |
HUBBUB_TOKENISER_ERROR_HANDLER, |
(hubbub_tokeniser_optparams *) params); |
} |
break; |
|
case HUBBUB_PARSER_CONTENT_MODEL: |
result = hubbub_tokeniser_setopt(parser->tok, |
HUBBUB_TOKENISER_CONTENT_MODEL, |
(hubbub_tokeniser_optparams *) params); |
break; |
|
case HUBBUB_PARSER_PAUSE: |
result = hubbub_tokeniser_setopt(parser->tok, |
HUBBUB_TOKENISER_PAUSE, |
(hubbub_tokeniser_optparams *) params); |
break; |
|
case HUBBUB_PARSER_TREE_HANDLER: |
if (parser->tb != NULL) { |
result = hubbub_treebuilder_setopt(parser->tb, |
HUBBUB_TREEBUILDER_TREE_HANDLER, |
(hubbub_treebuilder_optparams *) params); |
} |
break; |
|
case HUBBUB_PARSER_DOCUMENT_NODE: |
if (parser->tb != NULL) { |
result = hubbub_treebuilder_setopt(parser->tb, |
HUBBUB_TREEBUILDER_DOCUMENT_NODE, |
(hubbub_treebuilder_optparams *) params); |
} |
break; |
|
case HUBBUB_PARSER_ENABLE_SCRIPTING: |
if (parser->tb != NULL) { |
result = hubbub_treebuilder_setopt(parser->tb, |
HUBBUB_TREEBUILDER_ENABLE_SCRIPTING, |
(hubbub_treebuilder_optparams *) params); |
} |
break; |
|
default: |
result = HUBBUB_INVALID; |
} |
|
return result; |
} |
|
/** |
* Insert a chunk of data into a hubbub parser input stream |
* |
* Inserts the given data into the input stream ready for parsing but |
* does not cause any additional processing of the input. This is |
* useful to allow hubbub callbacks to add computed data to the input. |
* |
* \param parser Parser instance to use |
* \param data Data to parse (encoded in UTF-8) |
* \param len Length, in bytes, of data |
* \return HUBBUB_OK on success, appropriate error otherwise |
*/ |
hubbub_error hubbub_parser_insert_chunk(hubbub_parser *parser, |
const uint8_t *data, size_t len) |
{ |
if (parser == NULL || data == NULL) |
return HUBBUB_BADPARM; |
|
return hubbub_tokeniser_insert_chunk(parser->tok, data, len); |
} |
|
/** |
* Pass a chunk of data to a hubbub parser for parsing |
* |
* \param parser Parser instance to use |
* \param data Data to parse (encoded in the input charset) |
* \param len Length, in bytes, of data |
* \return HUBBUB_OK on success, appropriate error otherwise |
*/ |
hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, |
const uint8_t *data, size_t len) |
{ |
parserutils_error perror; |
hubbub_error error; |
|
if (parser == NULL || data == NULL) |
return HUBBUB_BADPARM; |
|
perror = parserutils_inputstream_append(parser->stream, data, len); |
if (perror != PARSERUTILS_OK) |
return hubbub_error_from_parserutils_error(perror); |
|
error = hubbub_tokeniser_run(parser->tok); |
if (error == HUBBUB_BADENCODING) { |
/* Ok, we autodetected an encoding that we don't actually |
* support. We've not actually processed any data at this |
* point so fall back to Windows-1252 and hope for the best |
*/ |
perror = parserutils_inputstream_change_charset(parser->stream, |
"Windows-1252", HUBBUB_CHARSET_TENTATIVE); |
/* Under no circumstances should we get here if we've managed |
* to process data. If there is a way, I want to know about it |
*/ |
assert(perror != PARSERUTILS_INVALID); |
if (perror != PARSERUTILS_OK) |
return hubbub_error_from_parserutils_error(perror); |
|
/* Retry the tokenisation */ |
error = hubbub_tokeniser_run(parser->tok); |
} |
|
if (error != HUBBUB_OK) |
return error; |
|
return HUBBUB_OK; |
} |
|
/** |
* Inform the parser that the last chunk of data has been parsed |
* |
* \param parser Parser to inform |
* \return HUBBUB_OK on success, appropriate error otherwise |
*/ |
hubbub_error hubbub_parser_completed(hubbub_parser *parser) |
{ |
parserutils_error perror; |
hubbub_error error; |
|
if (parser == NULL) |
return HUBBUB_BADPARM; |
|
perror = parserutils_inputstream_append(parser->stream, NULL, 0); |
if (perror != PARSERUTILS_OK) |
return hubbub_error_from_parserutils_error(perror); |
|
error = hubbub_tokeniser_run(parser->tok); |
if (error != HUBBUB_OK) |
return error; |
|
return HUBBUB_OK; |
} |
|
/** |
* Read the document charset |
* |
* \param parser Parser instance to query |
* \param source Pointer to location to receive charset source |
* \return Pointer to charset name (constant; do not free), or NULL if unknown |
*/ |
const char *hubbub_parser_read_charset(hubbub_parser *parser, |
hubbub_charset_source *source) |
{ |
const char *name; |
uint32_t src; |
|
if (parser == NULL || source == NULL) |
return NULL; |
|
name = parserutils_inputstream_read_charset(parser->stream, &src); |
|
*source = (hubbub_charset_source) src; |
|
return name; |
} |
|