0,0 → 1,1212 |
/* |
* Copyright 2012 John-Mark Bell <jmb@netsurf-browser.org> |
* Copyright 2004-2007 James Bursa <bursa@users.sourceforge.net> |
* |
* This file is part of NetSurf, http://www.netsurf-browser.org/ |
* |
* NetSurf is free software; you can redistribute it and/or modify |
* it under the terms of the GNU General Public License as published by |
* the Free Software Foundation; version 2 of the License. |
* |
* NetSurf is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
* GNU General Public License for more details. |
* |
* You should have received a copy of the GNU General Public License |
* along with this program. If not, see <http://www.gnu.org/licenses/>. |
*/ |
|
/** \file |
* Save HTML document with dependencies (implementation). |
*/ |
|
#include "utils/config.h" |
|
#include <assert.h> |
#include <ctype.h> |
#include <errno.h> |
#include <stdio.h> |
#include <string.h> |
#include <sys/types.h> |
#include <regex.h> |
|
#include <dom/dom.h> |
|
#include "content/content.h" |
#include "content/hlcache.h" |
#include "css/css.h" |
#include "desktop/save_complete.h" |
#include "render/box.h" |
#include "render/html.h" |
#include "utils/corestrings.h" |
#include "utils/log.h" |
#include "utils/nsurl.h" |
#include "utils/utf8.h" |
#include "utils/utils.h" |
|
regex_t save_complete_import_re; |
|
/** An entry in save_complete_list. */ |
typedef struct save_complete_entry { |
hlcache_handle *content; |
struct save_complete_entry *next; /**< Next entry in list */ |
} save_complete_entry; |
|
typedef struct save_complete_ctx { |
const char *path; |
save_complete_entry *list; |
save_complete_set_type_cb set_type; |
|
nsurl *base; |
FILE *fp; |
enum { STATE_NORMAL, STATE_IN_STYLE } iter_state; |
} save_complete_ctx; |
|
typedef enum { |
EVENT_ENTER, |
EVENT_LEAVE |
} save_complete_event_type; |
|
|
static bool save_complete_save_html(save_complete_ctx *ctx, hlcache_handle *c, |
bool index); |
static bool save_complete_save_imported_sheets(save_complete_ctx *ctx, |
struct nscss_import *imports, uint32_t import_count); |
|
|
static void save_complete_ctx_initialise(save_complete_ctx *ctx, |
const char *path, save_complete_set_type_cb set_type) |
{ |
ctx->path = path; |
ctx->list = NULL; |
ctx->set_type = set_type; |
} |
|
static void save_complete_ctx_finalise(save_complete_ctx *ctx) |
{ |
save_complete_entry *list = ctx->list; |
|
while (list != NULL) { |
save_complete_entry *next = list->next; |
free(list); |
list = next; |
} |
} |
|
static bool save_complete_ctx_add_content(save_complete_ctx *ctx, |
hlcache_handle *content) |
{ |
save_complete_entry *entry; |
|
entry = malloc(sizeof (*entry)); |
if (entry == NULL) |
return false; |
|
entry->content = content; |
entry->next = ctx->list; |
ctx->list = entry; |
|
return true; |
} |
|
|
static hlcache_handle *save_complete_ctx_find_content(save_complete_ctx *ctx, |
const nsurl *url) |
{ |
save_complete_entry *entry; |
|
for (entry = ctx->list; entry != NULL; entry = entry->next) |
if (nsurl_compare(url, |
hlcache_handle_get_url(entry->content), |
NSURL_COMPLETE)) |
return entry->content; |
|
return NULL; |
} |
|
|
static bool save_complete_ctx_has_content(save_complete_ctx *ctx, |
hlcache_handle *content) |
{ |
save_complete_entry *entry; |
|
for (entry = ctx->list; entry != NULL; entry = entry->next) |
if (entry->content == content) |
return true; |
|
return false; |
} |
|
static bool save_complete_save_buffer(save_complete_ctx *ctx, |
const char *leafname, const char *data, size_t data_len, |
lwc_string *mime_type) |
{ |
FILE *fp; |
bool error; |
char fullpath[PATH_MAX]; |
|
strncpy(fullpath, ctx->path, sizeof fullpath); |
error = path_add_part(fullpath, sizeof fullpath, leafname); |
if (error == false) { |
warn_user("NoMemory", NULL); |
return false; |
} |
|
fp = fopen(fullpath, "wb"); |
if (fp == NULL) { |
LOG(("fopen(): errno = %i", errno)); |
warn_user("SaveError", strerror(errno)); |
return false; |
} |
|
fwrite(data, sizeof(*data), data_len, fp); |
|
fclose(fp); |
|
if (ctx->set_type != NULL) |
ctx->set_type(fullpath, mime_type); |
|
return true; |
} |
|
/** |
* Rewrite stylesheet \@import rules for save complete. |
* |
* \param source stylesheet source |
* \param size size of source |
* \param base url of stylesheet |
* \param osize updated with the size of the result |
* \return converted source, or NULL on out of memory |
*/ |
|
static char *save_complete_rewrite_stylesheet_urls(save_complete_ctx *ctx, |
const char *source, unsigned long size, const nsurl *base, |
unsigned long *osize) |
{ |
char *rewritten; |
unsigned long offset = 0; |
unsigned int imports = 0; |
nserror error; |
|
/* count number occurrences of @import to (over)estimate result size */ |
/* can't use strstr because source is not 0-terminated string */ |
for (offset = 0; SLEN("@import") < size && |
offset <= size - SLEN("@import"); offset++) { |
if (source[offset] == '@' && |
tolower(source[offset + 1]) == 'i' && |
tolower(source[offset + 2]) == 'm' && |
tolower(source[offset + 3]) == 'p' && |
tolower(source[offset + 4]) == 'o' && |
tolower(source[offset + 5]) == 'r' && |
tolower(source[offset + 6]) == 't') |
imports++; |
} |
|
rewritten = malloc(size + imports * 20); |
if (rewritten == NULL) |
return NULL; |
*osize = 0; |
|
offset = 0; |
while (offset < size) { |
const char *import_url = NULL; |
char *import_url_copy; |
int import_url_len = 0; |
nsurl *url = NULL; |
regmatch_t match[11]; |
int m = regexec(&save_complete_import_re, source + offset, |
11, match, 0); |
if (m) |
break; |
|
if (match[2].rm_so != -1) { |
import_url = source + offset + match[2].rm_so; |
import_url_len = match[2].rm_eo - match[2].rm_so; |
} else if (match[4].rm_so != -1) { |
import_url = source + offset + match[4].rm_so; |
import_url_len = match[4].rm_eo - match[4].rm_so; |
} else if (match[6].rm_so != -1) { |
import_url = source + offset + match[6].rm_so; |
import_url_len = match[6].rm_eo - match[6].rm_so; |
} else if (match[8].rm_so != -1) { |
import_url = source + offset + match[8].rm_so; |
import_url_len = match[8].rm_eo - match[8].rm_so; |
} else if (match[10].rm_so != -1) { |
import_url = source + offset + match[10].rm_so; |
import_url_len = match[10].rm_eo - match[10].rm_so; |
} |
assert(import_url != NULL); |
|
import_url_copy = strndup(import_url, import_url_len); |
if (import_url_copy == NULL) { |
free(rewritten); |
return NULL; |
} |
|
error = nsurl_join(base, import_url_copy, &url); |
free(import_url_copy); |
if (error == NSERROR_NOMEM) { |
free(rewritten); |
return NULL; |
} |
|
/* copy data before match */ |
memcpy(rewritten + *osize, source + offset, match[0].rm_so); |
*osize += match[0].rm_so; |
|
if (url != NULL) { |
hlcache_handle *content; |
content = save_complete_ctx_find_content(ctx, url); |
if (content != NULL) { |
/* replace import */ |
char buf[64]; |
snprintf(buf, sizeof buf, "@import '%p'", |
content); |
memcpy(rewritten + *osize, buf, strlen(buf)); |
*osize += strlen(buf); |
} else { |
/* copy import */ |
memcpy(rewritten + *osize, |
source + offset + match[0].rm_so, |
match[0].rm_eo - match[0].rm_so); |
*osize += match[0].rm_eo - match[0].rm_so; |
} |
nsurl_unref(url); |
} else { |
/* copy import */ |
memcpy(rewritten + *osize, |
source + offset + match[0].rm_so, |
match[0].rm_eo - match[0].rm_so); |
*osize += match[0].rm_eo - match[0].rm_so; |
} |
|
assert(0 < match[0].rm_eo); |
offset += match[0].rm_eo; |
} |
|
/* copy rest of source */ |
if (offset < size) { |
memcpy(rewritten + *osize, source + offset, size - offset); |
*osize += size - offset; |
} |
|
return rewritten; |
} |
|
static bool save_complete_save_stylesheet(save_complete_ctx *ctx, |
hlcache_handle *css) |
{ |
const char *css_data; |
unsigned long css_size; |
char *source; |
unsigned long source_len; |
struct nscss_import *imports; |
uint32_t import_count; |
lwc_string *type; |
char filename[32]; |
bool result; |
|
if (save_complete_ctx_has_content(ctx, css)) |
return true; |
|
if (save_complete_ctx_add_content(ctx, css) == false) { |
warn_user("NoMemory", 0); |
return false; |
} |
|
imports = nscss_get_imports(css, &import_count); |
if (save_complete_save_imported_sheets(ctx, |
imports, import_count) == false) |
return false; |
|
css_data = content_get_source_data(css, &css_size); |
source = save_complete_rewrite_stylesheet_urls(ctx, css_data, css_size, |
hlcache_handle_get_url(css), &source_len); |
if (source == NULL) { |
warn_user("NoMemory", 0); |
return false; |
} |
|
type = content_get_mime_type(css); |
if (type == NULL) { |
free(source); |
return false; |
} |
|
snprintf(filename, sizeof filename, "%p", css); |
|
result = save_complete_save_buffer(ctx, filename, |
source, source_len, type); |
|
lwc_string_unref(type); |
free(source); |
|
return result; |
} |
|
static bool save_complete_save_imported_sheets(save_complete_ctx *ctx, |
struct nscss_import *imports, uint32_t import_count) |
{ |
uint32_t i; |
|
for (i = 0; i < import_count; i++) { |
if (save_complete_save_stylesheet(ctx, imports[i].c) == false) |
return false; |
} |
|
return true; |
} |
|
static bool save_complete_save_html_stylesheet(save_complete_ctx *ctx, |
struct html_stylesheet *sheet) |
{ |
if (sheet->type == HTML_STYLESHEET_INTERNAL) { |
if (save_complete_save_imported_sheets(ctx, |
sheet->data.internal->imports, |
sheet->data.internal->import_count) == false) |
return false; |
|
return true; |
} |
|
if (sheet->data.external == NULL) |
return true; |
|
return save_complete_save_stylesheet(ctx, sheet->data.external); |
} |
|
static bool save_complete_save_html_stylesheets(save_complete_ctx *ctx, |
hlcache_handle *c) |
{ |
struct html_stylesheet *sheets; |
unsigned int i, count; |
|
sheets = html_get_stylesheets(c, &count); |
|
for (i = STYLESHEET_START; i != count; i++) { |
if (save_complete_save_html_stylesheet(ctx, |
&sheets[i]) == false) |
return false; |
} |
|
return true; |
} |
|
static bool save_complete_save_html_object(save_complete_ctx *ctx, |
hlcache_handle *obj) |
{ |
const char *obj_data; |
unsigned long obj_size; |
lwc_string *type; |
bool result; |
char filename[32]; |
|
if (content_get_type(obj) == CONTENT_NONE) |
return true; |
|
obj_data = content_get_source_data(obj, &obj_size); |
if (obj_data == NULL) |
return true; |
|
if (save_complete_ctx_has_content(ctx, obj)) |
return true; |
|
if (save_complete_ctx_add_content(ctx, obj) == false) { |
warn_user("NoMemory", 0); |
return false; |
} |
|
if (content_get_type(obj) == CONTENT_HTML) { |
return save_complete_save_html(ctx, obj, false); |
} |
|
snprintf(filename, sizeof filename, "%p", obj); |
|
type = content_get_mime_type(obj); |
if (type == NULL) |
return false; |
|
result = save_complete_save_buffer(ctx, filename, |
obj_data, obj_size, type); |
|
lwc_string_unref(type); |
|
return result; |
} |
|
static bool save_complete_save_html_objects(save_complete_ctx *ctx, |
hlcache_handle *c) |
{ |
struct content_html_object *object; |
unsigned int count; |
|
object = html_get_objects(c, &count); |
|
for (; object != NULL; object = object->next) { |
if (object->content != NULL) { |
if (save_complete_save_html_object(ctx, |
object->content) == false) |
return false; |
} |
} |
|
return true; |
} |
|
static bool save_complete_libdom_treewalk(dom_node *root, |
bool (*callback)(dom_node *node, |
save_complete_event_type event_type, void *ctx), |
void *ctx) |
{ |
dom_node *node; |
|
node = dom_node_ref(root); /* tree root */ |
|
while (node != NULL) { |
dom_node *next = NULL; |
dom_exception exc; |
|
exc = dom_node_get_first_child(node, &next); |
if (exc != DOM_NO_ERR) { |
dom_node_unref(node); |
break; |
} |
|
if (next != NULL) { /* 1. children */ |
dom_node_unref(node); |
node = next; |
} else { |
exc = dom_node_get_next_sibling(node, &next); |
if (exc != DOM_NO_ERR) { |
dom_node_unref(node); |
break; |
} |
|
if (next != NULL) { /* 2. siblings */ |
if (callback(node, EVENT_LEAVE, ctx) == false) { |
return false; |
} |
dom_node_unref(node); |
node = next; |
} else { /* 3. ancestor siblings */ |
while (node != NULL) { |
exc = dom_node_get_next_sibling(node, |
&next); |
if (exc != DOM_NO_ERR) { |
dom_node_unref(node); |
node = NULL; |
break; |
} |
|
if (next != NULL) { |
dom_node_unref(next); |
break; |
} |
|
exc = dom_node_get_parent_node(node, |
&next); |
if (exc != DOM_NO_ERR) { |
dom_node_unref(node); |
node = NULL; |
break; |
} |
|
if (callback(node, EVENT_LEAVE, |
ctx) == false) { |
return false; |
} |
dom_node_unref(node); |
node = next; |
} |
|
if (node == NULL) |
break; |
|
exc = dom_node_get_next_sibling(node, &next); |
if (exc != DOM_NO_ERR) { |
dom_node_unref(node); |
break; |
} |
|
if (callback(node, EVENT_LEAVE, ctx) == false) { |
return false; |
} |
dom_node_unref(node); |
node = next; |
} |
} |
|
assert(node != NULL); |
|
if (callback(node, EVENT_ENTER, ctx) == false) { |
return false; /* callback caused early termination */ |
} |
|
} |
|
return true; |
} |
|
static bool save_complete_rewrite_url_value(save_complete_ctx *ctx, |
const char *value, size_t value_len) |
{ |
nsurl *url; |
hlcache_handle *content; |
char *escaped; |
nserror error; |
utf8_convert_ret ret; |
|
error = nsurl_join(ctx->base, value, &url); |
if (error == NSERROR_NOMEM) |
return false; |
|
if (url != NULL) { |
content = save_complete_ctx_find_content(ctx, url); |
if (content != NULL) { |
/* found a match */ |
nsurl_unref(url); |
|
fprintf(ctx->fp, "\"%p\"", content); |
} else { |
/* no match found */ |
ret = utf8_to_html(nsurl_access(url), "UTF-8", |
nsurl_length(url), &escaped); |
nsurl_unref(url); |
|
if (ret != UTF8_CONVERT_OK) |
return false; |
|
fprintf(ctx->fp, "\"%s\"", escaped); |
|
free(escaped); |
} |
} else { |
ret = utf8_to_html(value, "UTF-8", value_len, &escaped); |
if (ret != UTF8_CONVERT_OK) |
return false; |
|
fprintf(ctx->fp, "\"%s\"", escaped); |
|
free(escaped); |
} |
|
return true; |
} |
|
static bool save_complete_write_value(save_complete_ctx *ctx, |
const char *value, size_t value_len) |
{ |
char *escaped; |
utf8_convert_ret ret; |
|
ret = utf8_to_html(value, "UTF-8", value_len, &escaped); |
if (ret != UTF8_CONVERT_OK) |
return false; |
|
fprintf(ctx->fp, "\"%s\"", escaped); |
|
free(escaped); |
|
return true; |
} |
|
static bool save_complete_handle_attr_value(save_complete_ctx *ctx, |
dom_string *node_name, dom_string *attr_name, |
dom_string *attr_value) |
{ |
const char *node_data = dom_string_data(node_name); |
size_t node_len = dom_string_byte_length(node_name); |
const char *name_data = dom_string_data(attr_name); |
size_t name_len = dom_string_byte_length(attr_name); |
const char *value_data = dom_string_data(attr_value); |
size_t value_len = dom_string_byte_length(attr_value); |
|
/** |
* We only need to consider the following cases: |
* |
* Attribute: Elements: |
* |
* 1) data <object> |
* 2) href <a> <area> <link> |
* 3) src <script> <input> <frame> <iframe> <img> |
* 4) background any (except those above) |
*/ |
/* 1 */ |
if (name_len == SLEN("data") && |
strncasecmp(name_data, "data", name_len) == 0) { |
if (node_len == SLEN("object") && |
strncasecmp(node_data, |
"object", node_len) == 0) { |
return save_complete_rewrite_url_value(ctx, |
value_data, value_len); |
} else { |
return save_complete_write_value(ctx, |
value_data, value_len); |
} |
} |
/* 2 */ |
else if (name_len == SLEN("href") && |
strncasecmp(name_data, "href", name_len) == 0) { |
if ((node_len == SLEN("a") && |
strncasecmp(node_data, "a", node_len) == 0) || |
(node_len == SLEN("area") && |
strncasecmp(node_data, "area", |
node_len) == 0) || |
(node_len == SLEN("link") && |
strncasecmp(node_data, "link", |
node_len) == 0)) { |
return save_complete_rewrite_url_value(ctx, |
value_data, value_len); |
} else { |
return save_complete_write_value(ctx, |
value_data, value_len); |
} |
} |
/* 3 */ |
else if (name_len == SLEN("src") && |
strncasecmp(name_data, "src", name_len) == 0) { |
if ((node_len == SLEN("frame") && |
strncasecmp(node_data, "frame", |
node_len) == 0) || |
(node_len == SLEN("iframe") && |
strncasecmp(node_data, "iframe", |
node_len) == 0) || |
(node_len == SLEN("input") && |
strncasecmp(node_data, "input", |
node_len) == 0) || |
(node_len == SLEN("img") && |
strncasecmp(node_data, "img", |
node_len) == 0) || |
(node_len == SLEN("script") && |
strncasecmp(node_data, "script", |
node_len) == 0)) { |
return save_complete_rewrite_url_value(ctx, |
value_data, value_len); |
} else { |
return save_complete_write_value(ctx, |
value_data, value_len); |
} |
} |
/* 4 */ |
else if (name_len == SLEN("background") && |
strncasecmp(name_data, "background", name_len) == 0) { |
return save_complete_rewrite_url_value(ctx, |
value_data, value_len); |
} else { |
return save_complete_write_value(ctx, |
value_data, value_len); |
} |
} |
|
static bool save_complete_handle_attr(save_complete_ctx *ctx, |
dom_string *node_name, dom_attr *attr) |
{ |
dom_string *name; |
const char *name_data; |
size_t name_len; |
dom_string *value; |
dom_exception error; |
|
error = dom_attr_get_name(attr, &name); |
if (error != DOM_NO_ERR) |
return false; |
|
if (name == NULL) |
return true; |
|
error = dom_attr_get_value(attr, &value); |
if (error != DOM_NO_ERR) { |
dom_string_unref(name); |
return false; |
} |
|
name_data = dom_string_data(name); |
name_len = dom_string_byte_length(name); |
|
fputc(' ', ctx->fp); |
fwrite(name_data, sizeof(*name_data), name_len, ctx->fp); |
|
if (value != NULL) { |
fputc('=', ctx->fp); |
if (save_complete_handle_attr_value(ctx, node_name, |
name, value) == false) { |
dom_string_unref(value); |
dom_string_unref(name); |
return false; |
} |
} |
|
dom_string_unref(name); |
|
return true; |
} |
|
static bool save_complete_handle_attrs(save_complete_ctx *ctx, |
dom_string *node_name, dom_namednodemap *attrs) |
{ |
uint32_t length, i; |
dom_exception error; |
|
error = dom_namednodemap_get_length(attrs, &length); |
if (error != DOM_NO_ERR) |
return false; |
|
for (i = 0; i < length; i++) { |
dom_attr *attr; |
|
error = dom_namednodemap_item(attrs, i, (void *) &attr); |
if (error != DOM_NO_ERR) |
return false; |
|
if (attr == NULL) |
continue; |
|
if (save_complete_handle_attr(ctx, node_name, attr) == false) { |
dom_node_unref(attr); |
return false; |
} |
|
dom_node_unref(attr); |
} |
|
return true; |
} |
|
static bool save_complete_handle_element(save_complete_ctx *ctx, |
dom_node *node, save_complete_event_type event_type) |
{ |
dom_string *name; |
dom_namednodemap *attrs; |
const char *name_data; |
size_t name_len; |
bool process = true; |
dom_exception error; |
|
ctx->iter_state = STATE_NORMAL; |
|
error = dom_node_get_node_name(node, &name); |
if (error != DOM_NO_ERR) |
return false; |
|
if (name == NULL) |
return true; |
|
name_data = dom_string_data(name); |
name_len = dom_string_byte_length(name); |
|
if (name_len == SLEN("base") && |
strncasecmp(name_data, "base", name_len) == 0) { |
/* Elide BASE elements from the output */ |
process = false; |
} else if (name_len == SLEN("meta") && |
strncasecmp(name_data, "meta", name_len) == 0) { |
/* Don't emit close tags for META elements */ |
if (event_type == EVENT_LEAVE) { |
process = false; |
} else { |
/* Elide meta charsets */ |
dom_string *value; |
error = dom_element_get_attribute(node, |
corestring_dom_http_equiv, &value); |
if (error != DOM_NO_ERR) { |
dom_string_unref(name); |
return false; |
} |
|
if (value != NULL) { |
if (dom_string_length(value) == |
SLEN("Content-Type") && |
strncasecmp(dom_string_data(value), |
"Content-Type", |
SLEN("Content-Type")) == 0) |
process = false; |
|
dom_string_unref(value); |
} else { |
bool yes; |
|
error = dom_element_has_attribute(node, |
corestring_dom_charset, &yes); |
if (error != DOM_NO_ERR) { |
dom_string_unref(name); |
return false; |
} |
|
if (yes) |
process = false; |
} |
} |
} else if (event_type == EVENT_LEAVE && |
((name_len == SLEN("link") && |
strncasecmp(name_data, "link", name_len) == 0))) { |
/* Don't emit close tags for void elements */ |
process = false; |
} |
|
if (process == false) { |
dom_string_unref(name); |
return true; |
} |
|
fputc('<', ctx->fp); |
if (event_type == EVENT_LEAVE) |
fputc('/', ctx->fp); |
fwrite(name_data, sizeof(*name_data), name_len, ctx->fp); |
|
if (event_type == EVENT_ENTER) { |
error = dom_node_get_attributes(node, &attrs); |
if (error != DOM_NO_ERR) { |
dom_string_unref(name); |
return false; |
} |
|
if (save_complete_handle_attrs(ctx, name, attrs) == false) { |
dom_namednodemap_unref(attrs); |
dom_string_unref(name); |
return false; |
} |
|
dom_namednodemap_unref(attrs); |
} |
|
fputc('>', ctx->fp); |
|
/* Rewrite contents of style elements */ |
if (event_type == EVENT_ENTER && name_len == SLEN("style") && |
strncasecmp(name_data, "style", name_len) == 0) { |
dom_string *content; |
|
error = dom_node_get_text_content(node, &content); |
if (error != DOM_NO_ERR) { |
dom_string_unref(name); |
return false; |
} |
|
if (content != NULL) { |
char *rewritten; |
unsigned long len; |
|
/* Rewrite @import rules */ |
rewritten = save_complete_rewrite_stylesheet_urls( |
ctx, |
dom_string_data(content), |
dom_string_byte_length(content), |
ctx->base, |
&len); |
if (rewritten == NULL) { |
dom_string_unref(content); |
dom_string_unref(name); |
return false; |
} |
|
dom_string_unref(content); |
|
fwrite(rewritten, sizeof(*rewritten), len, ctx->fp); |
|
free(rewritten); |
} |
|
ctx->iter_state = STATE_IN_STYLE; |
} else if (event_type == EVENT_ENTER && name_len == SLEN("head") && |
strncasecmp(name_data, "head", name_len) == 0) { |
/* If this is a HEAD element, insert a meta charset */ |
fputs("<META http-equiv=\"Content-Type\" " |
"content=\"text/html; charset=utf-8\">", |
ctx->fp); |
} |
|
dom_string_unref(name); |
|
return true; |
} |
|
static bool save_complete_node_handler(dom_node *node, |
save_complete_event_type event_type, void *ctxin) |
{ |
save_complete_ctx *ctx = ctxin; |
dom_node_type type; |
dom_exception error; |
utf8_convert_ret ret; |
|
error = dom_node_get_node_type(node, &type); |
if (error != DOM_NO_ERR) |
return false; |
|
if (type == DOM_ELEMENT_NODE) { |
return save_complete_handle_element(ctx, node, event_type); |
} else if (type == DOM_TEXT_NODE || type == DOM_COMMENT_NODE) { |
if (event_type != EVENT_ENTER) |
return true; |
|
if (ctx->iter_state != STATE_IN_STYLE) { |
/* Emit text content */ |
dom_string *text; |
const char *text_data; |
size_t text_len; |
|
error = dom_characterdata_get_data(node, &text); |
if (error != DOM_NO_ERR) { |
return false; |
} |
|
if (type == DOM_COMMENT_NODE) |
fwrite("<!--", 1, sizeof("<!--") - 1, ctx->fp); |
|
if (text != NULL) { |
char *escaped; |
|
text_data = dom_string_data(text); |
text_len = dom_string_byte_length(text); |
|
ret = utf8_to_html(text_data, "UTF-8", |
text_len, &escaped); |
if (ret != UTF8_CONVERT_OK) |
return false; |
|
fwrite(escaped, sizeof(*escaped), |
strlen(escaped), ctx->fp); |
|
free(escaped); |
|
dom_string_unref(text); |
} |
|
if (type == DOM_COMMENT_NODE) { |
fwrite("-->", 1, sizeof("-->") - 1, ctx->fp); |
} |
} |
|
} else if (type == DOM_DOCUMENT_TYPE_NODE) { |
dom_string *name; |
const char *name_data; |
size_t name_len; |
|
if (event_type != EVENT_ENTER) |
return true; |
|
error = dom_document_type_get_name(node, &name); |
if (error != DOM_NO_ERR) |
return false; |
|
if (name == NULL) |
return true; |
|
name_data = dom_string_data(name); |
name_len = dom_string_byte_length(name); |
|
fputs("<!DOCTYPE ", ctx->fp); |
fwrite(name_data, sizeof(*name_data), name_len, ctx->fp); |
|
dom_string_unref(name); |
|
error = dom_document_type_get_public_id(node, &name); |
if (error != DOM_NO_ERR) |
return false; |
|
if (name != NULL) { |
name_data = dom_string_data(name); |
name_len = dom_string_byte_length(name); |
|
if (name_len > 0) |
fprintf(ctx->fp, " PUBLIC \"%.*s\"", |
(int) name_len, name_data); |
|
dom_string_unref(name); |
} |
|
error = dom_document_type_get_system_id(node, &name); |
if (error != DOM_NO_ERR) |
return false; |
|
if (name != NULL) { |
name_data = dom_string_data(name); |
name_len = dom_string_byte_length(name); |
|
if (name_len > 0) |
fprintf(ctx->fp, " \"%.*s\"", |
(int) name_len, name_data); |
|
dom_string_unref(name); |
} |
|
fputc('>', ctx->fp); |
} else if (type == DOM_DOCUMENT_NODE) { |
/* Do nothing */ |
} else { |
LOG(("Unhandled node type: %d", type)); |
} |
|
return true; |
} |
|
static bool save_complete_save_html_document(save_complete_ctx *ctx, |
hlcache_handle *c, bool index) |
{ |
bool error; |
FILE *fp; |
dom_document *doc; |
lwc_string *mime_type; |
char filename[32]; |
char fullpath[PATH_MAX]; |
|
strncpy(fullpath, ctx->path, sizeof fullpath); |
|
if (index) |
snprintf(filename, sizeof filename, "index"); |
else |
snprintf(filename, sizeof filename, "%p", c); |
|
error = path_add_part(fullpath, sizeof fullpath, filename); |
if (error == false) { |
warn_user("NoMemory", NULL); |
return false; |
} |
|
fp = fopen(fullpath, "wb"); |
if (fp == NULL) { |
warn_user("NoMemory", NULL); |
return false; |
} |
|
ctx->base = html_get_base_url(c); |
ctx->fp = fp; |
ctx->iter_state = STATE_NORMAL; |
|
doc = html_get_document(c); |
|
if (save_complete_libdom_treewalk((dom_node *) doc, |
save_complete_node_handler, ctx) == false) { |
warn_user("NoMemory", 0); |
fclose(fp); |
return false; |
} |
|
fclose(fp); |
|
mime_type = content_get_mime_type(c); |
if (mime_type != NULL) { |
if (ctx->set_type != NULL) |
ctx->set_type(fullpath, mime_type); |
|
lwc_string_unref(mime_type); |
} |
|
return true; |
} |
|
/** |
* Save an HTML page with all dependencies, recursing through imported pages. |
* |
* \param ctx Save complete context |
* \param c Content to save |
* \param index true to save as "index" |
* \return true on success, false on error and error reported |
*/ |
static bool save_complete_save_html(save_complete_ctx *ctx, hlcache_handle *c, |
bool index) |
{ |
if (content_get_type(c) != CONTENT_HTML) |
return false; |
|
if (save_complete_ctx_has_content(ctx, c)) |
return true; |
|
if (save_complete_save_html_stylesheets(ctx, c) == false) |
return false; |
|
if (save_complete_save_html_objects(ctx, c) == false) |
return false; |
|
return save_complete_save_html_document(ctx, c, index); |
} |
|
|
/** |
* Create the inventory file listing original URLs. |
*/ |
|
static bool save_complete_inventory(save_complete_ctx *ctx) |
{ |
FILE *fp; |
bool error; |
save_complete_entry *entry; |
char fullpath[PATH_MAX]; |
|
strncpy(fullpath, ctx->path, sizeof fullpath); |
error = path_add_part(fullpath, sizeof fullpath, "Inventory"); |
if (error == false) { |
warn_user("NoMemory", NULL); |
return false; |
} |
|
fp = fopen(fullpath, "w"); |
if (fp == NULL) { |
LOG(("fopen(): errno = %i", errno)); |
warn_user("SaveError", strerror(errno)); |
return false; |
} |
|
for (entry = ctx->list; entry != NULL; entry = entry->next) { |
fprintf(fp, "%p %s\n", entry->content, |
nsurl_access(hlcache_handle_get_url( |
entry->content))); |
} |
|
fclose(fp); |
|
return true; |
} |
|
/* Documented in save_complete.h */ |
void save_complete_init(void) |
{ |
/* Match an @import rule - see CSS 2.1 G.1. */ |
regcomp_wrapper(&save_complete_import_re, |
"@import" /* IMPORT_SYM */ |
"[ \t\r\n\f]*" /* S* */ |
/* 1 */ |
"(" /* [ */ |
/* 2 3 */ |
"\"(([^\"]|[\\]\")*)\"" /* STRING (approximated) */ |
"|" |
/* 4 5 */ |
"'(([^']|[\\]')*)'" |
"|" /* | */ |
"url\\([ \t\r\n\f]*" /* URI (approximated) */ |
/* 6 7 */ |
"\"(([^\"]|[\\]\")*)\"" |
"[ \t\r\n\f]*\\)" |
"|" |
"url\\([ \t\r\n\f]*" |
/* 8 9 */ |
"'(([^']|[\\]')*)'" |
"[ \t\r\n\f]*\\)" |
"|" |
"url\\([ \t\r\n\f]*" |
/* 10 */ |
"([^) \t\r\n\f]*)" |
"[ \t\r\n\f]*\\)" |
")", /* ] */ |
REG_EXTENDED | REG_ICASE); |
} |
|
/* Documented in save_complete.h */ |
bool save_complete(hlcache_handle *c, const char *path, |
save_complete_set_type_cb set_type) |
{ |
bool result; |
save_complete_ctx ctx; |
|
save_complete_ctx_initialise(&ctx, path, set_type); |
|
result = save_complete_save_html(&ctx, c, true); |
|
if (result) |
result = save_complete_inventory(&ctx); |
|
save_complete_ctx_finalise(&ctx); |
|
return result; |
} |
|