Subversion Repositories Kolibri OS

Rev

Rev 4364 | Go to most recent revision | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4364 Rev 5043
1
/*
1
/*
2
 * Copyright 2006 John M Bell 
2
 * Copyright 2006 John M Bell 
3
 * Copyright 2009 John Tytgat 
3
 * Copyright 2009 John Tytgat 
4
 *
4
 *
5
 * This file is part of NetSurf, http://www.netsurf-browser.org/
5
 * This file is part of NetSurf, http://www.netsurf-browser.org/
6
 *
6
 *
7
 * NetSurf is free software; you can redistribute it and/or modify
7
 * NetSurf is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
8
 * it under the terms of the GNU General Public License as published by
9
 * the Free Software Foundation; version 2 of the License.
9
 * the Free Software Foundation; version 2 of the License.
10
 *
10
 *
11
 * NetSurf is distributed in the hope that it will be useful,
11
 * NetSurf is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
14
 * GNU General Public License for more details.
15
 *
15
 *
16
 * You should have received a copy of the GNU General Public License
16
 * You should have received a copy of the GNU General Public License
17
 * along with this program.  If not, see .
17
 * along with this program.  If not, see .
18
 */
18
 */
19
 
19
 
20
/** \file
20
/** \file
21
 * Unified URL information database (implementation)
21
 * Unified URL information database (implementation)
22
 *
22
 *
23
 * URLs are stored in a tree-based structure as follows:
23
 * URLs are stored in a tree-based structure as follows:
24
 *
24
 *
25
 * The host component is extracted from each URL and, if a FQDN, split on
25
 * The host component is extracted from each URL and, if a FQDN, split on
26
 * every '.'.The tree is constructed by inserting each FQDN segment in
26
 * every '.'.The tree is constructed by inserting each FQDN segment in
27
 * reverse order. Duplicate nodes are merged.
27
 * reverse order. Duplicate nodes are merged.
28
 *
28
 *
29
 * If the host part of an URL is an IP address, then this is added to the
29
 * If the host part of an URL is an IP address, then this is added to the
30
 * tree verbatim (as if it were a TLD).
30
 * tree verbatim (as if it were a TLD).
31
 *
31
 *
32
 * This provides something looking like:
32
 * This provides something looking like:
33
 *
33
 *
34
 * 			      root (a sentinel)
34
 * 			      root (a sentinel)
35
 * 				|
35
 * 				|
36
 * 	-------------------------------------------------
36
 * 	-------------------------------------------------
37
 * 	|	|	|	|	|	|	|
37
 * 	|	|	|	|	|	|	|
38
 *     com     edu     gov  127.0.0.1  net     org     uk	TLDs
38
 *     com     edu     gov  127.0.0.1  net     org     uk	TLDs
39
 * 	|	|	|		|	|	|
39
 * 	|	|	|		|	|	|
40
 *    google   ...     ...             ...     ...     co	2LDs
40
 *    google   ...     ...             ...     ...     co	2LDs
41
 * 	|						|
41
 * 	|						|
42
 *     www					       bbc  Hosts/Subdomains
42
 *     www					       bbc  Hosts/Subdomains
43
 *							|
43
 *							|
44
 *						       www	...
44
 *						       www	...
45
 *
45
 *
46
 * Each of the nodes in this tree is a struct host_part. This stores the
46
 * Each of the nodes in this tree is a struct host_part. This stores the
47
 * FQDN segment (or IP address) with which the node is concerned. Each node
47
 * FQDN segment (or IP address) with which the node is concerned. Each node
48
 * may contain further information about paths on a host (struct path_data)
48
 * may contain further information about paths on a host (struct path_data)
49
 * or SSL certificate processing on a host-wide basis
49
 * or SSL certificate processing on a host-wide basis
50
 * (host_part::permit_invalid_certs).
50
 * (host_part::permit_invalid_certs).
51
 *
51
 *
52
 * Path data is concerned with storing various metadata about the path in
52
 * Path data is concerned with storing various metadata about the path in
53
 * question. This includes global history data, HTTP authentication details
53
 * question. This includes global history data, HTTP authentication details
54
 * and any associated HTTP cookies. This is stored as a tree of path segments
54
 * and any associated HTTP cookies. This is stored as a tree of path segments
55
 * hanging off the relevant host_part node.
55
 * hanging off the relevant host_part node.
56
 *
56
 *
57
 * Therefore, to find the last visited time of the URL
57
 * Therefore, to find the last visited time of the URL
58
 * http://www.example.com/path/to/resource.html, the FQDN tree would be
58
 * http://www.example.com/path/to/resource.html, the FQDN tree would be
59
 * traversed in the order root -> "com" -> "example" -> "www". The "www"
59
 * traversed in the order root -> "com" -> "example" -> "www". The "www"
60
 * node would have attached to it a tree of struct path_data:
60
 * node would have attached to it a tree of struct path_data:
61
 *
61
 *
62
 *			    (sentinel)
62
 *			    (sentinel)
63
 *				|
63
 *				|
64
 * 			       path
64
 * 			       path
65
 * 				|
65
 * 				|
66
 * 			       to
66
 * 			       to
67
 * 				|
67
 * 				|
68
 * 			   resource.html
68
 * 			   resource.html
69
 *
69
 *
70
 * This represents the absolute path "/path/to/resource.html". The leaf node
70
 * This represents the absolute path "/path/to/resource.html". The leaf node
71
 * "resource.html" contains the last visited time of the resource.
71
 * "resource.html" contains the last visited time of the resource.
72
 *
72
 *
73
 * The mechanism described above is, however, not particularly conducive to
73
 * The mechanism described above is, however, not particularly conducive to
74
 * fast searching of the database for a given URL (or URLs beginning with a
74
 * fast searching of the database for a given URL (or URLs beginning with a
75
 * given prefix). Therefore, an anciliary data structure is used to enable
75
 * given prefix). Therefore, an anciliary data structure is used to enable
76
 * fast searching. This structure simply reflects the contents of the
76
 * fast searching. This structure simply reflects the contents of the
77
 * database, with entries being added/removed at the same time as for the
77
 * database, with entries being added/removed at the same time as for the
78
 * core database. In order to ensure that degenerate cases are kept to a
78
 * core database. In order to ensure that degenerate cases are kept to a
79
 * minimum, we use an AAtree. This is an approximation of a Red-Black tree
79
 * minimum, we use an AAtree. This is an approximation of a Red-Black tree
80
 * with similar performance characteristics, but with a significantly
80
 * with similar performance characteristics, but with a significantly
81
 * simpler implementation. Entries in this tree comprise pointers to the
81
 * simpler implementation. Entries in this tree comprise pointers to the
82
 * leaf nodes of the host tree described above.
82
 * leaf nodes of the host tree described above.
83
 *
83
 *
84
 * REALLY IMPORTANT NOTE: urldb expects all URLs to be normalised. Use of 
84
 * REALLY IMPORTANT NOTE: urldb expects all URLs to be normalised. Use of 
85
 * non-normalised URLs with urldb will result in undefined behaviour and 
85
 * non-normalised URLs with urldb will result in undefined behaviour and 
86
 * potential crashes.
86
 * potential crashes.
87
 */
87
 */
88
 
88
 
89
#include 
89
#include 
90
#include 
90
#include 
91
#include 
91
#include 
92
#include 
92
#include 
93
#include 
93
#include 
94
#include 
94
#include 
95
#include 
95
#include 
96
#include 
96
#include 
97
 
97
 
98
#include 
98
#include 
99
 
99
 
100
#include "image/bitmap.h"
100
#include "image/bitmap.h"
101
#include "content/content.h"
101
#include "content/content.h"
102
#include "content/urldb.h"
102
#include "content/urldb.h"
103
#include "desktop/cookies.h"
103
#include "desktop/cookies.h"
104
#include "desktop/options.h"
104
#include "desktop/options.h"
105
#include "utils/log.h"
105
#include "utils/log.h"
106
#include "utils/corestrings.h"
106
#include "utils/corestrings.h"
107
#include "utils/filename.h"
107
#include "utils/filename.h"
108
#include "utils/url.h"
108
#include "utils/url.h"
109
#include "utils/utils.h"
109
#include "utils/utils.h"
110
 
110
 
111
struct cookie_internal_data {
111
struct cookie_internal_data {
112
	char *name;		/**< Cookie name */
112
	char *name;		/**< Cookie name */
113
	char *value;		/**< Cookie value */
113
	char *value;		/**< Cookie value */
114
	bool value_was_quoted;	/**< Value was quoted in Set-Cookie: */
114
	bool value_was_quoted;	/**< Value was quoted in Set-Cookie: */
115
	char *comment;		/**< Cookie comment */
115
	char *comment;		/**< Cookie comment */
116
	bool domain_from_set;	/**< Domain came from Set-Cookie: header */
116
	bool domain_from_set;	/**< Domain came from Set-Cookie: header */
117
	char *domain;		/**< Domain */
117
	char *domain;		/**< Domain */
118
	bool path_from_set;	/**< Path came from Set-Cookie: header */
118
	bool path_from_set;	/**< Path came from Set-Cookie: header */
119
	char *path;		/**< Path */
119
	char *path;		/**< Path */
120
	time_t expires;		/**< Expiry timestamp, or -1 for session */
120
	time_t expires;		/**< Expiry timestamp, or -1 for session */
121
	time_t last_used;	/**< Last used time */
121
	time_t last_used;	/**< Last used time */
122
	bool secure;		/**< Only send for HTTPS requests */
122
	bool secure;		/**< Only send for HTTPS requests */
123
	bool http_only;		/**< Only expose to HTTP(S) requests */
123
	bool http_only;		/**< Only expose to HTTP(S) requests */
124
	cookie_version version;	/**< Specification compliance */
124
	cookie_version version;	/**< Specification compliance */
125
	bool no_destroy;	/**< Never destroy this cookie,
125
	bool no_destroy;	/**< Never destroy this cookie,
126
				 * unless it's expired */
126
				 * unless it's expired */
127
 
127
 
128
	struct cookie_internal_data *prev;	/**< Previous in list */
128
	struct cookie_internal_data *prev;	/**< Previous in list */
129
	struct cookie_internal_data *next;	/**< Next in list */
129
	struct cookie_internal_data *next;	/**< Next in list */
130
};
130
};
131
 
131
 
132
/* A protection space is defined as a tuple canonical_root_url and realm.
132
/* A protection space is defined as a tuple canonical_root_url and realm.
133
 * This structure lives as linked list element in a leaf host_part struct
133
 * This structure lives as linked list element in a leaf host_part struct
134
 * so we need additional scheme and port to have a canonical_root_url.  */
134
 * so we need additional scheme and port to have a canonical_root_url.  */
135
struct prot_space_data {
135
struct prot_space_data {
136
	lwc_string *scheme;	/**< URL scheme of canonical hostname of this
136
	lwc_string *scheme;	/**< URL scheme of canonical hostname of this
137
				 * protection space. */
137
				 * protection space. */
138
	unsigned int port;	/**< Port number of canonical hostname of this
138
	unsigned int port;	/**< Port number of canonical hostname of this
139
				 * protection space. When 0, it means the
139
				 * protection space. When 0, it means the
140
				 * default port for given scheme, i.e. 80
140
				 * default port for given scheme, i.e. 80
141
				 * (http), 443 (https). */
141
				 * (http), 443 (https). */
142
	char *realm;		/**< Protection realm */
142
	char *realm;		/**< Protection realm */
143
 
143
 
144
	char *auth;		/**< Authentication details for this
144
	char *auth;		/**< Authentication details for this
145
				 * protection space in form
145
				 * protection space in form
146
				 * username:password */
146
				 * username:password */
147
	struct prot_space_data *next;	/**< Next sibling */
147
	struct prot_space_data *next;	/**< Next sibling */
148
};
148
};
149
 
149
 
150
struct cache_internal_data {
150
struct cache_internal_data {
151
	char filename[12];	/**< Cached filename, or first byte 0 for none */
151
	char filename[12];	/**< Cached filename, or first byte 0 for none */
152
};
152
};
153
 
153
 
154
struct url_internal_data {
154
struct url_internal_data {
155
	char *title;		/**< Resource title */
155
	char *title;		/**< Resource title */
156
	unsigned int visits;	/**< Visit count */
156
	unsigned int visits;	/**< Visit count */
157
	time_t last_visit;	/**< Last visit time */
157
	time_t last_visit;	/**< Last visit time */
158
	content_type type;	/**< Type of resource */
158
	content_type type;	/**< Type of resource */
159
};
159
};
160
 
160
 
161
struct path_data {
161
struct path_data {
162
	nsurl *url;		/**< Full URL */
162
	nsurl *url;		/**< Full URL */
163
	lwc_string *scheme;	/**< URL scheme for data */
163
	lwc_string *scheme;	/**< URL scheme for data */
164
	unsigned int port;	/**< Port number for data. When 0, it means
164
	unsigned int port;	/**< Port number for data. When 0, it means
165
				 * the default port for given scheme, i.e.
165
				 * the default port for given scheme, i.e.
166
				 * 80 (http), 443 (https). */
166
				 * 80 (http), 443 (https). */
167
	char *segment;		/**< Path segment for this node */
167
	char *segment;		/**< Path segment for this node */
168
	unsigned int frag_cnt;	/**< Number of entries in path_data::fragment */
168
	unsigned int frag_cnt;	/**< Number of entries in path_data::fragment */
169
	char **fragment;	/**< Array of fragments */
169
	char **fragment;	/**< Array of fragments */
170
	bool persistent;	/**< This entry should persist */
170
	bool persistent;	/**< This entry should persist */
171
 
171
 
172
	struct bitmap *thumb;	/**< Thumbnail image of resource */
172
	struct bitmap *thumb;	/**< Thumbnail image of resource */
173
	struct url_internal_data urld;	/**< URL data for resource */
173
	struct url_internal_data urld;	/**< URL data for resource */
174
	struct cache_internal_data cache;	/**< Cache data for resource */
174
	struct cache_internal_data cache;	/**< Cache data for resource */
175
	const struct prot_space_data *prot_space;	/**< Protection space
175
	const struct prot_space_data *prot_space;	/**< Protection space
176
				 * to which this resource belongs too. Can be
176
				 * to which this resource belongs too. Can be
177
				 * NULL when it does not belong to a protection
177
				 * NULL when it does not belong to a protection
178
				 * space or when it is not known. No
178
				 * space or when it is not known. No
179
				 * ownership (is with struct host_part::prot_space). */
179
				 * ownership (is with struct host_part::prot_space). */
180
	struct cookie_internal_data *cookies;	/**< Cookies associated with resource */
180
	struct cookie_internal_data *cookies;	/**< Cookies associated with resource */
181
	struct cookie_internal_data *cookies_end;	/**< Last cookie in list */
181
	struct cookie_internal_data *cookies_end;	/**< Last cookie in list */
182
 
182
 
183
	struct path_data *next;	/**< Next sibling */
183
	struct path_data *next;	/**< Next sibling */
184
	struct path_data *prev;	/**< Previous sibling */
184
	struct path_data *prev;	/**< Previous sibling */
185
	struct path_data *parent;	/**< Parent path segment */
185
	struct path_data *parent;	/**< Parent path segment */
186
	struct path_data *children;	/**< Child path segments */
186
	struct path_data *children;	/**< Child path segments */
187
	struct path_data *last;		/**< Last child */
187
	struct path_data *last;		/**< Last child */
188
};
188
};
189
 
189
 
190
struct host_part {
190
struct host_part {
191
	/**< Known paths on this host. This _must_ be first so that
191
	/**< Known paths on this host. This _must_ be first so that
192
	 * struct host_part *h = (struct host_part *)mypath; works */
192
	 * struct host_part *h = (struct host_part *)mypath; works */
193
	struct path_data paths;
193
	st