Subversion Repositories Kolibri OS

Rev

Rev 4364 | Go to most recent revision | Only display areas with differences | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4364 Rev 5043
1
/*
1
/*
2
 * Copyright 2006 Richard Wilson 
2
 * Copyright 2006 Richard Wilson 
3
 * Copyright 2005 James Bursa 
3
 * Copyright 2005 James Bursa 
4
 * Copyright 2005 John M Bell 
4
 * Copyright 2005 John M Bell 
5
 *
5
 *
6
 * This file is part of NetSurf, http://www.netsurf-browser.org/
6
 * This file is part of NetSurf, http://www.netsurf-browser.org/
7
 *
7
 *
8
 * NetSurf is free software; you can redistribute it and/or modify
8
 * NetSurf is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; version 2 of the License.
10
 * the Free Software Foundation; version 2 of the License.
11
 *
11
 *
12
 * NetSurf is distributed in the hope that it will be useful,
12
 * NetSurf is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
15
 * GNU General Public License for more details.
16
 *
16
 *
17
 * You should have received a copy of the GNU General Public License
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program.  If not, see .
18
 * along with this program.  If not, see .
19
 */
19
 */
20
 
20
 
21
/** \file
21
/** \file
22
 * URL parsing and joining (implementation).
22
 * URL parsing and joining (implementation).
23
 */
23
 */
24
 
24
 
25
#include 
25
#include 
26
#include 
26
#include 
27
#include 
27
#include 
28
#include 
28
#include 
29
#include 
29
#include 
30
#include 
30
#include 
31
#include 
31
#include 
32
 
32
 
33
#include "curl/curl.h"
33
#include "curl/curl.h"
34
#include "utils/config.h"
34
#include "utils/config.h"
35
#include "utils/log.h"
35
#include "utils/log.h"
36
#include "utils/url.h"
36
#include "utils/url.h"
37
#include "utils/utils.h"
37
#include "utils/utils.h"
-
 
38
#include "content/fetchers/http_msg.h"
-
 
39
#include "content/fetchers/http.h"
-
 
40
 
38
 
41
 
39
struct url_components_internal {
42
struct url_components_internal {
40
	char *buffer;	/* buffer used for all the following data */
43
	char *buffer;	/* buffer used for all the following data */
41
	char *scheme;
44
	char *scheme;
42
	char *authority;
45
	char *authority;
43
	char *path;
46
	char *path;
44
	char *query;
47
	char *query;
45
	char *fragment;
48
	char *fragment;
46
};
49
};
47
 
50
 
48
 
51
 
49
regex_t url_re, url_up_re;
52
regex_t url_re, url_up_re;
50
 
53
 
51
/**
54
/**
52
 * Initialise URL routines.
55
 * Initialise URL routines.
53
 *
56
 *
54
 * Compiles regular expressions required by the url_ functions.
57
 * Compiles regular expressions required by the url_ functions.
55
 */
58
 */
56
 
59
 
57
void url_init(void)
60
void url_init(void)
58
{
61
{
59
	/* regex from RFC 2396 */
62
	/* regex from RFC 2396 */
60
	regcomp_wrapper(&url_re, "^[[:space:]]*"
63
	regcomp_wrapper(&url_re, "^[[:space:]]*"
61
#define URL_RE_SCHEME 2
64
#define URL_RE_SCHEME 2
62
			"(([a-zA-Z][-a-zA-Z0-9+.]*):)?"
65
			"(([a-zA-Z][-a-zA-Z0-9+.]*):)?"
63
#define URL_RE_AUTHORITY 4
66
#define URL_RE_AUTHORITY 4
64
			"(//([^/?#[:space:]]*))?"
67
			"(//([^/?#[:space:]]*))?"
65
#define URL_RE_PATH 5
68
#define URL_RE_PATH 5
66
			"([^?#[:space:]]*)"
69
			"([^?#[:space:]]*)"
67
#define URL_RE_QUERY 7
70
#define URL_RE_QUERY 7
68
			"(\\?([^#[:space:]]*))?"
71
			"(\\?([^#[:space:]]*))?"
69
#define URL_RE_FRAGMENT 9
72
#define URL_RE_FRAGMENT 9
70
			"(#([^[:space:]]*))?"
73
			"(#([^[:space:]]*))?"
71
			"[[:space:]]*$", REG_EXTENDED);
74
			"[[:space:]]*$", REG_EXTENDED);
72
	regcomp_wrapper(&url_up_re,
75
	regcomp_wrapper(&url_up_re,
73
			"/([^/]?|[.][^./]|[^./][.]|[^./][^./]|[^/][^/][^/]+)"
76
			"/([^/]?|[.][^./]|[^./][.]|[^./][^./]|[^/][^/][^/]+)"
74
			"/[.][.](/|$)",
77
			"/[.][.](/|$)",
75
			REG_EXTENDED);
78
			REG_EXTENDED);
76
}
79
}
77
 
80
 
78
 
81
 
79
/**
82
/**
80
 * Check whether a host string is an IP address.  It should support and
83
 * Check whether a host string is an IP address.  It should support and
81
 * detect IPv4 addresses (all of dotted-quad or subsets, decimal or
84
 * detect IPv4 addresses (all of dotted-quad or subsets, decimal or
82
 * hexadecimal notations) and IPv6 addresses (including those containing
85
 * hexadecimal notations) and IPv6 addresses (including those containing
83
 * embedded IPv4 addresses.)
86
 * embedded IPv4 addresses.)
84
 *
87
 *
85
 * \param  host a hostname terminated by '\0'
88
 * \param  host a hostname terminated by '\0'
86
 * \return true if the hostname is an IP address, false otherwise
89
 * \return true if the hostname is an IP address, false otherwise
87
 */
90
 */
88
bool url_host_is_ip_address(const char *host)
91
bool url_host_is_ip_address(const char *host)
89
{
92
{
90
	struct in_addr ipv4;
93
	struct in_addr ipv4;
91
	size_t host_len = strlen(host);
94
	size_t host_len = strlen(host);
92
	const char *sane_host;
95
	const char *sane_host;
93
	const char *slash;
96
	const char *slash;
94
#ifndef NO_IPV6
97
#ifndef NO_IPV6
95
	struct in6_addr ipv6;
98
	struct in6_addr ipv6;
96
	char ipv6_addr[64];
99
	char ipv6_addr[64];
97
#endif
100
#endif
98
	/* FIXME TODO: Some parts of urldb.c (and perhaps other parts of
101
	/* FIXME TODO: Some parts of urldb.c (and perhaps other parts of
99
	 * NetSurf) make confusions between hosts and "prefixes", we can
102
	 * NetSurf) make confusions between hosts and "prefixes", we can
100
	 * sometimes be erroneously passed more than just a host.  Sometimes
103
	 * sometimes be erroneously passed more than just a host.  Sometimes
101
	 * we may be passed trailing slashes, or even whole path segments.
104
	 * we may be passed trailing slashes, or even whole path segments.
102
	 * A specific criminal in this class is urldb_iterate_partial, which
105
	 * A specific criminal in this class is urldb_iterate_partial, which
103
	 * takes a prefix to search for, but passes that prefix to functions
106
	 * takes a prefix to search for, but passes that prefix to functions
104
	 * that expect only hosts.
107
	 * that expect only hosts.
105
	 *
108
	 *
106
	 * For the time being, we will accept such calls; we check if there
109
	 * For the time being, we will accept such calls; we check if there
107
	 * is a / in the host parameter, and if there is, we take a copy and
110
	 * is a / in the host parameter, and if there is, we take a copy and
108
	 * replace the / with a \0.  This is not a permanent solution; we
111
	 * replace the / with a \0.  This is not a permanent solution; we
109
	 * should search through NetSurf and find all the callers that are
112
	 * should search through NetSurf and find all the callers that are
110
	 * in error and fix them.  When doing this task, it might be wise
113
	 * in error and fix them.  When doing this task, it might be wise
111
	 * to replace the hideousness below with code that doesn't have to do
114
	 * to replace the hideousness below with code that doesn't have to do
112
	 * this, and add assert(strchr(host, '/') == NULL); somewhere.
115
	 * this, and add assert(strchr(host, '/') == NULL); somewhere.
113
	 * -- rjek - 2010-11-04
116
	 * -- rjek - 2010-11-04
114
	 */
117
	 */
115
 
118
 
116
	slash = strchr(host, '/');
119
	slash = strchr(host, '/');
117
	if (slash == NULL) {
120
	if (slash == NULL) {
118
		sane_host = host;
121
		sane_host = host;
119
	} else {
122
	} else {
120
		char *c = strdup(host);
123
		char *c = strdup(host);
121
		c[slash - host] = '\0';
124
		c[slash - host] = '\0';
122
		sane_host = c;
125
		sane_host = c;
123
		host_len = slash - host - 1;
126
		host_len = slash - host - 1;
124
		LOG(("WARNING: called with non-host '%s'", host));
127
		LOG(("WARNING: called with non-host '%s'", host));
125
	}
128
	}
126
 
129
 
127
	if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
130
	if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
128
		goto out_false;
131
		goto out_false;
129
 
132
 
130
	if (inet_aton(sane_host, &ipv4) != 0) {
133
	if (inet_aton(sane_host, &ipv4) != 0) {
131
		/* This can only be a sane IPv4 address if it contains 3 dots.
134
		/* This can only be a sane IPv4 address if it contains 3 dots.
132
		 * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
135
		 * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
133
		 * and "a.b.c.d" as valid IPv4 address strings where we only
136
		 * and "a.b.c.d" as valid IPv4 address strings where we only
134
		 * support the full, dotted-quad, form.
137
		 * support the full, dotted-quad, form.
135
		 */
138
		 */
136
		int num_dots = 0;
139
		int num_dots = 0;
137
		size_t index;
140
		size_t index;
138
 
141
 
139
		for (index = 0; index < host_len; index++) {
142
		for (index = 0; index < host_len; index++) {
140
			if (sane_host[index] == '.')
143
			if (sane_host[index] == '.')
141
				num_dots++;
144
				num_dots++;
142
		}
145
		}
143
 
146
 
144
		if (num_dots == 3)
147
		if (num_dots == 3)
145
			goto out_true;
148
			goto out_true;
146
		else
149
		else
147
			goto out_false;
150
			goto out_false;
148
	}
151
	}
149
 
152
 
150
#ifndef NO_IPV6
153
#ifndef NO_IPV6
151
	if (sane_host[0] != '[' || sane_host[host_len] != ']')
154
	if (sane_host[0] != '[' || sane_host[host_len] != ']')
152
		goto out_false;
155
		goto out_false;
153
 
156
 
154
	strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr));
157
	strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr));
155
	ipv6_addr[sizeof(ipv6_addr) - 1] = '\0';
158
	ipv6_addr[sizeof(ipv6_addr) - 1] = '\0';
156
 
159
 
157
	if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
160
	if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
158
		goto out_true;
161
		goto out_true;
159
#endif
162
#endif
160
 
163
 
161
out_false:
164
out_false:
162
	if (slash != NULL) free((void *)sane_host);
165
	if (slash != NULL) free((void *)sane_host);
163
	return false;
166
	return false;
164
 
167
 
165
out_true:
168
out_true:
166
	if (slash != NULL) free((void *)sane_host);
169
	if (slash != NULL) free((void *)sane_host);
167
	return true;
170
	return true;
168
}
171
}
169
 
172
 
170
/**
173
/**
171
 * Split a URL into separate components
174
 * Split a URL into separate components
172
 *
175
 *
173
 * URLs passed to this function are assumed to be valid and no error checking
176
 * URLs passed to this function are assumed to be valid and no error checking
174
 * or recovery is attempted.
177
 * or recovery is attempted.
175
 *
178
 *
176
 * See RFC 3986 for reference.
179
 * See RFC 3986 for reference.
177
 *
180
 *
178
 * \param  url	     a valid absolute or relative URL
181
 * \param  url	     a valid absolute or relative URL
179
 * \param  result    pointer to buffer to hold components
182
 * \param  result    pointer to buffer to hold components
180
 * \return  URL_FUNC_OK on success
183
 * \return  URL_FUNC_OK on success
181
 */
184
 */
182
 
185
 
183
static url_func_result url_get_components(const char *url,
186
static url_func_result url_get_components(const char *url,
184
		struct url_components *result)
187
		struct url_components *result)
185
{
188
{
186
  	int storage_length;
189
  	int storage_length;
187
	char *storage_end;
190
	char *storage_end;
188
	const char *scheme;
191
	const char *scheme;
189
	const char *authority;
192
	const char *authority;
190
	const char *path;
193
	const char *path;
191
	const char *query;
194
	const char *query;
192
	const char *fragment;
195
	const char *fragment;
193
	struct url_components_internal *internal;
196
	struct url_components_internal *internal;
194
 
197
 
195
	assert(url);
198
	assert(url);
196
 
199
 
197
	/* clear our return value */
200
	/* clear our return value */
198
	internal = (struct url_components_internal *)result;
201
	internal = (struct url_components_internal *)result;
199
	memset(result, 0x00, sizeof(struct url_components));
202
	memset(result, 0x00, sizeof(struct url_components));
200
 
203
 
201
	/* get enough storage space for a URL with termination at each node */
204
	/* get enough storage space for a URL with termination at each node */
202
	storage_length = strlen(url) + 8;
205
	storage_length = strlen(url) + 8;
203
	internal->buffer = malloc(storage_length);
206
	internal->buffer = malloc(storage_length);
204
	if (!internal->buffer)
207
	if (!internal->buffer)
205
		return URL_FUNC_NOMEM;
208
		return URL_FUNC_NOMEM;
206
	storage_end = internal->buffer;
209
	storage_end = internal->buffer;
207
 
210
 
208
	/* look for a valid scheme */
211
	/* look for a valid scheme */
209
	scheme = url;
212
	scheme = url;
210
	if (isalpha(*scheme)) {
213
	if (isalpha(*scheme)) {
211
		for (scheme = url + 1;
214
		for (scheme = url + 1;
212
				((*scheme != ':') && (*scheme != '\0'));
215
				((*scheme != ':') && (*scheme != '\0'));
213
				scheme++) {
216
				scheme++) {
214
			if (!isalnum(*scheme) && (*scheme != '+') &&
217
			if (!isalnum(*scheme) && (*scheme != '+') &&
215
					(*scheme != '-') && (*scheme != '.'))
218
					(*scheme != '-') && (*scheme != '.'))
216
				break;
219
				break;
217
		}
220
		}
218
 
221
 
219
		if (*scheme == ':') {
222
		if (*scheme == ':') {
220
			memcpy(storage_end, url, scheme - url);
223
			memcpy(storage_end, url, scheme - url);
221
			storage_end[scheme - url] = '\0';
224
			storage_end[scheme - url] = '\0';
222
			result->scheme = storage_end;
225
			result->scheme = storage_end;
223
			storage_end += scheme - url + 1;
226
			storage_end += scheme - url + 1;
224
			scheme++;
227
			scheme++;
225
		} else {
228
		} else {
226
			scheme = url;
229
			scheme = url;
227
		}
230
		}
228
	}
231
	}
229
 
232
 
230
 
233
 
231
	/* look for an authority */
234
	/* look for an authority */
232
	authority = scheme;
235
	authority = scheme;
233
	if ((authority[0] == '/') && (authority[1] == '/')) {
236
	if ((authority[0] == '/') && (authority[1] == '/')) {
234
		authority = strpbrk(scheme + 2, "/?#");
237
		authority = strpbrk(scheme + 2, "/?#");
235
		if (!authority)
238
		if (!authority)
236
			authority = scheme + strlen(scheme);
239
			authority = scheme + strlen(scheme);
237
		memcpy(storage_end, scheme + 2, authority - scheme - 2);
240
		memcpy(storage_end, scheme + 2, authority - scheme - 2);
238
		storage_end[authority - scheme - 2] = '\0';
241
		storage_end[authority - scheme - 2] = '\0';
239
		result->authority = storage_end;
242
		result->authority = storage_end;
240
		storage_end += authority - scheme - 1;
243
		storage_end += authority - scheme - 1;
241
	}
244
	}
242
 
245
 
243
 
246
 
244
	/* look for a path */
247
	/* look for a path */
245
	path = authority;
248
	path = authority;
246
	if ((*path != '?') && (*path != '#') && (*path != '\0')) {
249
	if ((*path != '?') && (*path != '#') && (*path != '\0')) {
247
		path = strpbrk(path, "?#");
250
		path = strpbrk(path, "?#");
248
		if (!path)
251
		if (!path)
249
			path = authority + strlen(authority);
252
			path = authority + strlen(authority);
250
		memcpy(storage_end, authority, path - authority);
253
		memcpy(storage_end, authority, path - authority);
251
		storage_end[path - authority] = '\0';
254
		storage_end[path - authority] = '\0';
252
		result->path = storage_end;
255
		result->path = storage_end;
253
		storage_end += path - authority + 1;
256
		storage_end += path - authority + 1;
254
	}
257
	}
255
 
258
 
256
 
259
 
257
	/* look for a query */
260
	/* look for a query */
258
	query = path;
261
	query = path;
259
	if (*query == '?') {
262
	if (*query == '?') {
260
		query = strchr(query, '#');
263
		query = strchr(query, '#');
261
		if (!query)
264
		if (!query)
262
			query = path + strlen(path);
265
			query = path + strlen(path);
263
		memcpy(storage_end, path + 1, query - path - 1);
266
		memcpy(storage_end, path + 1, query - path - 1);
264
		storage_end[query - path - 1] = '\0';
267
		storage_end[query - path - 1] = '\0';
265
		result->query = storage_end;
268
		result->query = storage_end;
266
		storage_end += query - path;
269
		storage_end += query - path;
267
	}
270
	}
268
 
271
 
269
 
272
 
270
	/* look for a fragment */
273
	/* look for a fragment */
271
	fragment = query;
274
	fragment = query;
272
	if (*fragment == '#') {
275
	if (*fragment == '#') {
273
		fragment = query + strlen(query);
276
		fragment = query + strlen(query);
274
 
277
 
275
		/* make a copy of the result for the caller */
278
		/* make a copy of the result for the caller */
276
		memcpy(storage_end, query + 1, fragment - query - 1);
279
		memcpy(storage_end, query + 1, fragment - query - 1);
277
		storage_end[fragment - query - 1] = '\0';
280
		storage_end[fragment - query - 1] = '\0';
278
		result->fragment = storage_end;
281
		result->fragment = storage_end;
279
		storage_end += fragment - query;
282
		storage_end += fragment - query;
280
	}
283
	}
281
 
284
 
282
	assert((result->buffer + storage_length) >= storage_end);
285
	assert((result->buffer + storage_length) >= storage_end);
283
	return URL_FUNC_OK;
286
	return URL_FUNC_OK;
284
}
287
}
285
 
288
 
286
 
289
 
287
/**
290
/**
288
 * Reform a URL from separate components
291
 * Reform a URL from separate components
289
 *
292
 *
290
 * See RFC 3986 for reference.
293
 * See RFC 3986 for reference.
291
 *
294
 *
292
 * \param  components  the components to reform into a URL
295
 * \param  components  the components to reform into a URL
293
 * \return  a new URL allocated on the heap, or NULL on failure
296
 * \return  a new URL allocated on the heap, or NULL on failure
294
 */
297
 */
295
 
298
 
296
static char *url_reform_components(const struct url_components *components)
299
static char *url_reform_components(const struct url_components *components)
297
{
300
{
298
	int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0,
301
	int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0,
299
			fragment_len = 0;
302
			fragment_len = 0;
300
	char *result, *url;
303
	char *result, *url;
301
 
304
 
302
	/* 5.3 */
305
	/* 5.3 */
303
	if (components->scheme)
306
	if (components->scheme)
304
		scheme_len = strlen(components->scheme) + 1;
307
		scheme_len = strlen(components->scheme) + 1;
305
	if (components->authority)
308
	if (components->authority)
306
		authority_len = strlen(components->authority) + 2;
309
		authority_len = strlen(components->authority) + 2;
307
	if (components->path)
310
	if (components->path)
308
		path_len = strlen(components->path);
311
		path_len = strlen(components->path);
309
	if (components->query)
312
	if (components->query)
310
		query_len = strlen(components->query) + 1;
313
		query_len = strlen(components->query) + 1;
311
	if (components->fragment)
314
	if (components->fragment)
312
		fragment_len = strlen(components->fragment) + 1;
315
		fragment_len = strlen(components->fragment) + 1;
313
 
316
 
314
	/* claim memory */
317
	/* claim memory */
315
	url = result = malloc(scheme_len + authority_len + path_len +
318
	url = result = malloc(scheme_len + authority_len + path_len +
316
			query_len + fragment_len + 1);
319
			query_len + fragment_len + 1);
317
	if (!url) {
320
	if (!url) {
318
		LOG(("malloc failed"));
321
		LOG(("malloc failed"));
319
		return NULL;
322
		return NULL;
320
	}
323
	}
321
 
324
 
322
	/* rebuild URL */
325
	/* rebuild URL */
323
	if (components->scheme) {
326
	if (components->scheme) {
324
	  	sprintf(url, "%s:", components->scheme);
327
	  	sprintf(url, "%s:", components->scheme);
325
		url += scheme_len;
328
		url += scheme_len;
326
	}
329
	}
327
	if (components->authority) {
330
	if (components->authority) {
328
	  	sprintf(url, "//%s", components->authority);
331
	  	sprintf(url, "//%s", components->authority);
329
		url += authority_len;
332
		url += authority_len;
330
	}
333
	}
331
	if (components->path) {
334
	if (components->path) {
332
	  	sprintf(url, "%s", components->path);
335
	  	sprintf(url, "%s", components->path);
333
		url += path_len;
336
		url += path_len;
334
	}
337
	}
335
	if (components->query) {
338
	if (components->query) {
336
	  	sprintf(url, "?%s", components->query);
339
	  	sprintf(url, "?%s", components->query);
337
		url += query_len;
340
		url += query_len;
338
	}
341
	}
339
	if (components->fragment)
342
	if (components->fragment)
340
	  	sprintf(url, "#%s", components->fragment);
343
	  	sprintf(url, "#%s", components->fragment);
341
	return result;
344
	return result;
342
}
345
}
343
 
346
 
344
 
347
 
345
/**
348
/**
346
 * Release some url components from memory
349
 * Release some url components from memory
347
 *
350
 *
348
 * \param  result  pointer to buffer containing components
351
 * \param  result  pointer to buffer containing components
349
 */
352
 */
350
static void url_destroy_components(const struct url_components *components)
353
static void url_destroy_components(const struct url_components *components)
351
{
354
{
352
	const struct url_components_internal *internal;
355
	const struct url_components_internal *internal;
353
 
356
 
354
	assert(components);
357
	assert(components);
355
 
358
 
356
	internal = (const struct url_components_internal *)components;
359
	internal = (const struct url_components_internal *)components;
357
	if (internal->buffer)
360
	if (internal->buffer)
358
		free(internal->buffer);
361
		free(internal->buffer);
359
}
362
}
360
 
363
 
361
 
364
 
362
/**
365
/**
363
 * Resolve a relative URL to absolute form.
366
 * Resolve a relative URL to absolute form.
364
 *
367
 *
365
 * \param  rel	   relative URL
368
 * \param  rel	   relative URL
366
 * \param  base	   base URL, must be absolute and cleaned as by nsurl_create()
369
 * \param  base	   base URL, must be absolute and cleaned as by nsurl_create()
367
 * \param  result  pointer to pointer to buffer to hold absolute url
370
 * \param  result  pointer to pointer to buffer to hold absolute url
368
 * \return  URL_FUNC_OK on success
371
 * \return  URL_FUNC_OK on success
369
 */
372
 */
370
 
373
 
371
url_func_result url_join(const char *rel, const char *base, char **result)
374
url_func_result url_join(const char *rel, const char *base, char **result)
372
{
375
{
373
	url_func_result status = URL_FUNC_NOMEM;
376
	url_func_result status = URL_FUNC_NOMEM;
374
	struct url_components_internal base_components = {0,0,0,0,0,0};
377
	struct url_components_internal base_components = {0,0,0,0,0,0};
375
	struct url_components_internal *base_ptr = &base_components;
378
	struct url_components_internal *base_ptr = &base_components;
376
	struct url_components_internal rel_components = {0,0,0,0,0,0};
379
	struct url_components_internal rel_components = {0,0,0,0,0,0};
377
	struct url_components_internal *rel_ptr = &rel_components;
380
	struct url_components_internal *rel_ptr = &rel_components;
378
	struct url_components_internal merged_components = {0,0,0,0,0,0};
381
	struct url_components_internal merged_components = {0,0,0,0,0,0};
379
	struct url_components_internal *merged_ptr = &merged_components;
382
	struct url_components_internal *merged_ptr = &merged_components;
380
	char *merge_path = NULL, *split_point;
383
	char *merge_path = NULL, *split_point;
381
	char *input, *output, *start = NULL;
384
	char *input, *output, *start = NULL;
382
	int len, buf_len;
385
	int len, buf_len;
383
 
386
 
384
	(*result) = 0;
387
	(*result) = 0;
385
 
388
 
386
	assert(base);
389
	assert(base);
387
	assert(rel);
390
	assert(rel);
388
 
391
 
389
 
392
 
390
	/* break down the relative URL (not cached, corruptable) */
393
	/* break down the relative URL (not cached, corruptable) */
391
	status = url_get_components(rel, (struct url_components *) rel_ptr);
394
	status = url_get_components(rel, (struct url_components *) rel_ptr);
392
	if (status != URL_FUNC_OK) {
395
	if (status != URL_FUNC_OK) {
393
		LOG(("relative url '%s' failed to get components", rel));
396
		LOG(("relative url '%s' failed to get components", rel));
394
		return URL_FUNC_FAILED;
397
		return URL_FUNC_FAILED;
395
	}
398
	}
396
 
399
 
397
	/* [1] relative URL is absolute, use it entirely */
400
	/* [1] relative URL is absolute, use it entirely */
398
	merged_components = rel_components;
401
	merged_components = rel_components;
399
	if (rel_components.scheme)
402
	if (rel_components.scheme)
400
		goto url_join_reform_url;
403
		goto url_join_reform_url;
401
 
404
 
402
	/* break down the base URL (possibly cached, not corruptable) */
405
	/* break down the base URL (possibly cached, not corruptable) */
403
	status = url_get_components(base, (struct url_components *) base_ptr);
406
	status = url_get_components(base, (struct url_components *) base_ptr);
404
	if (status != URL_FUNC_OK) {
407
	if (status != URL_FUNC_OK) {
405
		url_destroy_components((struct url_components *) rel_ptr);
408
		url_destroy_components((struct url_components *) rel_ptr);
406
		LOG(("base url '%s' failed to get components", base));
409
		LOG(("base url '%s' failed to get components", base));
407
		return URL_FUNC_FAILED;
410
		return URL_FUNC_FAILED;
408
	}
411
	}
409
 
412
 
410
	/* [2] relative authority takes presidence */
413
	/* [2] relative authority takes presidence */
411
	merged_components.scheme = base_components.scheme;
414
	merged_components.scheme = base_components.scheme;
412
	if (rel_components.authority)
415
	if (rel_components.authority)
413
		goto url_join_reform_url;
416
		goto url_join_reform_url;
414
 
417
 
415
	/* [3] handle empty paths */
418
	/* [3] handle empty paths */
416
	merged_components.authority = base_components.authority;
419
	merged_components.authority = base_components.authority;
417
	if (!rel_components.path) {
420
	if (!rel_components.path) {
418
	  	merged_components.path = base_components.path;
421
	  	merged_components.path = base_components.path;
419
		if (!rel_components.query)
422
		if (!rel_components.query)
420
			merged_components.query = base_components.query;
423
			merged_components.query = base_components.query;
421
		goto url_join_reform_url;
424
		goto url_join_reform_url;
422
	}
425
	}
423
 
426
 
424
	/* [4] handle valid paths */
427
	/* [4] handle valid paths */
425
	if (rel_components.path[0] == '/')
428
	if (rel_components.path[0] == '/')
426
		merged_components.path = rel_components.path;
429
		merged_components.path = rel_components.path;
427
	else {
430
	else {
428
		/* 5.2.3 */
431
		/* 5.2.3 */
429
		if ((base_components.authority) && (!base_components.path)) {
432
		if ((base_components.authority) && (!base_components.path)) {
430
			merge_path = malloc(strlen(rel_components.path) + 2);
433
			merge_path = malloc(strlen(rel_components.path) + 2);
431
			if (!merge_path) {
434
			if (!merge_path) {
432
				LOG(("malloc failed"));
435
				LOG(("malloc failed"));
433
				goto url_join_no_mem;
436
				goto url_join_no_mem;
434
			}
437
			}
435
			sprintf(merge_path, "/%s", rel_components.path);
438
			sprintf(merge_path, "/%s", rel_components.path);
436
			merged_components.path = merge_path;
439
			merged_components.path = merge_path;
437
		} else {
440
		} else {
438
			split_point = base_components.path ?
441
			split_point = base_components.path ?
439
					strrchr(base_components.path, '/') :
442
					strrchr(base_components.path, '/') :
440
					NULL;
443
					NULL;
441
			if (!split_point) {
444
			if (!split_point) {
442
				merged_components.path = rel_components.path;
445
				merged_components.path = rel_components.path;
443
			} else {
446
			} else {
444
				len = ++split_point - base_components.path;
447
				len = ++split_point - base_components.path;
445
				buf_len = len + 1 + strlen(rel_components.path);
448
				buf_len = len + 1 + strlen(rel_components.path);
446
				merge_path = malloc(buf_len);
449
				merge_path = malloc(buf_len);
447
				if (!merge_path) {
450
				if (!merge_path) {
448
					LOG(("malloc failed"));
451
					LOG(("malloc failed"));
449
					goto url_join_no_mem;
452
					goto url_join_no_mem;
450
				}
453
				}
451
				memcpy(merge_path, base_components.path, len);
454
				memcpy(merge_path, base_components.path, len);
452
				memcpy(merge_path + len, rel_components.path,
455
				memcpy(merge_path + len, rel_components.path,
453
						strlen(rel_components.path));
456
						strlen(rel_components.path));
454
				merge_path[buf_len - 1] = '\0';
457
				merge_path[buf_len - 1] = '\0';
455
				merged_components.path = merge_path;
458
				merged_components.path = merge_path;
456
			}
459
			}
457
		}
460
		}
458
	}
461
	}
459
 
462
 
460
url_join_reform_url:
463
url_join_reform_url:
461
	/* 5.2.4 */
464
	/* 5.2.4 */
462
	input = merged_components.path;
465
	input = merged_components.path;
463
	if ((input) && (strchr(input, '.'))) {
466
	if ((input) && (strchr(input, '.'))) {
464
	  	/* [1] remove all dot references */
467
	  	/* [1] remove all dot references */
465
	  	output = start = malloc(strlen(input) + 1);
468
	  	output = start = malloc(strlen(input) + 1);
466
	  	if (!output) {
469
	  	if (!output) {
467
			LOG(("malloc failed"));
470
			LOG(("malloc failed"));
468
			goto url_join_no_mem;
471
			goto url_join_no_mem;
469
		}
472
		}
470
		merged_components.path = output;
473
		merged_components.path = output;
471
		*output = '\0';
474
		*output = '\0';
472
 
475
 
473
		while (*input != '\0') {
476
		while (*input != '\0') {
474
		  	/* [2A] */
477
		  	/* [2A] */
475
		  	if (input[0] == '.') {
478
		  	if (input[0] == '.') {
476
		  		if (input[1] == '/') {
479
		  		if (input[1] == '/') {
477
		  			input = input + 2;
480
		  			input = input + 2;
478
		  			continue;
481
		  			continue;
479
		  		} else if ((input[1] == '.') &&
482
		  		} else if ((input[1] == '.') &&
480
		  				(input[2] == '/')) {
483
		  				(input[2] == '/')) {
481
		  			input = input + 3;
484
		  			input = input + 3;
482
		  			continue;
485
		  			continue;
483
		  		}
486
		  		}
484
		  	}
487
		  	}
485
 
488
 
486
		  	/* [2B] */
489
		  	/* [2B] */
487
		  	if ((input[0] == '/') && (input[1] == '.')) {
490
		  	if ((input[0] == '/') && (input[1] == '.')) {
488
		  		if (input[2] == '/') {
491
		  		if (input[2] == '/') {
489
		  		  	input = input + 2;
492
		  		  	input = input + 2;
490
		  		  	continue;
493
		  		  	continue;
491
		  		} else if (input[2] == '\0') {
494
		  		} else if (input[2] == '\0') {
492
		  		  	input = input + 1;
495
		  		  	input = input + 1;
493
		  		  	*input = '/';
496
		  		  	*input = '/';
494
		  		  	continue;
497
		  		  	continue;
495
		  		}
498
		  		}
496
 
499
 
497
		  		/* [2C] */
500
		  		/* [2C] */
498
		  		if ((input[2] == '.') && ((input[3] == '/') ||
501
		  		if ((input[2] == '.') && ((input[3] == '/') ||
499
		  				(input[3] == '\0'))) {
502
		  				(input[3] == '\0'))) {
500
			  		if (input[3] == '/') {
503
			  		if (input[3] == '/') {
501
			  		  	input = input + 3;
504
			  		  	input = input + 3;
502
			  		} else {
505
			  		} else {
503
		  				input = input + 2;
506
		  				input = input + 2;
504
		  			  	*input = '/';
507
		  			  	*input = '/';
505
		  			}
508
		  			}
506
 
509
 
507
		  			if ((output > start) &&
510
		  			if ((output > start) &&
508
		  					(output[-1] == '/'))
511
		  					(output[-1] == '/'))
509
		  				*--output = '\0';
512
		  				*--output = '\0';
510
		  			split_point = strrchr(start, '/');
513
		  			split_point = strrchr(start, '/');
511
		  			if (!split_point)
514
		  			if (!split_point)
512
		  				output = start;
515
		  				output = start;
513
		  			else
516
		  			else
514
		  				output = split_point;
517
		  				output = split_point;
515
		  			*output = '\0';
518
		  			*output = '\0';
516
		  			continue;
519
		  			continue;
517
		  		}
520
		  		}
518
		  	}
521
		  	}
519
 
522
 
520
 
523
 
521
		  	/* [2D] */
524
		  	/* [2D] */
522
		  	if (input[0] == '.') {
525
		  	if (input[0] == '.') {
523
		  		if (input[1] == '\0') {
526
		  		if (input[1] == '\0') {
524
		  			input = input + 1;
527
		  			input = input + 1;
525
		  			continue;
528
		  			continue;
526
		  		} else if ((input[1] == '.') &&
529
		  		} else if ((input[1] == '.') &&
527
		  				(input[2] == '\0')) {
530
		  				(input[2] == '\0')) {
528
		  			input = input + 2;
531
		  			input = input + 2;
529
		  			continue;
532
		  			continue;
530
		  		}
533
		  		}
531
		  	}
534
		  	}
532
 
535
 
533
		  	/* [2E] */
536
		  	/* [2E] */
534
		  	if (*input == '/')
537
		  	if (*input == '/')
535
		  		*output++ = *input++;
538
		  		*output++ = *input++;
536
		  	while ((*input != '/') && (*input != '\0'))
539
		  	while ((*input != '/') && (*input != '\0'))
537
		  		*output++ = *input++;
540
		  		*output++ = *input++;
538
		  	*output = '\0';
541
		  	*output = '\0';
539
                }
542
                }
540
                /* [3] */
543
                /* [3] */
541
      		merged_components.path = start;
544
      		merged_components.path = start;
542
	}
545
	}
543
 
546
 
544
	/* 5.3 */
547
	/* 5.3 */
545
	*result = url_reform_components((struct url_components *) merged_ptr);
548
	*result = url_reform_components((struct url_components *) merged_ptr);
546
  	if (!(*result))
549
  	if (!(*result))
547
		goto url_join_no_mem;
550
		goto url_join_no_mem;
548
 
551
 
549
	/* return success */
552
	/* return success */
550
	status = URL_FUNC_OK;
553
	status = URL_FUNC_OK;
551
 
554
 
552
url_join_no_mem:
555
url_join_no_mem:
553
	free(start);
556
	free(start);
554
	free(merge_path);
557
	free(merge_path);
555
	url_destroy_components((struct url_components *) base_ptr);
558
	url_destroy_components((struct url_components *) base_ptr);
556
	url_destroy_components((struct url_components *) rel_ptr);
559
	url_destroy_components((struct url_components *) rel_ptr);
557
	return status;
560
	return status;
558
}
561
}
559
 
562
 
560
 
563
 
561
/**
564
/**
562
 * Return the host name from an URL.
565
 * Return the host name from an URL.
563
 *
566
 *
564
 * \param  url	   an absolute URL
567
 * \param  url	   an absolute URL
565
 * \param  result  pointer to pointer to buffer to hold host name
568
 * \param  result  pointer to pointer to buffer to hold host name
566
 * \return  URL_FUNC_OK on success
569
 * \return  URL_FUNC_OK on success
567
 */
570
 */
568
 
571
 
569
url_func_result url_host(const char *url, char **result)
572
url_func_result url_host(const char *url, char **result)
570
{
573
{
571
	url_func_result status;
574
	url_func_result status;
572
	struct url_components components;
575
	struct url_components components;
573
	const char *host_start, *host_end;
576
	const char *host_start, *host_end;
574
 
577
 
575
	assert(url);
578
	assert(url);
576
 
579
 
577
	status = url_get_components(url, &components);
580
	status = url_get_components(url, &components);
578
	if (status == URL_FUNC_OK) {
581
	if (status == URL_FUNC_OK) {
579
		if (!components.authority) {
582
		if (!components.authority) {
580
			url_destroy_components(&components);
583
			url_destroy_components(&components);
581
			return URL_FUNC_FAILED;
584
			return URL_FUNC_FAILED;
582
		}
585
		}
583
		host_start = strchr(components.authority, '@');
586
		host_start = strchr(components.authority, '@');
584
		host_start = host_start ? host_start + 1 : components.authority;
587
		host_start = host_start ? host_start + 1 : components.authority;
585
 
588
 
586
		/* skip over an IPv6 address if there is one */
589
		/* skip over an IPv6 address if there is one */
587
		if (host_start[0] == '[') {
590
		if (host_start[0] == '[') {
588
			host_end = strchr(host_start, ']') + 1;
591
			host_end = strchr(host_start, ']') + 1;
589
		} else {
592
		} else {
590
			host_end = strchr(host_start, ':');
593
			host_end = strchr(host_start, ':');
591
		}
594
		}
592
 
595
 
593
		if (!host_end)
596
		if (!host_end)
594
			host_end = components.authority +
597
			host_end = components.authority +
595
					strlen(components.authority);
598
					strlen(components.authority);
596
 
599
 
597
		*result = malloc(host_end - host_start + 1);
600
		*result = malloc(host_end - host_start + 1);
598
		if (!(*result)) {
601
		if (!(*result)) {
599
			url_destroy_components(&components);
602
			url_destroy_components(&components);
600
			return URL_FUNC_FAILED;
603
			return URL_FUNC_FAILED;
601
		}
604
		}
602
		memcpy((*result), host_start, host_end - host_start);
605
		memcpy((*result), host_start, host_end - host_start);
603
		(*result)[host_end - host_start] = '\0';
606
		(*result)[host_end - host_start] = '\0';
604
	}
607
	}
605
	url_destroy_components(&components);
608
	url_destroy_components(&components);
606
	return status;
609
	return status;
607
}
610
}
608
 
611
 
609
 
612
 
610
/**
613
/**
611
 * Return the scheme name from an URL.
614
 * Return the scheme name from an URL.
612
 *
615
 *
613
 * See RFC 3986, 3.1 for reference.
616
 * See RFC 3986, 3.1 for reference.
614
 *
617
 *
615
 * \param  url	   an absolute URL
618
 * \param  url	   an absolute URL
616
 * \param  result  pointer to pointer to buffer to hold scheme name
619
 * \param  result  pointer to pointer to buffer to hold scheme name
617
 * \return  URL_FUNC_OK on success
620
 * \return  URL_FUNC_OK on success
618
 */
621
 */
619
 
622
 
620
url_func_result url_scheme(const char *url, char **result)
623
url_func_result url_scheme(const char *url, char **result)
621
{
624
{
622
	url_func_result status;
625
	url_func_result status;
623
	struct url_components components;
626
	struct url_components components;
624
 
627
 
625
	assert(url);
628
	assert(url);
626
 
629
 
627
	status = url_get_components(url, &components);
630
	status = url_get_components(url, &components);
628
	if (status == URL_FUNC_OK) {
631
	if (status == URL_FUNC_OK) {
629
		if (!components.scheme) {
632
		if (!components.scheme) {
630
			status = URL_FUNC_FAILED;
633
			status = URL_FUNC_FAILED;
631
		} else {
634
		} else {
632
			*result = strdup(components.scheme);
635
			*result = strdup(components.scheme);
633
			if (!(*result))
636
			if (!(*result))
634
				status = URL_FUNC_NOMEM;
637
				status = URL_FUNC_NOMEM;
635
		}
638
		}
636
	}
639
	}
637
	url_destroy_components(&components);
640
	url_destroy_components(&components);
638
	return status;
641
	return status;
639
}
642
}
640
 
643
 
641
 
644
 
642
/**
645
/**
643
 * Extract path segment from an URL
646
 * Extract path segment from an URL
644
 *
647
 *
645
 * \param url	  an absolute URL
648
 * \param url	  an absolute URL
646
 * \param result  pointer to pointer to buffer to hold result
649
 * \param result  pointer to pointer to buffer to hold result
647
 * \return URL_FUNC_OK on success
650
 * \return URL_FUNC_OK on success
648
 */
651
 */
649
 
652
 
650
url_func_result url_path(const char *url, char **result)
653
url_func_result url_path(const char *url, char **result)
651
{
654
{
652
	url_func_result status;
655
	url_func_result status;
653
	struct url_components components;
656
	struct url_components components;
654
 
657
 
655
	assert(url);
658
	assert(url);
656
 
659
 
657
	status = url_get_components(url, &components);
660
	status = url_get_components(url, &components);
658
	if (status == URL_FUNC_OK) {
661
	if (status == URL_FUNC_OK) {
659
		if (!components.path) {
662
		if (!components.path) {
660
			status = URL_FUNC_FAILED;
663
			status = URL_FUNC_FAILED;
661
		} else {
664
		} else {
662
			*result = strdup(components.path);
665
			*result = strdup(components.path);
663
			if (!(*result))
666
			if (!(*result))
664
				status = URL_FUNC_NOMEM;
667
				status = URL_FUNC_NOMEM;
665
		}
668
		}
666
	}
669
	}
667
	url_destroy_components(&components);
670
	url_destroy_components(&components);
668
	return status;
671
	return status;
669
}
672
}
670
 
673
 
671
/**
674
/**
672
 * Attempt to find a nice filename for a URL.
675
 * Attempt to find a nice filename for a URL.
673
 *
676
 *
674
 * \param  url	   an absolute URL
677
 * \param  url	   an absolute URL
675
 * \param  result  pointer to pointer to buffer to hold filename
678
 * \param  result  pointer to pointer to buffer to hold filename
676
 * \param  remove_extensions  remove any extensions from the filename
679
 * \param  remove_extensions  remove any extensions from the filename
677
 * \return  URL_FUNC_OK on success
680
 * \return  URL_FUNC_OK on success
678
 */
681
 */
679
 
682
 
680
url_func_result url_nice(const char *url, char **result,
683
url_func_result url_nice(const char *url, char **result,
681
		bool remove_extensions)
684
		bool remove_extensions)
682
{
685
{
683
	int m;
686
	int m;
684
	regmatch_t match[10];
687
	regmatch_t match[10];
685
	regoff_t start, end;
688
	regoff_t start, end;
686
	size_t i;
689
	size_t i;
687
	char *dot;
690
	char *dot;
688
 
691
 
689
	*result = 0;
692
	*result = 0;
690
 
693
 
691
	m = regexec(&url_re, url, 10, match, 0);
694
	m = regexec(&url_re, url, 10, match, 0);
692
	if (m) {
695
	if (m) {
693
		LOG(("url '%s' failed to match regex", url));
696
		LOG(("url '%s' failed to match regex", url));
694
		return URL_FUNC_FAILED;
697
		return URL_FUNC_FAILED;
695
	}
698
	}
696
 
699
 
697
	/* extract the last component of the path, if possible */
700
	/* extract the last component of the path, if possible */
698
	if (match[URL_RE_PATH].rm_so == -1 || match[URL_RE_PATH].rm_so ==
701
	if (match[URL_RE_PATH].rm_so == -1 || match[URL_RE_PATH].rm_so ==
699
			match[URL_RE_PATH].rm_eo)
702
			match[URL_RE_PATH].rm_eo)
700
		goto no_path;  /* no path, or empty */
703
		goto no_path;  /* no path, or empty */
701
	for (end = match[URL_RE_PATH].rm_eo - 1;
704
	for (end = match[URL_RE_PATH].rm_eo - 1;
702
			end != match[URL_RE_PATH].rm_so && url[end] == '/';
705
			end != match[URL_RE_PATH].rm_so && url[end] == '/';
703
			end--)
706
			end--)
704
		;
707
		;
705
	if (end == match[URL_RE_PATH].rm_so)
708
	if (end == match[URL_RE_PATH].rm_so)
706
		goto no_path;  /* path is a string of '/' */
709
		goto no_path;  /* path is a string of '/' */
707
	end++;
710
	end++;
708
	for (start = end - 1;
711
	for (start = end - 1;
709
			start != match[URL_RE_PATH].rm_so && url[start] != '/';
712
			start != match[URL_RE_PATH].rm_so && url[start] != '/';
710
			start--)
713
			start--)
711
		;
714
		;
712
	if (url[start] == '/')
715
	if (url[start] == '/')
713
		start++;
716
		start++;
714
 
717
 
715
	if (!strncasecmp(url + start, "index.", 6) ||
718
	if (!strncasecmp(url + start, "index.", 6) ||
716
			!strncasecmp(url + start, "default.", 8)) {
719
			!strncasecmp(url + start, "default.", 8)) {
717
		/* try again */
720
		/* try again */
718
		if (start == match[URL_RE_PATH].rm_so)
721
		if (start == match[URL_RE_PATH].rm_so)
719
			goto no_path;
722
			goto no_path;
720
		for (end = start - 1;
723
		for (end = start - 1;
721
				end != match[URL_RE_PATH].rm_so &&
724
				end != match[URL_RE_PATH].rm_so &&
722
				url[end] == '/';
725
				url[end] == '/';
723
				end--)
726
				end--)
724
			;
727
			;
725
		if (end == match[URL_RE_PATH].rm_so)
728
		if (end == match[URL_RE_PATH].rm_so)
726
			goto no_path;
729
			goto no_path;
727
		end++;
730
		end++;
728
		for (start = end - 1;
731
		for (start = end - 1;
729
				start != match[URL_RE_PATH].rm_so &&
732
				start != match[URL_RE_PATH].rm_so &&
730
				url[start] != '/';
733
				url[start] != '/';
731
				start--)
734
				start--)
732
		;
735
		;
733
		if (url[start] == '/')
736
		if (url[start] == '/')
734
			start++;
737
			start++;
735
	}
738
	}
736
 
739
 
737
	*result = malloc(end - start + 1);
740
	*result = malloc(end - start + 1);
738
	if (!*result) {
741
	if (!*result) {
739
		LOG(("malloc failed"));
742
		LOG(("malloc failed"));
740
		return URL_FUNC_NOMEM;
743
		return URL_FUNC_NOMEM;
741
	}
744
	}
742
	strncpy(*result, url + start, end - start);
745
	strncpy(*result, url + start, end - start);
743
	(*result)[end - start] = 0;
746
	(*result)[end - start] = 0;
744
 
747
 
745
	if (remove_extensions) {
748
	if (remove_extensions) {
746
		dot = strchr(*result, '.');
749
		dot = strchr(*result, '.');
747
		if (dot && dot != *result)
750
		if (dot && dot != *result)
748
			*dot = 0;
751
			*dot = 0;
749
	}
752
	}
750
 
753
 
751
	return URL_FUNC_OK;
754
	return URL_FUNC_OK;
752
 
755
 
753
no_path:
756
no_path:
754
 
757
 
755
	/* otherwise, use the host name, with '.' replaced by '_' */
758
	/* otherwise, use the host name, with '.' replaced by '_' */
756
	if (match[URL_RE_AUTHORITY].rm_so != -1 &&
759
	if (match[URL_RE_AUTHORITY].rm_so != -1 &&
757
			match[URL_RE_AUTHORITY].rm_so !=
760
			match[URL_RE_AUTHORITY].rm_so !=
758
			match[URL_RE_AUTHORITY].rm_eo) {
761
			match[URL_RE_AUTHORITY].rm_eo) {
759
		*result = malloc(match[URL_RE_AUTHORITY].rm_eo -
762
		*result = malloc(match[URL_RE_AUTHORITY].rm_eo -
760
				match[URL_RE_AUTHORITY].rm_so + 1);
763
				match[URL_RE_AUTHORITY].rm_so + 1);
761
		if (!*result) {
764
		if (!*result) {
762
			LOG(("malloc failed"));
765
			LOG(("malloc failed"));
763
			return URL_FUNC_NOMEM;
766
			return URL_FUNC_NOMEM;
764
		}
767
		}
765
		strncpy(*result, url + match[URL_RE_AUTHORITY].rm_so,
768
		strncpy(*result, url + match[URL_RE_AUTHORITY].rm_so,
766
				match[URL_RE_AUTHORITY].rm_eo -
769
				match[URL_RE_AUTHORITY].rm_eo -
767
				match[URL_RE_AUTHORITY].rm_so);
770
				match[URL_RE_AUTHORITY].rm_so);
768
		(*result)[match[URL_RE_AUTHORITY].rm_eo -
771
		(*result)[match[URL_RE_AUTHORITY].rm_eo -
769
				match[URL_RE_AUTHORITY].rm_so] = 0;
772
				match[URL_RE_AUTHORITY].rm_so] = 0;
770
 
773
 
771
		for (i = 0; (*result)[i]; i++)
774
		for (i = 0; (*result)[i]; i++)
772
			if ((*result)[i] == '.')
775
			if ((*result)[i] == '.')
773
				(*result)[i] = '_';
776
				(*result)[i] = '_';
774
 
777
 
775
		return URL_FUNC_OK;
778
		return URL_FUNC_OK;
776
	}
779
	}
777
 
780
 
778
	return URL_FUNC_FAILED;
781
	return URL_FUNC_FAILED;
779
}
782
}
780
 
783
 
781
/**
784
/**
782
 * Convert an escaped string to plain.
785
 * Convert an escaped string to plain.
783
 * \param result unescaped string owned by caller must be freed with free()
786
 * \param result unescaped string owned by caller must be freed with free()
784
 * \return  URL_FUNC_OK on success
787
 * \return  URL_FUNC_OK on success
785
 */
788
 */
786
url_func_result url_unescape(const char *str, char **result)
789
url_func_result url_unescape(const char *str, char **result)
787
{
790
{
788
	char *curlstr;
791
	char *curlstr;
789
	char *retstr;
792
	char *retstr;
-
 
793
	/* curlstr = curl_unescape(str, 0); */
-
 
794
	LOG(("Address of str is : %x\n", str));
-
 
795
	LOG(("url is %s\n", str));
-
 
796
	
-
 
797
	LOG(("Calling http_unescape_url in url.c\n"));
-
 
798
	curlstr =  http_unescape_url(str);
-
 
799
	LOG(("http_unescape_url returned.\n"));
-
 
800
	__menuet__debug_out("http_unescape_url returned\n");
790
 
-
 
791
	curlstr = curl_unescape(str, 0);
801
 
792
	if (curlstr == NULL) {
802
	if (curlstr == NULL) {
793
		return URL_FUNC_NOMEM;
803
		return URL_FUNC_NOMEM;
794
	}
804
	}
-
 
805
	__menuet__debug_out("Calling strdup in url.c with : ");	
-
 
806
	__menuet__debug_out(curlstr);
-
 
807
	__menuet__debug_out("\n");
795
 
808
 
-
 
809
	retstr = strdup(curlstr);
796
	retstr = strdup(curlstr);
810
	/* free(curlstr); */ /* Doesn't work because mem not allocated with malloc/calloc/realloc*/
-
 
811
	/* TODO: Use mem_free here*/
-
 
812
 
797
	curl_free(curlstr);
813
	__menuet__debug_out("After strdup in url.c\n");	
-
 
814
 
798
 
815
	if (retstr == NULL) {
799
	if (retstr == NULL) {
816
	  __menuet__debug_out("retstr is NULL in url.c\n");	
800
		return URL_FUNC_NOMEM;
817
		return URL_FUNC_NOMEM;
801
	}
818
	}
802
 
819
 
803
	*result = retstr;
820
	*result = retstr;
-
 
821
	__menuet__debug_out("returning from url_unescape in url.c\n");
804
	return URL_FUNC_OK;
822
	return URL_FUNC_OK;
-
 
823
 
805
}
824
}
806
 
825
 
807
/**
826
/**
808
 * Escape a string suitable for inclusion in an URL.
827
 * Escape a string suitable for inclusion in an URL.
809
 *
828
 *
810
 * \param  unescaped      the unescaped string
829
 * \param  unescaped      the unescaped string
811
 * \param  toskip         number of bytes to skip in unescaped string
830
 * \param  toskip         number of bytes to skip in unescaped string
812
 * \param  sptoplus       true iff spaces should be converted to +
831
 * \param  sptoplus       true iff spaces should be converted to +
813
 * \param  escexceptions  NULL or a string of characters excluded to be escaped
832
 * \param  escexceptions  NULL or a string of characters excluded to be escaped
814
 * \param  result         pointer to pointer to buffer to hold escaped string
833
 * \param  result         pointer to pointer to buffer to hold escaped string
815
 * \return  URL_FUNC_OK on success
834
 * \return  URL_FUNC_OK on success
816
 */
835
 */
817
 
836
 
818
url_func_result url_escape(const char *unescaped, size_t toskip,
837
url_func_result url_escape(const char *unescaped, size_t toskip,
819
		bool sptoplus, const char *escexceptions, char **result)
838
		bool sptoplus, const char *escexceptions, char **result)
820
{
839
{
821
	size_t len;
840
	size_t len;
822
	char *escaped, *d, *tmpres;
841
	char *escaped, *d, *tmpres;
823
	const char *c;
842
	const char *c;
824
 
843
 
825
	if (!unescaped || !result)
844
	if (!unescaped || !result)
826
		return URL_FUNC_FAILED;
845
		return URL_FUNC_FAILED;
827
 
846
 
828
	*result = NULL;
847
	*result = NULL;
829
 
848
 
830
	len = strlen(unescaped);
849
	len = strlen(unescaped);
831
	if (len < toskip)
850
	if (len < toskip)
832
		return URL_FUNC_FAILED;
851
		return URL_FUNC_FAILED;
833
	len -= toskip;
852
	len -= toskip;
834
 
853
 
835
	escaped = malloc(len * 3 + 1);
854
	escaped = malloc(len * 3 + 1);
836
	if (!escaped)
855
	if (!escaped)
837
		return URL_FUNC_NOMEM;
856
		return URL_FUNC_NOMEM;
838
 
857
 
839
	for (c = unescaped + toskip, d = escaped; *c; c++) {
858
	for (c = unescaped + toskip, d = escaped; *c; c++) {
840
		/* Check if we should escape this byte.
859
		/* Check if we should escape this byte.
841
		 * '~' is unreserved and should not be percent encoded, if
860
		 * '~' is unreserved and should not be percent encoded, if
842
		 * you believe the spec; however, leaving it unescaped
861
		 * you believe the spec; however, leaving it unescaped
843
		 * breaks a bunch of websites, so we escape it anyway. */
862
		 * breaks a bunch of websites, so we escape it anyway. */
844
		if (!isascii(*c)
863
		if (!isascii(*c)
845
			|| (strchr(":/?#[]@" /* gen-delims */
864
			|| (strchr(":/?#[]@" /* gen-delims */
846
				  "!$&'()*+,;=" /* sub-delims */
865
				  "!$&'()*+,;=" /* sub-delims */
847
				  "<>%\"{}|\\^`~" /* others */,	*c)
866
				  "<>%\"{}|\\^`~" /* others */,	*c)
848
				&& (!escexceptions || !strchr(escexceptions, *c)))
867
				&& (!escexceptions || !strchr(escexceptions, *c)))
849
			|| *c <= 0x20 || *c == 0x7f) {
868
			|| *c <= 0x20 || *c == 0x7f) {
850
			if (*c == 0x20 && sptoplus) {
869
			if (*c == 0x20 && sptoplus) {
851
				*d++ = '+';
870
				*d++ = '+';
852
			} else {
871
			} else {
853
				*d++ = '%';
872
				*d++ = '%';
854
				*d++ = "0123456789ABCDEF"[((*c >> 4) & 0xf)];
873
				*d++ = "0123456789ABCDEF"[((*c >> 4) & 0xf)];
855
				*d++ = "0123456789ABCDEF"[(*c & 0xf)];
874
				*d++ = "0123456789ABCDEF"[(*c & 0xf)];
856
			}
875
			}
857
		} else {
876
		} else {
858
			/* unreserved characters: [a-zA-Z0-9-._] */
877
			/* unreserved characters: [a-zA-Z0-9-._] */
859
			*d++ = *c;
878
			*d++ = *c;
860
		}
879
		}
861
	}
880
	}
862
	*d++ = '\0';
881
	*d++ = '\0';
863
 
882
 
864
	tmpres = malloc(d - escaped + toskip);
883
	tmpres = malloc(d - escaped + toskip);
865
	if (!tmpres) {
884
	if (!tmpres) {
866
		free(escaped);
885
		free(escaped);
867
		return URL_FUNC_NOMEM;
886
		return URL_FUNC_NOMEM;
868
	}
887
	}
869
 
888
 
870
	memcpy(tmpres, unescaped, toskip); 
889
	memcpy(tmpres, unescaped, toskip); 
871
	memcpy(tmpres + toskip, escaped, d - escaped);
890
	memcpy(tmpres + toskip, escaped, d - escaped);
872
	*result = tmpres;
891
	*result = tmpres;
873
 
892
 
874
	free(escaped);
893
	free(escaped);
875
 
894
 
876
	return URL_FUNC_OK;
895
	return URL_FUNC_OK;
877
}
896
}
878
 
897
 
879
 
898
 
880
#ifdef TEST
899
#ifdef TEST
881
 
900
 
882
int main(int argc, char *argv[])
901
int main(int argc, char *argv[])
883
{
902
{
884
	int i;
903
	int i;
885
	url_func_result res;
904
	url_func_result res;
886
	char *s;
905
	char *s;
887
	url_init();
906
	url_init();
888
	for (i = 1; i != argc; i++) {
907
	for (i = 1; i != argc; i++) {
889
/*		printf("==> '%s'\n", argv[i]);
908
/*		printf("==> '%s'\n", argv[i]);
890
		res = url_normalize(argv[i], &s);
909
		res = url_normalize(argv[i], &s);
891
		if (res == URL_FUNC_OK) {
910
		if (res == URL_FUNC_OK) {
892
			printf("<== '%s'\n", s);
911
			printf("<== '%s'\n", s);
893
			free(s);
912
			free(s);
894
		}*/
913
		}*/
895
/*		printf("==> '%s'\n", argv[i]);
914
/*		printf("==> '%s'\n", argv[i]);
896
		res = url_host(argv[i], &s);
915
		res = url_host(argv[i], &s);
897
		if (res == URL_FUNC_OK) {
916
		if (res == URL_FUNC_OK) {
898
			printf("<== '%s'\n", s);
917
			printf("<== '%s'\n", s);
899
			free(s);
918
			free(s);
900
		}*/
919
		}*/
901
		if (1 != i) {
920
		if (1 != i) {
902
			res = url_join(argv[i], argv[1], &s);
921
			res = url_join(argv[i], argv[1], &s);
903
			if (res == URL_FUNC_OK) {
922
			if (res == URL_FUNC_OK) {
904
				printf("'%s' + '%s' \t= '%s'\n", argv[1],
923
				printf("'%s' + '%s' \t= '%s'\n", argv[1],
905
						argv[i], s);
924
						argv[i], s);
906
				free(s);
925
				free(s);
907
			}
926
			}
908
		}
927
		}
909
/*		printf("'%s' => ", argv[i]);
928
/*		printf("'%s' => ", argv[i]);
910
		res = url_nice(argv[i], &s, true);
929
		res = url_nice(argv[i], &s, true);
911
		if (res == URL_FUNC_OK) {
930
		if (res == URL_FUNC_OK) {
912
			printf("'%s', ", s);
931
			printf("'%s', ", s);
913
			free(s);
932
			free(s);
914
		} else {
933
		} else {
915
			printf("failed %u, ", res);
934
			printf("failed %u, ", res);
916
		}
935
		}
917
		res = url_nice(argv[i], &s, false);
936
		res = url_nice(argv[i], &s, false);
918
		if (res == URL_FUNC_OK) {
937
		if (res == URL_FUNC_OK) {
919
			printf("'%s', ", s);
938
			printf("'%s', ", s);
920
			free(s);
939
			free(s);
921
		} else {
940
		} else {
922
			printf("failed %u, ", res);
941
			printf("failed %u, ", res);
923
		}
942
		}
924
		printf("\n");*/
943
		printf("\n");*/
925
	}
944
	}
926
	return 0;
945
	return 0;
927
}
946
}
928
 
947
 
929
void regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
948
void regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
930
{
949
{
931
	char errbuf[200];
950
	char errbuf[200];
932
	int r;
951
	int r;
933
	r = regcomp(preg, regex, cflags);
952
	r = regcomp(preg, regex, cflags);
934
	if (r) {
953
	if (r) {
935
		regerror(r, preg, errbuf, sizeof errbuf);
954
		regerror(r, preg, errbuf, sizeof errbuf);
936
		fprintf(stderr, "Failed to compile regexp '%s'\n", regex);
955
		fprintf(stderr, "Failed to compile regexp '%s'\n", regex);
937
		fprintf(stderr, "error: %s\n", errbuf);
956
		fprintf(stderr, "error: %s\n", errbuf);
938
		exit(1);
957
		exit(1);
939
	}
958
	}
940
}
959
}
941
 
960
 
942
#endif
961
#endif