Subversion Repositories Kolibri OS

Rev

Rev 4364 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
3584 sourcerer 1
/*
2
 * Copyright 2006 Richard Wilson 
3
 * Copyright 2005 James Bursa 
4
 * Copyright 2005 John M Bell 
5
 *
6
 * This file is part of NetSurf, http://www.netsurf-browser.org/
7
 *
8
 * NetSurf is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; version 2 of the License.
11
 *
12
 * NetSurf is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU General Public License
18
 * along with this program.  If not, see .
19
 */
20
 
21
/** \file
22
 * URL parsing and joining (implementation).
23
 */
24
 
25
#include 
26
#include 
27
#include 
28
#include 
29
#include 
30
#include 
31
#include 
32
 
33
#include "curl/curl.h"
34
#include "utils/config.h"
35
#include "utils/log.h"
36
#include "utils/url.h"
37
#include "utils/utils.h"
5043 ashmew2 38
#include "content/fetchers/http_msg.h"
39
#include "content/fetchers/http.h"
3584 sourcerer 40
 
5043 ashmew2 41
 
3584 sourcerer 42
struct url_components_internal {
43
	char *buffer;	/* buffer used for all the following data */
44
	char *scheme;
45
	char *authority;
46
	char *path;
47
	char *query;
48
	char *fragment;
49
};
50
 
51
 
52
regex_t url_re, url_up_re;
53
 
54
/**
55
 * Initialise URL routines.
56
 *
57
 * Compiles regular expressions required by the url_ functions.
58
 */
59
 
60
void url_init(void)
61
{
62
	/* regex from RFC 2396 */
63
	regcomp_wrapper(&url_re, "^[[:space:]]*"
64
#define URL_RE_SCHEME 2
65
			"(([a-zA-Z][-a-zA-Z0-9+.]*):)?"
66
#define URL_RE_AUTHORITY 4
67
			"(//([^/?#[:space:]]*))?"
68
#define URL_RE_PATH 5
69
			"([^?#[:space:]]*)"
70
#define URL_RE_QUERY 7
71
			"(\\?([^#[:space:]]*))?"
72
#define URL_RE_FRAGMENT 9
73
			"(#([^[:space:]]*))?"
74
			"[[:space:]]*$", REG_EXTENDED);
75
	regcomp_wrapper(&url_up_re,
76
			"/([^/]?|[.][^./]|[^./][.]|[^./][^./]|[^/][^/][^/]+)"
77
			"/[.][.](/|$)",
78
			REG_EXTENDED);
79
}
80
 
81
 
82
/**
83
 * Check whether a host string is an IP address.  It should support and
84
 * detect IPv4 addresses (all of dotted-quad or subsets, decimal or
85
 * hexadecimal notations) and IPv6 addresses (including those containing
86
 * embedded IPv4 addresses.)
87
 *
88
 * \param  host a hostname terminated by '\0'
89
 * \return true if the hostname is an IP address, false otherwise
90
 */
91
bool url_host_is_ip_address(const char *host)
92
{
93
	struct in_addr ipv4;
94
	size_t host_len = strlen(host);
95
	const char *sane_host;
96
	const char *slash;
97
#ifndef NO_IPV6
98
	struct in6_addr ipv6;
99
	char ipv6_addr[64];
100
#endif
101
	/* FIXME TODO: Some parts of urldb.c (and perhaps other parts of
102
	 * NetSurf) make confusions between hosts and "prefixes", we can
103
	 * sometimes be erroneously passed more than just a host.  Sometimes
104
	 * we may be passed trailing slashes, or even whole path segments.
105
	 * A specific criminal in this class is urldb_iterate_partial, which
106
	 * takes a prefix to search for, but passes that prefix to functions
107
	 * that expect only hosts.
108
	 *
109
	 * For the time being, we will accept such calls; we check if there
110
	 * is a / in the host parameter, and if there is, we take a copy and
111
	 * replace the / with a \0.  This is not a permanent solution; we
112
	 * should search through NetSurf and find all the callers that are
113
	 * in error and fix them.  When doing this task, it might be wise
114
	 * to replace the hideousness below with code that doesn't have to do
115
	 * this, and add assert(strchr(host, '/') == NULL); somewhere.
116
	 * -- rjek - 2010-11-04
117
	 */
118
 
119
	slash = strchr(host, '/');
120
	if (slash == NULL) {
121
		sane_host = host;
122
	} else {
123
		char *c = strdup(host);
124
		c[slash - host] = '\0';
125
		sane_host = c;
126
		host_len = slash - host - 1;
127
		LOG(("WARNING: called with non-host '%s'", host));
128
	}
129
 
130
	if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
131
		goto out_false;
132
 
133
	if (inet_aton(sane_host, &ipv4) != 0) {
134
		/* This can only be a sane IPv4 address if it contains 3 dots.
135
		 * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
136
		 * and "a.b.c.d" as valid IPv4 address strings where we only
137
		 * support the full, dotted-quad, form.
138
		 */
139
		int num_dots = 0;
140
		size_t index;
141
 
142
		for (index = 0; index < host_len; index++) {
143
			if (sane_host[index] == '.')
144
				num_dots++;
145
		}
146
 
147
		if (num_dots == 3)
148
			goto out_true;
149
		else
150
			goto out_false;
151
	}
152
 
153
#ifndef NO_IPV6
154
	if (sane_host[0] != '[' || sane_host[host_len] != ']')
155
		goto out_false;
156
 
157
	strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr));
158
	ipv6_addr[sizeof(ipv6_addr) - 1] = '\0';
159
 
160
	if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
161
		goto out_true;
162
#endif
163
 
164
out_false:
165
	if (slash != NULL) free((void *)sane_host);
166
	return false;
167
 
168
out_true:
169
	if (slash != NULL) free((void *)sane_host);
170
	return true;
171
}
172
 
173
/**
174
 * Split a URL into separate components
175
 *
176
 * URLs passed to this function are assumed to be valid and no error checking
177
 * or recovery is attempted.
178
 *
179
 * See RFC 3986 for reference.
180
 *
181
 * \param  url	     a valid absolute or relative URL
182
 * \param  result    pointer to buffer to hold components
183
 * \return  URL_FUNC_OK on success
184
 */
185
 
186
static url_func_result url_get_components(const char *url,
187
		struct url_components *result)
188
{
189
  	int storage_length;
190
	char *storage_end;
191
	const char *scheme;
192
	const char *authority;
193
	const char *path;
194
	const char *query;
195
	const char *fragment;
196
	struct url_components_internal *internal;
197
 
198
	assert(url);
199
 
200
	/* clear our return value */
201
	internal = (struct url_components_internal *)result;
202
	memset(result, 0x00, sizeof(struct url_components));
203
 
204
	/* get enough storage space for a URL with termination at each node */
205
	storage_length = strlen(url) + 8;
206
	internal->buffer = malloc(storage_length);
207
	if (!internal->buffer)
208
		return URL_FUNC_NOMEM;
209
	storage_end = internal->buffer;
210
 
211
	/* look for a valid scheme */
212
	scheme = url;
213
	if (isalpha(*scheme)) {
214
		for (scheme = url + 1;
215
				((*scheme != ':') && (*scheme != '\0'));
216
				scheme++) {
217
			if (!isalnum(*scheme) && (*scheme != '+') &&
218
					(*scheme != '-') && (*scheme != '.'))
219
				break;
220
		}
221
 
222
		if (*scheme == ':') {
223
			memcpy(storage_end, url, scheme - url);
224
			storage_end[scheme - url] = '\0';
225
			result->scheme = storage_end;
226
			storage_end += scheme - url + 1;
227
			scheme++;
228
		} else {
229
			scheme = url;
230
		}
231
	}
232
 
233
 
234
	/* look for an authority */
235
	authority = scheme;
236
	if ((authority[0] == '/') && (authority[1] == '/')) {
237
		authority = strpbrk(scheme + 2, "/?#");
238
		if (!authority)
239
			authority = scheme + strlen(scheme);
240
		memcpy(storage_end, scheme + 2, authority - scheme - 2);
241
		storage_end[authority - scheme - 2] = '\0';
242
		result->authority = storage_end;
243
		storage_end += authority - scheme - 1;
244
	}
245
 
246
 
247
	/* look for a path */
248
	path = authority;
249
	if ((*path != '?') && (*path != '#') && (*path != '\0')) {
250
		path = strpbrk(path, "?#");
251
		if (!path)
252
			path = authority + strlen(authority);
253
		memcpy(storage_end, authority, path - authority);
254
		storage_end[path - authority] = '\0';
255
		result->path = storage_end;
256
		storage_end += path - authority + 1;
257
	}
258
 
259
 
260
	/* look for a query */
261
	query = path;
262
	if (*query == '?') {
263
		query = strchr(query, '#');
264
		if (!query)
265
			query = path + strlen(path);
266
		memcpy(storage_end, path + 1, query - path - 1);
267
		storage_end[query - path - 1] = '\0';
268
		result->query = storage_end;
269
		storage_end += query - path;
270
	}
271
 
272
 
273
	/* look for a fragment */
274
	fragment = query;
275
	if (*fragment == '#') {
276
		fragment = query + strlen(query);
277
 
278
		/* make a copy of the result for the caller */
279
		memcpy(storage_end, query + 1, fragment - query - 1);
280
		storage_end[fragment - query - 1] = '\0';
281
		result->fragment = storage_end;
282
		storage_end += fragment - query;
283
	}
284
 
285
	assert((result->buffer + storage_length) >= storage_end);
286
	return URL_FUNC_OK;
287
}
288
 
289
 
290
/**
291
 * Reform a URL from separate components
292
 *
293
 * See RFC 3986 for reference.
294
 *
295
 * \param  components  the components to reform into a URL
296
 * \return  a new URL allocated on the heap, or NULL on failure
297
 */
298
 
299
static char *url_reform_components(const struct url_components *components)
300
{
301
	int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0,
302
			fragment_len = 0;
303
	char *result, *url;
304
 
305
	/* 5.3 */
306
	if (components->scheme)
307
		scheme_len = strlen(components->scheme) + 1;
308
	if (components->authority)
309
		authority_len = strlen(components->authority) + 2;
310
	if (components->path)
311
		path_len = strlen(components->path);
312
	if (components->query)
313
		query_len = strlen(components->query) + 1;
314
	if (components->fragment)
315
		fragment_len = strlen(components->fragment) + 1;
316
 
317
	/* claim memory */
318
	url = result = malloc(scheme_len + authority_len + path_len +
319
			query_len + fragment_len + 1);
320
	if (!url) {
321
		LOG(("malloc failed"));
322
		return NULL;
323
	}
324
 
325
	/* rebuild URL */
326
	if (components->scheme) {
327
	  	sprintf(url, "%s:", components->scheme);
328
		url += scheme_len;
329
	}
330
	if (components->authority) {
331
	  	sprintf(url, "//%s", components->authority);
332
		url += authority_len;
333
	}
334
	if (components->path) {
335
	  	sprintf(url, "%s", components->path);
336
		url += path_len;
337
	}
338
	if (components->query) {
339
	  	sprintf(url, "?%s", components->query);
340
		url += query_len;
341
	}
342
	if (components->fragment)
343
	  	sprintf(url, "#%s", components->fragment);
344
	return result;
345
}
346
 
347
 
348
/**
349
 * Release some url components from memory
350
 *
351
 * \param  result  pointer to buffer containing components
352
 */
353
static void url_destroy_components(const struct url_components *components)
354
{
355
	const struct url_components_internal *internal;
356
 
357
	assert(components);
358
 
359
	internal = (const struct url_components_internal *)components;
360
	if (internal->buffer)
361
		free(internal->buffer);
362
}
363
 
364
 
365
/**
366
 * Resolve a relative URL to absolute form.
367
 *
368
 * \param  rel	   relative URL
369
 * \param  base	   base URL, must be absolute and cleaned as by nsurl_create()
370
 * \param  result  pointer to pointer to buffer to hold absolute url
371
 * \return  URL_FUNC_OK on success
372
 */
373
 
374
url_func_result url_join(const char *rel, const char *base, char **result)
375
{
376
	url_func_result status = URL_FUNC_NOMEM;
377
	struct url_components_internal base_components = {0,0,0,0,0,0};
378
	struct url_components_internal *base_ptr = &base_components;
379
	struct url_components_internal rel_components = {0,0,0,0,0,0};
380
	struct url_components_internal *rel_ptr = &rel_components;
381
	struct url_components_internal merged_components = {0,0,0,0,0,0};
382
	struct url_components_internal *merged_ptr = &merged_components;
383
	char *merge_path = NULL, *split_point;
384
	char *input, *output, *start = NULL;
385
	int len, buf_len;
386
 
387
	(*result) = 0;
388
 
389
	assert(base);
390
	assert(rel);
391
 
392
 
393
	/* break down the relative URL (not cached, corruptable) */
394
	status = url_get_components(rel, (struct url_components *) rel_ptr);
395
	if (status != URL_FUNC_OK) {
396
		LOG(("relative url '%s' failed to get components", rel));
397
		return URL_FUNC_FAILED;
398
	}
399
 
400
	/* [1] relative URL is absolute, use it entirely */
401
	merged_components = rel_components;
402
	if (rel_components.scheme)
403
		goto url_join_reform_url;
404
 
405
	/* break down the base URL (possibly cached, not corruptable) */
406
	status = url_get_components(base, (struct url_components *) base_ptr);
407
	if (status != URL_FUNC_OK) {
408
		url_destroy_components((struct url_components *) rel_ptr);
409
		LOG(("base url '%s' failed to get components", base));
410
		return URL_FUNC_FAILED;
411
	}
412
 
413
	/* [2] relative authority takes presidence */
414
	merged_components.scheme = base_components.scheme;
415
	if (rel_components.authority)
416
		goto url_join_reform_url;
417
 
418
	/* [3] handle empty paths */
419
	merged_components.authority = base_components.authority;
420
	if (!rel_components.path) {
421
	  	merged_components.path = base_components.path;
422
		if (!rel_components.query)
423
			merged_components.query = base_components.query;
424
		goto url_join_reform_url;
425
	}
426
 
427
	/* [4] handle valid paths */
428
	if (rel_components.path[0] == '/')
429
		merged_components.path = rel_components.path;
430
	else {
431
		/* 5.2.3 */
432
		if ((base_components.authority) && (!base_components.path)) {
433
			merge_path = malloc(strlen(rel_components.path) + 2);
434
			if (!merge_path) {
435
				LOG(("malloc failed"));
436
				goto url_join_no_mem;
437
			}
438
			sprintf(merge_path, "/%s", rel_components.path);
439
			merged_components.path = merge_path;
440
		} else {
441
			split_point = base_components.path ?
442
					strrchr(base_components.path, '/') :
443
					NULL;
444
			if (!split_point) {
445
				merged_components.path = rel_components.path;
446
			} else {
447
				len = ++split_point - base_components.path;
448
				buf_len = len + 1 + strlen(rel_components.path);
449
				merge_path = malloc(buf_len);
450
				if (!merge_path) {
451
					LOG(("malloc failed"));
452
					goto url_join_no_mem;
453
				}
454
				memcpy(merge_path, base_components.path, len);
455
				memcpy(merge_path + len, rel_components.path,
456
						strlen(rel_components.path));
457
				merge_path[buf_len - 1] = '\0';
458
				merged_components.path = merge_path;
459
			}
460
		}
461
	}
462
 
463
url_join_reform_url:
464
	/* 5.2.4 */
465
	input = merged_components.path;
466
	if ((input) && (strchr(input, '.'))) {
467
	  	/* [1] remove all dot references */
468
	  	output = start = malloc(strlen(input) + 1);
469
	  	if (!output) {
470
			LOG(("malloc failed"));
471
			goto url_join_no_mem;
472
		}
473
		merged_components.path = output;
474
		*output = '\0';
475
 
476
		while (*input != '\0') {
477
		  	/* [2A] */
478
		  	if (input[0] == '.') {
479
		  		if (input[1] == '/') {
480
		  			input = input + 2;
481
		  			continue;
482
		  		} else if ((input[1] == '.') &&
483
		  				(input[2] == '/')) {
484
		  			input = input + 3;
485
		  			continue;
486
		  		}
487
		  	}
488
 
489
		  	/* [2B] */
490
		  	if ((input[0] == '/') && (input[1] == '.')) {
491
		  		if (input[2] == '/') {
492
		  		  	input = input + 2;
493
		  		  	continue;
494
		  		} else if (input[2] == '\0') {
495
		  		  	input = input + 1;
496
		  		  	*input = '/';
497
		  		  	continue;
498
		  		}
499
 
500
		  		/* [2C] */
501
		  		if ((input[2] == '.') && ((input[3] == '/') ||
502
		  				(input[3] == '\0'))) {
503
			  		if (input[3] == '/') {
504
			  		  	input = input + 3;
505
			  		} else {
506
		  				input = input + 2;
507
		  			  	*input = '/';
508
		  			}
509
 
510
		  			if ((output > start) &&
511
		  					(output[-1] == '/'))
512
		  				*--output = '\0';
513
		  			split_point = strrchr(start, '/');
514
		  			if (!split_point)
515
		  				output = start;
516
		  			else
517
		  				output = split_point;
518
		  			*output = '\0';
519
		  			continue;
520
		  		}
521
		  	}
522
 
523
 
524
		  	/* [2D] */
525
		  	if (input[0] == '.') {
526
		  		if (input[1] == '\0') {
527
		  			input = input + 1;
528
		  			continue;
529
		  		} else if ((input[1] == '.') &&
530
		  				(input[2] == '\0')) {
531
		  			input = input + 2;
532
		  			continue;
533
		  		}
534
		  	}
535
 
536
		  	/* [2E] */
537
		  	if (*input == '/')
538
		  		*output++ = *input++;
539
		  	while ((*input != '/') && (*input != '\0'))
540
		  		*output++ = *input++;
541
		  	*output = '\0';
542
                }
543
                /* [3] */
544
      		merged_components.path = start;
545
	}
546
 
547
	/* 5.3 */
548
	*result = url_reform_components((struct url_components *) merged_ptr);
549
  	if (!(*result))
550
		goto url_join_no_mem;
551
 
552
	/* return success */
553
	status = URL_FUNC_OK;
554
 
555
url_join_no_mem:
556
	free(start);
557
	free(merge_path);
558
	url_destroy_components((struct url_components *) base_ptr);
559
	url_destroy_components((struct url_components *) rel_ptr);
560
	return status;
561
}
562
 
563
 
564
/**
565
 * Return the host name from an URL.
566
 *
567
 * \param  url	   an absolute URL
568
 * \param  result  pointer to pointer to buffer to hold host name
569
 * \return  URL_FUNC_OK on success
570
 */
571
 
572
url_func_result url_host(const char *url, char **result)
573
{
574
	url_func_result status;
575
	struct url_components components;
576
	const char *host_start, *host_end;
577
 
578
	assert(url);
579
 
580
	status = url_get_components(url, &components);
581
	if (status == URL_FUNC_OK) {
582
		if (!components.authority) {
583
			url_destroy_components(&components);
584
			return URL_FUNC_FAILED;
585
		}
586
		host_start = strchr(components.authority, '@');
587
		host_start = host_start ? host_start + 1 : components.authority;
588
 
589
		/* skip over an IPv6 address if there is one */
590
		if (host_start[0] == '[') {
591
			host_end = strchr(host_start, ']') + 1;
592
		} else {
593
			host_end = strchr(host_start, ':');
594
		}
595
 
596
		if (!host_end)
597
			host_end = components.authority +
598
					strlen(components.authority);
599
 
600
		*result = malloc(host_end - host_start + 1);
601
		if (!(*result)) {
602
			url_destroy_components(&components);
603
			return URL_FUNC_FAILED;
604
		}
605
		memcpy((*result), host_start, host_end - host_start);
606
		(*result)[host_end - host_start] = '\0';
607
	}
608
	url_destroy_components(&components);
609
	return status;
610
}
611
 
612
 
613
/**
614
 * Return the scheme name from an URL.
615
 *
616
 * See RFC 3986, 3.1 for reference.
617
 *
618
 * \param  url	   an absolute URL
619
 * \param  result  pointer to pointer to buffer to hold scheme name
620
 * \return  URL_FUNC_OK on success
621
 */
622
 
623
url_func_result url_scheme(const char *url, char **result)
624
{
625
	url_func_result status;
626
	struct url_components components;
627
 
628
	assert(url);
629
 
630
	status = url_get_components(url, &components);
631
	if (status == URL_FUNC_OK) {
632
		if (!components.scheme) {
633
			status = URL_FUNC_FAILED;
634
		} else {
635
			*result = strdup(components.scheme);
636
			if (!(*result))
637
				status = URL_FUNC_NOMEM;
638
		}
639
	}
640
	url_destroy_components(&components);
641
	return status;
642
}
643
 
644
 
645
/**
646
 * Extract path segment from an URL
647
 *
648
 * \param url	  an absolute URL
649
 * \param result  pointer to pointer to buffer to hold result
650
 * \return URL_FUNC_OK on success
651
 */
652
 
653
url_func_result url_path(const char *url, char **result)
654
{
655
	url_func_result status;
656
	struct url_components components;
657
 
658
	assert(url);
659
 
660
	status = url_get_components(url, &components);
661
	if (status == URL_FUNC_OK) {
662
		if (!components.path) {
663
			status = URL_FUNC_FAILED;
664
		} else {
665
			*result = strdup(components.path);
666
			if (!(*result))
667
				status = URL_FUNC_NOMEM;
668
		}
669
	}
670
	url_destroy_components(&components);
671
	return status;
672
}
673
 
674
/**
675
 * Attempt to find a nice filename for a URL.
676
 *
677
 * \param  url	   an absolute URL
678
 * \param  result  pointer to pointer to buffer to hold filename
679
 * \param  remove_extensions  remove any extensions from the filename
680
 * \return  URL_FUNC_OK on success
681
 */
682
 
683
url_func_result url_nice(const char *url, char **result,
684
		bool remove_extensions)
685
{
686
	int m;
687
	regmatch_t match[10];
688
	regoff_t start, end;
689
	size_t i;
690
	char *dot;
691
 
692
	*result = 0;
693
 
694
	m = regexec(&url_re, url, 10, match, 0);
695
	if (m) {
696
		LOG(("url '%s' failed to match regex", url));
697
		return URL_FUNC_FAILED;
698
	}
699
 
700
	/* extract the last component of the path, if possible */
701
	if (match[URL_RE_PATH].rm_so == -1 || match[URL_RE_PATH].rm_so ==
702
			match[URL_RE_PATH].rm_eo)
703
		goto no_path;  /* no path, or empty */
704
	for (end = match[URL_RE_PATH].rm_eo - 1;
705
			end != match[URL_RE_PATH].rm_so && url[end] == '/';
706
			end--)
707
		;
708
	if (end == match[URL_RE_PATH].rm_so)
709
		goto no_path;  /* path is a string of '/' */
710
	end++;
711
	for (start = end - 1;
712
			start != match[URL_RE_PATH].rm_so && url[start] != '/';
713
			start--)
714
		;
715
	if (url[start] == '/')
716
		start++;
717
 
718
	if (!strncasecmp(url + start, "index.", 6) ||
719
			!strncasecmp(url + start, "default.", 8)) {
720
		/* try again */
721
		if (start == match[URL_RE_PATH].rm_so)
722
			goto no_path;
723
		for (end = start - 1;
724
				end != match[URL_RE_PATH].rm_so &&
725
				url[end] == '/';
726
				end--)
727
			;
728
		if (end == match[URL_RE_PATH].rm_so)
729
			goto no_path;
730
		end++;
731
		for (start = end - 1;
732
				start != match[URL_RE_PATH].rm_so &&
733
				url[start] != '/';
734
				start--)
735
		;
736
		if (url[start] == '/')
737
			start++;
738
	}
739
 
740
	*result = malloc(end - start + 1);
741
	if (!*result) {
742
		LOG(("malloc failed"));
743
		return URL_FUNC_NOMEM;
744
	}
745
	strncpy(*result, url + start, end - start);
746
	(*result)[end - start] = 0;
747
 
748
	if (remove_extensions) {
749
		dot = strchr(*result, '.');
750
		if (dot && dot != *result)
751
			*dot = 0;
752
	}
753
 
754
	return URL_FUNC_OK;
755
 
756
no_path:
757
 
758
	/* otherwise, use the host name, with '.' replaced by '_' */
759
	if (match[URL_RE_AUTHORITY].rm_so != -1 &&
760
			match[URL_RE_AUTHORITY].rm_so !=
761
			match[URL_RE_AUTHORITY].rm_eo) {
762
		*result = malloc(match[URL_RE_AUTHORITY].rm_eo -
763
				match[URL_RE_AUTHORITY].rm_so + 1);
764
		if (!*result) {
765
			LOG(("malloc failed"));
766
			return URL_FUNC_NOMEM;
767
		}
768
		strncpy(*result, url + match[URL_RE_AUTHORITY].rm_so,
769
				match[URL_RE_AUTHORITY].rm_eo -
770
				match[URL_RE_AUTHORITY].rm_so);
771
		(*result)[match[URL_RE_AUTHORITY].rm_eo -
772
				match[URL_RE_AUTHORITY].rm_so] = 0;
773
 
774
		for (i = 0; (*result)[i]; i++)
775
			if ((*result)[i] == '.')
776
				(*result)[i] = '_';
777
 
778
		return URL_FUNC_OK;
779
	}
780
 
781
	return URL_FUNC_FAILED;
782
}
783
 
784
/**
785
 * Convert an escaped string to plain.
786
 * \param result unescaped string owned by caller must be freed with free()
787
 * \return  URL_FUNC_OK on success
788
 */
789
url_func_result url_unescape(const char *str, char **result)
790
{
791
	char *curlstr;
792
	char *retstr;
5043 ashmew2 793
	/* curlstr = curl_unescape(str, 0); */
794
	LOG(("Address of str is : %x\n", str));
795
	LOG(("url is %s\n", str));
796
 
797
	LOG(("Calling http_unescape_url in url.c\n"));
798
	curlstr =  http_unescape_url(str);
799
	LOG(("http_unescape_url returned.\n"));
800
	__menuet__debug_out("http_unescape_url returned\n");
3584 sourcerer 801
 
802
	if (curlstr == NULL) {
803
		return URL_FUNC_NOMEM;
804
	}
5043 ashmew2 805
	__menuet__debug_out("Calling strdup in url.c with : ");
806
	__menuet__debug_out(curlstr);
807
	__menuet__debug_out("\n");
3584 sourcerer 808
 
809
	retstr = strdup(curlstr);
5043 ashmew2 810
	/* free(curlstr); */ /* Doesn't work because mem not allocated with malloc/calloc/realloc*/
811
	/* TODO: Use mem_free here*/
3584 sourcerer 812
 
5043 ashmew2 813
	__menuet__debug_out("After strdup in url.c\n");
814
 
3584 sourcerer 815
	if (retstr == NULL) {
5043 ashmew2 816
	  __menuet__debug_out("retstr is NULL in url.c\n");
3584 sourcerer 817
		return URL_FUNC_NOMEM;
818
	}
819
 
820
	*result = retstr;
5043 ashmew2 821
	__menuet__debug_out("returning from url_unescape in url.c\n");
3584 sourcerer 822
	return URL_FUNC_OK;
5043 ashmew2 823
 
3584 sourcerer 824
}
825
 
826
/**
827
 * Escape a string suitable for inclusion in an URL.
828
 *
829
 * \param  unescaped      the unescaped string
830
 * \param  toskip         number of bytes to skip in unescaped string
831
 * \param  sptoplus       true iff spaces should be converted to +
832
 * \param  escexceptions  NULL or a string of characters excluded to be escaped
833
 * \param  result         pointer to pointer to buffer to hold escaped string
834
 * \return  URL_FUNC_OK on success
835
 */
836
 
837
url_func_result url_escape(const char *unescaped, size_t toskip,
838
		bool sptoplus, const char *escexceptions, char **result)
839
{
840
	size_t len;
841
	char *escaped, *d, *tmpres;
842
	const char *c;
843
 
844
	if (!unescaped || !result)
845
		return URL_FUNC_FAILED;
846
 
847
	*result = NULL;
848
 
849
	len = strlen(unescaped);
850
	if (len < toskip)
851
		return URL_FUNC_FAILED;
852
	len -= toskip;
853
 
854
	escaped = malloc(len * 3 + 1);
855
	if (!escaped)
856
		return URL_FUNC_NOMEM;
857
 
858
	for (c = unescaped + toskip, d = escaped; *c; c++) {
859
		/* Check if we should escape this byte.
860
		 * '~' is unreserved and should not be percent encoded, if
861
		 * you believe the spec; however, leaving it unescaped
862
		 * breaks a bunch of websites, so we escape it anyway. */
863
		if (!isascii(*c)
864
			|| (strchr(":/?#[]@" /* gen-delims */
865
				  "!$&'()*+,;=" /* sub-delims */
866
				  "<>%\"{}|\\^`~" /* others */,	*c)
867
				&& (!escexceptions || !strchr(escexceptions, *c)))
868
			|| *c <= 0x20 || *c == 0x7f) {
869
			if (*c == 0x20 && sptoplus) {
870
				*d++ = '+';
871
			} else {
872
				*d++ = '%';
873
				*d++ = "0123456789ABCDEF"[((*c >> 4) & 0xf)];
874
				*d++ = "0123456789ABCDEF"[(*c & 0xf)];
875
			}
876
		} else {
877
			/* unreserved characters: [a-zA-Z0-9-._] */
878
			*d++ = *c;
879
		}
880
	}
881
	*d++ = '\0';
882
 
883
	tmpres = malloc(d - escaped + toskip);
884
	if (!tmpres) {
885
		free(escaped);
886
		return URL_FUNC_NOMEM;
887
	}
888
 
889
	memcpy(tmpres, unescaped, toskip);
890
	memcpy(tmpres + toskip, escaped, d - escaped);
891
	*result = tmpres;
892
 
893
	free(escaped);
894
 
895
	return URL_FUNC_OK;
896
}
897
 
898
 
899
#ifdef TEST
900
 
901
int main(int argc, char *argv[])
902
{
903
	int i;
904
	url_func_result res;
905
	char *s;
906
	url_init();
907
	for (i = 1; i != argc; i++) {
908
/*		printf("==> '%s'\n", argv[i]);
909
		res = url_normalize(argv[i], &s);
910
		if (res == URL_FUNC_OK) {
911
			printf("<== '%s'\n", s);
912
			free(s);
913
		}*/
914
/*		printf("==> '%s'\n", argv[i]);
915
		res = url_host(argv[i], &s);
916
		if (res == URL_FUNC_OK) {
917
			printf("<== '%s'\n", s);
918
			free(s);
919
		}*/
920
		if (1 != i) {
921
			res = url_join(argv[i], argv[1], &s);
922
			if (res == URL_FUNC_OK) {
923
				printf("'%s' + '%s' \t= '%s'\n", argv[1],
924
						argv[i], s);
925
				free(s);
926
			}
927
		}
928
/*		printf("'%s' => ", argv[i]);
929
		res = url_nice(argv[i], &s, true);
930
		if (res == URL_FUNC_OK) {
931
			printf("'%s', ", s);
932
			free(s);
933
		} else {
934
			printf("failed %u, ", res);
935
		}
936
		res = url_nice(argv[i], &s, false);
937
		if (res == URL_FUNC_OK) {
938
			printf("'%s', ", s);
939
			free(s);
940
		} else {
941
			printf("failed %u, ", res);
942
		}
943
		printf("\n");*/
944
	}
945
	return 0;
946
}
947
 
948
void regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
949
{
950
	char errbuf[200];
951
	int r;
952
	r = regcomp(preg, regex, cflags);
953
	if (r) {
954
		regerror(r, preg, errbuf, sizeof errbuf);
955
		fprintf(stderr, "Failed to compile regexp '%s'\n", regex);
956
		fprintf(stderr, "error: %s\n", errbuf);
957
		exit(1);
958
	}
959
}
960
 
961
#endif