WebSVN – Kolibri OS – Diff – /contrib/network/netsurf/netsurf/utils/url.c


/*
 * Copyright 2006 Richard Wilson 
 * Copyright 2005 James Bursa 
 * Copyright 2005 John M Bell 
 *
 * This file is part of NetSurf, http://www.netsurf-browser.org/
 *
 * NetSurf is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * NetSurf is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */
 
/** \file
 * URL parsing and joining (implementation).
 */
 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
 
#include "curl/curl.h"
#include "utils/config.h"
#include "utils/log.h"
#include "utils/url.h"
#include "utils/utils.h"
#include "content/fetchers/http_msg.h"
#include "content/fetchers/http.h"
 
 
struct url_components_internal {
	char *buffer;	/* buffer used for all the following data */
	char *scheme;
	char *authority;
	char *path;
	char *query;
	char *fragment;
};
 
 
regex_t url_re, url_up_re;
 
/**
 * Initialise URL routines.
 *
 * Compiles regular expressions required by the url_ functions.
 */
 
void url_init(void)
{
	/* regex from RFC 2396 */
	regcomp_wrapper(&url_re, "^[[:space:]]*"
#define URL_RE_SCHEME 2
			"(([a-zA-Z][-a-zA-Z0-9+.]*):)?"
#define URL_RE_AUTHORITY 4
			"(//([^/?#[:space:]]*))?"
#define URL_RE_PATH 5
			"([^?#[:space:]]*)"
#define URL_RE_QUERY 7
			"(\\?([^#[:space:]]*))?"
#define URL_RE_FRAGMENT 9
			"(#([^[:space:]]*))?"
			"[[:space:]]*$", REG_EXTENDED);
	regcomp_wrapper(&url_up_re,
			"/([^/]?|[.][^./]|[^./][.]|[^./][^./]|[^/][^/][^/]+)"
			"/[.][.](/|$)",
			REG_EXTENDED);
}
 
 
/**
 * Check whether a host string is an IP address.  It should support and
 * detect IPv4 addresses (all of dotted-quad or subsets, decimal or
 * hexadecimal notations) and IPv6 addresses (including those containing
 * embedded IPv4 addresses.)
 *
 * \param  host a hostname terminated by '\0'
 * \return true if the hostname is an IP address, false otherwise
 */
bool url_host_is_ip_address(const char *host)
{
	struct in_addr ipv4;
	size_t host_len = strlen(host);
	const char *sane_host;
	const char *slash;
#ifndef NO_IPV6
	struct in6_addr ipv6;
	char ipv6_addr[64];
#endif
	/* FIXME TODO: Some parts of urldb.c (and perhaps other parts of
	 * NetSurf) make confusions between hosts and "prefixes", we can
	 * sometimes be erroneously passed more than just a host.  Sometimes
	 * we may be passed trailing slashes, or even whole path segments.
	 * A specific criminal in this class is urldb_iterate_partial, which
	 * takes a prefix to search for, but passes that prefix to functions
	 * that expect only hosts.
	 *
	 * For the time being, we will accept such calls; we check if there
	 * is a / in the host parameter, and if there is, we take a copy and
	 * replace the / with a \0.  This is not a permanent solution; we
	 * should search through NetSurf and find all the callers that are
	 * in error and fix them.  When doing this task, it might be wise
	 * to replace the hideousness below with code that doesn't have to do
	 * this, and add assert(strchr(host, '/') == NULL); somewhere.
	 * -- rjek - 2010-11-04
	 */
 
	slash = strchr(host, '/');
	if (slash == NULL) {
		sane_host = host;
	} else {
		char *c = strdup(host);
		c[slash - host] = '\0';
		sane_host = c;
		host_len = slash - host - 1;
		LOG(("WARNING: called with non-host '%s'", host));
	}
 
	if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
		goto out_false;
 
	if (inet_aton(sane_host, &ipv4) != 0) {
		/* This can only be a sane IPv4 address if it contains 3 dots.
		 * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
		 * and "a.b.c.d" as valid IPv4 address strings where we only
		 * support the full, dotted-quad, form.
		 */
		int num_dots = 0;
		size_t index;
 
		for (index = 0; index < host_len; index++) {
			if (sane_host[index] == '.')
				num_dots++;
		}
 
		if (num_dots == 3)
			goto out_true;
		else
			goto out_false;
	}
 
#ifndef NO_IPV6
	if (sane_host[0] != '[' || sane_host[host_len] != ']')
		goto out_false;
 
	strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr));
	ipv6_addr[sizeof(ipv6_addr) - 1] = '\0';
 
	if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
		goto out_true;
#endif
 
out_false:
	if (slash != NULL) free((void *)sane_host);
	return false;
 
out_true:
	if (slash != NULL) free((void *)sane_host);
	return true;
}
 
/**
 * Split a URL into separate components
 *
 * URLs passed to this function are assumed to be valid and no error checking
 * or recovery is attempted.
 *
 * See RFC 3986 for reference.
 *
 * \param  url	     a valid absolute or relative URL
 * \param  result    pointer to buffer to hold components
 * \return  URL_FUNC_OK on success
 */
 
static url_func_result url_get_components(const char *url,
		struct url_components *result)
{
  	int storage_length;
	char *storage_end;
	const char *scheme;
	const char *authority;
	const char *path;
	const char *query;
	const char *fragment;
	struct url_components_internal *internal;
 
	assert(url);
 
	/* clear our return value */
	internal = (struct url_components_internal *)result;
	memset(result, 0x00, sizeof(struct url_components));
 
	/* get enough storage space for a URL with termination at each node */
	storage_length = strlen(url) + 8;
	internal->buffer = malloc(storage_length);
	if (!internal->buffer)
		return URL_FUNC_NOMEM;
	storage_end = internal->buffer;
 
	/* look for a valid scheme */
	scheme = url;
	if (isalpha(*scheme)) {
		for (scheme = url + 1;
				((*scheme != ':') && (*scheme != '\0'));
				scheme++) {
			if (!isalnum(*scheme) && (*scheme != '+') &&
					(*scheme != '-') && (*scheme != '.'))
				break;
		}
 
		if (*scheme == ':') {
			memcpy(storage_end, url, scheme - url);
			storage_end[scheme - url] = '\0';
			result->scheme = storage_end;
			storage_end += scheme - url + 1;
			scheme++;
		} else {
			scheme = url;
		}
	}
 
 
	/* look for an authority */
	authority = scheme;
	if ((authority[0] == '/') && (authority[1] == '/')) {
		authority = strpbrk(scheme + 2, "/?#");
		if (!authority)
			authority = scheme + strlen(scheme);
		memcpy(storage_end, scheme + 2, authority - scheme - 2);
		storage_end[authority - scheme - 2] = '\0';
		result->authority = storage_end;
		storage_end += authority - scheme - 1;
	}
 
 
	/* look for a path */
	path = authority;
	if ((*path != '?') && (*path != '#') && (*path != '\0')) {
		path = strpbrk(path, "?#");
		if (!path)
			path = authority + strlen(authority);
		memcpy(storage_end, authority, path - authority);
		storage_end[path - authority] = '\0';
		result->path = storage_end;
		storage_end += path - authority + 1;
	}
 
 
	/* look for a query */
	query = path;
	if (*query == '?') {
		query = strchr(query, '#');
		if (!query)
			query = path + strlen(path);
		memcpy(storage_end, path + 1, query - path - 1);
		storage_end[query - path - 1] = '\0';
		result->query = storage_end;
		storage_end += query - path;
	}
 
 
	/* look for a fragment */
	fragment = query;
	if (*fragment == '#') {
		fragment = query + strlen(query);
 
		/* make a copy of the result for the caller */
		memcpy(storage_end, query + 1, fragment - query - 1);
		storage_end[fragment - query - 1] = '\0';
		result->fragment = storage_end;
		storage_end += fragment - query;
	}
 
	assert((result->buffer + storage_length) >= storage_end);
	return URL_FUNC_OK;
}
 
 
/**
 * Reform a URL from separate components
 *
 * See RFC 3986 for reference.
 *
 * \param  components  the components to reform into a URL
 * \return  a new URL allocated on the heap, or NULL on failure
 */
 
static char *url_reform_components(const struct url_components *components)
{
	int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0,
			fragment_len = 0;
	char *result, *url;
 
	/* 5.3 */
	if (components->scheme)
		scheme_len = strlen(components->scheme) + 1;
	if (components->authority)
		authority_len = strlen(components->authority) + 2;
	if (components->path)
		path_len = strlen(components->path);
	if (components->query)
		query_len = strlen(components->query) + 1;
	if (components->fragment)
		fragment_len = strlen(components->fragment) + 1;
 
	/* claim memory */
	url = result = malloc(scheme_len + authority_len + path_len +
			query_len + fragment_len + 1);
	if (!url) {
		LOG(("malloc failed"));
		return NULL;
	}
 
	/* rebuild URL */
	if (components->scheme) {
	  	sprintf(url, "%s:", components->scheme);
		url += scheme_len;
	}
	if (components->authority) {
	  	sprintf(url, "//%s", components->authority);
		url += authority_len;
	}
	if (components->path) {
	  	sprintf(url, "%s", components->path);
		url += path_len;
	}
	if (components->query) {
	  	sprintf(url, "?%s", components->query);
		url += query_len;
	}
	if (components->fragment)
	  	sprintf(url, "#%s", components->fragment);
	return result;
}
 
 
/**
 * Release some url components from memory
 *
 * \param  result  pointer to buffer containing components
 */
static void url_destroy_components(const struct url_components *components)
{
	const struct url_components_internal *internal;
 
	assert(components);
 
	internal = (const struct url_components_internal *)components;
	if (internal->buffer)
		free(internal->buffer);
}
 
 
/**
 * Resolve a relative URL to absolute form.
 *
 * \param  rel	   relative URL
 * \param  base	   base URL, must be absolute and cleaned as by nsurl_create()
 * \param  result  pointer to pointer to buffer to hold absolute url
 * \return  URL_FUNC_OK on success
 */
 
url_func_result url_join(const char *rel, const char *base, char **result)
{
	url_func_result status = URL_FUNC_NOMEM;
	struct url_components_internal base_components = {0,0,0,0,0,0};
	struct url_components_internal *base_ptr = &base_components;
	struct url_components_internal rel_components = {0,0,0,0,0,0};
	struct url_components_internal *rel_ptr = &rel_components;
	struct url_components_internal merged_components = {0,0,0,0,0,0};
	struct url_components_internal *merged_ptr = &merged_components;
	char *merge_path = NULL, *split_point;
	char *input, *output, *start = NULL;
	int len, buf_len;
 
	(*result) = 0;
 
	assert(base);
	assert(rel);
 
 
	/* break down the relative URL (not cached, corruptable) */
	status = url_get_components(rel, (struct url_components *) rel_ptr);
	if (status != URL_FUNC_OK) {
		LOG(("relative url '%s' failed to get components", rel));
		return URL_FUNC_FAILED;
	}
 
	/* [1] relative URL is absolute, use it entirely */
	merged_components = rel_components;
	if (rel_components.scheme)
		goto url_join_reform_url;
 
	/* break down the base URL (possibly cached, not corruptable) */
	status = url_get_components(base, (struct url_components *) base_ptr);
	if (status != URL_FUNC_OK) {
		url_destroy_components((struct url_components *) rel_ptr);
		LOG(("base url '%s' failed to get components", base));
		return URL_FUNC_FAILED;
	}
 
	/* [2] relative authority takes presidence */
	merged_components.scheme = base_components.scheme;
	if (rel_components.authority)
		goto url_join_reform_url;
 
	/* [3] handle empty paths */
	merged_components.authority = base_components.authority;
	if (!rel_components.path) {
	  	merged_components.path = base_components.path;
		if (!rel_components.query)
			merged_components.query = base_components.query;
		goto url_join_reform_url;
	}
 
	/* [4] handle valid paths */
	if (rel_components.path[0] == '/')
		merged_components.path = rel_components.path;
	else {
		/* 5.2.3 */
		if ((base_components.authority) && (!base_components.path)) {
			merge_path = malloc(strlen(rel_components.path) + 2);
			if (!merge_path) {
				LOG(("malloc failed"));
				goto url_join_no_mem;
			}
			sprintf(merge_path, "/%s", rel_components.path);
			merged_components.path = merge_path;
		} else {
			split_point = base_components.path ?
					strrchr(base_components.path, '/') :
					NULL;
			if (!split_point) {
				merged_components.path = rel_components.path;
			} else {
				len = ++split_point - base_components.path;
				buf_len = len + 1 + strlen(rel_components.path);
				merge_path = malloc(buf_len);
				if (!merge_path) {
					LOG(("malloc failed"));
					goto url_join_no_mem;
				}
				memcpy(merge_path, base_components.path, len);
				memcpy(merge_path + len, rel_components.path,
						strlen(rel_components.path));
				merge_path[buf_len - 1] = '\0';
				merged_components.path = merge_path;
			}
		}
	}
 
url_join_reform_url:
	/* 5.2.4 */
	input = merged_components.path;
	if ((input) && (strchr(input, '.'))) {
	  	/* [1] remove all dot references */
	  	output = start = malloc(strlen(input) + 1);
	  	if (!output) {
			LOG(("malloc failed"));
			goto url_join_no_mem;
		}
		merged_components.path = output;
		*output = '\0';
 
		while (*input != '\0') {
		  	/* [2A] */
		  	if (input[0] == '.') {
		  		if (input[1] == '/') {
		  			input = input + 2;
		  			continue;
		  		} else if ((input[1] == '.') &&
		  				(input[2] == '/')) {
		  			input = input + 3;
		  			continue;
		  		}
		  	}
 
		  	/* [2B] */
		  	if ((input[0] == '/') && (input[1] == '.')) {
		  		if (input[2] == '/') {
		  		  	input = input + 2;
		  		  	continue;
		  		} else if (input[2] == '\0') {
		  		  	input = input + 1;
		  		  	*input = '/';
		  		  	continue;
		  		}
 
		  		/* [2C] */
		  		if ((input[2] == '.') && ((input[3] == '/') ||
		  				(input[3] == '\0'))) {
			  		if (input[3] == '/') {
			  		  	input = input + 3;
			  		} else {
		  				input = input + 2;
		  			  	*input = '/';
		  			}
 
		  			if ((output > start) &&
		  					(output[-1] == '/'))
		  				*--output = '\0';
		  			split_point = strrchr(start, '/');
		  			if (!split_point)
		  				output = start;
		  			else
		  				output = split_point;
		  			*output = '\0';
		  			continue;
		  		}
		  	}
 
 
		  	/* [2D] */
		  	if (input[0] == '.') {
		  		if (input[1] == '\0') {
		  			input = input + 1;
		  			continue;
		  		} else if ((input[1] == '.') &&
		  				(input[2] == '\0')) {
		  			input = input + 2;
		  			continue;
		  		}
		  	}
 
		  	/* [2E] */
		  	if (*input == '/')
		  		*output++ = *input++;
		  	while ((*input != '/') && (*input != '\0'))
		  		*output++ = *input++;
		  	*output = '\0';
                }
                /* [3] */
      		merged_components.path = start;
	}
 
	/* 5.3 */
	*result = url_reform_components((struct url_components *) merged_ptr);
  	if (!(*result))
		goto url_join_no_mem;
 
	/* return success */
	status = URL_FUNC_OK;
 
url_join_no_mem:
	free(start);
	free(merge_path);
	url_destroy_components((struct url_components *) base_ptr);
	url_destroy_components((struct url_components *) rel_ptr);
	return status;
}
 
 
/**
 * Return the host name from an URL.
 *
 * \param  url	   an absolute URL
 * \param  result  pointer to pointer to buffer to hold host name
 * \return  URL_FUNC_OK on success
 */
 
url_func_result url_host(const char *url, char **result)
{
	url_func_result status;
	struct url_components components;
	const char *host_start, *host_end;
 
	assert(url);
 
	status = url_get_components(url, &components);
	if (status == URL_FUNC_OK) {
		if (!components.authority) {
			url_destroy_components(&components);
			return URL_FUNC_FAILED;
		}
		host_start = strchr(components.authority, '@');
		host_start = host_start ? host_start + 1 : components.authority;
 
		/* skip over an IPv6 address if there is one */
		if (host_start[0] == '[') {
			host_end = strchr(host_start, ']') + 1;
		} else {
			host_end = strchr(host_start, ':');
		}
 
		if (!host_end)
			host_end = components.authority +
					strlen(components.authority);
 
		*result = malloc(host_end - host_start + 1);
		if (!(*result)) {
			url_destroy_components(&components);
			return URL_FUNC_FAILED;
		}
		memcpy((*result), host_start, host_end - host_start);
		(*result)[host_end - host_start] = '\0';
	}
	url_destroy_components(&components);
	return status;
}
 
 
/**
 * Return the scheme name from an URL.
 *
 * See RFC 3986, 3.1 for reference.
 *
 * \param  url	   an absolute URL
 * \param  result  pointer to pointer to buffer to hold scheme name
 * \return  URL_FUNC_OK on success
 */
 
url_func_result url_scheme(const char *url, char **result)
{
	url_func_result status;
	struct url_components components;
 
	assert(url);
 
	status = url_get_components(url, &components);
	if (status == URL_FUNC_OK) {
		if (!components.scheme) {
			status = URL_FUNC_FAILED;
		} else {
			*result = strdup(components.scheme);
			if (!(*result))
				status = URL_FUNC_NOMEM;
		}
	}
	url_destroy_components(&components);
	return status;
}
 
 
/**
 * Extract path segment from an URL
 *
 * \param url	  an absolute URL
 * \param result  pointer to pointer to buffer to hold result
 * \return URL_FUNC_OK on success
 */
 
url_func_result url_path(const char *url, char **result)
{
	url_func_result status;
	struct url_components components;
 
	assert(url);
 
	status = url_get_components(url, &components);
	if (status == URL_FUNC_OK) {
		if (!components.path) {
			status = URL_FUNC_FAILED;
		} else {
			*result = strdup(components.path);
			if (!(*result))
				status = URL_FUNC_NOMEM;
		}
	}
	url_destroy_components(&components);
	return status;
}
 
/**
 * Attempt to find a nice filename for a URL.
 *
 * \param  url	   an absolute URL
 * \param  result  pointer to pointer to buffer to hold filename
 * \param  remove_extensions  remove any extensions from the filename
 * \return  URL_FUNC_OK on success
 */
 
url_func_result url_nice(const char *url, char **result,
		bool remove_extensions)
{
	int m;
	regmatch_t match[10];
	regoff_t start, end;
	size_t i;
	char *dot;
 
	*result = 0;
 
	m = regexec(&url_re, url, 10, match, 0);
	if (m) {
		LOG(("url '%s' failed to match regex", url));
		return URL_FUNC_FAILED;
	}
 
	/* extract the last component of the path, if possible */
	if (match[URL_RE_PATH].rm_so == -1 || match[URL_RE_PATH].rm_so ==
			match[URL_RE_PATH].rm_eo)
		goto no_path;  /* no path, or empty */
	for (end = match[URL_RE_PATH].rm_eo - 1;
			end != match[URL_RE_PATH].rm_so && url[end] == '/';
			end--)
		;
	if (end == match[URL_RE_PATH].rm_so)
		goto no_path;  /* path is a string of '/' */
	end++;
	for (start = end - 1;
			start != match[URL_RE_PATH].rm_so && url[start] != '/';
			start--)
		;
	if (url[start] == '/')
		start++;
 
	if (!strncasecmp(url + start, "index.", 6) ||
			!strncasecmp(url + start, "default.", 8)) {
		/* try again */
		if (start == match[URL_RE_PATH].rm_so)
			goto no_path;
		for (end = start - 1;
				end != match[URL_RE_PATH].rm_so &&
				url[end] == '/';
				end--)
			;
		if (end == match[URL_RE_PATH].rm_so)
			goto no_path;
		end++;
		for (start = end - 1;
				start != match[URL_RE_PATH].rm_so &&
				url[start] != '/';
				start--)
		;
		if (url[start] == '/')
			start++;
	}
 
	*result = malloc(end - start + 1);
	if (!*result) {
		LOG(("malloc failed"));
		return URL_FUNC_NOMEM;
	}
	strncpy(*result, url + start, end - start);
	(*result)[end - start] = 0;
 
	if (remove_extensions) {
		dot = strchr(*result, '.');
		if (dot && dot != *result)
			*dot = 0;
	}
 
	return URL_FUNC_OK;
 
no_path:
 
	/* otherwise, use the host name, with '.' replaced by '_' */
	if (match[URL_RE_AUTHORITY].rm_so != -1 &&
			match[URL_RE_AUTHORITY].rm_so !=
			match[URL_RE_AUTHORITY].rm_eo) {
		*result = malloc(match[URL_RE_AUTHORITY].rm_eo -
				match[URL_RE_AUTHORITY].rm_so + 1);
		if (!*result) {
			LOG(("malloc failed"));
			return URL_FUNC_NOMEM;
		}
		strncpy(*result, url + match[URL_RE_AUTHORITY].rm_so,
				match[URL_RE_AUTHORITY].rm_eo -
				match[URL_RE_AUTHORITY].rm_so);
		(*result)[match[URL_RE_AUTHORITY].rm_eo -
				match[URL_RE_AUTHORITY].rm_so] = 0;
 
		for (i = 0; (*result)[i]; i++)
			if ((*result)[i] == '.')
				(*result)[i] = '_';
 
		return URL_FUNC_OK;
	}
 
	return URL_FUNC_FAILED;
}
 
/**
 * Convert an escaped string to plain.
 * \param result unescaped string owned by caller must be freed with free()
 * \return  URL_FUNC_OK on success
 */
url_func_result url_unescape(const char *str, char **result)
{
	char *curlstr;
	char *retstr;
	/* curlstr = curl_unescape(str, 0); */
	LOG(("Address of str is : %x\n", str));
	LOG(("url is %s\n", str));
	
	LOG(("Calling http_unescape_url in url.c\n"));
	curlstr =  http_unescape_url(str);
	LOG(("http_unescape_url returned.\n"));
	__menuet__debug_out("http_unescape_url returned\n");
 
 
	if (curlstr == NULL) {
		return URL_FUNC_NOMEM;
	}
	__menuet__debug_out("Calling strdup in url.c with : ");	
	__menuet__debug_out(curlstr);
	__menuet__debug_out("\n");
 
	retstr = strdup(curlstr);
	/* free(curlstr); */ /* Doesn't work because mem not allocated with malloc/calloc/realloc*/
	/* TODO: Use mem_free here*/
 
	__menuet__debug_out("After strdup in url.c\n");	
 
	if (retstr == NULL) {
	  __menuet__debug_out("retstr is NULL in url.c\n");	
		return URL_FUNC_NOMEM;
	}
 
	*result = retstr;
	__menuet__debug_out("returning from url_unescape in url.c\n");
	return URL_FUNC_OK;
 
}
 
/**
 * Escape a string suitable for inclusion in an URL.
 *
 * \param  unescaped      the unescaped string
 * \param  toskip         number of bytes to skip in unescaped string
 * \param  sptoplus       true iff spaces should be converted to +
 * \param  escexceptions  NULL or a string of characters excluded to be escaped
 * \param  result         pointer to pointer to buffer to hold escaped string
 * \return  URL_FUNC_OK on success
 */
 
url_func_result url_escape(const char *unescaped, size_t toskip,
		bool sptoplus, const char *escexceptions, char **result)
{
	size_t len;
	char *escaped, *d, *tmpres;
	const char *c;
 
	if (!unescaped || !result)
		return URL_FUNC_FAILED;
 
	*result = NULL;
 
	len = strlen(unescaped);
	if (len < toskip)
		return URL_FUNC_FAILED;
	len -= toskip;
 
	escaped = malloc(len * 3 + 1);
	if (!escaped)
		return URL_FUNC_NOMEM;
 
	for (c = unescaped + toskip, d = escaped; *c; c++) {
		/* Check if we should escape this byte.
		 * '~' is unreserved and should not be percent encoded, if
		 * you believe the spec; however, leaving it unescaped
		 * breaks a bunch of websites, so we escape it anyway. */
		if (!isascii(*c)
			|| (strchr(":/?#[]@" /* gen-delims */
				  "!$&'()*+,;=" /* sub-delims */
				  "<>%\"{}|\\^`~" /* others */,	*c)
				&& (!escexceptions || !strchr(escexceptions, *c)))
			|| *c <= 0x20 || *c == 0x7f) {
			if (*c == 0x20 && sptoplus) {
				*d++ = '+';
			} else {
				*d++ = '%';
				*d++ = "0123456789ABCDEF"[((*c >> 4) & 0xf)];
				*d++ = "0123456789ABCDEF"[(*c & 0xf)];
			}
		} else {
			/* unreserved characters: [a-zA-Z0-9-._] */
			*d++ = *c;
		}
	}
	*d++ = '\0';
 
	tmpres = malloc(d - escaped + toskip);
	if (!tmpres) {
		free(escaped);
		return URL_FUNC_NOMEM;
	}
 
	memcpy(tmpres, unescaped, toskip); 
	memcpy(tmpres + toskip, escaped, d - escaped);
	*result = tmpres;
 
	free(escaped);
 
	return URL_FUNC_OK;
}
 
 
#ifdef TEST
 
int main(int argc, char *argv[])
{
	int i;
	url_func_result res;
	char *s;
	url_init();
	for (i = 1; i != argc; i++) {
/*		printf("==> '%s'\n", argv[i]);
		res = url_normalize(argv[i], &s);
		if (res == URL_FUNC_OK) {
			printf("<== '%s'\n", s);
			free(s);
		}*/
/*		printf("==> '%s'\n", argv[i]);
		res = url_host(argv[i], &s);
		if (res == URL_FUNC_OK) {
			printf("<== '%s'\n", s);
			free(s);
		}*/
		if (1 != i) {
			res = url_join(argv[i], argv[1], &s);
			if (res == URL_FUNC_OK) {
				printf("'%s' + '%s' \t= '%s'\n", argv[1],
						argv[i], s);
				free(s);
			}
		}
/*		printf("'%s' => ", argv[i]);
		res = url_nice(argv[i], &s, true);
		if (res == URL_FUNC_OK) {
			printf("'%s', ", s);
			free(s);
		} else {
			printf("failed %u, ", res);
		}
		res = url_nice(argv[i], &s, false);
		if (res == URL_FUNC_OK) {
			printf("'%s', ", s);
			free(s);
		} else {
			printf("failed %u, ", res);
		}
		printf("\n");*/
	}
	return 0;
}
 
void regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
{
	char errbuf[200];
	int r;
	r = regcomp(preg, regex, cflags);
	if (r) {
		regerror(r, preg, errbuf, sizeof errbuf);
		fprintf(stderr, "Failed to compile regexp '%s'\n", regex);
		fprintf(stderr, "error: %s\n", errbuf);
		exit(1);
	}
}
 
#endif

Rev 4364	Rev 5043
1	/*	1	/*
2	* Copyright 2006 Richard Wilson	2	* Copyright 2006 Richard Wilson
3	* Copyright 2005 James Bursa	3	* Copyright 2005 James Bursa
4	* Copyright 2005 John M Bell	4	* Copyright 2005 John M Bell
5	*	5	*
6	* This file is part of NetSurf, http://www.netsurf-browser.org/	6	* This file is part of NetSurf, http://www.netsurf-browser.org/
7	*	7	*
8	* NetSurf is free software; you can redistribute it and/or modify	8	* NetSurf is free software; you can redistribute it and/or modify
9	* it under the terms of the GNU General Public License as published by	9	* it under the terms of the GNU General Public License as published by
10	* the Free Software Foundation; version 2 of the License.	10	* the Free Software Foundation; version 2 of the License.
11	*	11	*
12	* NetSurf is distributed in the hope that it will be useful,	12	* NetSurf is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.	15	* GNU General Public License for more details.
16	*	16	*
17	* You should have received a copy of the GNU General Public License	17	* You should have received a copy of the GNU General Public License
18	* along with this program. If not, see .	18	* along with this program. If not, see .
19	*/	19	*/
20		20
21	/** \file	21	/** \file
22	* URL parsing and joining (implementation).	22	* URL parsing and joining (implementation).
23	*/	23	*/
24		24
25	#include	25	#include
26	#include	26	#include
27	#include	27	#include
28	#include	28	#include
29	#include	29	#include
30	#include	30	#include
31	#include	31	#include
32		32
33	#include "curl/curl.h"	33	#include "curl/curl.h"
34	#include "utils/config.h"	34	#include "utils/config.h"
35	#include "utils/log.h"	35	#include "utils/log.h"
36	#include "utils/url.h"	36	#include "utils/url.h"
37	#include "utils/utils.h"	37	#include "utils/utils.h"
-		38	#include "content/fetchers/http_msg.h"
-		39	#include "content/fetchers/http.h"
-		40
38		41
39	struct url_components_internal {	42	struct url_components_internal {
40	char buffer; / buffer used for all the following data */	43	char buffer; / buffer used for all the following data */
41	char *scheme;	44	char *scheme;
42	char *authority;	45	char *authority;
43	char *path;	46	char *path;
44	char *query;	47	char *query;
45	char *fragment;	48	char *fragment;
46	};	49	};
47		50
48		51
49	regex_t url_re, url_up_re;	52	regex_t url_re, url_up_re;
50		53
51	/**	54	/**
52	* Initialise URL routines.	55	* Initialise URL routines.
53	*	56	*
54	* Compiles regular expressions required by the url_ functions.	57	* Compiles regular expressions required by the url_ functions.
55	*/	58	*/
56		59
57	void url_init(void)	60	void url_init(void)
58	{	61	{
59	/* regex from RFC 2396 */	62	/* regex from RFC 2396 */
60	regcomp_wrapper(&url_re, "^[[:space:]]*"	63	regcomp_wrapper(&url_re, "^[[:space:]]*"
61	#define URL_RE_SCHEME 2	64	#define URL_RE_SCHEME 2
62	"(([a-zA-Z][-a-zA-Z0-9+.]*):)?"	65	"(([a-zA-Z][-a-zA-Z0-9+.]*):)?"
63	#define URL_RE_AUTHORITY 4	66	#define URL_RE_AUTHORITY 4
64	"(//([^/?#[:space:]]*))?"	67	"(//([^/?#[:space:]]*))?"
65	#define URL_RE_PATH 5	68	#define URL_RE_PATH 5
66	"([^?#[:space:]]*)"	69	"([^?#[:space:]]*)"
67	#define URL_RE_QUERY 7	70	#define URL_RE_QUERY 7
68	"(\\?([^#[:space:]]*))?"	71	"(\\?([^#[:space:]]*))?"
69	#define URL_RE_FRAGMENT 9	72	#define URL_RE_FRAGMENT 9
70	"(#([^[:space:]]*))?"	73	"(#([^[:space:]]*))?"
71	"[[:space:]]*$", REG_EXTENDED);	74	"[[:space:]]*$", REG_EXTENDED);
72	regcomp_wrapper(&url_up_re,	75	regcomp_wrapper(&url_up_re,
73	"/([^/]?\|[.][^./]\|[^./][.]\|[^./][^./]\|[^/][^/][^/]+)"	76	"/([^/]?\|[.][^./]\|[^./][.]\|[^./][^./]\|[^/][^/][^/]+)"
74	"/[.][.](/\|$)",	77	"/[.][.](/\|$)",
75	REG_EXTENDED);	78	REG_EXTENDED);
76	}	79	}
77		80
78		81
79	/**	82	/**
80	* Check whether a host string is an IP address. It should support and	83	* Check whether a host string is an IP address. It should support and
81	* detect IPv4 addresses (all of dotted-quad or subsets, decimal or	84	* detect IPv4 addresses (all of dotted-quad or subsets, decimal or
82	* hexadecimal notations) and IPv6 addresses (including those containing	85	* hexadecimal notations) and IPv6 addresses (including those containing
83	* embedded IPv4 addresses.)	86	* embedded IPv4 addresses.)
84	*	87	*
85	* \param host a hostname terminated by '\0'	88	* \param host a hostname terminated by '\0'
86	* \return true if the hostname is an IP address, false otherwise	89	* \return true if the hostname is an IP address, false otherwise
87	*/	90	*/
88	bool url_host_is_ip_address(const char *host)	91	bool url_host_is_ip_address(const char *host)
89	{	92	{
90	struct in_addr ipv4;	93	struct in_addr ipv4;
91	size_t host_len = strlen(host);	94	size_t host_len = strlen(host);
92	const char *sane_host;	95	const char *sane_host;
93	const char *slash;	96	const char *slash;
94	#ifndef NO_IPV6	97	#ifndef NO_IPV6
95	struct in6_addr ipv6;	98	struct in6_addr ipv6;
96	char ipv6_addr[64];	99	char ipv6_addr[64];
97	#endif	100	#endif
98	/* FIXME TODO: Some parts of urldb.c (and perhaps other parts of	101	/* FIXME TODO: Some parts of urldb.c (and perhaps other parts of
99	* NetSurf) make confusions between hosts and "prefixes", we can	102	* NetSurf) make confusions between hosts and "prefixes", we can
100	* sometimes be erroneously passed more than just a host. Sometimes	103	* sometimes be erroneously passed more than just a host. Sometimes
101	* we may be passed trailing slashes, or even whole path segments.	104	* we may be passed trailing slashes, or even whole path segments.
102	* A specific criminal in this class is urldb_iterate_partial, which	105	* A specific criminal in this class is urldb_iterate_partial, which
103	* takes a prefix to search for, but passes that prefix to functions	106	* takes a prefix to search for, but passes that prefix to functions
104	* that expect only hosts.	107	* that expect only hosts.
105	*	108	*
106	* For the time being, we will accept such calls; we check if there	109	* For the time being, we will accept such calls; we check if there
107	* is a / in the host parameter, and if there is, we take a copy and	110	* is a / in the host parameter, and if there is, we take a copy and
108	* replace the / with a \0. This is not a permanent solution; we	111	* replace the / with a \0. This is not a permanent solution; we
109	* should search through NetSurf and find all the callers that are	112	* should search through NetSurf and find all the callers that are
110	* in error and fix them. When doing this task, it might be wise	113	* in error and fix them. When doing this task, it might be wise
111	* to replace the hideousness below with code that doesn't have to do	114	* to replace the hideousness below with code that doesn't have to do
112	* this, and add assert(strchr(host, '/') == NULL); somewhere.	115	* this, and add assert(strchr(host, '/') == NULL); somewhere.
113	* -- rjek - 2010-11-04	116	* -- rjek - 2010-11-04
114	*/	117	*/
115		118
116	slash = strchr(host, '/');	119	slash = strchr(host, '/');
117	if (slash == NULL) {	120	if (slash == NULL) {
118	sane_host = host;	121	sane_host = host;
119	} else {	122	} else {
120	char *c = strdup(host);	123	char *c = strdup(host);
121	c[slash - host] = '\0';	124	c[slash - host] = '\0';
122	sane_host = c;	125	sane_host = c;
123	host_len = slash - host - 1;	126	host_len = slash - host - 1;
124	LOG(("WARNING: called with non-host '%s'", host));	127	LOG(("WARNING: called with non-host '%s'", host));
125	}	128	}
126		129
127	if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)	130	if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len)
128	goto out_false;	131	goto out_false;
129		132
130	if (inet_aton(sane_host, &ipv4) != 0) {	133	if (inet_aton(sane_host, &ipv4) != 0) {
131	/* This can only be a sane IPv4 address if it contains 3 dots.	134	/* This can only be a sane IPv4 address if it contains 3 dots.
132	* Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",	135	* Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c",
133	* and "a.b.c.d" as valid IPv4 address strings where we only	136	* and "a.b.c.d" as valid IPv4 address strings where we only
134	* support the full, dotted-quad, form.	137	* support the full, dotted-quad, form.
135	*/	138	*/
136	int num_dots = 0;	139	int num_dots = 0;
137	size_t index;	140	size_t index;
138		141
139	for (index = 0; index < host_len; index++) {	142	for (index = 0; index < host_len; index++) {
140	if (sane_host[index] == '.')	143	if (sane_host[index] == '.')
141	num_dots++;	144	num_dots++;
142	}	145	}
143		146
144	if (num_dots == 3)	147	if (num_dots == 3)
145	goto out_true;	148	goto out_true;
146	else	149	else
147	goto out_false;	150	goto out_false;
148	}	151	}
149		152
150	#ifndef NO_IPV6	153	#ifndef NO_IPV6
151	if (sane_host[0] != '[' \|\| sane_host[host_len] != ']')	154	if (sane_host[0] != '[' \|\| sane_host[host_len] != ']')
152	goto out_false;	155	goto out_false;
153		156
154	strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr));	157	strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr));
155	ipv6_addr[sizeof(ipv6_addr) - 1] = '\0';	158	ipv6_addr[sizeof(ipv6_addr) - 1] = '\0';
156		159
157	if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)	160	if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1)
158	goto out_true;	161	goto out_true;
159	#endif	162	#endif
160		163
161	out_false:	164	out_false:
162	if (slash != NULL) free((void *)sane_host);	165	if (slash != NULL) free((void *)sane_host);
163	return false;	166	return false;
164		167
165	out_true:	168	out_true:
166	if (slash != NULL) free((void *)sane_host);	169	if (slash != NULL) free((void *)sane_host);
167	return true;	170	return true;
168	}	171	}
169		172
170	/**	173	/**
171	* Split a URL into separate components	174	* Split a URL into separate components
172	*	175	*
173	* URLs passed to this function are assumed to be valid and no error checking	176	* URLs passed to this function are assumed to be valid and no error checking
174	* or recovery is attempted.	177	* or recovery is attempted.
175	*	178	*
176	* See RFC 3986 for reference.	179	* See RFC 3986 for reference.
177	*	180	*
178	* \param url a valid absolute or relative URL	181	* \param url a valid absolute or relative URL
179	* \param result pointer to buffer to hold components	182	* \param result pointer to buffer to hold components
180	* \return URL_FUNC_OK on success	183	* \return URL_FUNC_OK on success
181	*/	184	*/
182		185
183	static url_func_result url_get_components(const char *url,	186	static url_func_result url_get_components(const char *url,
184	struct url_components *result)	187	struct url_components *result)
185	{	188	{
186	int storage_length;	189	int storage_length;
187	char *storage_end;	190	char *storage_end;
188	const char *scheme;	191	const char *scheme;
189	const char *authority;	192	const char *authority;
190	const char *path;	193	const char *path;
191	const char *query;	194	const char *query;
192	const char *fragment;	195	const char *fragment;
193	struct url_components_internal *internal;	196	struct url_components_internal *internal;
194		197
195	assert(url);	198	assert(url);
196		199
197	/* clear our return value */	200	/* clear our return value */
198	internal = (struct url_components_internal *)result;	201	internal = (struct url_components_internal *)result;
199	memset(result, 0x00, sizeof(struct url_components));	202	memset(result, 0x00, sizeof(struct url_components));
200		203
201	/* get enough storage space for a URL with termination at each node */	204	/* get enough storage space for a URL with termination at each node */
202	storage_length = strlen(url) + 8;	205	storage_length = strlen(url) + 8;
203	internal->buffer = malloc(storage_length);	206	internal->buffer = malloc(storage_length);
204	if (!internal->buffer)	207	if (!internal->buffer)
205	return URL_FUNC_NOMEM;	208	return URL_FUNC_NOMEM;
206	storage_end = internal->buffer;	209	storage_end = internal->buffer;
207		210
208	/* look for a valid scheme */	211	/* look for a valid scheme */
209	scheme = url;	212	scheme = url;
210	if (isalpha(*scheme)) {	213	if (isalpha(*scheme)) {
211	for (scheme = url + 1;	214	for (scheme = url + 1;
212	((scheme != ':') && (scheme != '\0'));	215	((scheme != ':') && (scheme != '\0'));
213	scheme++) {	216	scheme++) {
214	if (!isalnum(scheme) && (scheme != '+') &&	217	if (!isalnum(scheme) && (scheme != '+') &&
215	(scheme != '-') && (scheme != '.'))	218	(scheme != '-') && (scheme != '.'))
216	break;	219	break;
217	}	220	}
218		221
219	if (*scheme == ':') {	222	if (*scheme == ':') {
220	memcpy(storage_end, url, scheme - url);	223	memcpy(storage_end, url, scheme - url);
221	storage_end[scheme - url] = '\0';	224	storage_end[scheme - url] = '\0';
222	result->scheme = storage_end;	225	result->scheme = storage_end;
223	storage_end += scheme - url + 1;	226	storage_end += scheme - url + 1;
224	scheme++;	227	scheme++;
225	} else {	228	} else {
226	scheme = url;	229	scheme = url;
227	}	230	}
228	}	231	}
229		232
230		233
231	/* look for an authority */	234	/* look for an authority */
232	authority = scheme;	235	authority = scheme;
233	if ((authority[0] == '/') && (authority[1] == '/')) {	236	if ((authority[0] == '/') && (authority[1] == '/')) {
234	authority = strpbrk(scheme + 2, "/?#");	237	authority = strpbrk(scheme + 2, "/?#");
235	if (!authority)	238	if (!authority)
236	authority = scheme + strlen(scheme);	239	authority = scheme + strlen(scheme);
237	memcpy(storage_end, scheme + 2, authority - scheme - 2);	240	memcpy(storage_end, scheme + 2, authority - scheme - 2);
238	storage_end[authority - scheme - 2] = '\0';	241	storage_end[authority - scheme - 2] = '\0';
239	result->authority = storage_end;	242	result->authority = storage_end;
240	storage_end += authority - scheme - 1;	243	storage_end += authority - scheme - 1;
241	}	244	}
242		245
243		246
244	/* look for a path */	247	/* look for a path */
245	path = authority;	248	path = authority;
246	if ((path != '?') && (path != '#') && (*path != '\0')) {	249	if ((path != '?') && (path != '#') && (*path != '\0')) {
247	path = strpbrk(path, "?#");	250	path = strpbrk(path, "?#");
248	if (!path)	251	if (!path)
249	path = authority + strlen(authority);	252	path = authority + strlen(authority);
250	memcpy(storage_end, authority, path - authority);	253	memcpy(storage_end, authority, path - authority);
251	storage_end[path - authority] = '\0';	254	storage_end[path - authority] = '\0';
252	result->path = storage_end;	255	result->path = storage_end;
253	storage_end += path - authority + 1;	256	storage_end += path - authority + 1;
254	}	257	}
255		258
256		259
257	/* look for a query */	260	/* look for a query */
258	query = path;	261	query = path;
259	if (*query == '?') {	262	if (*query == '?') {
260	query = strchr(query, '#');	263	query = strchr(query, '#');
261	if (!query)	264	if (!query)
262	query = path + strlen(path);	265	query = path + strlen(path);
263	memcpy(storage_end, path + 1, query - path - 1);	266	memcpy(storage_end, path + 1, query - path - 1);
264	storage_end[query - path - 1] = '\0';	267	storage_end[query - path - 1] = '\0';
265	result->query = storage_end;	268	result->query = storage_end;
266	storage_end += query - path;	269	storage_end += query - path;
267	}	270	}
268		271
269		272
270	/* look for a fragment */	273	/* look for a fragment */
271	fragment = query;	274	fragment = query;
272	if (*fragment == '#') {	275	if (*fragment == '#') {
273	fragment = query + strlen(query);	276	fragment = query + strlen(query);
274		277
275	/* make a copy of the result for the caller */	278	/* make a copy of the result for the caller */
276	memcpy(storage_end, query + 1, fragment - query - 1);	279	memcpy(storage_end, query + 1, fragment - query - 1);
277	storage_end[fragment - query - 1] = '\0';	280	storage_end[fragment - query - 1] = '\0';
278	result->fragment = storage_end;	281	result->fragment = storage_end;
279	storage_end += fragment - query;	282	storage_end += fragment - query;
280	}	283	}
281		284
282	assert((result->buffer + storage_length) >= storage_end);	285	assert((result->buffer + storage_length) >= storage_end);
283	return URL_FUNC_OK;	286	return URL_FUNC_OK;
284	}	287	}
285		288
286		289
287	/**	290	/**
288	* Reform a URL from separate components	291	* Reform a URL from separate components
289	*	292	*
290	* See RFC 3986 for reference.	293	* See RFC 3986 for reference.
291	*	294	*
292	* \param components the components to reform into a URL	295	* \param components the components to reform into a URL
293	* \return a new URL allocated on the heap, or NULL on failure	296	* \return a new URL allocated on the heap, or NULL on failure
294	*/	297	*/
295		298
296	static char url_reform_components(const struct url_components components)	299	static char url_reform_components(const struct url_components components)
297	{	300	{
298	int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0,	301	int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0,
299	fragment_len = 0;	302	fragment_len = 0;
300	char result, url;	303	char result, url;
301		304
302	/* 5.3 */	305	/* 5.3 */
303	if (components->scheme)	306	if (components->scheme)
304	scheme_len = strlen(components->scheme) + 1;	307	scheme_len = strlen(components->scheme) + 1;
305	if (components->authority)	308	if (components->authority)
306	authority_len = strlen(components->authority) + 2;	309	authority_len = strlen(components->authority) + 2;
307	if (components->path)	310	if (components->path)
308	path_len = strlen(components->path);	311	path_len = strlen(components->path);
309	if (components->query)	312	if (components->query)
310	query_len = strlen(components->query) + 1;	313	query_len = strlen(components->query) + 1;
311	if (components->fragment)	314	if (components->fragment)
312	fragment_len = strlen(components->fragment) + 1;	315	fragment_len = strlen(components->fragment) + 1;
313		316
314	/* claim memory */	317	/* claim memory */
315	url = result = malloc(scheme_len + authority_len + path_len +	318	url = result = malloc(scheme_len + authority_len + path_len +
316	query_len + fragment_len + 1);	319	query_len + fragment_len + 1);
317	if (!url) {	320	if (!url) {
318	LOG(("malloc failed"));	321	LOG(("malloc failed"));
319	return NULL;	322	return NULL;
320	}	323	}
321		324
322	/* rebuild URL */	325	/* rebuild URL */
323	if (components->scheme) {	326	if (components->scheme) {
324	sprintf(url, "%s:", components->scheme);	327	sprintf(url, "%s:", components->scheme);
325	url += scheme_len;	328	url += scheme_len;
326	}	329	}
327	if (components->authority) {	330	if (components->authority) {
328	sprintf(url, "//%s", components->authority);	331	sprintf(url, "//%s", components->authority);
329	url += authority_len;	332	url += authority_len;
330	}	333	}
331	if (components->path) {	334	if (components->path) {
332	sprintf(url, "%s", components->path);	335	sprintf(url, "%s", components->path);
333	url += path_len;	336	url += path_len;
334	}	337	}
335	if (components->query) {	338	if (components->query) {
336	sprintf(url, "?%s", components->query);	339	sprintf(url, "?%s", components->query);
337	url += query_len;	340	url += query_len;
338	}	341	}
339	if (components->fragment)	342	if (components->fragment)
340	sprintf(url, "#%s", components->fragment);	343	sprintf(url, "#%s", components->fragment);
341	return result;	344	return result;
342	}	345	}
343		346
344		347
345	/**	348	/**
346	* Release some url components from memory	349	* Release some url components from memory
347	*	350	*
348	* \param result pointer to buffer containing components	351	* \param result pointer to buffer containing components
349	*/	352	*/
350	static void url_destroy_components(const struct url_components *components)	353	static void url_destroy_components(const struct url_components *components)
351	{	354	{
352	const struct url_components_internal *internal;	355	const struct url_components_internal *internal;
353		356
354	assert(components);	357	assert(components);
355		358
356	internal = (const struct url_components_internal *)components;	359	internal = (const struct url_components_internal *)components;
357	if (internal->buffer)	360	if (internal->buffer)
358	free(internal->buffer);	361	free(internal->buffer);
359	}	362	}
360		363
361		364
362	/**	365	/**
363	* Resolve a relative URL to absolute form.	366	* Resolve a relative URL to absolute form.
364	*	367	*
365	* \param rel relative URL	368	* \param rel relative URL
366	* \param base base URL, must be absolute and cleaned as by nsurl_create()	369	* \param base base URL, must be absolute and cleaned as by nsurl_create()
367	* \param result pointer to pointer to buffer to hold absolute url	370	* \param result pointer to pointer to buffer to hold absolute url
368	* \return URL_FUNC_OK on success	371	* \return URL_FUNC_OK on success
369	*/	372	*/
370		373
371	url_func_result url_join(const char rel, const char base, char **result)	374	url_func_result url_join(const char rel, const char base, char **result)
372	{	375	{
373	url_func_result status = URL_FUNC_NOMEM;	376	url_func_result status = URL_FUNC_NOMEM;
374	struct url_components_internal base_components = {0,0,0,0,0,0};	377	struct url_components_internal base_components = {0,0,0,0,0,0};
375	struct url_components_internal *base_ptr = &base_components;	378	struct url_components_internal *base_ptr = &base_components;
376	struct url_components_internal rel_components = {0,0,0,0,0,0};	379	struct url_components_internal rel_components = {0,0,0,0,0,0};
377	struct url_components_internal *rel_ptr = &rel_components;	380	struct url_components_internal *rel_ptr = &rel_components;
378	struct url_components_internal merged_components = {0,0,0,0,0,0};	381	struct url_components_internal merged_components = {0,0,0,0,0,0};
379	struct url_components_internal *merged_ptr = &merged_components;	382	struct url_components_internal *merged_ptr = &merged_components;
380	char merge_path = NULL, split_point;	383	char merge_path = NULL, split_point;
381	char input, output, *start = NULL;	384	char input, output, *start = NULL;
382	int len, buf_len;	385	int len, buf_len;
383		386
384	(*result) = 0;	387	(*result) = 0;
385		388
386	assert(base);	389	assert(base);
387	assert(rel);	390	assert(rel);
388		391
389		392
390	/* break down the relative URL (not cached, corruptable) */	393	/* break down the relative URL (not cached, corruptable) */
391	status = url_get_components(rel, (struct url_components *) rel_ptr);	394	status = url_get_components(rel, (struct url_components *) rel_ptr);
392	if (status != URL_FUNC_OK) {	395	if (status != URL_FUNC_OK) {
393	LOG(("relative url '%s' failed to get components", rel));	396	LOG(("relative url '%s' failed to get components", rel));
394	return URL_FUNC_FAILED;	397	return URL_FUNC_FAILED;
395	}	398	}
396		399
397	/* [1] relative URL is absolute, use it entirely */	400	/* [1] relative URL is absolute, use it entirely */
398	merged_components = rel_components;	401	merged_components = rel_components;
399	if (rel_components.scheme)	402	if (rel_components.scheme)
400	goto url_join_reform_url;	403	goto url_join_reform_url;
401		404
402	/* break down the base URL (possibly cached, not corruptable) */	405	/* break down the base URL (possibly cached, not corruptable) */
403	status = url_get_components(base, (struct url_components *) base_ptr);	406	status = url_get_components(base, (struct url_components *) base_ptr);
404	if (status != URL_FUNC_OK) {	407	if (status != URL_FUNC_OK) {
405	url_destroy_components((struct url_components *) rel_ptr);	408	url_destroy_components((struct url_components *) rel_ptr);
406	LOG(("base url '%s' failed to get components", base));	409	LOG(("base url '%s' failed to get components", base));
407	return URL_FUNC_FAILED;	410	return URL_FUNC_FAILED;
408	}	411	}
409		412
410	/* [2] relative authority takes presidence */	413	/* [2] relative authority takes presidence */
411	merged_components.scheme = base_components.scheme;	414	merged_components.scheme = base_components.scheme;
412	if (rel_components.authority)	415	if (rel_components.authority)
413	goto url_join_reform_url;	416	goto url_join_reform_url;
414		417
415	/* [3] handle empty paths */	418	/* [3] handle empty paths */
416	merged_components.authority = base_components.authority;	419	merged_components.authority = base_components.authority;
417	if (!rel_components.path) {	420	if (!rel_components.path) {
418	merged_components.path = base_components.path;	421	merged_components.path = base_components.path;
419	if (!rel_components.query)	422	if (!rel_components.query)
420	merged_components.query = base_components.query;	423	merged_components.query = base_components.query;
421	goto url_join_reform_url;	424	goto url_join_reform_url;
422	}	425	}
423		426
424	/* [4] handle valid paths */	427	/* [4] handle valid paths */
425	if (rel_components.path[0] == '/')	428	if (rel_components.path[0] == '/')
426	merged_components.path = rel_components.path;	429	merged_components.path = rel_components.path;
427	else {	430	else {
428	/* 5.2.3 */	431	/* 5.2.3 */
429	if ((base_components.authority) && (!base_components.path)) {	432	if ((base_components.authority) && (!base_components.path)) {
430	merge_path = malloc(strlen(rel_components.path) + 2);	433	merge_path = malloc(strlen(rel_components.path) + 2);
431	if (!merge_path) {	434	if (!merge_path) {
432	LOG(("malloc failed"));	435	LOG(("malloc failed"));
433	goto url_join_no_mem;	436	goto url_join_no_mem;
434	}	437	}
435	sprintf(merge_path, "/%s", rel_components.path);	438	sprintf(merge_path, "/%s", rel_components.path);
436	merged_components.path = merge_path;	439	merged_components.path = merge_path;
437	} else {	440	} else {
438	split_point = base_components.path ?	441	split_point = base_components.path ?
439	strrchr(base_components.path, '/') :	442	strrchr(base_components.path, '/') :
440	NULL;	443	NULL;
441	if (!split_point) {	444	if (!split_point) {
442	merged_components.path = rel_components.path;	445	merged_components.path = rel_components.path;
443	} else {	446	} else {
444	len = ++split_point - base_components.path;	447	len = ++split_point - base_components.path;
445	buf_len = len + 1 + strlen(rel_components.path);	448	buf_len = len + 1 + strlen(rel_components.path);
446	merge_path = malloc(buf_len);	449	merge_path = malloc(buf_len);
447	if (!merge_path) {	450	if (!merge_path) {
448	LOG(("malloc failed"));	451	LOG(("malloc failed"));
449	goto url_join_no_mem;	452	goto url_join_no_mem;
450	}	453	}
451	memcpy(merge_path, base_components.path, len);	454	memcpy(merge_path, base_components.path, len);
452	memcpy(merge_path + len, rel_components.path,	455	memcpy(merge_path + len, rel_components.path,
453	strlen(rel_components.path));	456	strlen(rel_components.path));
454	merge_path[buf_len - 1] = '\0';	457	merge_path[buf_len - 1] = '\0';
455	merged_components.path = merge_path;	458	merged_components.path = merge_path;
456	}	459	}
457	}	460	}
458	}	461	}
459		462
460	url_join_reform_url:	463	url_join_reform_url:
461	/* 5.2.4 */	464	/* 5.2.4 */
462	input = merged_components.path;	465	input = merged_components.path;
463	if ((input) && (strchr(input, '.'))) {	466	if ((input) && (strchr(input, '.'))) {
464	/* [1] remove all dot references */	467	/* [1] remove all dot references */
465	output = start = malloc(strlen(input) + 1);	468	output = start = malloc(strlen(input) + 1);
466	if (!output) {	469	if (!output) {
467	LOG(("malloc failed"));	470	LOG(("malloc failed"));
468	goto url_join_no_mem;	471	goto url_join_no_mem;
469	}	472	}
470	merged_components.path = output;	473	merged_components.path = output;
471	*output = '\0';	474	*output = '\0';
472		475
473	while (*input != '\0') {	476	while (*input != '\0') {
474	/* [2A] */	477	/* [2A] */
475	if (input[0] == '.') {	478	if (input[0] == '.') {
476	if (input[1] == '/') {	479	if (input[1] == '/') {
477	input = input + 2;	480	input = input + 2;
478	continue;	481	continue;
479	} else if ((input[1] == '.') &&	482	} else if ((input[1] == '.') &&
480	(input[2] == '/')) {	483	(input[2] == '/')) {
481	input = input + 3;	484	input = input + 3;
482	continue;	485	continue;
483	}	486	}
484	}	487	}
485		488
486	/* [2B] */	489	/* [2B] */
487	if ((input[0] == '/') && (input[1] == '.')) {	490	if ((input[0] == '/') && (input[1] == '.')) {
488	if (input[2] == '/') {	491	if (input[2] == '/') {
489	input = input + 2;	492	input = input + 2;
490	continue;	493	continue;
491	} else if (input[2] == '\0') {	494	} else if (input[2] == '\0') {
492	input = input + 1;	495	input = input + 1;
493	*input = '/';	496	*input = '/';
494	continue;	497	continue;
495	}	498	}
496		499
497	/* [2C] */	500	/* [2C] */

Subversion Repositories Kolibri OS

(root)/contrib/network/netsurf/netsurf/utils/url.c – Rev 4364 → 5043