/* URL parser and translator; implementation of RFC 2396. */
/* $Id: url.c,v 1.76 2003/06/26 23:49:00 jonas Exp $ */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "elinks.h"

#include "protocol/uri.h"
#include "util/error.h"
#include "util/memory.h"
#include "util/string.h"

#define URIDEBUG
#ifndef URIDEBUG
#define log(msg, args...)
#else
#define log(msg, args...) debug(msg, ##args)
#endif

03:14 < Miciah> Maybe you could replace those loops in init_uri_scan_index with some macros or somethin'.
03:14 < Miciah> ...or a table of character pairs and associated bits.
03:14 < Miciah> s/pairs/ranges/
03:14 < fonseca> Miciah: ahh nice idea :)
03:15 < fonseca> well it would be posible to generate it at compile time
03:16 < Miciah> Are you suggesting a shell script, a huge table, or something really clever?
03:16 < fonseca> huge table
03:18 < Miciah> Could use memset for NON_ASCII characters.
03:18 < Miciah> Why do you start with character 161 for NON_ASCII?
03:19 < fonseca> dunno
03:20 < Miciah> Seems like each character should be either reserved or non-reserved.
03:21 < fonseca> i recall it got a little too complicated and confusing
03:21 < fonseca> but the ideas was good i think
03:22 < Miciah> %s/LOOKIP/LOOKUP/

#define	DIGIT		(1 <<  0)
#define	UPPER_ALPHA	(1 <<  1)
#define	LOWER_ALPHA	(1 <<  2)
#define	HEX_DIGIT	(1 <<  3)
#define	MARK		(1 <<  4)
#define	RESERVED	(1 <<  5)
#define	PATH_CHAR	(1 <<  6)
#define	USER_INFO	(1 <<  7 | MARK)
#define REG_NAME	(1 <<  8)
#define PROTOCOL	(1 <<  9)
#define REL_SEGMENT	(1 << 10)
#define URIC_NO_SLASH	(1 << 11)
#define NON_ASCII	(1 << 12)
#define ALPHANUM	(LOWER_ALPHA | UPPER_ALPHA | DIGIT)
#define UNRESERVED	(ALPHANUM | MARK)
#define BASETYPE	(USER_INFO | REG_NAME | REL_SEGMENT | URIC_NO_SLASH)

short int uri_scan_index[256];

/* Initiate bitmaps */
void
init_uri_scan_index(void)
{
	unsigned char *grouping;
	unsigned char position;

	memset(uri_scan_index, 0, sizeof(uri_scan_index));

	for (position = '0'; position <= '9'; position++) {
		uri_scan_index[position] |= DIGIT | HEX_DIGIT | PROTOCOL |
					    PCHAR | BASETYPE;
	}

	for (position = 'A'; position <= 'Z'; position++) {
		if ((position >= 'A') && (position <= 'F')) {
			uri_scan_index[position] |= HEX_DIGIT;
			uri_scan_index[position + 32] |= HEX_DIGIT;
		}

		uri_scan_index[position] |= UPPER_ALPHA | PROTOCOL | PCHAR | BASETYPE;
		uri_scan_index[position + 32] |= LOWER_ALPHA | PROTOCOL | PCHAR | BASETYPE;
	}

	for (grouping = "-_.!~*'()"; *grouping; grouping++)
		uri_scan_index[*grouping] |= MARK;

	for (grouping = ";&=+$,"; *grouping; grouping++)
		uri_scan_index[*grouping] |= RESERVED | BASETYPE;

	for (grouping = ".+-"; *grouping; grouping++)
		uri_scan_index[*grouping] |= PROTOCOL;

	uri_scan_index[':'] |= RESERVED | PCHAR | REG_NAME | REL_SEGMENT | USER_INFO;
	uri_scan_index['@'] |= RESERVED | PCHAR | REG_NAME | REL_SEGMENT;
	uri_scan_index['?'] |= RESERVED | REL_SEGMENT;
	uri_scan_index['/'] |= RESERVED;

#if 0
	for (position = 161; position <= 255; position++)
		uri_scan_index[position] |= NON_ASCII;
#endif
}

#define LOOKUP_INDEX(c, flags) (uri_scan_index[(c)] & (flags))

#define IS_PROTOCOL(c)	LOOKUP_INDEX(c, PROTOCOL)
#define IS_USER_INFO(c)	LOOKUP_INDEX(c, USER_INFO)
#define IS_HOSTNAME(c)	LOOKUP_INDEX(c, RESERVED | UNRESERVED)
#define IS_DIGIT(c)	LOOKUP_INDEX(c, DIGIT)
#define IS_ALPHA(c)	LOOKUP_INDEX(c, LOWER_ALPHA | UPPER_ALPHA)
#define IS_PATHCHAR(c)	LOOKUP_INDEX(c, PATH_CHAR)
#define IS_QUERY(c)	LOOKUP_INDEX(c, RESERVED | UNRESERVED)
#define IS_FRAGMENT(c)	LOOKUP_INDEX(c, RESERVED | UNRESERVED)

#define scan_protocol(pos) \
	if (IS_ALPHA(*(pos))) { \
		(pos)++; \
		while (LOOKUP_INDEX(*(pos), PROTOCOL)) (pos)++; \
	}

#define scan_username(pos) \
	while (LOOKIP_INDEX(*(pos), PROTOCOL) && *(pos) != ':') (pos)++;

static unsigned char *
parse_uri_hierarchic_part(struct uri *uri, unsigned char *startpos)
{
	unsigned char *uripos = startpos;

	/* hier_part     = ( net_path | abs_path ) [ "?" query ] */
	log("hier_part		: %s", uripos);

	if (*uripos == '/') {
		uripos++;

		log("net_path		: %s", uripos);
		/* net_path      = "//" authority [ abs_path ]
		 * authority     = server | reg_name
		 * server        = [ [ userinfo "@" ] hostport ] */
		uri->username = uripos;
		scan_username(uripos);
		if (uri->username == uripos) return startpos;

		if (*uripos == ':') {
			*uripos++ = 0;
			uri->password = uripos;
			scan_password(uripos);

			if (uri->password == uripos) return startpos;

			for (; IS_USER_INFO(*uripos); uripos++) {
			if (*uripos == ':' && !uri->password) {
				*uripos++ = 0;
				uri->password = uripos;
			}
		}

		if (*uripos == '@') {
			*uripos++ = 0;
			uri->host = uripos;
		} else {
			uri->host = uri->username;
		}

		log("host_port		: %s", uripos);
		/* hostport      = host [ ":" digit ] */
		/* host          = hostname | IPv4address */
		if (IS_ALPHA(*uripos)) {
			log("host_name		: %s", uripos);
			/* hostname      = *( domainlabel "." ) toplabel [ "." ]
			 * domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
			 * toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
			 */
			while (uri_scan_index[*uripos] & ALPHANUM && *uripos == '-') uripos++;
			while (*uripos == '.') {
				uripos++;

				while (uri_scan_index[*uripos] & ALPHANUM) uripos++;
			}

		} else if (IS_DIGIT(*uripos)) {
			log("ipv4address	: %s", uripos);
			/* IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit */
			unsigned char *digit_start = uripos;
			int digits = 1;

			while (digits < 4) {
				while (IS_DIGIT(*uripos)) uripos++;
				if ((digits < 4 && *uripos != '.') || digit_start < uripos)
					return startpos;

				digit_start = uripos;
				digits++;
			}
		} else if (uri->host != uri->username) {
			debug("uri->host == uri->username");
			return startpos;
		}
	}

	log("done net_path	: %s", uripos);

	/* net_path can contain abs_path */
	if (*uripos == '/') {
		log("abs_path		: %s", uripos);
		/* abs_path      = "/"  path_segments */
		/* path_segments = segment *( "/" segment )
		 * segment       = *pchar *( ";" pchar )
		 */
		*uripos++ = 0;
		uri->path = uripos;
		do {
			while (*uripos == '/') uripos++;

			while (IS_PATHCHAR(*uripos)) uripos++;

			if (*uripos == ';')
				while (IS_PATHCHAR(*uripos)) uripos++;

		} while (*uripos == '/');
	}

	return uripos;
}

int
parse_uri(struct uri *uri)
{
	unsigned char *uripos = uri->protocol;

	assert(uri);

	memset(uri, 0, sizeof(struct uri));
	uri->protocol = uripos;

	log("protocol		: %s", uripos);
	/* URI-reference = absoluteURI [ "#" fragment ]
	 * absoluteURI   = scheme ":" ( hier_part | opaque_part ) */
	if (IS_ALPHA(*uripos)) {
		uripos++;
		while (IS_PROTOCOL(*uripos)) uripos++;
	}

	if (!*uripos || *uripos != ':') return 0;

	*uripos++ = 0;

	/* Check protocol */

	if (*uripos == '/') {
		unsigned char *newpos = parse_uri_hierarchic_part(uri, ++uripos);

		if (newpos == uripos) return 0;
		uripos = newpos;

	} else if (uri_scan_index[*uripos] & URIC_NO_SLASH) {
		log("opaque_part		: %s", uripos);
		/* opaque_part   = uric_no_slash *( reserved | unreserved | escaped ) */

		uri->username = uripos;
		while (uri_scan_index[*uripos] & (RESERVED | UNRESERVED /* | ESCAPED */)) {
			if (*uripos == '@') {
				*uripos++ = 0;
				uri->password = uripos;
			}
			uripos++;
		}
	}

	if (*uripos != '?') return !*uripos ? uripos - uri->protocol : 0;

	*uripos++ = 0;

	log("query		: %s", uripos);
	/* query         = *( reserved | unreserved | escaped ) */
	uri->query = uripos;
	while (IS_QUERY(*uripos))
		uripos++;

	if (*uripos != '#') return !*uripos ? uripos - uri->protocol : 0;

	*uripos++ = 0;

	log("fragment		: %s", uripos);
	/* fragment      = *( reserved | unreserved | escaped ) */
	uri->fragment = uripos;
	while (IS_QUERY(*uripos))
		uripos++;

	return !*uripos ? uripos - uri->protocol : 0;
}

