diff --git a/thirdpart/urlobject/__init__.py b/thirdpart/urlobject/__init__.py new file mode 100644 index 0000000000..2f71db6f5d --- /dev/null +++ b/thirdpart/urlobject/__init__.py @@ -0,0 +1 @@ +from urlobject import URLObject diff --git a/thirdpart/urlobject/netloc.py b/thirdpart/urlobject/netloc.py new file mode 100644 index 0000000000..9c0bc90bb8 --- /dev/null +++ b/thirdpart/urlobject/netloc.py @@ -0,0 +1,113 @@ +import urlparse + + +class Netloc(unicode): + + """ + A netloc string (``username:password@hostname:port``). + + Contains methods for accessing and (non-destructively) modifying those four + components of the netloc. All methods return new instances. + """ + + def __repr__(self): + return 'Netloc(%r)' % (unicode(self),) + + @classmethod + def __unsplit(cls, username, password, hostname, port): + """Put together a :class:`Netloc` from its constituent parts.""" + auth_string = u'' + if username: + auth_string = username + if password: + auth_string += u':' + password + auth_string += '@' + port_string = u'' + if port is not None: + port_string = u':%d' % port + return cls(auth_string + hostname + port_string) + + @property + def username(self): + """The username portion of this netloc, or ``None``.""" + return self.__urlsplit.username + + def with_username(self, username): + """Replace or add a username to this netloc.""" + return self.__replace(username=username) + + def without_username(self): + """Remove any username (and password) from this netloc.""" + return self.without_password().with_username('') + + @property + def password(self): + """The password portion of this netloc, or ``None``.""" + return self.__urlsplit.password + + def with_password(self, password): + + """ + Replace or add a password to this netloc. + + Raises a ``ValueError`` if you attempt to add a password to a netloc + with no username. + """ + + if password and not self.username: + raise ValueError("Can't set a password on a netloc with no username") + return self.__replace(password=password) + + def without_password(self): + """Remove any password from this netloc.""" + return self.with_password('') + + @property + def auth(self): + """The username and password of this netloc as a 2-tuple.""" + return (self.username, self.password) + + def with_auth(self, username, *password): + """Replace or add a username and password in one method call.""" + netloc = self.without_auth() + if password: + return netloc.with_username(username).with_password(*password) + return netloc.with_username(username) + + def without_auth(self): + return self.without_password().without_username() + + @property + def hostname(self): + """The hostname portion of this netloc.""" + return self.__urlsplit.hostname + + def with_hostname(self, hostname): + """Replace the hostname on this netloc.""" + return self.__replace(hostname=hostname) + + @property + def port(self): + """The port number on this netloc (as an ``int``), or ``None``.""" + return self.__urlsplit.port + + def with_port(self, port): + """Replace or add a port number to this netloc.""" + return self.__replace(port=port) + + def without_port(self): + """Remove any port number from this netloc.""" + return self.__replace(port=None) + + @property + def __urlsplit(self): + return urlparse.SplitResult('', self, '', '', '') + + def __replace(self, **params): + """Replace any number of components on this netloc.""" + unsplit_args = {'username': self.username, + 'password': self.password, + 'hostname': self.hostname, + 'port': self.port} + unsplit_args.update(params) + return self.__unsplit(**unsplit_args) diff --git a/thirdpart/urlobject/path.py b/thirdpart/urlobject/path.py new file mode 100644 index 0000000000..1123df4e69 --- /dev/null +++ b/thirdpart/urlobject/path.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +import posixpath +import urllib +import urlparse + + +class Root(object): + + """A descriptor which always returns the root path.""" + + def __get__(self, instance, cls): + return cls('/') + + +class URLPath(unicode): + + root = Root() + + def __repr__(self): + return 'URLPath(%r)' % (unicode(self),) + + @classmethod + def join_segments(cls, segments, absolute=True): + """Create a :class:`URLPath` from an iterable of segments.""" + path = cls('/') + for segment in segments: + path = path.add_segment(segment) + return path + + @property + def segments(self): + """ + Split this path into (decoded) segments. + + >>> URLPath(u'/a/b/c').segments + (u'a', u'b', u'c') + + Non-leaf nodes will have a trailing empty string, and percent encodes + will be decoded: + + >>> URLPath(u'/a%20b/c%20d/').segments + (u'a b', u'c d', u'') + """ + segments = tuple(map(path_decode, self.split('/'))) + if segments[0] == u'': + return segments[1:] + return segments + + @property + def parent(self): + """ + The parent of this node. + + >>> URLPath(u'/a/b/c').parent + URLPath(u'/a/b/') + >>> URLPath(u'/foo/bar/').parent + URLPath(u'/foo/') + """ + if self.is_leaf: + return self.relative('.') + return self.relative('..') + + @property + def is_leaf(self): + """ + Is this path a leaf node? + + >>> URLPath(u'/a/b/c').is_leaf + True + >>> URLPath(u'/a/b/').is_leaf + False + """ + return self and self.segments[-1] != u'' + + @property + def is_relative(self): + """ + Is this path relative? + + >>> URLPath(u'a/b/c').is_relative + True + >>> URLPath(u'/a/b/c').is_relative + False + """ + return self[0] != u'/' + + @property + def is_absolute(self): + """ + Is this path absolute? + + >>> URLPath(u'a/b/c').is_absolute + False + >>> URLPath(u'/a/b/c').is_absolute + True + """ + return self[0] == u'/' + + def relative(self, rel_path): + """ + Resolve a relative path against this one. + + >>> URLPath(u'/a/b/c').relative('.') + URLPath(u'/a/b/') + >>> URLPath(u'/a/b/c').relative('d') + URLPath(u'/a/b/d') + >>> URLPath(u'/a/b/c').relative('../d') + URLPath(u'/a/d') + """ + return type(self)(urlparse.urljoin(self, rel_path)) + + def add_segment(self, segment): + u""" + Add a segment to this path. + + >>> URLPath(u'/a/b/').add_segment('c') + URLPath(u'/a/b/c') + + Non-ASCII and reserved characters (including slashes) will be encoded: + + >>> URLPath(u'/a/b/').add_segment(u'dé/f') + URLPath(u'/a/b/d%C3%A9%2Ff') + """ + return type(self)(posixpath.join(self, path_encode(segment))) + + def add(self, path): + u""" + Add a partial path to this one. + + The only difference between this and :meth:`add_segment` is that slash + characters will not be encoded, making it suitable for adding more than + one path segment at a time: + + >>> URLPath(u'/a/b/').add(u'dé/f/g') + URLPath(u'/a/b/d%C3%A9/f/g') + """ + return type(self)(posixpath.join(self, path_encode(path, safe='/'))) + + +def path_encode(string, safe=''): + return urllib.quote(string.encode('utf-8'), safe=safe) + +def path_decode(string): + return urllib.unquote(string).decode('utf-8') diff --git a/thirdpart/urlobject/ports.py b/thirdpart/urlobject/ports.py new file mode 100644 index 0000000000..27b8ef3400 --- /dev/null +++ b/thirdpart/urlobject/ports.py @@ -0,0 +1,25 @@ +"""Default port numbers for the URI schemes supported by urlparse.""" + +DEFAULT_PORTS = { + 'ftp': 21, + 'gopher': 70, + 'hdl': 2641, + 'http': 80, + 'https': 443, + 'imap': 143, + 'mms': 651, + 'news': 2009, + 'nntp': 119, + 'prospero': 191, + 'rsync': 873, + 'rtsp': 554, + 'rtspu': 554, + 'sftp': 115, + 'shttp': 80, + 'sip': 5060, + 'sips': 5061, + 'snews': 2009, + 'svn': 3690, + 'svn+ssh': 22, + 'telnet': 23, +} diff --git a/thirdpart/urlobject/query_string.py b/thirdpart/urlobject/query_string.py new file mode 100644 index 0000000000..4ca203a6af --- /dev/null +++ b/thirdpart/urlobject/query_string.py @@ -0,0 +1,109 @@ +import collections +import re +import urllib +import urlparse + + +class QueryString(unicode): + + def __repr__(self): + return 'QueryString(%r)' % (unicode(self),) + + @property + def list(self): + result = [] + if not self: + # Empty string => empty list. + return result + + name_value_pairs = re.split(r'[\&\;]', self) + for name_value_pair in name_value_pairs: + # Split the pair string into a naive, encoded (name, value) pair. + name_value = name_value_pair.split('=', 1) + # 'param=' => ('param', None) + if len(name_value) == 1: + name, value = name_value + [None] + # 'param=value' => ('param', 'value') + # 'param=' => ('param', '') + else: + name, value = name_value + + name = qs_decode(name) + if value is not None: + value = qs_decode(value) + + result.append((name, value)) + return result + + @property + def dict(self): + return dict(self.list) + + @property + def multi_dict(self): + result = collections.defaultdict(list) + for name, value in self.list: + result[name].append(value) + return dict(result) + + def add_param(self, name, value): + if value is None: + parameter = qs_encode(name) + else: + parameter = qs_encode(name) + '=' + qs_encode(value) + if self: + return type(self)(self + '&' + parameter) + return type(self)(parameter) + + def add_params(self, *args, **kwargs): + params_list = get_params_list(*args, **kwargs) + new = self + for name, value in params_list: + new = new.add_param(name, value) + return new + + def del_param(self, name): + params = [(n, v) for n, v in self.list if n != name] + qs = type(self)('') + for param in params: + qs = qs.add_param(*param) + return qs + + def set_param(self, name, value): + return self.del_param(name).add_param(name, value) + + def set_params(self, *args, **kwargs): + params_list = get_params_list(*args, **kwargs) + new = self + for name, value in params_list: + new = new.set_param(name, value) + return new + + def del_params(self, params): + deleted = set(params) + params = [(name, value) for name, value in self.list + if name not in deleted] + qs = type(self)('') + for param in params: + qs = qs.add_param(*param) + return qs + + +qs_encode = lambda s: urllib.quote(s.encode('utf-8')) +qs_decode = lambda s: urllib.unquote(str(s).replace('+', ' ')).decode('utf-8') + + +def get_params_list(*args, **kwargs): + """Turn dict-like arguments into an ordered list of pairs.""" + params = [] + if args: + if len(args) > 1: + raise TypeError("Expected at most 1 arguments, got 2") + arg = args[0] + if hasattr(arg, 'items'): + params.extend(arg.items()) + else: + params.extend(list(arg)) + if kwargs: + params.extend(kwargs.items()) + return params diff --git a/thirdpart/urlobject/urlobject.py b/thirdpart/urlobject/urlobject.py new file mode 100644 index 0000000000..d788f2f71e --- /dev/null +++ b/thirdpart/urlobject/urlobject.py @@ -0,0 +1,190 @@ +import urlparse + +from netloc import Netloc +from path import URLPath, path_encode, path_decode +from ports import DEFAULT_PORTS +from query_string import QueryString + + +class URLObject(unicode): + + """ + A URL. + + This class contains properties and methods for accessing and modifying the + constituent components of a URL. :class:`URLObject` instances are + immutable, as they derive from the built-in ``unicode``, and therefore all + methods return *new* objects; you need to consider this when using + :class:`URLObject` in your own code. + """ + + def __repr__(self): + return 'URLObject(%r)' % (unicode(self),) + + @property + def scheme(self): + return urlparse.urlsplit(self).scheme + def with_scheme(self, scheme): + return self.__replace(scheme=scheme) + + @property + def netloc(self): + return Netloc(urlparse.urlsplit(self).netloc) + def with_netloc(self, netloc): + return self.__replace(netloc=netloc) + + @property + def username(self): + return self.netloc.username + def with_username(self, username): + return self.with_netloc(self.netloc.with_username(username)) + def without_username(self): + return self.with_netloc(self.netloc.without_username()) + + @property + def password(self): + return self.netloc.password + def with_password(self, password): + return self.with_netloc(self.netloc.with_password(password)) + def without_password(self): + return self.with_netloc(self.netloc.without_password()) + + @property + def hostname(self): + return self.netloc.hostname + def with_hostname(self, hostname): + return self.with_netloc(self.netloc.with_hostname(hostname)) + + @property + def port(self): + return self.netloc.port + def with_port(self, port): + return self.with_netloc(self.netloc.with_port(port)) + def without_port(self): + return self.with_netloc(self.netloc.without_port()) + + @property + def auth(self): + return self.netloc.auth + def with_auth(self, *auth): + return self.with_netloc(self.netloc.with_auth(*auth)) + def without_auth(self): + return self.with_netloc(self.netloc.without_auth()) + + @property + def default_port(self): + """ + The destination port number for this URL. + + If no port number is explicitly given in the URL, this will return the + default port number for the scheme if one is known, or ``None``. The + mapping of schemes to default ports is defined in + :const:`urlobject.ports.DEFAULT_PORTS`. + """ + port = urlparse.urlsplit(self).port + if port is not None: + return port + return DEFAULT_PORTS.get(self.scheme) + + @property + def path(self): + return URLPath(urlparse.urlsplit(self).path) + def with_path(self, path): + return self.__replace(path=path) + + @property + def root(self): + return self.with_path('/') + + @property + def parent(self): + return self.with_path(self.path.parent) + + @property + def is_leaf(self): + return self.path.is_leaf + + def add_path_segment(self, segment): + return self.with_path(self.path.add_segment(segment)) + + def add_path(self, partial_path): + return self.with_path(self.path.add(partial_path)) + + @property + def query(self): + return QueryString(urlparse.urlsplit(self).query) + def with_query(self, query): + return self.__replace(query=query) + def without_query(self): + return self.__replace(query='') + + @property + def query_list(self): + return self.query.list + + @property + def query_dict(self): + return self.query.dict + + @property + def query_multi_dict(self): + return self.query.multi_dict + + def add_query_param(self, name, value): + return self.with_query(self.query.add_param(name, value)) + def add_query_params(self, *args, **kwargs): + return self.with_query(self.query.add_params(*args, **kwargs)) + + def set_query_param(self, name, value): + return self.with_query(self.query.set_param(name, value)) + def set_query_params(self, *args, **kwargs): + return self.with_query(self.query.set_params(*args, **kwargs)) + + def del_query_param(self, name): + return self.with_query(self.query.del_param(name)) + def del_query_params(self, params): + return self.with_query(self.query.del_params(params)) + + @property + def fragment(self): + return path_decode(urlparse.urlsplit(self).fragment) + def with_fragment(self, fragment): + return self.__replace(fragment=path_encode(fragment)) + def without_fragment(self): + return self.__replace(fragment='') + + def relative(self, other): + """Resolve another URL relative to this one.""" + # Relative URL resolution involves cascading through the properties + # from left to right, replacing + other = type(self)(other) + if other.scheme: + return other + elif other.netloc: + return other.with_scheme(self.scheme) + elif other.path: + return other.with_scheme(self.scheme).with_netloc(self.netloc) \ + .with_path(self.path.relative(other.path)) + elif other.query: + return other.with_scheme(self.scheme).with_netloc(self.netloc) \ + .with_path(self.path) + elif other.fragment: + return other.with_scheme(self.scheme).with_netloc(self.netloc) \ + .with_path(self.path).with_query(self.query) + # Empty string just removes fragment; it's treated as a path meaning + # 'the current location'. + return self.without_fragment() + + def __replace(self, **replace): + """Replace a field in the ``urlparse.SplitResult`` for this URL.""" + return type(self)(urlparse.urlunsplit( + urlparse.urlsplit(self)._replace(**replace))) + + +if not hasattr(urlparse, 'ResultMixin'): + def _replace(split_result, **replace): + return urlparse.SplitResult( + **dict((attr, replace.get(attr, getattr(split_result, attr))) + for attr in ('scheme', 'netloc', 'path', 'query', 'fragment'))) + urlparse.BaseResult._replace = _replace + del _replace