[PATCH 01 of 11 RFC] url: provide url object
Adrian Buehlmann
adrian at cadifra.com
Wed Mar 30 09:23:43 UTC 2011
On 2011-03-26 07:29, Brodie Rao wrote:
> # HG changeset patch
> # User Brodie Rao <brodie at bitheap.org>
> # Date 1301119136 25200
> # Node ID fe11ab4ab5cda92dec316a86479604280074876f
> # Parent e45780ac829283080ad6b5a5e064e59a163c08d6
> url: provide url object
>
> This adds a url object that re-implements urlsplit() and
> unsplit(). The implementation splits out usernames, passwords, and
> ports.
>
> The implementation is based on the behavior specified by RFC
> 2396[1]. However, it is much more forgiving than the RFC's
> specification; it places no specific restrictions on what characters
> are allowed in each segment of the URL other than what is necessary to
> split the URL into its constituent parts.
>
> [1]: http://www.ietf.org/rfc/rfc2396.txt
>
> diff --git a/mercurial/url.py b/mercurial/url.py
> --- a/mercurial/url.py
> +++ b/mercurial/url.py
> @@ -23,6 +23,198 @@ def _urlunparse(scheme, netloc, path, pa
> result = scheme + '://' + result[len(scheme + ':'):]
> return result
>
> +class url(object):
> + """Reliable URL parser.
> +
> + This parses URLs and provides attributes for the following
> + components:
> +
> + <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment>
> +
> + Missing components are set to None. The only exception is
> + fragment, which is set to '' if present but empty.
> +
> + If parse_fragment is False, fragment is included in query. If
> + parse_query is False, query is included in path. If both are
> + False, both fragment and query are included in path.
> +
> + See http://www.ietf.org/rfc/rfc2396.txt for more information.
> +
> + Examples:
> +
> + >>> url('http://www.ietf.org/rfc/rfc2396.txt')
> + <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'>
> + >>> url('ssh://[::1]:2200//home/joe/repo')
> + <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'>
> + >>> url('file:///home/joe/repo')
> + <url scheme: 'file', path: '/home/joe/repo'>
> + >>> url('bundle:foo')
> + <url scheme: 'bundle', path: 'foo'>
> +
> + Authentication credentials:
> +
> + >>> url('ssh://joe:xyz@x/repo')
> + <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'>
> + >>> url('ssh://joe@x/repo')
> + <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'>
> +
> + Query strings and fragments:
> +
> + >>> url('http://host/a?b#c')
> + <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
> + >>> url('http://host/a?b#c', parse_query=False, parse_fragment=False)
> + <url scheme: 'http', host: 'host', path: 'a?b#c'>
> + """
> +
> + _safechars = "!~*'()+"
> + _safepchars = "/!~*'()+"
> +
> + def __init__(self, path, parse_query=True, parse_fragment=True):
> + # We slowly chomp away at path until we have only the path left
> + self.scheme = self.user = self.passwd = self.host = None
> + self.port = self.path = self.query = self.fragment = None
> + self._localpath = True
> +
> + if not path.startswith('/') and ':' in path:
> + parts = path.split(':', 1)
> + if parts[0]:
> + self.scheme, path = parts
> + self._localpath = False
> +
> + if not path:
> + path = None
> + if self._localpath:
> + self.path = ''
> + return
> + else:
> + if parse_fragment and '#' in path:
> + path, self.fragment = path.split('#', 1)
> + if not path:
> + path = None
> + if self._localpath:
> + self.path = path
> + return
> +
> + if parse_query and '?' in path:
> + path, self.query = path.split('?', 1)
> + if not path:
> + path = None
> + if not self.query:
> + self.query = None
> +
> + # // is required to specify a host/authority
> + if path and path.startswith('//'):
> + parts = path[2:].split('/', 1)
> + if len(parts) > 1:
> + self.host, path = parts
> + path = path
> + else:
> + self.host = parts[0]
> + path = None
> + if not self.host:
> + self.host = None
> + if path:
> + path = '/' + path
> +
> + if self.host and '@' in self.host:
> + self.user, self.host = self.host.rsplit('@', 1)
> + if ':' in self.user:
> + self.user, self.passwd = self.user.split(':', 1)
> + if not self.host:
> + self.host = None
> +
> + # Don't split on colons in IPv6 addresses without ports
> + if (self.host and ':' in self.host and
> + not (self.host.startswith('[') and self.host.endswith(']'))):
> + self.host, self.port = self.host.rsplit(':', 1)
> + if not self.host:
> + self.host = None
> + self.path = path
> +
> + for a in ('user', 'passwd', 'host', 'port',
> + 'path', 'query', 'fragment'):
> + v = getattr(self, a)
> + if v is not None:
> + setattr(self, a, urllib.unquote(v))
> +
> + def __repr__(self):
> + attrs = []
> + for a in ('scheme', 'user', 'passwd', 'host', 'port', 'path',
> + 'query', 'fragment'):
> + v = getattr(self, a)
> + if v is not None:
> + attrs.append('%s: %r' % (a, v))
> + return '<url %s>' % ', '.join(attrs)
> +
> + def __str__(self):
> + """Join the URL's components back into a URL string.
> +
> + Examples:
> +
> + >>> str(url('http://user:pw@host:80/?foo#bar'))
> + 'http://user:pw@host:80/?foo#bar'
> + >>> str(url('ssh://user:pw@[::1]:2200//home/joe#'))
> + 'ssh://user:pw@[::1]:2200//home/joe#'
> + >>> str(url('http://localhost:80//'))
> + 'http://localhost:80//'
> + >>> str(url('http://localhost:80/'))
> + 'http://localhost:80/'
> + >>> str(url('http://localhost:80'))
> + 'http://localhost:80'
> + >>> str(url('bundle:foo'))
> + 'bundle:foo'
> + >>> str(url('path'))
> + 'path'
> + """
> + if self._localpath:
> + s = self.path
> + if self.fragment:
> + s += '#' + self.fragment
> + return s
> +
> + s = self.scheme + ':'
> + if (self.user or self.passwd or self.host or
> + self.scheme and not self.path):
> + s += '//'
> + if self.user:
> + s += urllib.quote(self.user, safe=self._safechars)
> + if self.passwd:
> + s += ':' + urllib.quote(self.passwd, safe=self._safechars)
> + if self.user or self.passwd:
> + s += '@'
> + if self.host:
> + if not (self.host.startswith('[') and self.host.endswith(']')):
> + s += urllib.quote(self.host)
> + else:
> + s += self.host
> + if self.port:
> + s += ':' + urllib.quote(self.port)
> + if ((self.host and self.path is not None) or
> + (self.host and self.query or self.fragment)):
> + s += '/'
> + if self.path:
> + s += urllib.quote(self.path, safe=self._safepchars)
> + if self.query:
> + s += '?' + urllib.quote(self.query, safe=self._safepchars)
> + if self.fragment is not None:
> + s += '#' + urllib.quote(self.fragment, safe=self._safepchars)
> + return s
> +
> + def authinfo(self):
> + user, passwd = self.user, self.passwd
> + try:
> + self.user, self.passwd = None, None
> + s = str(self)
> + finally:
> + self.user, self.passwd = user, passwd
> + if not self.user:
> + return (s, None)
> + return (s, (None, (str(self), self.host),
> + self.user, self.passwd or ''))
> +
> +def has_scheme(path):
> + return bool(url(path).scheme)
> +
> def hidepassword(url):
> '''hide user credential in a url string'''
> scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
> diff --git a/tests/test-url.py b/tests/test-url.py
> --- a/tests/test-url.py
> +++ b/tests/test-url.py
> @@ -49,6 +49,142 @@ check(_verifycert({'subject': ()},
> check(_verifycert(None, 'example.com'),
> 'no certificate received')
>
> +import doctest
> +
> +def test_url():
> + """
> + >>> from mercurial.url import url
> +
> + This tests for edge cases in url.URL's parsing algorithm. Most of
> + these aren't useful for documentation purposes, so they aren't
> + part of the class's doc tests.
> +
> + Query strings and fragments:
> +
> + >>> url('http://host/a?b#c')
> + <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
> + >>> url('http://host/a?')
> + <url scheme: 'http', host: 'host', path: 'a'>
> + >>> url('http://host/a#b#c')
> + <url scheme: 'http', host: 'host', path: 'a', fragment: 'b#c'>
> + >>> url('http://host/a#b?c')
> + <url scheme: 'http', host: 'host', path: 'a', fragment: 'b?c'>
> + >>> url('http://host/?a#b')
> + <url scheme: 'http', host: 'host', path: '', query: 'a', fragment: 'b'>
> + >>> url('http://host/?a#b', parse_query=False)
> + <url scheme: 'http', host: 'host', path: '?a', fragment: 'b'>
> + >>> url('http://host/?a#b', parse_fragment=False)
> + <url scheme: 'http', host: 'host', path: '', query: 'a#b'>
> + >>> url('http://host/?a#b', parse_query=False, parse_fragment=False)
> + <url scheme: 'http', host: 'host', path: '?a#b'>
> +
> + IPv6 addresses:
> +
> + >>> url('ldap://[2001:db8::7]/c=GB?objectClass?one')
> + <url scheme: 'ldap', host: '[2001:db8::7]', path: 'c=GB',
> + query: 'objectClass?one'>
> + >>> url('ldap://joe:xxx@[2001:db8::7]:80/c=GB?objectClass?one')
> + <url scheme: 'ldap', user: 'joe', passwd: 'xxx', host: '[2001:db8::7]',
> + port: '80', path: 'c=GB', query: 'objectClass?one'>
> +
> + Missing scheme, host, etc.:
> +
> + >>> url('://192.0.2.16:80/')
> + <url path: '://192.0.2.16:80/'>
> + >>> url('http://mercurial.selenic.com')
> + <url scheme: 'http', host: 'mercurial.selenic.com'>
> + >>> url('/foo')
> + <url path: '/foo'>
> + >>> url('bundle:/foo')
> + <url scheme: 'bundle', path: '/foo'>
> + >>> url('a?b#c')
> + <url path: 'a?b', fragment: 'c'>
> + >>> url('http://x.com?arg=/foo')
> + <url scheme: 'http', host: 'x.com', query: 'arg=/foo'>
> + >>> url('http://joe:xxx@/foo')
> + <url scheme: 'http', user: 'joe', passwd: 'xxx', path: 'foo'>
> +
> + Just a scheme and a path:
> +
> + >>> url('mailto:John.Doe at example.com')
> + <url scheme: 'mailto', path: 'John.Doe at example.com'>
> + >>> url('a:b:c:d')
> + <url scheme: 'a', path: 'b:c:d'>
> +
> + SSH examples:
> +
> + >>> url('ssh://joe@host//home/joe')
> + <url scheme: 'ssh', user: 'joe', host: 'host', path: '/home/joe'>
> + >>> url('ssh://joe:xxx@host/src')
> + <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', path: 'src'>
> + >>> url('ssh://joe:xxx@host')
> + <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host'>
> + >>> url('ssh://joe@host')
> + <url scheme: 'ssh', user: 'joe', host: 'host'>
> + >>> url('ssh://host')
> + <url scheme: 'ssh', host: 'host'>
> + >>> url('ssh://')
> + <url scheme: 'ssh'>
> + >>> url('ssh:')
> + <url scheme: 'ssh'>
> +
> + Non-numeric port:
> +
> + >>> url('http://example.com:dd')
> + <url scheme: 'http', host: 'example.com', port: 'dd'>
> + >>> url('ssh://joe:xxx@host:ssh/foo')
> + <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', port: 'ssh',
> + path: 'foo'>
> +
> + Bad authentication credentials:
> +
> + >>> url('http://joe@joeville:123@4:@host/a?b#c')
> + <url scheme: 'http', user: 'joe at joeville', passwd: '123 at 4:',
> + host: 'host', path: 'a', query: 'b', fragment: 'c'>
> + >>> url('http://!*#?/@!*#?/:@host/a?b#c')
> + <url scheme: 'http', host: '!*', fragment: '?/@!*#?/:@host/a?b#c'>
> + >>> url('http://!*#?@!*#?:@host/a?b#c')
> + <url scheme: 'http', host: '!*', fragment: '?@!*#?:@host/a?b#c'>
> + >>> url('http://!*@:!*@@host/a?b#c')
> + <url scheme: 'http', user: '!*@', passwd: '!*@', host: 'host',
> + path: 'a', query: 'b', fragment: 'c'>
> +
> + File paths:
> +
> + >>> url('a/b/c/d.g.f')
> + <url path: 'a/b/c/d.g.f'>
> + >>> url('/x///z/y/')
> + <url path: '/x///z/y/'>
> +
It seems we need some Windows path test cases here, also using the
Windows canonical path separator ('\').
Example paths:
C:\some\path\to\file
C:\some\path with spaces\to\file
c:\some\path\to\file
\\server\share\path\to\file
(Note that accessing Mercurial repositories residing on Windows shares
is <diplomatic mode:ON>not recommended best practice, but we don't
forbid it either, so we have to support \\server\share "UNC" paths.)
Obscure examples (not sure if these should be taken):
C:file
C:..\tmp.txt
"Long path" examples:
\\?\UNC\server\share\path\to\file
\\?\D:\very long path
Windows normally also accepts forward slashes as path separator too (but
apparently not for long path "\\?\" paths -- as I just discovered...).
So these are possible paths as well:
C:\some\path/to/file
C:/some/path/to/file
\\server\share/path/to/file
See also http://msdn.microsoft.com/en-us/library/aa365247(v=vs.85).aspx
"Naming Files, Paths, and Namespaces".
> + Empty URL:
> +
> + >>> u = url('')
> + >>> u
> + <url path: ''>
> + >>> str(u)
> + ''
> +
> + Empty path with query string:
> +
> + >>> str(url('http://foo/?bar'))
> + 'http://foo/?bar'
> +
> + Invalid path:
> +
> + >>> u = url('http://foo/bar')
> + >>> u.path = 'bar'
> + >>> str(u)
> + 'http://foo/bar'
> +
> + >>> u = url('file:///foo/bar/baz')
> + >>> u
> + <url scheme: 'file', path: '/foo/bar/baz'>
> + >>> str(u)
> + 'file:/foo/bar/baz'
> + """
> +
> +doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
> +
> # Unicode (IDN) certname isn't supported
> check(_verifycert(cert(u'\u4f8b.jp'), 'example.jp'),
> 'IDN in certificate not supported')
More information about the Mercurial-devel
mailing list