[PATCH 01 of 11 RFC] url: provide url object

Adrian Buehlmann adrian at cadifra.com
Wed Mar 30 09:23:43 UTC 2011


On 2011-03-26 07:29, Brodie Rao wrote:
> # HG changeset patch
> # User Brodie Rao <brodie at bitheap.org>
> # Date 1301119136 25200
> # Node ID fe11ab4ab5cda92dec316a86479604280074876f
> # Parent  e45780ac829283080ad6b5a5e064e59a163c08d6
> url: provide url object
> 
> This adds a url object that re-implements urlsplit() and
> unsplit(). The implementation splits out usernames, passwords, and
> ports.
> 
> The implementation is based on the behavior specified by RFC
> 2396[1]. However, it is much more forgiving than the RFC's
> specification; it places no specific restrictions on what characters
> are allowed in each segment of the URL other than what is necessary to
> split the URL into its constituent parts.
> 
> [1]: http://www.ietf.org/rfc/rfc2396.txt
> 
> diff --git a/mercurial/url.py b/mercurial/url.py
> --- a/mercurial/url.py
> +++ b/mercurial/url.py
> @@ -23,6 +23,198 @@ def _urlunparse(scheme, netloc, path, pa
>          result = scheme + '://' + result[len(scheme + ':'):]
>      return result
>  
> +class url(object):
> +    """Reliable URL parser.
> +
> +    This parses URLs and provides attributes for the following
> +    components:
> +
> +    <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment>
> +
> +    Missing components are set to None. The only exception is
> +    fragment, which is set to '' if present but empty.
> +
> +    If parse_fragment is False, fragment is included in query. If
> +    parse_query is False, query is included in path. If both are
> +    False, both fragment and query are included in path.
> +
> +    See http://www.ietf.org/rfc/rfc2396.txt for more information.
> +
> +    Examples:
> +
> +    >>> url('http://www.ietf.org/rfc/rfc2396.txt')
> +    <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'>
> +    >>> url('ssh://[::1]:2200//home/joe/repo')
> +    <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'>
> +    >>> url('file:///home/joe/repo')
> +    <url scheme: 'file', path: '/home/joe/repo'>
> +    >>> url('bundle:foo')
> +    <url scheme: 'bundle', path: 'foo'>
> +
> +    Authentication credentials:
> +
> +    >>> url('ssh://joe:xyz@x/repo')
> +    <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'>
> +    >>> url('ssh://joe@x/repo')
> +    <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'>
> +
> +    Query strings and fragments:
> +
> +    >>> url('http://host/a?b#c')
> +    <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
> +    >>> url('http://host/a?b#c', parse_query=False, parse_fragment=False)
> +    <url scheme: 'http', host: 'host', path: 'a?b#c'>
> +    """
> +
> +    _safechars = "!~*'()+"
> +    _safepchars = "/!~*'()+"
> +
> +    def __init__(self, path, parse_query=True, parse_fragment=True):
> +        # We slowly chomp away at path until we have only the path left
> +        self.scheme = self.user = self.passwd = self.host = None
> +        self.port = self.path = self.query = self.fragment = None
> +        self._localpath = True
> +
> +        if not path.startswith('/') and ':' in path:
> +            parts = path.split(':', 1)
> +            if parts[0]:
> +                self.scheme, path = parts
> +                self._localpath = False
> +
> +        if not path:
> +            path = None
> +            if self._localpath:
> +                self.path = ''
> +                return
> +        else:
> +            if parse_fragment and '#' in path:
> +                path, self.fragment = path.split('#', 1)
> +                if not path:
> +                    path = None
> +            if self._localpath:
> +                self.path = path
> +                return
> +
> +            if parse_query and '?' in path:
> +                path, self.query = path.split('?', 1)
> +                if not path:
> +                    path = None
> +                if not self.query:
> +                    self.query = None
> +
> +            # // is required to specify a host/authority
> +            if path and path.startswith('//'):
> +                parts = path[2:].split('/', 1)
> +                if len(parts) > 1:
> +                    self.host, path = parts
> +                    path = path
> +                else:
> +                    self.host = parts[0]
> +                    path = None
> +                if not self.host:
> +                    self.host = None
> +                    if path:
> +                        path = '/' + path
> +
> +            if self.host and '@' in self.host:
> +                self.user, self.host = self.host.rsplit('@', 1)
> +                if ':' in self.user:
> +                    self.user, self.passwd = self.user.split(':', 1)
> +                if not self.host:
> +                    self.host = None
> +
> +            # Don't split on colons in IPv6 addresses without ports
> +            if (self.host and ':' in self.host and
> +                not (self.host.startswith('[') and self.host.endswith(']'))):
> +                self.host, self.port = self.host.rsplit(':', 1)
> +                if not self.host:
> +                    self.host = None
> +        self.path = path
> +
> +        for a in ('user', 'passwd', 'host', 'port',
> +                  'path', 'query', 'fragment'):
> +            v = getattr(self, a)
> +            if v is not None:
> +                setattr(self, a, urllib.unquote(v))
> +
> +    def __repr__(self):
> +        attrs = []
> +        for a in ('scheme', 'user', 'passwd', 'host', 'port', 'path',
> +                  'query', 'fragment'):
> +            v = getattr(self, a)
> +            if v is not None:
> +                attrs.append('%s: %r' % (a, v))
> +        return '<url %s>' % ', '.join(attrs)
> +
> +    def __str__(self):
> +        """Join the URL's components back into a URL string.
> +
> +        Examples:
> +
> +        >>> str(url('http://user:pw@host:80/?foo#bar'))
> +        'http://user:pw@host:80/?foo#bar'
> +        >>> str(url('ssh://user:pw@[::1]:2200//home/joe#'))
> +        'ssh://user:pw@[::1]:2200//home/joe#'
> +        >>> str(url('http://localhost:80//'))
> +        'http://localhost:80//'
> +        >>> str(url('http://localhost:80/'))
> +        'http://localhost:80/'
> +        >>> str(url('http://localhost:80'))
> +        'http://localhost:80'
> +        >>> str(url('bundle:foo'))
> +        'bundle:foo'
> +        >>> str(url('path'))
> +        'path'
> +        """
> +        if self._localpath:
> +            s = self.path
> +            if self.fragment:
> +                s += '#' + self.fragment
> +            return s
> +
> +        s = self.scheme + ':'
> +        if (self.user or self.passwd or self.host or
> +            self.scheme and not self.path):
> +            s += '//'
> +        if self.user:
> +            s += urllib.quote(self.user, safe=self._safechars)
> +        if self.passwd:
> +            s += ':' + urllib.quote(self.passwd, safe=self._safechars)
> +        if self.user or self.passwd:
> +            s += '@'
> +        if self.host:
> +            if not (self.host.startswith('[') and self.host.endswith(']')):
> +                s += urllib.quote(self.host)
> +            else:
> +                s += self.host
> +        if self.port:
> +            s += ':' + urllib.quote(self.port)
> +        if ((self.host and self.path is not None) or
> +            (self.host and self.query or self.fragment)):
> +            s += '/'
> +        if self.path:
> +            s += urllib.quote(self.path, safe=self._safepchars)
> +        if self.query:
> +            s += '?' + urllib.quote(self.query, safe=self._safepchars)
> +        if self.fragment is not None:
> +            s += '#' + urllib.quote(self.fragment, safe=self._safepchars)
> +        return s
> +
> +    def authinfo(self):
> +        user, passwd = self.user, self.passwd
> +        try:
> +            self.user, self.passwd = None, None
> +            s = str(self)
> +        finally:
> +            self.user, self.passwd = user, passwd
> +        if not self.user:
> +            return (s, None)
> +        return (s, (None, (str(self), self.host),
> +                    self.user, self.passwd or ''))
> +
> +def has_scheme(path):
> +    return bool(url(path).scheme)
> +
>  def hidepassword(url):
>      '''hide user credential in a url string'''
>      scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
> diff --git a/tests/test-url.py b/tests/test-url.py
> --- a/tests/test-url.py
> +++ b/tests/test-url.py
> @@ -49,6 +49,142 @@ check(_verifycert({'subject': ()},
>  check(_verifycert(None, 'example.com'),
>        'no certificate received')
>  
> +import doctest
> +
> +def test_url():
> +    """
> +    >>> from mercurial.url import url
> +
> +    This tests for edge cases in url.URL's parsing algorithm. Most of
> +    these aren't useful for documentation purposes, so they aren't
> +    part of the class's doc tests.
> +
> +    Query strings and fragments:
> +
> +    >>> url('http://host/a?b#c')
> +    <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
> +    >>> url('http://host/a?')
> +    <url scheme: 'http', host: 'host', path: 'a'>
> +    >>> url('http://host/a#b#c')
> +    <url scheme: 'http', host: 'host', path: 'a', fragment: 'b#c'>
> +    >>> url('http://host/a#b?c')
> +    <url scheme: 'http', host: 'host', path: 'a', fragment: 'b?c'>
> +    >>> url('http://host/?a#b')
> +    <url scheme: 'http', host: 'host', path: '', query: 'a', fragment: 'b'>
> +    >>> url('http://host/?a#b', parse_query=False)
> +    <url scheme: 'http', host: 'host', path: '?a', fragment: 'b'>
> +    >>> url('http://host/?a#b', parse_fragment=False)
> +    <url scheme: 'http', host: 'host', path: '', query: 'a#b'>
> +    >>> url('http://host/?a#b', parse_query=False, parse_fragment=False)
> +    <url scheme: 'http', host: 'host', path: '?a#b'>
> +
> +    IPv6 addresses:
> +
> +    >>> url('ldap://[2001:db8::7]/c=GB?objectClass?one')
> +    <url scheme: 'ldap', host: '[2001:db8::7]', path: 'c=GB',
> +         query: 'objectClass?one'>
> +    >>> url('ldap://joe:xxx@[2001:db8::7]:80/c=GB?objectClass?one')
> +    <url scheme: 'ldap', user: 'joe', passwd: 'xxx', host: '[2001:db8::7]',
> +         port: '80', path: 'c=GB', query: 'objectClass?one'>
> +
> +    Missing scheme, host, etc.:
> +
> +    >>> url('://192.0.2.16:80/')
> +    <url path: '://192.0.2.16:80/'>
> +    >>> url('http://mercurial.selenic.com')
> +    <url scheme: 'http', host: 'mercurial.selenic.com'>
> +    >>> url('/foo')
> +    <url path: '/foo'>
> +    >>> url('bundle:/foo')
> +    <url scheme: 'bundle', path: '/foo'>
> +    >>> url('a?b#c')
> +    <url path: 'a?b', fragment: 'c'>
> +    >>> url('http://x.com?arg=/foo')
> +    <url scheme: 'http', host: 'x.com', query: 'arg=/foo'>
> +    >>> url('http://joe:xxx@/foo')
> +    <url scheme: 'http', user: 'joe', passwd: 'xxx', path: 'foo'>
> +
> +    Just a scheme and a path:
> +
> +    >>> url('mailto:John.Doe at example.com')
> +    <url scheme: 'mailto', path: 'John.Doe at example.com'>
> +    >>> url('a:b:c:d')
> +    <url scheme: 'a', path: 'b:c:d'>
> +
> +    SSH examples:
> +
> +    >>> url('ssh://joe@host//home/joe')
> +    <url scheme: 'ssh', user: 'joe', host: 'host', path: '/home/joe'>
> +    >>> url('ssh://joe:xxx@host/src')
> +    <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', path: 'src'>
> +    >>> url('ssh://joe:xxx@host')
> +    <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host'>
> +    >>> url('ssh://joe@host')
> +    <url scheme: 'ssh', user: 'joe', host: 'host'>
> +    >>> url('ssh://host')
> +    <url scheme: 'ssh', host: 'host'>
> +    >>> url('ssh://')
> +    <url scheme: 'ssh'>
> +    >>> url('ssh:')
> +    <url scheme: 'ssh'>
> +
> +    Non-numeric port:
> +
> +    >>> url('http://example.com:dd')
> +    <url scheme: 'http', host: 'example.com', port: 'dd'>
> +    >>> url('ssh://joe:xxx@host:ssh/foo')
> +    <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', port: 'ssh',
> +         path: 'foo'>
> +
> +    Bad authentication credentials:
> +
> +    >>> url('http://joe@joeville:123@4:@host/a?b#c')
> +    <url scheme: 'http', user: 'joe at joeville', passwd: '123 at 4:',
> +         host: 'host', path: 'a', query: 'b', fragment: 'c'>
> +    >>> url('http://!*#?/@!*#?/:@host/a?b#c')
> +    <url scheme: 'http', host: '!*', fragment: '?/@!*#?/:@host/a?b#c'>
> +    >>> url('http://!*#?@!*#?:@host/a?b#c')
> +    <url scheme: 'http', host: '!*', fragment: '?@!*#?:@host/a?b#c'>
> +    >>> url('http://!*@:!*@@host/a?b#c')
> +    <url scheme: 'http', user: '!*@', passwd: '!*@', host: 'host',
> +         path: 'a', query: 'b', fragment: 'c'>
> +
> +    File paths:
> +
> +    >>> url('a/b/c/d.g.f')
> +    <url path: 'a/b/c/d.g.f'>
> +    >>> url('/x///z/y/')
> +    <url path: '/x///z/y/'>
> +

It seems we need some Windows path test cases here, also using the
Windows canonical path separator ('\').

Example paths:

  C:\some\path\to\file
  C:\some\path with spaces\to\file
  c:\some\path\to\file
  \\server\share\path\to\file

(Note that accessing Mercurial repositories residing on Windows shares
is <diplomatic mode:ON>not recommended best practice, but we don't
forbid it either, so we have to support \\server\share "UNC" paths.)

Obscure examples (not sure if these should be taken):

  C:file
  C:..\tmp.txt

"Long path" examples:

  \\?\UNC\server\share\path\to\file
  \\?\D:\very long path

Windows normally also accepts forward slashes as path separator too (but
apparently not for long path "\\?\" paths -- as I just discovered...).
So these are possible paths as well:

  C:\some\path/to/file
  C:/some/path/to/file
  \\server\share/path/to/file

See also http://msdn.microsoft.com/en-us/library/aa365247(v=vs.85).aspx
"Naming Files, Paths, and Namespaces".

> +    Empty URL:
> +
> +    >>> u = url('')
> +    >>> u
> +    <url path: ''>
> +    >>> str(u)
> +    ''
> +
> +    Empty path with query string:
> +
> +    >>> str(url('http://foo/?bar'))
> +    'http://foo/?bar'
> +
> +    Invalid path:
> +
> +    >>> u = url('http://foo/bar')
> +    >>> u.path = 'bar'
> +    >>> str(u)
> +    'http://foo/bar'
> +
> +    >>> u = url('file:///foo/bar/baz')
> +    >>> u
> +    <url scheme: 'file', path: '/foo/bar/baz'>
> +    >>> str(u)
> +    'file:/foo/bar/baz'
> +    """
> +
> +doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
> +
>  # Unicode (IDN) certname isn't supported
>  check(_verifycert(cert(u'\u4f8b.jp'), 'example.jp'),
>        'IDN in certificate not supported')



More information about the Mercurial-devel mailing list