Skip to content

♻️ REFACTOR: Port mdurl and punycode for URL normalisation #171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Dec 1, 2021
66 changes: 66 additions & 0 deletions markdown_it/_punycode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2014 Mathias Bynens <https://mathiasbynens.be/>
# Copyright 2021 Taneli Hukkinen
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import codecs
import re

REGEX_SEPARATORS = re.compile(r"[\x2E\u3002\uFF0E\uFF61]")
REGEX_NON_ASCII = re.compile(r"[^\0-\x7E]")


def encode(uni: str) -> str:
return codecs.encode(uni, encoding="punycode").decode()


def decode(ascii: str) -> str:
return codecs.decode(ascii, encoding="punycode") # type: ignore[call-overload]


def map_domain(string, fn):
parts = string.split("@")
result = ""
if len(parts) > 1:
# In email addresses, only the domain name should be punycoded. Leave
# the local part (i.e. everything up to `@`) intact.
result = parts[0] + "@"
string = parts[1]
labels = REGEX_SEPARATORS.split(string)
encoded = ".".join(fn(label) for label in labels)
return result + encoded


def to_unicode(obj: str) -> str:
def mapping(obj: str) -> str:
if obj.startswith("xn--"):
return decode(obj[4:].lower())
return obj

return map_domain(obj, mapping)


def to_ascii(obj: str) -> str:
def mapping(obj: str) -> str:
if REGEX_NON_ASCII.search(obj):
return "xn--" + encode(obj)
return obj

return map_domain(obj, mapping)
177 changes: 39 additions & 138 deletions markdown_it/common/normalize_url.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,13 @@
import html
import re
from typing import Callable, Optional
from urllib.parse import urlparse, urlunparse, quote, unquote # noqa: F401

from .utils import ESCAPABLE
import mdurl

# TODO below we port the use of the JS packages:
# var mdurl = require('mdurl')
# var punycode = require('punycode')
#
# e.g. mdurl: parsed = mdurl.parse(url, True)
#
# but need to check these fixes from https://www.npmjs.com/package/mdurl:
#
# Parse url string. Similar to node's url.parse,
# but without any normalizations and query string parse.
# url - input url (string)
# slashesDenoteHost - if url starts with //, expect a hostname after it. Optional, false.
# Difference with node's url:
from .. import _punycode

# No leading slash in paths, e.g. in url.parse('http://foo?bar') pathname is ``, not /
# Backslashes are not replaced with slashes, so http:\\example.org\ is treated like a relative path
# Trailing colon is treated like a part of the path, i.e. in http://example.org:foo pathname is :foo
# Nothing is URL-encoded in the resulting object,
# (in joyent/node some chars in auth and paths are encoded)
# url.parse() does not have parseQueryString argument
# Removed extraneous result properties: host, path, query, etc.,
# which can be constructed using other parts of the url.


# ################# Copied from Commonmark.py #################

ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"
reBackslashOrAmp = re.compile(r"[\\&]")
reEntityOrEscapedChar = re.compile(
"\\\\" + "[" + ESCAPABLE + "]|" + ENTITY, re.IGNORECASE
)


def unescape_char(s: str) -> str:
if s[0] == "\\":
return s[1]
else:
return html.unescape(s)


def unescape_string(s: str) -> str:
"""Replace entities and backslash escapes with literal characters."""
if re.search(reBackslashOrAmp, s):
return re.sub(reEntityOrEscapedChar, lambda m: unescape_char(m.group()), s)
else:
return s


def normalize_uri(uri: str) -> str:
return quote(uri, safe="/@:+?=&()%#*,")


##################


RECODE_HOSTNAME_FOR = ("http", "https", "mailto")


def unescape_normalize_uri(x: str) -> str:
return normalize_uri(unescape_string(x))
RECODE_HOSTNAME_FOR = ("http:", "https:", "mailto:")


def normalizeLink(url: str) -> str:
Expand All @@ -75,91 +18,49 @@ def normalizeLink(url: str) -> str:
[label]: destination 'title'
^^^^^^^^^^^
"""
(scheme, netloc, path, params, query, fragment) = urlparse(url)
if scheme in RECODE_HOSTNAME_FOR:
url = urlunparse(
(
scheme,
unescape_normalize_uri(netloc),
normalize_uri(path),
unescape_normalize_uri(params),
normalize_uri(query),
unescape_normalize_uri(fragment),
)
)
else:
url = unescape_normalize_uri(url)

return url

# TODO the selective encoding below should probably be done here,
# something like:
# url_check = urllib.parse.urlparse(destination)
# if url_check.scheme in RECODE_HOSTNAME_FOR: ...

# parsed = urlparse(url)
# if parsed.hostname:
# # Encode hostnames in urls like:
# # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
# #
# # We don't encode unknown schemas, because it's likely that we encode
# # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
# #
# if (not parsed.scheme) or parsed.scheme in RECODE_HOSTNAME_FOR:
# try:
# parsed.hostname = punycode.toASCII(parsed.hostname)
# except Exception:
# pass
# return quote(urlunparse(parsed))


def unescape_unquote(x: str) -> str:
return unquote(unescape_string(x))


def normalizeLinkText(link: str) -> str:
parsed = mdurl.parse(url, slashes_denote_host=True)

if parsed.hostname:
# Encode hostnames in urls like:
# `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
#
# We don't encode unknown schemas, because it's likely that we encode
# something we shouldn't (e.g. `skype:name` treated as `skype:host`)
#
if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR:
try:
parsed = parsed._replace(hostname=_punycode.to_ascii(parsed.hostname))
except Exception:
pass

return mdurl.encode(mdurl.format(parsed))


def normalizeLinkText(url: str) -> str:
"""Normalize autolink content

::

<destination>
~~~~~~~~~~~
"""
(scheme, netloc, path, params, query, fragment) = urlparse(link)
if scheme in RECODE_HOSTNAME_FOR:
url = urlunparse(
(
scheme,
unescape_unquote(netloc),
unquote(path),
unescape_unquote(params),
unquote(query),
unescape_unquote(fragment),
)
)
else:
url = unescape_unquote(link)
return url

# TODO the selective encoding below should probably be done here,
# something like:
# url_check = urllib.parse.urlparse(destination)
# if url_check.scheme in RECODE_HOSTNAME_FOR: ...

# parsed = urlparse(url)
# if parsed.hostname:
# # Encode hostnames in urls like:
# # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`
# #
# # We don't encode unknown schemas, because it's likely that we encode
# # something we shouldn't (e.g. `skype:name` treated as `skype:host`)
# #
# if (not parsed.protocol) or parsed.protocol in RECODE_HOSTNAME_FOR:
# try:
# parsed.hostname = punycode.toUnicode(parsed.hostname)
# except Exception:
# pass
# return unquote(urlunparse(parsed))
parsed = mdurl.parse(url, slashes_denote_host=True)

if parsed.hostname:
# Encode hostnames in urls like:
# `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
#
# We don't encode unknown schemas, because it's likely that we encode
# something we shouldn't (e.g. `skype:name` treated as `skype:host`)
#
if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR:
try:
parsed = parsed._replace(hostname=_punycode.to_unicode(parsed.hostname))
except Exception:
pass

# add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720
return mdurl.decode(mdurl.format(parsed), mdurl.DECODE_DEFAULT_CHARS + "%")


BAD_PROTO_RE = re.compile(r"^(vbscript|javascript|file|data):")
Expand Down
15 changes: 11 additions & 4 deletions markdown_it/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@

from .entities import entities

# from .normalize_url import unescape_string


def charCodeAt(src: str, pos: int) -> Any:
"""
Expand Down Expand Up @@ -105,7 +103,7 @@ def fromCodePoint(c: int) -> str:
UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
UNESCAPE_ALL_RE = re.compile(
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31})",
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
re.IGNORECASE,
)
DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
Expand Down Expand Up @@ -146,7 +144,16 @@ def unescapeMd(string: str) -> str:


def unescapeAll(string: str) -> str:
return html.unescape(string)
def replacer_func(match):
escaped = match.group(1)
if escaped:
return escaped
entity = match.group(2)
return replaceEntityPattern(match.group(), entity)

if "\\" not in string and "&" not in string:
return string
return UNESCAPE_ALL_RE.sub(replacer_func, string)


ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
Expand Down
4 changes: 2 additions & 2 deletions markdown_it/helpers/parse_link_title.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Parse link title
"""
from ..common.utils import unescapeAll, charCodeAt, stripEscape
from ..common.utils import unescapeAll, charCodeAt


class _Result:
Expand Down Expand Up @@ -40,7 +40,7 @@ def parseLinkTitle(string: str, pos: int, maximum: int) -> _Result:
code = charCodeAt(string, pos)
if code == marker:
title = string[start + 1 : pos]
title = unescapeAll(stripEscape(title))
title = unescapeAll(title)
result.pos = pos + 1
result.lines = lines
result.str = title
Expand Down
3 changes: 1 addition & 2 deletions markdown_it/rules_block/fence.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# fences (``` lang, ~~~ lang)
import logging

from ..common.utils import stripEscape
from .state_block import StateBlock

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -97,7 +96,7 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool):
state.line = nextLine + (1 if haveEndMarker else 0)

token = state.push("fence", "code", 0)
token.info = stripEscape(params)
token.info = params
token.content = state.getLines(startLine + 1, nextLine, length, True)
token.markup = markup
token.map = [startLine, state.line]
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ project_urls =
[options]
packages = find:
install_requires =
mdurl
attrs>=19,<22
typing_extensions>=3.7.4;python_version<'3.8'
python_requires = ~=3.6
Expand Down
10 changes: 0 additions & 10 deletions tests/test_port/test_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,6 @@ def test_table(line, title, input, expected):
read_fixture_file(FIXTURE_PATH.joinpath("commonmark_extras.md")),
)
def test_commonmark_extras(line, title, input, expected):
if title in {
"Escaping entities in links:",
"Checking combination of replaceEntities and unescapeMd:",
}:
# TODO fix failing escaping tests
# probably requires a fix of common.utils.stripEscape
pytest.xfail("escaping entities in link titles / fence.info")
md = MarkdownIt("commonmark")
md.options["langPrefix"] = ""
text = md.render(input)
Expand Down Expand Up @@ -99,9 +92,6 @@ def test_normalize_url(line, title, input, expected):
"line,title,input,expected", read_fixture_file(FIXTURE_PATH.joinpath("fatal.md"))
)
def test_fatal(line, title, input, expected):
if line in [1, 17]:
# TODO fix failing url escaping tests
pytest.xfail("url normalisation")
md = MarkdownIt("commonmark").enable("replacements")
md.options["typographer"] = True
text = md.render(input)
Expand Down