Skip to content

Commit b9fcd55

Browse files
authored
Merge pull request #10378 from jdufresne/unescape
Remove unnecessary html.unescape() calls in index/collector.py
2 parents b8fc219 + af34057 commit b9fcd55

File tree

3 files changed

+46
-23
lines changed

3 files changed

+46
-23
lines changed

news/10378.bugfix.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix double unescape of HTML ``data-requires-python`` and ``data-yanked`` attributes.

src/pip/_internal/index/collector.py

-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import cgi
66
import collections
77
import functools
8-
import html
98
import itertools
109
import logging
1110
import os
@@ -248,11 +247,7 @@ def _create_link_from_element(
248247

249248
url = _clean_link(urllib.parse.urljoin(base_url, href))
250249
pyrequire = anchor.get("data-requires-python")
251-
pyrequire = html.unescape(pyrequire) if pyrequire else None
252-
253250
yanked_reason = anchor.get("data-yanked")
254-
if yanked_reason:
255-
yanked_reason = html.unescape(yanked_reason)
256251

257252
link = Link(
258253
url,

tests/unit/test_collector.py

+45-18
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,45 @@ def test_clean_link(url, clean_url):
410410
assert _clean_link(url) == clean_url
411411

412412

413+
def _test_parse_links_data_attribute(anchor_html, attr, expected):
414+
html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>'
415+
html_bytes = html.encode("utf-8")
416+
page = HTMLPage(
417+
html_bytes,
418+
encoding=None,
419+
# parse_links() is cached by url, so we inject a random uuid to ensure
420+
# the page content isn't cached.
421+
url=f"https://example.com/simple-{uuid.uuid4()}/",
422+
)
423+
links = list(parse_links(page))
424+
(link,) = links
425+
actual = getattr(link, attr)
426+
assert actual == expected
427+
428+
429+
@pytest.mark.parametrize(
430+
"anchor_html, expected",
431+
[
432+
# Test not present.
433+
('<a href="/pkg-1.0.tar.gz"></a>', None),
434+
# Test present with no value.
435+
('<a href="/pkg-1.0.tar.gz" data-requires-python></a>', None),
436+
# Test a value with an escaped character.
437+
(
438+
'<a href="/pkg-1.0.tar.gz" data-requires-python="&gt;=3.6"></a>',
439+
">=3.6",
440+
),
441+
# Test requires python is unescaped once.
442+
(
443+
'<a href="/pkg-1.0.tar.gz" data-requires-python="&amp;gt;=3.6"></a>',
444+
"&gt;=3.6",
445+
),
446+
],
447+
)
448+
def test_parse_links__requires_python(anchor_html, expected):
449+
_test_parse_links_data_attribute(anchor_html, "requires_python", expected)
450+
451+
413452
@pytest.mark.parametrize(
414453
"anchor_html, expected",
415454
[
@@ -428,27 +467,15 @@ def test_clean_link(url, clean_url):
428467
'<a href="/pkg-1.0.tar.gz" data-yanked="curlyquote \u2018"></a>',
429468
"curlyquote \u2018",
430469
),
470+
# Test yanked reason is unescaped once.
471+
(
472+
'<a href="/pkg-1.0.tar.gz" data-yanked="version &amp;lt; 1"></a>',
473+
"version &lt; 1",
474+
),
431475
],
432476
)
433477
def test_parse_links__yanked_reason(anchor_html, expected):
434-
html = (
435-
# Mark this as a unicode string for Python 2 since anchor_html
436-
# can contain non-ascii.
437-
'<html><head><meta charset="utf-8"><head>'
438-
"<body>{}</body></html>"
439-
).format(anchor_html)
440-
html_bytes = html.encode("utf-8")
441-
page = HTMLPage(
442-
html_bytes,
443-
encoding=None,
444-
# parse_links() is cached by url, so we inject a random uuid to ensure
445-
# the page content isn't cached.
446-
url=f"https://example.com/simple-{uuid.uuid4()}/",
447-
)
448-
links = list(parse_links(page))
449-
(link,) = links
450-
actual = link.yanked_reason
451-
assert actual == expected
478+
_test_parse_links_data_attribute(anchor_html, "yanked_reason", expected)
452479

453480

454481
def test_parse_links_caches_same_page_by_url():

0 commit comments

Comments
 (0)