Skip to content

Commit b756a32

Browse files
committed
Switch to using html.parser even when the doctype isn't proper
This ensures that we handle html5lib parsing as non-default deprecated behaviour.
1 parent d5aeced commit b756a32

File tree

2 files changed

+27
-29
lines changed

2 files changed

+27
-29
lines changed

src/pip/_internal/index/collector.py

+2-24
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
from pip._internal.models.search_scope import SearchScope
4242
from pip._internal.network.session import PipSession
4343
from pip._internal.network.utils import raise_for_status
44-
from pip._internal.utils.deprecation import deprecated
4544
from pip._internal.utils.filetypes import is_archive_file
4645
from pip._internal.utils.misc import pairwise, redact_auth_from_url
4746
from pip._internal.vcs import vcs
@@ -346,34 +345,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin
346345
"""
347346
Parse an HTML document, and yield its anchor elements as Link objects.
348347
"""
349-
encoding = page.encoding or "utf-8"
350-
351-
# Check if the page starts with a valid doctype, to decide whether to use
352-
# http.parser or (deprecated) html5lib for parsing -- unless explicitly
353-
# requested to use html5lib.
354-
if not use_deprecated_html5lib:
355-
expected_doctype = "<!doctype html>".encode(encoding)
356-
actual_start = page.content[: len(expected_doctype)]
357-
if actual_start.decode(encoding).lower() != "<!doctype html>":
358-
deprecated(
359-
reason=(
360-
f"The HTML index page being used ({page.url}) is not a proper "
361-
"HTML 5 document. This is in violation of PEP 503 which requires "
362-
"these pages to be well-formed HTML 5 documents. Please reach out "
363-
"to the owners of this index page, and ask them to update this "
364-
"index page to a valid HTML 5 document."
365-
),
366-
replacement=None,
367-
gone_in="22.2",
368-
issue=10825,
369-
)
370-
use_deprecated_html5lib = True
371348

372349
if use_deprecated_html5lib:
373350
yield from _parse_links_html5lib(page)
374351
return
375352

376-
parser = HTMLLinkParser()
353+
parser = HTMLLinkParser(page.url)
354+
encoding = page.encoding or "utf-8"
377355
parser.feed(page.content.decode(encoding))
378356

379357
url = page.url

tests/unit/test_collector.py

+25-5
Original file line numberDiff line numberDiff line change
@@ -551,21 +551,41 @@ def test_parse_link_handles_deprecated_usage_properly() -> None:
551551
assert "pkg1-2.0" in parsed_links[1].url
552552

553553

554-
@mock.patch("pip._internal.index.collector.deprecated")
555-
def test_parse_links_presents_deprecation_warning_on_non_html5_page(
556-
mock_deprecated: mock.Mock,
554+
def test_parse_links_presents_warning_on_missing_doctype(
555+
caplog: pytest.LogCaptureFixture,
557556
) -> None:
558557
html = b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
559558
url = "https://example.com/simple/"
560559
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)
561560

562-
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
561+
with caplog.at_level(logging.WARN):
562+
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
563563

564564
assert len(parsed_links) == 2, parsed_links
565565
assert "pkg1-1.0" in parsed_links[0].url
566566
assert "pkg1-2.0" in parsed_links[1].url
567567

568-
mock_deprecated.assert_called_once()
568+
assert len(caplog.records) == 1
569+
570+
571+
def test_parse_links_presents_warning_on_html4_doctype(
572+
caplog: pytest.LogCaptureFixture,
573+
) -> None:
574+
html = (
575+
b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
576+
b'<a href="/pkg1-1.0.tar.gz"></a><a href="/pkg1-2.0.tar.gz"></a>'
577+
)
578+
url = "https://example.com/simple/"
579+
page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False)
580+
581+
with caplog.at_level(logging.WARN):
582+
parsed_links = list(parse_links(page, use_deprecated_html5lib=False))
583+
584+
assert len(parsed_links) == 2, parsed_links
585+
assert "pkg1-1.0" in parsed_links[0].url
586+
assert "pkg1-2.0" in parsed_links[1].url
587+
588+
assert len(caplog.records) == 1
569589

570590

571591
@mock.patch("pip._internal.index.collector.raise_for_status")

0 commit comments

Comments
 (0)