Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit d5254ab

Browse files
committed
Add stubs package for lxml.
1 parent c01343d commit d5254ab

10 files changed

+108
-49
lines changed

changelog.d/15697.misc

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve type hints.

mypy.ini

-3
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,6 @@ ignore_missing_imports = True
6060
[mypy-ijson.*]
6161
ignore_missing_imports = True
6262

63-
[mypy-lxml]
64-
ignore_missing_imports = True
65-
6663
# https://github.com/msgpack/msgpack-python/issues/448
6764
[mypy-msgpack]
6865
ignore_missing_imports = True

poetry.lock

+20-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ black = ">=22.3.0"
314314
ruff = "0.0.265"
315315

316316
# Typechecking
317+
lxml-stubs = ">=0.4.0"
317318
mypy = "*"
318319
mypy-zope = "*"
319320
types-bleach = ">=4.1.0"

synapse/media/oembed.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import html
1515
import logging
1616
import urllib.parse
17-
from typing import TYPE_CHECKING, List, Optional
17+
from typing import TYPE_CHECKING, List, Optional, cast
1818

1919
import attr
2020

@@ -98,7 +98,7 @@ def get_oembed_url(self, url: str) -> Optional[str]:
9898
# No match.
9999
return None
100100

101-
def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
101+
def autodiscover_from_html(self, tree: "etree._Element") -> Optional[str]:
102102
"""
103103
Search an HTML document for oEmbed autodiscovery information.
104104
@@ -109,18 +109,20 @@ def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
109109
The URL to use for oEmbed information, or None if no URL was found.
110110
"""
111111
# Search for link elements with the proper rel and type attributes.
112-
for tag in tree.xpath(
113-
"//link[@rel='alternate'][@type='application/json+oembed']"
112+
for tag in cast(
113+
List["etree._Element"],
114+
tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
114115
):
115116
if "href" in tag.attrib:
116-
return tag.attrib["href"]
117+
return cast(str, tag.attrib["href"])
117118

118119
# Some providers (e.g. Flickr) use alternative instead of alternate.
119-
for tag in tree.xpath(
120-
"//link[@rel='alternative'][@type='application/json+oembed']"
120+
for tag in cast(
121+
List["etree._Element"],
122+
tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
121123
):
122124
if "href" in tag.attrib:
123-
return tag.attrib["href"]
125+
return cast(str, tag.attrib["href"])
124126

125127
return None
126128

@@ -212,11 +214,11 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
212214
return OEmbedResult(open_graph_response, author_name, cache_age)
213215

214216

215-
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
217+
def _fetch_urls(tree: "etree._Element", tag_name: str) -> List[str]:
216218
results = []
217-
for tag in tree.xpath("//*/" + tag_name):
219+
for tag in cast(List["etree._Element"], tree.xpath("//*/" + tag_name)):
218220
if "src" in tag.attrib:
219-
results.append(tag.attrib["src"])
221+
results.append(cast(str, tag.attrib["src"]))
220222
return results
221223

222224

@@ -244,11 +246,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
244246
parser = etree.HTMLParser(recover=True, encoding="utf-8")
245247

246248
# Attempt to parse the body. If this fails, log and return no metadata.
247-
tree = etree.fromstring(html_body, parser)
249+
# TODO Develop of lxml-stubs has this correct.
250+
tree = etree.fromstring(html_body, parser) # type: ignore[arg-type]
248251

249252
# The data was successfully parsed, but no tree was found.
250253
if tree is None:
251-
return
254+
return # type: ignore[unreachable]
252255

253256
# Attempt to find interesting URLs (images, videos, embeds).
254257
if "og:image" not in open_graph_response:

synapse/media/preview_html.py

+50-24
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
Optional,
2525
Set,
2626
Union,
27+
cast,
2728
)
2829

2930
if TYPE_CHECKING:
@@ -115,7 +116,7 @@ def _get_html_media_encodings(
115116

116117
def decode_body(
117118
body: bytes, uri: str, content_type: Optional[str] = None
118-
) -> Optional["etree.Element"]:
119+
) -> Optional["etree._Element"]:
119120
"""
120121
This uses lxml to parse the HTML document.
121122
@@ -152,11 +153,12 @@ def decode_body(
152153

153154
# Attempt to parse the body. Returns None if the body was successfully
154155
# parsed, but no tree was found.
155-
return etree.fromstring(body, parser)
156+
# TODO Develop of lxml-stubs has this correct.
157+
return etree.fromstring(body, parser) # type: ignore[arg-type]
156158

157159

158160
def _get_meta_tags(
159-
tree: "etree.Element",
161+
tree: "etree._Element",
160162
property: str,
161163
prefix: str,
162164
property_mapper: Optional[Callable[[str], Optional[str]]] = None,
@@ -175,9 +177,14 @@ def _get_meta_tags(
175177
Returns:
176178
A map of tag name to value.
177179
"""
180+
# This actually returns Dict[str, str], but the caller sets this as a variable
181+
# which is Dict[str, Optional[str]].
178182
results: Dict[str, Optional[str]] = {}
179-
for tag in tree.xpath(
180-
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
183+
for tag in cast(
184+
List["etree._Element"],
185+
tree.xpath(
186+
f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
187+
),
181188
):
182189
# if we've got more than 50 tags, someone is taking the piss
183190
if len(results) >= 50:
@@ -187,14 +194,15 @@ def _get_meta_tags(
187194
)
188195
return {}
189196

190-
key = tag.attrib[property]
197+
key = cast(str, tag.attrib[property])
191198
if property_mapper:
192-
key = property_mapper(key)
199+
new_key = property_mapper(key)
193200
# None is a special value used to ignore a value.
194-
if key is None:
201+
if new_key is None:
195202
continue
203+
key = new_key
196204

197-
results[key] = tag.attrib["content"]
205+
results[key] = cast(str, tag.attrib["content"])
198206

199207
return results
200208

@@ -219,7 +227,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]:
219227
return "og" + key[7:]
220228

221229

222-
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
230+
def parse_html_to_open_graph(tree: "etree._Element") -> Dict[str, Optional[str]]:
223231
"""
224232
Parse the HTML document into an Open Graph response.
225233
@@ -247,7 +255,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
247255
# "og:video:height" : "720",
248256
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
249257

250-
og = _get_meta_tags(tree, "property", "og")
258+
og: Dict[str, Optional[str]] = _get_meta_tags(tree, "property", "og")
251259

252260
# TODO: Search for properties specific to the different Open Graph types,
253261
# such as article: meta tags, e.g.:
@@ -276,15 +284,21 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
276284

277285
if "og:title" not in og:
278286
# Attempt to find a title from the title tag, or the biggest header on the page.
279-
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
287+
title = cast(
288+
List["etree._ElementUnicodeResult"],
289+
tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"),
290+
)
280291
if title:
281292
og["og:title"] = title[0].strip()
282293
else:
283294
og["og:title"] = None
284295

285296
if "og:image" not in og:
286-
meta_image = tree.xpath(
287-
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
297+
meta_image = cast(
298+
List["etree._ElementUnicodeResult"],
299+
tree.xpath(
300+
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
301+
),
288302
)
289303
# If a meta image is found, use it.
290304
if meta_image:
@@ -293,7 +307,10 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
293307
# Try to find images which are larger than 10px by 10px.
294308
#
295309
# TODO: consider inlined CSS styles as well as width & height attribs
296-
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
310+
images = cast(
311+
List["etree._Element"],
312+
tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"),
313+
)
297314
images = sorted(
298315
images,
299316
key=lambda i: (
@@ -302,20 +319,26 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
302319
)
303320
# If no images were found, try to find *any* images.
304321
if not images:
305-
images = tree.xpath("//img[@src][1]")
322+
images = cast(List["etree._Element"], tree.xpath("//img[@src][1]"))
306323
if images:
307-
og["og:image"] = images[0].attrib["src"]
324+
og["og:image"] = cast(str, images[0].attrib["src"])
308325

309326
# Finally, fallback to the favicon if nothing else.
310327
else:
311-
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
328+
favicons = cast(
329+
List["etree._ElementUnicodeResult"],
330+
tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"),
331+
)
312332
if favicons:
313333
og["og:image"] = favicons[0]
314334

315335
if "og:description" not in og:
316336
# Check the first meta description tag for content.
317-
meta_description = tree.xpath(
318-
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
337+
meta_description = cast(
338+
List["etree._ElementUnicodeResult"],
339+
tree.xpath(
340+
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
341+
),
319342
)
320343
# If a meta description is found with content, use it.
321344
if meta_description:
@@ -332,7 +355,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
332355
return og
333356

334357

335-
def parse_html_description(tree: "etree.Element") -> Optional[str]:
358+
def parse_html_description(tree: "etree._Element") -> Optional[str]:
336359
"""
337360
Calculate a text description based on an HTML document.
338361
@@ -368,6 +391,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
368391
"canvas",
369392
"img",
370393
"picture",
394+
# etree.Comment is a function which creates an etree._Comment element.
395+
# The "tag" attribute of an etree._Comment instance is confusingly the
396+
# etree.Comment function instead of a string.
371397
etree.Comment,
372398
}
373399

@@ -381,8 +407,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
381407

382408

383409
def _iterate_over_text(
384-
tree: Optional["etree.Element"],
385-
tags_to_ignore: Set[Union[str, "etree.Comment"]],
410+
tree: Optional["etree._Element"],
411+
tags_to_ignore: Set[object],
386412
stack_limit: int = 1024,
387413
) -> Generator[str, None, None]:
388414
"""Iterate over the tree returning text nodes in a depth first fashion,
@@ -402,7 +428,7 @@ def _iterate_over_text(
402428

403429
# This is a stack whose items are elements to iterate over *or* strings
404430
# to be returned.
405-
elements: List[Union[str, "etree.Element"]] = [tree]
431+
elements: List[Union[str, "etree._Element"]] = [tree]
406432
while elements:
407433
el = elements.pop()
408434

0 commit comments

Comments
 (0)