24
24
Optional ,
25
25
Set ,
26
26
Union ,
27
+ cast ,
27
28
)
28
29
29
30
if TYPE_CHECKING :
@@ -115,7 +116,7 @@ def _get_html_media_encodings(
115
116
116
117
def decode_body (
117
118
body : bytes , uri : str , content_type : Optional [str ] = None
118
- ) -> Optional ["etree.Element " ]:
119
+ ) -> Optional ["etree._Element " ]:
119
120
"""
120
121
This uses lxml to parse the HTML document.
121
122
@@ -152,11 +153,12 @@ def decode_body(
152
153
153
154
# Attempt to parse the body. Returns None if the body was successfully
154
155
# parsed, but no tree was found.
155
- return etree .fromstring (body , parser )
156
+ # TODO Develop of lxml-stubs has this correct.
157
+ return etree .fromstring (body , parser ) # type: ignore[arg-type]
156
158
157
159
158
160
def _get_meta_tags (
159
- tree : "etree.Element " ,
161
+ tree : "etree._Element " ,
160
162
property : str ,
161
163
prefix : str ,
162
164
property_mapper : Optional [Callable [[str ], Optional [str ]]] = None ,
@@ -175,9 +177,14 @@ def _get_meta_tags(
175
177
Returns:
176
178
A map of tag name to value.
177
179
"""
180
+ # This actually returns Dict[str, str], but the caller sets this as a variable
181
+ # which is Dict[str, Optional[str]].
178
182
results : Dict [str , Optional [str ]] = {}
179
- for tag in tree .xpath (
180
- f"//*/meta[starts-with(@{ property } , '{ prefix } :')][@content][not(@content='')]"
183
+ for tag in cast (
184
+ List ["etree._Element" ],
185
+ tree .xpath (
186
+ f"//*/meta[starts-with(@{ property } , '{ prefix } :')][@content][not(@content='')]"
187
+ ),
181
188
):
182
189
# if we've got more than 50 tags, someone is taking the piss
183
190
if len (results ) >= 50 :
@@ -187,14 +194,15 @@ def _get_meta_tags(
187
194
)
188
195
return {}
189
196
190
- key = tag .attrib [property ]
197
+ key = cast ( str , tag .attrib [property ])
191
198
if property_mapper :
192
- key = property_mapper (key )
199
+ new_key = property_mapper (key )
193
200
# None is a special value used to ignore a value.
194
- if key is None :
201
+ if new_key is None :
195
202
continue
203
+ key = new_key
196
204
197
- results [key ] = tag .attrib ["content" ]
205
+ results [key ] = cast ( str , tag .attrib ["content" ])
198
206
199
207
return results
200
208
@@ -219,7 +227,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]:
219
227
return "og" + key [7 :]
220
228
221
229
222
- def parse_html_to_open_graph (tree : "etree.Element " ) -> Dict [str , Optional [str ]]:
230
+ def parse_html_to_open_graph (tree : "etree._Element " ) -> Dict [str , Optional [str ]]:
223
231
"""
224
232
Parse the HTML document into an Open Graph response.
225
233
@@ -247,7 +255,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
247
255
# "og:video:height" : "720",
248
256
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
249
257
250
- og = _get_meta_tags (tree , "property" , "og" )
258
+ og : Dict [ str , Optional [ str ]] = _get_meta_tags (tree , "property" , "og" )
251
259
252
260
# TODO: Search for properties specific to the different Open Graph types,
253
261
# such as article: meta tags, e.g.:
@@ -276,15 +284,21 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
276
284
277
285
if "og:title" not in og :
278
286
# Attempt to find a title from the title tag, or the biggest header on the page.
279
- title = tree .xpath ("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()" )
287
+ title = cast (
288
+ List ["etree._ElementUnicodeResult" ],
289
+ tree .xpath ("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()" ),
290
+ )
280
291
if title :
281
292
og ["og:title" ] = title [0 ].strip ()
282
293
else :
283
294
og ["og:title" ] = None
284
295
285
296
if "og:image" not in og :
286
- meta_image = tree .xpath (
287
- "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
297
+ meta_image = cast (
298
+ List ["etree._ElementUnicodeResult" ],
299
+ tree .xpath (
300
+ "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
301
+ ),
288
302
)
289
303
# If a meta image is found, use it.
290
304
if meta_image :
@@ -293,7 +307,10 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
293
307
# Try to find images which are larger than 10px by 10px.
294
308
#
295
309
# TODO: consider inlined CSS styles as well as width & height attribs
296
- images = tree .xpath ("//img[@src][number(@width)>10][number(@height)>10]" )
310
+ images = cast (
311
+ List ["etree._Element" ],
312
+ tree .xpath ("//img[@src][number(@width)>10][number(@height)>10]" ),
313
+ )
297
314
images = sorted (
298
315
images ,
299
316
key = lambda i : (
@@ -302,20 +319,26 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
302
319
)
303
320
# If no images were found, try to find *any* images.
304
321
if not images :
305
- images = tree .xpath ("//img[@src][1]" )
322
+ images = cast ( List [ "etree._Element" ], tree .xpath ("//img[@src][1]" ) )
306
323
if images :
307
- og ["og:image" ] = images [0 ].attrib ["src" ]
324
+ og ["og:image" ] = cast ( str , images [0 ].attrib ["src" ])
308
325
309
326
# Finally, fallback to the favicon if nothing else.
310
327
else :
311
- favicons = tree .xpath ("//link[@href][contains(@rel, 'icon')]/@href[1]" )
328
+ favicons = cast (
329
+ List ["etree._ElementUnicodeResult" ],
330
+ tree .xpath ("//link[@href][contains(@rel, 'icon')]/@href[1]" ),
331
+ )
312
332
if favicons :
313
333
og ["og:image" ] = favicons [0 ]
314
334
315
335
if "og:description" not in og :
316
336
# Check the first meta description tag for content.
317
- meta_description = tree .xpath (
318
- "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
337
+ meta_description = cast (
338
+ List ["etree._ElementUnicodeResult" ],
339
+ tree .xpath (
340
+ "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
341
+ ),
319
342
)
320
343
# If a meta description is found with content, use it.
321
344
if meta_description :
@@ -332,7 +355,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
332
355
return og
333
356
334
357
335
- def parse_html_description (tree : "etree.Element " ) -> Optional [str ]:
358
+ def parse_html_description (tree : "etree._Element " ) -> Optional [str ]:
336
359
"""
337
360
Calculate a text description based on an HTML document.
338
361
@@ -368,6 +391,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
368
391
"canvas" ,
369
392
"img" ,
370
393
"picture" ,
394
+ # etree.Comment is a function which creates an etree._Comment element.
395
+ # The "tag" attribute of an etree._Comment instance is confusingly the
396
+ # etree.Comment function instead of a string.
371
397
etree .Comment ,
372
398
}
373
399
@@ -381,8 +407,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
381
407
382
408
383
409
def _iterate_over_text (
384
- tree : Optional ["etree.Element " ],
385
- tags_to_ignore : Set [Union [ str , "etree.Comment" ] ],
410
+ tree : Optional ["etree._Element " ],
411
+ tags_to_ignore : Set [object ],
386
412
stack_limit : int = 1024 ,
387
413
) -> Generator [str , None , None ]:
388
414
"""Iterate over the tree returning text nodes in a depth first fashion,
@@ -402,7 +428,7 @@ def _iterate_over_text(
402
428
403
429
# This is a stack whose items are elements to iterate over *or* strings
404
430
# to be returned.
405
- elements : List [Union [str , "etree.Element " ]] = [tree ]
431
+ elements : List [Union [str , "etree._Element " ]] = [tree ]
406
432
while elements :
407
433
el = elements .pop ()
408
434
0 commit comments