Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Try to recover from unknown media encodings. #9164

Merged
merged 11 commits into from
Jan 26, 2021
Merged
10 changes: 3 additions & 7 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,32 +717,28 @@ def decode_and_calc_og(
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
except LookupError:
# blindly consider the encoding as utf-8.
try:
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create fallback HTML parser: %s" % (e,))
return {}
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create HTML parser: %s" % (e,))
return {}

# Attempt to parse the body. If this fails, log and return no metadata.
try:
tree = etree.fromstring(body, parser)
return _calc_og(tree, media_uri)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
try:
tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
return _calc_og(tree, media_uri)
except Exception as e:
logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,))
return {}
except Exception as e:
logger.warning("Failed to parse HTML body: %s" % (e,))
return {}

return _calc_og(tree, media_uri)


def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response.
Expand Down
27 changes: 27 additions & 0 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,30 @@ def test_empty(self):
html = ""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})

def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = """
<html>
<head><title>Foo</title></head>
<body>
Some text.
</body>
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html", "invalid-encoding")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding2(self):
"""A body which doesn't match the sent character encoding."""
# Note that this contains an invalid UTF-8 sequence in the title.
html = b"""
<html>
<head><title>\xff\xff Foo</title></head>
<body>
Some text.
</body>
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})