matrix-org · clokep · Jan 26, 2021 · Jan 19, 2021 · Jan 19, 2021 · Jan 19, 2021
@@ -717,32 +717,28 @@ def decode_and_calc_og(
         parser = etree.HTMLParser(recover=True, encoding=request_encoding)
     except LookupError:
         # blindly consider the encoding as utf-8.
-        try:
-            parser = etree.HTMLParser(recover=True, encoding="utf-8")
-        except Exception as e:
-            logger.warning("Unable to create fallback HTML parser: %s" % (e,))
-            return {}
+        parser = etree.HTMLParser(recover=True, encoding="utf-8")
     except Exception as e:
         logger.warning("Unable to create HTML parser: %s" % (e,))
         return {}
 
     # Attempt to parse the body. If this fails, log and return no metadata.
     try:
         tree = etree.fromstring(body, parser)
+        return _calc_og(tree, media_uri)
     except UnicodeDecodeError:
         # blindly try decoding the body as utf-8, which seems to fix
         # the charset mismatches on https://google.com
         try:
             tree = etree.fromstring(body.decode("utf-8", "ignore"), parser)
+            return _calc_og(tree, media_uri)
         except Exception as e:
             logger.warning("Failed to parse HTML body as UTF-8: %s" % (e,))
             return {}
     except Exception as e:
         logger.warning("Failed to parse HTML body: %s" % (e,))
         return {}
 
-    return _calc_og(tree, media_uri)
-
 
 def _calc_og(tree, media_uri: str) -> Dict[str, Optional[str]]:
     # suck our tree into lxml and define our OG response.

@@ -261,3 +261,30 @@ def test_empty(self):
         html = ""
         og = decode_and_calc_og(html, "http://example.com/test.html")
         self.assertEqual(og, {})
+
+    def test_invalid_encoding(self):
+        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
+        html = """
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        og = decode_and_calc_og(html, "http://example.com/test.html", "invalid-encoding")
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_invalid_encoding2(self):
+        """A body which doesn't match the sent character encoding."""
+        # Note that this contains an invalid UTF-8 sequence in the title.
+        html = b"""
+        <html>
+        <head><title>\xff\xff Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        og = decode_and_calc_og(html, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})