fix(utils): Fix UnicodeDecodeError on Python 2 (#2657)

sentrivana · web-flow · commit e373e35851b8 · 2024-01-29T15:57:56.000+01:00
diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py
@@ -383,6 +383,13 @@ def __init__(self, value, metadata):
         self.value = value
         self.metadata = metadata
 
+    def __eq__(self, other):
+        # type: (Any) -> bool
+        if not isinstance(other, AnnotatedValue):
+            return False
+
+        return self.value == other.value and self.metadata == other.metadata
+
     @classmethod
     def removed_because_raw_data(cls):
         # type: () -> AnnotatedValue
@@ -1119,6 +1126,39 @@ def _is_in_project_root(abs_path, project_root):
     return False
 
 
+def _truncate_by_bytes(string, max_bytes):
+    # type: (str, int) -> str
+    """
+    Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes.
+    """
+    # This function technically supports bytes, but only for Python 2 compat.
+    # XXX remove support for bytes when we drop Python 2
+    if isinstance(string, bytes):
+        truncated = string[: max_bytes - 3]
+    else:
+        truncated = string.encode("utf-8")[: max_bytes - 3].decode(
+            "utf-8", errors="ignore"
+        )
+
+    return truncated + "..."
+
+
+def _get_size_in_bytes(value):
+    # type: (str) -> Optional[int]
+    # This function technically supports bytes, but only for Python 2 compat.
+    # XXX remove support for bytes when we drop Python 2
+    if not isinstance(value, (bytes, text_type)):
+        return None
+
+    if isinstance(value, bytes):
+        return len(value)
+
+    try:
+        return len(value.encode("utf-8"))
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        return None
+
+
 def strip_string(value, max_length=None):
     # type: (str, Optional[int]) -> Union[AnnotatedValue, str]
     if not value:
@@ -1127,17 +1167,27 @@ def strip_string(value, max_length=None):
     if max_length is None:
         max_length = DEFAULT_MAX_VALUE_LENGTH
 
-    length = len(value.encode("utf-8"))
+    byte_size = _get_size_in_bytes(value)
+    text_size = None
+    if isinstance(value, text_type):
+        text_size = len(value)
+
+    if byte_size is not None and byte_size > max_length:
+        # truncate to max_length bytes, preserving code points
+        truncated_value = _truncate_by_bytes(value, max_length)
+    elif text_size is not None and text_size > max_length:
+        # fallback to truncating by string length
+        truncated_value = value[: max_length - 3] + "..."
+    else:
+        return value
 
-    if length > max_length:
-        return AnnotatedValue(
-            value=value[: max_length - 3] + "...",
-            metadata={
-                "len": length,
-                "rem": [["!limit", "x", max_length - 3, max_length]],
-            },
-        )
-    return value
+    return AnnotatedValue(
+        value=truncated_value,
+        metadata={
+            "len": byte_size or text_size,
+            "rem": [["!limit", "x", max_length - 3, max_length]],
+        },
+    )
 
 
 def parse_version(version):
diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py
@@ -572,22 +572,32 @@ def test_failed_base64_conversion(input):
         assert to_base64(input) is None
 
 
-def test_strip_string():
-    # If value is None returns None.
-    assert strip_string(None) is None
-
-    # If max_length is not passed, returns the full text (up to 1024 bytes).
-    text_1024_long = "a" * 1024
-    assert strip_string(text_1024_long).count("a") == 1024
-
-    # If value exceeds the max_length, returns an AnnotatedValue.
-    text_1025_long = "a" * 1025
-    stripped_text = strip_string(text_1025_long)
-    assert isinstance(stripped_text, AnnotatedValue)
-    assert stripped_text.value.count("a") == 1021  # + '...' is 1024
-
-    # If text has unicode characters, it counts bytes and not number of characters.
-    # fmt: off
-    text_with_unicode_character = u"éê"
-    assert strip_string(text_with_unicode_character, max_length=2).value == u"é..."
-    # fmt: on
+@pytest.mark.parametrize(
+    "input,max_length,result",
+    [
+        [None, None, None],
+        ["a" * 256, None, "a" * 256],
+        [
+            "a" * 257,
+            256,
+            AnnotatedValue(
+                value="a" * 253 + "...",
+                metadata={"len": 257, "rem": [["!limit", "x", 253, 256]]},
+            ),
+        ],
+        # fmt: off
+        [u"éééé", None, u"éééé"],
+        [u"éééé", 5, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]})],
+        # fmt: on
+        ["éééé", None, "éééé"],
+        [
+            "éééé",
+            5,
+            AnnotatedValue(
+                value="é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]}
+            ),
+        ],
+    ],
+)
+def test_strip_string(input, max_length, result):
+    assert strip_string(input, max_length) == result