Skip to content

Commit e373e35

Browse files
authoredJan 29, 2024
fix(utils): Fix UnicodeDecodeError on Python 2 (#2657)
1 parent 704d259 commit e373e35

File tree

2 files changed

+89
-29
lines changed

2 files changed

+89
-29
lines changed
 

‎sentry_sdk/utils.py

+60-10
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,13 @@ def __init__(self, value, metadata):
383383
self.value = value
384384
self.metadata = metadata
385385

386+
def __eq__(self, other):
387+
# type: (Any) -> bool
388+
if not isinstance(other, AnnotatedValue):
389+
return False
390+
391+
return self.value == other.value and self.metadata == other.metadata
392+
386393
@classmethod
387394
def removed_because_raw_data(cls):
388395
# type: () -> AnnotatedValue
@@ -1119,6 +1126,39 @@ def _is_in_project_root(abs_path, project_root):
11191126
return False
11201127

11211128

1129+
def _truncate_by_bytes(string, max_bytes):
1130+
# type: (str, int) -> str
1131+
"""
1132+
Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes.
1133+
"""
1134+
# This function technically supports bytes, but only for Python 2 compat.
1135+
# XXX remove support for bytes when we drop Python 2
1136+
if isinstance(string, bytes):
1137+
truncated = string[: max_bytes - 3]
1138+
else:
1139+
truncated = string.encode("utf-8")[: max_bytes - 3].decode(
1140+
"utf-8", errors="ignore"
1141+
)
1142+
1143+
return truncated + "..."
1144+
1145+
1146+
def _get_size_in_bytes(value):
1147+
# type: (str) -> Optional[int]
1148+
# This function technically supports bytes, but only for Python 2 compat.
1149+
# XXX remove support for bytes when we drop Python 2
1150+
if not isinstance(value, (bytes, text_type)):
1151+
return None
1152+
1153+
if isinstance(value, bytes):
1154+
return len(value)
1155+
1156+
try:
1157+
return len(value.encode("utf-8"))
1158+
except (UnicodeEncodeError, UnicodeDecodeError):
1159+
return None
1160+
1161+
11221162
def strip_string(value, max_length=None):
11231163
# type: (str, Optional[int]) -> Union[AnnotatedValue, str]
11241164
if not value:
@@ -1127,17 +1167,27 @@ def strip_string(value, max_length=None):
11271167
if max_length is None:
11281168
max_length = DEFAULT_MAX_VALUE_LENGTH
11291169

1130-
length = len(value.encode("utf-8"))
1170+
byte_size = _get_size_in_bytes(value)
1171+
text_size = None
1172+
if isinstance(value, text_type):
1173+
text_size = len(value)
1174+
1175+
if byte_size is not None and byte_size > max_length:
1176+
# truncate to max_length bytes, preserving code points
1177+
truncated_value = _truncate_by_bytes(value, max_length)
1178+
elif text_size is not None and text_size > max_length:
1179+
# fallback to truncating by string length
1180+
truncated_value = value[: max_length - 3] + "..."
1181+
else:
1182+
return value
11311183

1132-
if length > max_length:
1133-
return AnnotatedValue(
1134-
value=value[: max_length - 3] + "...",
1135-
metadata={
1136-
"len": length,
1137-
"rem": [["!limit", "x", max_length - 3, max_length]],
1138-
},
1139-
)
1140-
return value
1184+
return AnnotatedValue(
1185+
value=truncated_value,
1186+
metadata={
1187+
"len": byte_size or text_size,
1188+
"rem": [["!limit", "x", max_length - 3, max_length]],
1189+
},
1190+
)
11411191

11421192

11431193
def parse_version(version):

‎tests/utils/test_general.py

+29-19
Original file line numberDiff line numberDiff line change
@@ -572,22 +572,32 @@ def test_failed_base64_conversion(input):
572572
assert to_base64(input) is None
573573

574574

575-
def test_strip_string():
576-
# If value is None returns None.
577-
assert strip_string(None) is None
578-
579-
# If max_length is not passed, returns the full text (up to 1024 bytes).
580-
text_1024_long = "a" * 1024
581-
assert strip_string(text_1024_long).count("a") == 1024
582-
583-
# If value exceeds the max_length, returns an AnnotatedValue.
584-
text_1025_long = "a" * 1025
585-
stripped_text = strip_string(text_1025_long)
586-
assert isinstance(stripped_text, AnnotatedValue)
587-
assert stripped_text.value.count("a") == 1021 # + '...' is 1024
588-
589-
# If text has unicode characters, it counts bytes and not number of characters.
590-
# fmt: off
591-
text_with_unicode_character = u"éê"
592-
assert strip_string(text_with_unicode_character, max_length=2).value == u"é..."
593-
# fmt: on
575+
@pytest.mark.parametrize(
576+
"input,max_length,result",
577+
[
578+
[None, None, None],
579+
["a" * 256, None, "a" * 256],
580+
[
581+
"a" * 257,
582+
256,
583+
AnnotatedValue(
584+
value="a" * 253 + "...",
585+
metadata={"len": 257, "rem": [["!limit", "x", 253, 256]]},
586+
),
587+
],
588+
# fmt: off
589+
[u"éééé", None, u"éééé"],
590+
[u"éééé", 5, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]})],
591+
# fmt: on
592+
["éééé", None, "éééé"],
593+
[
594+
"éééé",
595+
5,
596+
AnnotatedValue(
597+
value="é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]}
598+
),
599+
],
600+
],
601+
)
602+
def test_strip_string(input, max_length, result):
603+
assert strip_string(input, max_length) == result

0 commit comments

Comments
 (0)
Please sign in to comment.