@@ -383,6 +383,13 @@ def __init__(self, value, metadata):
383
383
self .value = value
384
384
self .metadata = metadata
385
385
386
+ def __eq__ (self , other ):
387
+ # type: (Any) -> bool
388
+ if not isinstance (other , AnnotatedValue ):
389
+ return False
390
+
391
+ return self .value == other .value and self .metadata == other .metadata
392
+
386
393
@classmethod
387
394
def removed_because_raw_data (cls ):
388
395
# type: () -> AnnotatedValue
@@ -1119,6 +1126,39 @@ def _is_in_project_root(abs_path, project_root):
1119
1126
return False
1120
1127
1121
1128
1129
+ def _truncate_by_bytes (string , max_bytes ):
1130
+ # type: (str, int) -> str
1131
+ """
1132
+ Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes.
1133
+ """
1134
+ # This function technically supports bytes, but only for Python 2 compat.
1135
+ # XXX remove support for bytes when we drop Python 2
1136
+ if isinstance (string , bytes ):
1137
+ truncated = string [: max_bytes - 3 ]
1138
+ else :
1139
+ truncated = string .encode ("utf-8" )[: max_bytes - 3 ].decode (
1140
+ "utf-8" , errors = "ignore"
1141
+ )
1142
+
1143
+ return truncated + "..."
1144
+
1145
+
1146
+ def _get_size_in_bytes (value ):
1147
+ # type: (str) -> Optional[int]
1148
+ # This function technically supports bytes, but only for Python 2 compat.
1149
+ # XXX remove support for bytes when we drop Python 2
1150
+ if not isinstance (value , (bytes , text_type )):
1151
+ return None
1152
+
1153
+ if isinstance (value , bytes ):
1154
+ return len (value )
1155
+
1156
+ try :
1157
+ return len (value .encode ("utf-8" ))
1158
+ except (UnicodeEncodeError , UnicodeDecodeError ):
1159
+ return None
1160
+
1161
+
1122
1162
def strip_string (value , max_length = None ):
1123
1163
# type: (str, Optional[int]) -> Union[AnnotatedValue, str]
1124
1164
if not value :
@@ -1127,17 +1167,27 @@ def strip_string(value, max_length=None):
1127
1167
if max_length is None :
1128
1168
max_length = DEFAULT_MAX_VALUE_LENGTH
1129
1169
1130
- length = len (value .encode ("utf-8" ))
1170
+ byte_size = _get_size_in_bytes (value )
1171
+ text_size = None
1172
+ if isinstance (value , text_type ):
1173
+ text_size = len (value )
1174
+
1175
+ if byte_size is not None and byte_size > max_length :
1176
+ # truncate to max_length bytes, preserving code points
1177
+ truncated_value = _truncate_by_bytes (value , max_length )
1178
+ elif text_size is not None and text_size > max_length :
1179
+ # fallback to truncating by string length
1180
+ truncated_value = value [: max_length - 3 ] + "..."
1181
+ else :
1182
+ return value
1131
1183
1132
- if length > max_length :
1133
- return AnnotatedValue (
1134
- value = value [: max_length - 3 ] + "..." ,
1135
- metadata = {
1136
- "len" : length ,
1137
- "rem" : [["!limit" , "x" , max_length - 3 , max_length ]],
1138
- },
1139
- )
1140
- return value
1184
+ return AnnotatedValue (
1185
+ value = truncated_value ,
1186
+ metadata = {
1187
+ "len" : byte_size or text_size ,
1188
+ "rem" : [["!limit" , "x" , max_length - 3 , max_length ]],
1189
+ },
1190
+ )
1141
1191
1142
1192
1143
1193
def parse_version (version ):
0 commit comments