Skip to content

Commit 96163eb

Browse files
RyanGWU82emilyskuo
andauthored
feat: new redaction methodology for Python metrics client (#266)
* feat: new redaction methodology for Python metrics client * release: bump version number to 1.2.0 due to changes with redaction * fix: redacted strings should have a space between REDACTED and their length * fix: remove outdated TODO * Update packages/python/readme_metrics/PayloadBuilder.py Co-authored-by: Emily Kuo <[email protected]> Co-authored-by: Emily Kuo <[email protected]>
1 parent 14a6549 commit 96163eb

File tree

6 files changed

+187
-142
lines changed

6 files changed

+187
-142
lines changed

packages/python/readme_metrics/PayloadBuilder.py

+78-68
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
from collections.abc import Mapping
12
import json
3+
from json import JSONDecodeError
24
import sys
35
import time
46
import importlib
@@ -9,8 +11,6 @@
911
from readme_metrics import ResponseInfoWrapper
1012
from werkzeug import Request
1113

12-
from readme_metrics.util import util_exclude_keys, util_filter_keys
13-
1414

1515
class PayloadBuilder:
1616
"""
@@ -97,56 +97,20 @@ def _build_request_payload(self, request: Request) -> dict:
9797
Returns:
9898
dict: Wrapped request payload
9999
"""
100-
post_data = {}
101-
headers = None
102-
103-
# Convert EnivronHeaders to a dictionary
104-
headers_dict = dict(request.headers.items())
105-
if self.denylist:
106-
headers = util_exclude_keys(headers_dict, self.denylist)
107-
elif self.allowlist:
108-
headers = util_filter_keys(headers_dict, self.allowlist)
109-
else:
110-
headers = headers_dict
111-
112-
if request.content_length is not None and request.content_length > 0:
113-
114-
body = request.rm_body.decode("utf-8") or ""
115-
116-
try:
117-
json_object = json.loads(body)
100+
headers = self._redact_dict(request.headers)
101+
params = parse.parse_qsl(request.query_string.decode("utf-8"))
118102

119-
if self.denylist:
120-
body = util_exclude_keys(json_object, self.denylist)
121-
elif self.allowlist:
122-
body = util_filter_keys(json_object, self.allowlist)
123-
124-
post_data["mimeType"] = "application/json"
125-
post_data["text"] = body
126-
except ValueError as e:
127-
post_data["params"] = [body]
128-
129-
if request.content_type:
130-
post_data["mimeType"] = request.content_type
131-
else:
132-
post_data["mimeType"] = "text/html"
133-
134-
hdr_items = []
135-
for k, v in headers.items():
136-
hdr_items.append({"name": k, "value": v})
137-
138-
qs_items = []
139-
qs_dict = dict(parse.parse_qsl(request.query_string.decode("utf-8")))
140-
141-
for k, v in qs_dict.items():
142-
qs_items.append({"name": k, "value": v})
103+
if request.content_length:
104+
post_data = self._process_body(request.rm_body)
105+
else:
106+
post_data = {}
143107

144108
return {
145109
"method": request.method,
146110
"url": request.base_url,
147111
"httpVersion": request.environ["SERVER_PROTOCOL"],
148-
"headers": hdr_items,
149-
"queryString": qs_items,
112+
"headers": [{"name": k, "value": v} for (k, v) in headers.items()],
113+
"queryString": [{"name": k, "value": v} for (k, v) in params],
150114
**post_data,
151115
}
152116

@@ -159,28 +123,10 @@ def _build_response_payload(self, response: ResponseInfoWrapper) -> dict:
159123
Returns:
160124
dict: Wrapped response payload
161125
"""
162-
if self.denylist:
163-
headers = util_exclude_keys(response.headers, self.denylist)
164-
elif self.allowlist:
165-
headers = util_filter_keys(response.headers, self.allowlist)
166-
else:
167-
headers = response.headers
168-
169-
body = response.body
170-
171-
try:
172-
json_object = json.loads(body)
173-
174-
if self.denylist:
175-
body = util_exclude_keys(json_object, self.denylist)
176-
elif self.allowlist:
177-
body = util_filter_keys(json_object, self.allowlist)
178-
except ValueError:
179-
pass
126+
headers = self._redact_dict(response.headers)
127+
body = self._process_body(response.body).get("text")
180128

181-
hdr_items = []
182-
for k, v in headers.items():
183-
hdr_items.append({"name": k, "value": v})
129+
headers = [{"name": k, "value": v} for (k, v) in headers.items()]
184130

185131
status_string = str(response.status)
186132
status_code = int(status_string.split(" ")[0])
@@ -189,10 +135,74 @@ def _build_response_payload(self, response: ResponseInfoWrapper) -> dict:
189135
return {
190136
"status": status_code,
191137
"statusText": status_text or "",
192-
"headers": hdr_items, # headers.items(),
138+
"headers": headers, # headers.items(),
193139
"content": {
194140
"text": body,
195141
"size": response.content_length,
196142
"mimeType": response.content_type,
197143
},
198144
}
145+
146+
# always returns a dict with some of these fields: text, mimeType, params}
147+
def _process_body(self, body):
148+
if isinstance(body, bytes):
149+
# Non-unicode bytes cannot be directly serialized as a JSON
150+
# payload to send to the ReadMe API, so we need to convert this to a
151+
# unicode string first. But we don't know what encoding it might be
152+
# using, if any (it could also just be raw bytes, like an image).
153+
# We're going to assume that if it's possible to decode at all, then
154+
# it's most likely UTF-8. If we can't decode it, just send an error
155+
# with the JSON payload.
156+
try:
157+
body = body.decode("utf-8")
158+
except UnicodeDecodeError:
159+
return {"text": "[ERROR: NOT VALID UTF-8]"}
160+
161+
if not isinstance(body, str):
162+
# We don't know how to process this body. If it's safe to encode as
163+
# JSON, return it unchanged; otherwise return an error.
164+
try:
165+
json.dumps(body)
166+
return {"text": body}
167+
except TypeError:
168+
return {"text": "[ERROR: NOT SERIALIZABLE]"}
169+
170+
try:
171+
body_data = json.loads(body)
172+
except JSONDecodeError:
173+
params = parse.parse_qsl(body)
174+
if params:
175+
return {
176+
"text": body,
177+
"mimeType": "multipart/form-data",
178+
"params": [{"name": k, "value": v} for (k, v) in params],
179+
}
180+
else:
181+
return {"text": body}
182+
183+
if (self.denylist or self.allowlist) and isinstance(body_data, dict):
184+
redacted_data = self._redact_dict(body_data)
185+
body = json.dumps(redacted_data)
186+
187+
return {"text": body, "mimeType": "application/json"}
188+
189+
def _redact_dict(self, mapping: Mapping):
190+
def _redact_value(v):
191+
if isinstance(v, str):
192+
return f"[REDACTED {len(v)}]"
193+
else:
194+
return "[REDACTED]"
195+
196+
# Short-circuit this function if there's no allowlist or denylist
197+
if not (self.allowlist or self.denylist):
198+
return mapping
199+
200+
result = dict()
201+
for (key, value) in mapping.items():
202+
if self.denylist and key in self.denylist:
203+
result[key] = _redact_value(value)
204+
elif self.allowlist and key not in self.allowlist:
205+
result[key] = _redact_value(value)
206+
else:
207+
result[key] = value
208+
return result
+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from readme_metrics.MetricsApiConfig import MetricsApiConfig
22
from readme_metrics.MetricsMiddleware import MetricsMiddleware
33

4-
__version__ = "1.1.0"
4+
__version__ = "1.2.0"

packages/python/readme_metrics/tests/PayloadBuilder_test.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,10 @@ def testDenylist(self):
113113
text = data["request"]["log"]["entries"][0]["request"]["text"]
114114

115115
assert "ok" in text
116-
assert "password" not in text
116+
assert "123" in text
117+
assert "password" in text
118+
assert "456" not in text
119+
assert "[REDACTED]" in text
117120

118121
def testAllowlist(self):
119122
config = self.mockMiddlewareConfig(allowlist=["ok"])
@@ -131,7 +134,10 @@ def testAllowlist(self):
131134
text = data["request"]["log"]["entries"][0]["request"]["text"]
132135

133136
assert "ok" in text
134-
assert "password" not in text
137+
assert "123" in text
138+
assert "password" in text
139+
assert "456" not in text
140+
assert "[REDACTED]" in text
135141

136142
def testDeprecatedBlackListed(self):
137143

@@ -151,7 +157,10 @@ def testDeprecatedBlackListed(self):
151157
text = data["request"]["log"]["entries"][0]["request"]["text"]
152158

153159
assert "ok" in text
154-
assert "password" not in text
160+
assert "123" in text
161+
assert "password" in text
162+
assert "456" not in text
163+
assert "[REDACTED]" in text
155164

156165
def testDeprecatedWhiteListed(self):
157166
config = self.mockMiddlewareConfig(whitelist=["ok"])
@@ -169,7 +178,10 @@ def testDeprecatedWhiteListed(self):
169178
text = data["request"]["log"]["entries"][0]["request"]["text"]
170179

171180
assert "ok" in text
172-
assert "password" not in text
181+
assert "123" in text
182+
assert "password" in text
183+
assert "456" not in text
184+
assert "[REDACTED]" in text
173185

174186
def testGroupingFunction(self):
175187
config = self.mockMiddlewareConfig(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import pytest # pylint: disable=import-error
2+
from readme_metrics.PayloadBuilder import PayloadBuilder
3+
4+
5+
allowlist = [
6+
"allowed_string",
7+
"allowed_number",
8+
"allowed_dict",
9+
"allowed_list",
10+
"allowed_object",
11+
]
12+
denylist = [
13+
"denied_string",
14+
"denied_number",
15+
"denied_dict",
16+
"denied_list",
17+
"denied_object",
18+
]
19+
20+
subdict = {"allowed_string": "allowed_value", "denied_string": "denied_value"}
21+
sublist = ["allowed_string", "denied_string"]
22+
mapping = {
23+
"allowed_string": "allowed_value",
24+
"denied_string": "denied_value",
25+
"unspecified_string": "unspecified_value",
26+
"allowed_number": 123,
27+
"denied_number": 456,
28+
"unspecified_number": 789,
29+
"allowed_dict": subdict,
30+
"denied_dict": subdict,
31+
"unspecified_dict": subdict,
32+
"allowed_list": sublist,
33+
"denied_list": sublist,
34+
"unspecified_list": sublist,
35+
}
36+
37+
# When using the denylist, we should redact all top-level fields with names like denied_*.
38+
expected_denylist_result = {
39+
"allowed_string": "allowed_value",
40+
"denied_string": "[REDACTED 12]",
41+
"unspecified_string": "unspecified_value",
42+
"allowed_number": 123,
43+
"denied_number": "[REDACTED]",
44+
"unspecified_number": 789,
45+
"allowed_dict": subdict,
46+
"denied_dict": "[REDACTED]",
47+
"unspecified_dict": subdict,
48+
"allowed_list": sublist,
49+
"denied_list": "[REDACTED]",
50+
"unspecified_list": sublist,
51+
}
52+
53+
# When using the denylist, we should redact all denied_* and unspecified_* top-level fields.
54+
expected_allowlist_result = {
55+
"allowed_string": "allowed_value",
56+
"denied_string": "[REDACTED 12]",
57+
"unspecified_string": "[REDACTED 17]",
58+
"allowed_number": 123,
59+
"denied_number": "[REDACTED]",
60+
"unspecified_number": "[REDACTED]",
61+
"allowed_dict": subdict,
62+
"denied_dict": "[REDACTED]",
63+
"unspecified_dict": "[REDACTED]",
64+
"allowed_list": sublist,
65+
"denied_list": "[REDACTED]",
66+
"unspecified_list": "[REDACTED]",
67+
}
68+
69+
70+
def test_redaction_with_allowlist():
71+
allowlist_result = PayloadBuilder(
72+
denylist=None,
73+
allowlist=allowlist,
74+
development_mode=True,
75+
grouping_function=None,
76+
)._redact_dict(mapping)
77+
assert allowlist_result == expected_allowlist_result
78+
79+
80+
def test_redaction_with_denylist():
81+
denylist_result = PayloadBuilder(
82+
denylist=denylist, allowlist=None, development_mode=True, grouping_function=None
83+
)._redact_dict(mapping)
84+
assert denylist_result == expected_denylist_result
85+
86+
87+
def test_redaction_with_both():
88+
# when both allowlist and denylist are present, denylist takes precedence
89+
denylist_result = PayloadBuilder(
90+
denylist=denylist, allowlist=None, development_mode=True, grouping_function=None
91+
)._redact_dict(mapping)
92+
assert denylist_result == expected_denylist_result

packages/python/readme_metrics/tests/util_test.py

-61
This file was deleted.

0 commit comments

Comments
 (0)