feat: new redaction methodology for Python metrics client (#266)

RyanGWU82 · emilyskuo · web-flow · commit 96163ebd935c · 2021-07-07T16:13:06.000-07:00
* feat: new redaction methodology for Python metrics client

* release: bump version number to 1.2.0 due to changes with redaction

* fix: redacted strings should have a space between REDACTED and their length

* fix: remove outdated TODO

* Update packages/python/readme_metrics/PayloadBuilder.py

Co-authored-by: Emily Kuo &lt;58803587+emilyskuo@users.noreply.github.com&gt;

Co-authored-by: Emily Kuo &lt;58803587+emilyskuo@users.noreply.github.com&gt;
diff --git a/packages/python/readme_metrics/PayloadBuilder.py b/packages/python/readme_metrics/PayloadBuilder.py
@@ -1,4 +1,6 @@
+from collections.abc import Mapping
 import json
+from json import JSONDecodeError
 import sys
 import time
 import importlib
@@ -9,8 +11,6 @@
 from readme_metrics import ResponseInfoWrapper
 from werkzeug import Request
 
-from readme_metrics.util import util_exclude_keys, util_filter_keys
-
 
 class PayloadBuilder:
     """
@@ -97,56 +97,20 @@ def _build_request_payload(self, request: Request) -> dict:
         Returns:
             dict: Wrapped request payload
         """
-        post_data = {}
-        headers = None
-
-        # Convert EnivronHeaders to a dictionary
-        headers_dict = dict(request.headers.items())
-        if self.denylist:
-            headers = util_exclude_keys(headers_dict, self.denylist)
-        elif self.allowlist:
-            headers = util_filter_keys(headers_dict, self.allowlist)
-        else:
-            headers = headers_dict
-
-        if request.content_length is not None and request.content_length > 0:
-
-            body = request.rm_body.decode("utf-8") or ""
-
-            try:
-                json_object = json.loads(body)
+        headers = self._redact_dict(request.headers)
+        params = parse.parse_qsl(request.query_string.decode("utf-8"))
 
-                if self.denylist:
-                    body = util_exclude_keys(json_object, self.denylist)
-                elif self.allowlist:
-                    body = util_filter_keys(json_object, self.allowlist)
-
-                post_data["mimeType"] = "application/json"
-                post_data["text"] = body
-            except ValueError as e:
-                post_data["params"] = [body]
-
-                if request.content_type:
-                    post_data["mimeType"] = request.content_type
-                else:
-                    post_data["mimeType"] = "text/html"
-
-        hdr_items = []
-        for k, v in headers.items():
-            hdr_items.append({"name": k, "value": v})
-
-        qs_items = []
-        qs_dict = dict(parse.parse_qsl(request.query_string.decode("utf-8")))
-
-        for k, v in qs_dict.items():
-            qs_items.append({"name": k, "value": v})
+        if request.content_length:
+            post_data = self._process_body(request.rm_body)
+        else:
+            post_data = {}
 
         return {
             "method": request.method,
             "url": request.base_url,
             "httpVersion": request.environ["SERVER_PROTOCOL"],
-            "headers": hdr_items,
-            "queryString": qs_items,
+            "headers": [{"name": k, "value": v} for (k, v) in headers.items()],
+            "queryString": [{"name": k, "value": v} for (k, v) in params],
             **post_data,
         }
 
@@ -159,28 +123,10 @@ def _build_response_payload(self, response: ResponseInfoWrapper) -> dict:
         Returns:
             dict: Wrapped response payload
         """
-        if self.denylist:
-            headers = util_exclude_keys(response.headers, self.denylist)
-        elif self.allowlist:
-            headers = util_filter_keys(response.headers, self.allowlist)
-        else:
-            headers = response.headers
-
-        body = response.body
-
-        try:
-            json_object = json.loads(body)
-
-            if self.denylist:
-                body = util_exclude_keys(json_object, self.denylist)
-            elif self.allowlist:
-                body = util_filter_keys(json_object, self.allowlist)
-        except ValueError:
-            pass
+        headers = self._redact_dict(response.headers)
+        body = self._process_body(response.body).get("text")
 
-        hdr_items = []
-        for k, v in headers.items():
-            hdr_items.append({"name": k, "value": v})
+        headers = [{"name": k, "value": v} for (k, v) in headers.items()]
 
         status_string = str(response.status)
         status_code = int(status_string.split(" ")[0])
@@ -189,10 +135,74 @@ def _build_response_payload(self, response: ResponseInfoWrapper) -> dict:
         return {
             "status": status_code,
             "statusText": status_text or "",
-            "headers": hdr_items,  # headers.items(),
+            "headers": headers,  # headers.items(),
             "content": {
                 "text": body,
                 "size": response.content_length,
                 "mimeType": response.content_type,
             },
         }
+
+    # always returns a dict with some of these fields: text, mimeType, params}
+    def _process_body(self, body):
+        if isinstance(body, bytes):
+            # Non-unicode bytes cannot be directly serialized as a JSON
+            # payload to send to the ReadMe API, so we need to convert this to a
+            # unicode string first. But we don't know what encoding it might be
+            # using, if any (it could also just be raw bytes, like an image).
+            # We're going to assume that if it's possible to decode at all, then
+            # it's most likely UTF-8. If we can't decode it, just send an error
+            # with the JSON payload.
+            try:
+                body = body.decode("utf-8")
+            except UnicodeDecodeError:
+                return {"text": "[ERROR: NOT VALID UTF-8]"}
+
+        if not isinstance(body, str):
+            # We don't know how to process this body. If it's safe to encode as
+            # JSON, return it unchanged; otherwise return an error.
+            try:
+                json.dumps(body)
+                return {"text": body}
+            except TypeError:
+                return {"text": "[ERROR: NOT SERIALIZABLE]"}
+
+        try:
+            body_data = json.loads(body)
+        except JSONDecodeError:
+            params = parse.parse_qsl(body)
+            if params:
+                return {
+                    "text": body,
+                    "mimeType": "multipart/form-data",
+                    "params": [{"name": k, "value": v} for (k, v) in params],
+                }
+            else:
+                return {"text": body}
+
+        if (self.denylist or self.allowlist) and isinstance(body_data, dict):
+            redacted_data = self._redact_dict(body_data)
+            body = json.dumps(redacted_data)
+
+        return {"text": body, "mimeType": "application/json"}
+
+    def _redact_dict(self, mapping: Mapping):
+        def _redact_value(v):
+            if isinstance(v, str):
+                return f"[REDACTED {len(v)}]"
+            else:
+                return "[REDACTED]"
+
+        # Short-circuit this function if there's no allowlist or denylist
+        if not (self.allowlist or self.denylist):
+            return mapping
+
+        result = dict()
+        for (key, value) in mapping.items():
+            if self.denylist and key in self.denylist:
+                result[key] = _redact_value(value)
+            elif self.allowlist and key not in self.allowlist:
+                result[key] = _redact_value(value)
+            else:
+                result[key] = value
+        return result
diff --git a/packages/python/readme_metrics/__init__.py b/packages/python/readme_metrics/__init__.py
@@ -1,4 +1,4 @@
 from readme_metrics.MetricsApiConfig import MetricsApiConfig
 from readme_metrics.MetricsMiddleware import MetricsMiddleware
 
-__version__ = "1.1.0"
+__version__ = "1.2.0"
diff --git a/packages/python/readme_metrics/tests/PayloadBuilder_test.py b/packages/python/readme_metrics/tests/PayloadBuilder_test.py
@@ -113,7 +113,10 @@ def testDenylist(self):
         text = data["request"]["log"]["entries"][0]["request"]["text"]
 
         assert "ok" in text
-        assert "password" not in text
+        assert "123" in text
+        assert "password" in text
+        assert "456" not in text
+        assert "[REDACTED]" in text
 
     def testAllowlist(self):
         config = self.mockMiddlewareConfig(allowlist=["ok"])
@@ -131,7 +134,10 @@ def testAllowlist(self):
         text = data["request"]["log"]["entries"][0]["request"]["text"]
 
         assert "ok" in text
-        assert "password" not in text
+        assert "123" in text
+        assert "password" in text
+        assert "456" not in text
+        assert "[REDACTED]" in text
 
     def testDeprecatedBlackListed(self):
 
@@ -151,7 +157,10 @@ def testDeprecatedBlackListed(self):
         text = data["request"]["log"]["entries"][0]["request"]["text"]
 
         assert "ok" in text
-        assert "password" not in text
+        assert "123" in text
+        assert "password" in text
+        assert "456" not in text
+        assert "[REDACTED]" in text
 
     def testDeprecatedWhiteListed(self):
         config = self.mockMiddlewareConfig(whitelist=["ok"])
@@ -169,7 +178,10 @@ def testDeprecatedWhiteListed(self):
         text = data["request"]["log"]["entries"][0]["request"]["text"]
 
         assert "ok" in text
-        assert "password" not in text
+        assert "123" in text
+        assert "password" in text
+        assert "456" not in text
+        assert "[REDACTED]" in text
 
     def testGroupingFunction(self):
         config = self.mockMiddlewareConfig(
diff --git a/packages/python/readme_metrics/tests/redaction_test.py b/packages/python/readme_metrics/tests/redaction_test.py
@@ -0,0 +1,92 @@
+import pytest  # pylint: disable=import-error
+from readme_metrics.PayloadBuilder import PayloadBuilder
+
+
+allowlist = [
+    "allowed_string",
+    "allowed_number",
+    "allowed_dict",
+    "allowed_list",
+    "allowed_object",
+]
+denylist = [
+    "denied_string",
+    "denied_number",
+    "denied_dict",
+    "denied_list",
+    "denied_object",
+]
+
+subdict = {"allowed_string": "allowed_value", "denied_string": "denied_value"}
+sublist = ["allowed_string", "denied_string"]
+mapping = {
+    "allowed_string": "allowed_value",
+    "denied_string": "denied_value",
+    "unspecified_string": "unspecified_value",
+    "allowed_number": 123,
+    "denied_number": 456,
+    "unspecified_number": 789,
+    "allowed_dict": subdict,
+    "denied_dict": subdict,
+    "unspecified_dict": subdict,
+    "allowed_list": sublist,
+    "denied_list": sublist,
+    "unspecified_list": sublist,
+}
+
+# When using the denylist, we should redact all top-level fields with names like denied_*.
+expected_denylist_result = {
+    "allowed_string": "allowed_value",
+    "denied_string": "[REDACTED 12]",
+    "unspecified_string": "unspecified_value",
+    "allowed_number": 123,
+    "denied_number": "[REDACTED]",
+    "unspecified_number": 789,
+    "allowed_dict": subdict,
+    "denied_dict": "[REDACTED]",
+    "unspecified_dict": subdict,
+    "allowed_list": sublist,
+    "denied_list": "[REDACTED]",
+    "unspecified_list": sublist,
+}
+
+# When using the denylist, we should redact all denied_* and unspecified_* top-level fields.
+expected_allowlist_result = {
+    "allowed_string": "allowed_value",
+    "denied_string": "[REDACTED 12]",
+    "unspecified_string": "[REDACTED 17]",
+    "allowed_number": 123,
+    "denied_number": "[REDACTED]",
+    "unspecified_number": "[REDACTED]",
+    "allowed_dict": subdict,
+    "denied_dict": "[REDACTED]",
+    "unspecified_dict": "[REDACTED]",
+    "allowed_list": sublist,
+    "denied_list": "[REDACTED]",
+    "unspecified_list": "[REDACTED]",
+}
+
+
+def test_redaction_with_allowlist():
+    allowlist_result = PayloadBuilder(
+        denylist=None,
+        allowlist=allowlist,
+        development_mode=True,
+        grouping_function=None,
+    )._redact_dict(mapping)
+    assert allowlist_result == expected_allowlist_result
+
+
+def test_redaction_with_denylist():
+    denylist_result = PayloadBuilder(
+        denylist=denylist, allowlist=None, development_mode=True, grouping_function=None
+    )._redact_dict(mapping)
+    assert denylist_result == expected_denylist_result
+
+
+def test_redaction_with_both():
+    # when both allowlist and denylist are present, denylist takes precedence
+    denylist_result = PayloadBuilder(
+        denylist=denylist, allowlist=None, development_mode=True, grouping_function=None
+    )._redact_dict(mapping)
+    assert denylist_result == expected_denylist_result
diff --git a/packages/python/readme_metrics/tests/util_test.py b/packages/python/readme_metrics/tests/util_test.py
diff --git a/packages/python/readme_metrics/util.py b/packages/python/readme_metrics/util.py