Skip to content

Commit 05628da

Browse files
authored
feat(txnames): Include transactions without source in clustering (#51437)
When an SDK does not provide a [transaction source](https://develop.sentry.dev/sdk/event-payloads/transaction/#transaction-annotations) annotation, Relay does [one of two things](https://github.com/getsentry/relay/blob/ab54df17d285a2210203e3a41dcf0373d7c6a3d2/relay-general/src/store/transactions/processor.rs#L369-L374): 1. When the transaction name is expected to contain identifiers (high-cardinality), leave the source empty. 2. When the transaction name is expected to be low-cardinality, set it to `unknown`. In the first case, we want to run the [clusterer](https://develop.sentry.dev/transaction-clustering/) to detect high-cardinality patterns. We have to make the simplifying assumption that a transaction name containing `/` is a URL. Note: This PR includes `source:null` transactions in the clustering input. To also apply the discovered rules in Relay, we need another change in https://github.com/getsentry/relay/.
1 parent d34c3fc commit 05628da

File tree

2 files changed

+23
-10
lines changed

2 files changed

+23
-10
lines changed

src/sentry/ingest/transaction_clusterer/datasource/redis.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,7 @@ def clear_samples(namespace: ClustererNamespace, project: Project) -> None:
101101

102102

103103
def record_transaction_name(project: Project, event_data: Mapping[str, Any], **kwargs: Any) -> None:
104-
transaction_name = event_data.get("transaction")
105-
106-
if transaction_name and _should_store_transaction_name(event_data):
104+
if transaction_name := _should_store_transaction_name(event_data):
107105
safe_execute(
108106
_record_sample,
109107
ClustererNamespace.TRANSACTIONS,
@@ -116,28 +114,41 @@ def record_transaction_name(project: Project, event_data: Mapping[str, Any], **k
116114
safe_execute(_bump_rule_lifetime, project, event_data, _with_transaction=False)
117115

118116

119-
def _should_store_transaction_name(event_data: Mapping[str, Any]) -> bool:
117+
def _should_store_transaction_name(event_data: Mapping[str, Any]) -> Optional[str]:
120118
"""Returns whether the given event must be stored as input for the
121119
transaction clusterer."""
122-
tags = event_data.get("tags")
120+
transaction_name = event_data.get("transaction")
121+
if not transaction_name:
122+
return None
123+
124+
tags = event_data.get("tags") or {}
123125
transaction_info = event_data.get("transaction_info") or {}
124126
source = transaction_info.get("source")
125127

126-
# For now, we also feed back transactions into the clustering algorithm
128+
# We also feed back transactions into the clustering algorithm
127129
# that have already been sanitized, so we have a chance to discover
128130
# more high cardinality segments after partial sanitation.
129131
# For example, we may have sanitized `/orgs/*/projects/foo`,
130132
# But the clusterer has yet to discover `/orgs/*/projects/*`.
131133
#
132134
# Disadvantage: the load on redis does not decrease over time.
133135
#
134-
if source not in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED):
135-
return False
136+
source_matches = source in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED) or (
137+
# Relay leaves source None if it expects it to be high cardinality, (otherwise it sets it to "unknown")
138+
# (see https://github.com/getsentry/relay/blob/2d07bef86415cc0ae8af01d16baecde10cdb23a6/relay-general/src/store/transactions/processor.rs#L369-L373).
139+
#
140+
# Our data shows that a majority of these `None` source transactions contain slashes, so treat them as URL transactions:
141+
source is None
142+
and "/" in transaction_name
143+
)
144+
145+
if not source_matches:
146+
return None
136147

137148
if tags and HTTP_404_TAG in tags:
138-
return False
149+
return None
139150

140-
return True
151+
return transaction_name
141152

142153

143154
def _bump_rule_lifetime(project: Project, event_data: Mapping[str, Any]) -> None:

tests/sentry/ingest/test_transaction_clusterer.py

+2
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ def test_distribution():
155155
("route", "/", [["transaction", "/"]], 0),
156156
("url", None, [], 0),
157157
("url", "/a/b/c", [["http.status_code", "404"]], 0),
158+
(None, "/a/b/c", [], 1),
159+
(None, "foo", [], 0),
158160
],
159161
)
160162
def test_record_transactions(mocked_record, default_organization, source, txname, tags, expected):

0 commit comments

Comments
 (0)