Skip to content

Commit 0d4ca30

Browse files
authored
doc(server): Clarify envelope and request metrics (#614)
Clarifies documentation for metrics that include envelopes with items other than events, as well as metrics that measure web requests rather than events.
1 parent de76f4c commit 0d4ca30

File tree

7 files changed

+93
-80
lines changed

7 files changed

+93
-80
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ We have switched to [CalVer](https://calver.org/)! Relay's version is always in
1515
- All endpoint metrics now report their proper `route` tag. This applies to `requests`, `requests.duration`, and `responses.status_codes`. Previously, some some endpoints reported an empty route. ([#595](https://github.com/getsentry/relay/pull/595))
1616
- Properly refresh cached project states based on the configured intervals. Previously, Relay may have gone into an endless refresh cycle if the system clock not accurate, or the state had not been updated in the upstream. ([#596](https://github.com/getsentry/relay/pull/596))
1717
- Respond with `403 Forbidden` when multiple authentication payloads are sent by the SDK. Previously, Relay would authenticate using one of the payloads and silently ignore the rest. ([#602](https://github.com/getsentry/relay/pull/602))
18+
- Improve metrics documentation. ([#614](https://github.com/getsentry/relay/pull/614))
1819

1920
**Internal**:
2021

relay-server/src/actors/events.rs

+9-7
Original file line numberDiff line numberDiff line change
@@ -836,8 +836,8 @@ impl Handler<ProcessEnvelope> for EventProcessor {
836836
type Result = Result<ProcessEnvelopeResponse, ProcessingError>;
837837

838838
fn handle(&mut self, message: ProcessEnvelope, _context: &mut Self::Context) -> Self::Result {
839-
metric!(timer(RelayTimers::EventWaitTime) = message.start_time.elapsed());
840-
metric!(timer(RelayTimers::EventProcessingTime), {
839+
metric!(timer(RelayTimers::EnvelopeWaitTime) = message.start_time.elapsed());
840+
metric!(timer(RelayTimers::EnvelopeProcessingTime), {
841841
self.process(message)
842842
})
843843
}
@@ -951,10 +951,12 @@ impl Handler<QueueEnvelope> for EventManager {
951951
type Result = Result<Option<EventId>, QueueEnvelopeError>;
952952

953953
fn handle(&mut self, mut message: QueueEnvelope, context: &mut Self::Context) -> Self::Result {
954-
metric!(histogram(RelayHistograms::EventQueueSize) = u64::from(self.current_active_events));
954+
metric!(
955+
histogram(RelayHistograms::EnvelopeQueueSize) = u64::from(self.current_active_events)
956+
);
955957

956958
metric!(
957-
histogram(RelayHistograms::EventQueueSizePct) = {
959+
histogram(RelayHistograms::EnvelopeQueueSizePct) = {
958960
let queue_size_pct = self.current_active_events as f32 * 100.0
959961
/ self.config.event_buffer_size() as f32;
960962
queue_size_pct.floor() as u64
@@ -1167,9 +1169,9 @@ impl Handler<HandleEnvelope> for EventManager {
11671169
}))
11681170
.into_actor(self)
11691171
.timeout(self.config.event_buffer_expiry(), ProcessingError::Timeout)
1170-
.map(|_, _, _| metric!(counter(RelayCounters::EventAccepted) += 1))
1172+
.map(|_, _, _| metric!(counter(RelayCounters::EnvelopeAccepted) += 1))
11711173
.map_err(move |error, slf, _| {
1172-
metric!(counter(RelayCounters::EventRejected) += 1);
1174+
metric!(counter(RelayCounters::EnvelopeRejected) += 1);
11731175

11741176
// Rate limits need special handling: Cache them on the project to avoid
11751177
// expensive processing while the limit is active.
@@ -1219,7 +1221,7 @@ impl Handler<HandleEnvelope> for EventManager {
12191221
}
12201222
})
12211223
.then(move |x, slf, _| {
1222-
metric!(timer(RelayTimers::EventTotalTime) = start_time.elapsed());
1224+
metric!(timer(RelayTimers::EnvelopeTotalTime) = start_time.elapsed());
12231225
slf.current_active_events -= 1;
12241226
fut::result(x)
12251227
})

relay-server/src/actors/outcome.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ mod kafka {
390390
.map_err(OutcomeError::SerializationError)?;
391391

392392
metric!(
393-
counter(RelayCounters::EventOutcomes) += 1,
393+
counter(RelayCounters::Outcomes) += 1,
394394
reason = message.outcome.to_reason().unwrap_or(""),
395395
outcome = message.outcome.name()
396396
);

relay-server/src/actors/store.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ impl Handler<StoreEnvelope> for StoreForwarder {
479479

480480
self.produce(topic, event_message)?;
481481
metric!(
482-
counter(RelayCounters::ProcessingEventProduced) += 1,
482+
counter(RelayCounters::ProcessingMessageProduced) += 1,
483483
event_type = "event"
484484
);
485485
} else if !attachments.is_empty() {
@@ -493,7 +493,7 @@ impl Handler<StoreEnvelope> for StoreForwarder {
493493

494494
self.produce(topic, attachment_message)?;
495495
metric!(
496-
counter(RelayCounters::ProcessingEventProduced) += 1,
496+
counter(RelayCounters::ProcessingMessageProduced) += 1,
497497
event_type = "attachment"
498498
);
499499
}

relay-server/src/body/store_body.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,10 @@ impl Future for StoreBody {
156156
})
157157
.and_then(|body_opt| {
158158
let body = body_opt.ok_or(StorePayloadError::Overflow)?;
159-
metric!(histogram(RelayHistograms::EventSizeBytesRaw) = body.len() as u64);
159+
metric!(histogram(RelayHistograms::RequestSizeBytesRaw) = body.len() as u64);
160160
let decoded = decode_bytes(body.freeze())?;
161161
metric!(
162-
histogram(RelayHistograms::EventSizeBytesUncompressed) = decoded.len() as u64
162+
histogram(RelayHistograms::RequestSizeBytesUncompressed) = decoded.len() as u64
163163
);
164164
Ok(decoded)
165165
});

relay-server/src/endpoints/common.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,7 @@ where
456456
})
457457
}))
458458
.or_else(move |error: BadStoreRequest| {
459-
metric!(counter(RelayCounters::EventRejected) += 1);
459+
metric!(counter(RelayCounters::EnvelopeRejected) += 1);
460460

461461
if is_event {
462462
outcome_producer.do_send(TrackOutcome {

relay-server/src/metrics.rs

+77-67
Original file line numberDiff line numberDiff line change
@@ -16,42 +16,53 @@ impl SetMetric for RelaySets {
1616

1717
/// Histogram metrics used by Relay.
1818
pub enum RelayHistograms {
19-
/// The number of events in the queue as a percentage of the maximum number of events
20-
/// that can be stored in the queue ( 0 ... the queue is empty, 1 ... the queue is full
21-
/// and no additional events can be added).
22-
EventQueueSizePct,
23-
/// The number of events in the queue. The event queue represents the events that are being
24-
/// processed at a particular time in Relay. Once a request is received the event has
25-
/// some preliminary (quick) processing to determine if it can be processed or it is
26-
/// rejected. Once this determination has been done the http request that
27-
/// created the event terminates and, if the request is to be further processed,
28-
/// the event enters a queue ( a virtual queue, the event is kept in a future that
29-
/// will resolve at some point in time).
30-
/// Once the event finishes processing and is sent downstream (i.e. the future is
31-
/// resolved and the event leaves relay) the event is considered handled and it
32-
/// leaves the queue ( the queue size is decremented).
33-
EventQueueSize,
34-
/// The event size as seen by Relay after it is extracted from a request.
35-
EventSizeBytesRaw,
36-
/// The event size as seen by Relay after it has been decompressed and decoded (e.g. from Base64).
37-
EventSizeBytesUncompressed,
38-
/// Number of projects in the ProjectCache that are waiting for their state to be updated.
19+
/// The number of envelopes in the queue as a percentage of the maximum number of envelopes that
20+
/// can be stored in the queue.
21+
///
22+
/// The value ranges from `0` (the queue is empty) to `1` (the queue is full and no additional
23+
/// events can be added).
24+
EnvelopeQueueSizePct,
25+
/// The number of envelopes in the queue.
26+
///
27+
/// The event queue represents the envelopes that are being processed at a particular time in
28+
/// Relay. Once a request is received, the envelope receives some preliminary (quick) processing
29+
/// to determine if it can be processed or it is rejected. Once this determination has been
30+
/// done, the http request that created the envelope terminates and, if the request is to be
31+
/// further processed, the envelope enters a queue.
32+
///
33+
/// Once the envelope finishes processing and is sent downstream, the envelope is considered
34+
/// handled and it leaves the queue.
35+
EnvelopeQueueSize,
36+
/// The size of the request body as seen by Relay after it is extracted from a request.
37+
///
38+
/// For envelope requests, this is the full size of the envelope. For JSON store requests, this
39+
/// is the size of the JSON body.
40+
///
41+
/// If this request contains a base64 zlib compressed payload without a proper
42+
/// `content-encoding` header, then this is the size before decompression.
43+
RequestSizeBytesRaw,
44+
/// The size of the request body as seen by Relay after it has been decompressed and decoded in
45+
/// case this request contains a base64 zlib compressed payload without a proper
46+
/// `content-encoding` header. Otherwise, this metric is always equal to `event.size_bytes.raw`.
47+
RequestSizeBytesUncompressed,
48+
/// Number of projects in the in-memory project cache that are waiting for their state to be
49+
/// updated.
3950
ProjectStatePending,
40-
/// Number of project state requested from the Upstream for the current batch request.
51+
/// Number of project states requested from the Upstream for the current batch request.
4152
ProjectStateRequestBatchSize,
4253
/// Number of project states received from the Upstream for the current batch request.
4354
ProjectStateReceived,
44-
/// Number of project states currently held in the ProjectState cache.
55+
/// Number of project states currently held in the in-memory project cache.
4556
ProjectStateCacheSize,
4657
}
4758

4859
impl HistogramMetric for RelayHistograms {
4960
fn name(&self) -> &'static str {
5061
match self {
51-
RelayHistograms::EventQueueSizePct => "event.queue_size.pct",
52-
RelayHistograms::EventQueueSize => "event.queue_size",
53-
RelayHistograms::EventSizeBytesRaw => "event.size_bytes.raw",
54-
RelayHistograms::EventSizeBytesUncompressed => "event.size_bytes.uncompressed",
62+
RelayHistograms::EnvelopeQueueSizePct => "event.queue_size.pct",
63+
RelayHistograms::EnvelopeQueueSize => "event.queue_size",
64+
RelayHistograms::RequestSizeBytesRaw => "event.size_bytes.raw",
65+
RelayHistograms::RequestSizeBytesUncompressed => "event.size_bytes.uncompressed",
5566
RelayHistograms::ProjectStatePending => "project_state.pending",
5667
RelayHistograms::ProjectStateRequestBatchSize => "project_state.request.batch_size",
5768
RelayHistograms::ProjectStateReceived => "project_state.received",
@@ -65,31 +76,29 @@ pub enum RelayTimers {
6576
/// The time spent deserializing an event from a JSON byte array into the native data structure
6677
/// on which Relay operates.
6778
EventProcessingDeserialize,
68-
/// Time spent running event processors on an event.
69-
/// Event processing happens before filtering.
79+
/// Time spent running event processors on an event. Event processing happens before filtering.
7080
#[cfg(feature = "processing")]
7181
EventProcessingProcess,
7282
/// Time spent running filtering on an event.
7383
#[cfg(feature = "processing")]
7484
EventProcessingFiltering,
7585
/// Time spent checking for rate limits in Redis.
76-
/// Note that not all events are checked against Redis. After an event is rate limited
77-
/// for period A, any event using the same key coming during period A will be automatically
78-
/// rate limited without checking against Redis (the event will be simply discarded without
79-
/// being placed in the processing queue).
86+
///
87+
/// Note that not all events are checked against Redis. After an event is rate limited for the
88+
/// first time, the rate limit is cached. Events coming in during this period will be discarded
89+
/// earlier in the request queue and do not reach the processing queue.
8090
#[cfg(feature = "processing")]
8191
EventProcessingRateLimiting,
82-
/// Time spent in data scrubbing for the current event.
92+
/// Time spent in data scrubbing for the current event. Data scrubbing happens last before
93+
/// serializing the event back to JSON.
8394
EventProcessingPii,
84-
/// Time spent converting the event from an Annotated<Event> into a String containing the JSON
85-
/// representation of the event.
95+
/// Time spent converting the event from its in-memory reprsentation into a JSON string.
8696
EventProcessingSerialization,
87-
/// Represents the time spent between receiving the event in Relay (i.e. beginning of the
88-
/// request handling) up to the time before starting synchronous processing in the EventProcessor.
89-
EventWaitTime,
90-
/// This is the time the event spends in the EventProcessor (i.e. the sync processing of the
91-
/// event).
92-
/// The time spent in synchronous event processing.
97+
/// Time spent between receiving a request in Relay (that is, beginning of request handling) and
98+
/// the start of synchronous processing in the EventProcessor. This metric primarily indicates
99+
/// backlog in event processing.
100+
EnvelopeWaitTime,
101+
/// The time spent in synchronous processing of envelopes.
93102
///
94103
/// This timing covers the end-to-end processing in the CPU pool and comprises:
95104
///
@@ -102,10 +111,10 @@ pub enum RelayTimers {
102111
/// - `event_processing.process`
103112
/// - `event_processing.filtering`
104113
/// - `event_processing.rate_limiting`
105-
EventProcessingTime,
106-
/// The total time an event spends in Relay from the time it is received until it finishes
107-
/// processing.
108-
EventTotalTime,
114+
EnvelopeProcessingTime,
115+
/// The total time an envelope spends in Relay from the time it is received until it finishes
116+
/// processing and has been submitted.
117+
EnvelopeTotalTime,
109118
/// The total time spent during `ProjectCache.fetch_states` in which eviction of outdated
110119
/// projects happens.
111120
ProjectStateEvictionDuration,
@@ -142,9 +151,9 @@ impl TimerMetric for RelayTimers {
142151
RelayTimers::EventProcessingRateLimiting => "event_processing.rate_limiting",
143152
RelayTimers::EventProcessingPii => "event_processing.pii",
144153
RelayTimers::EventProcessingSerialization => "event_processing.serialization",
145-
RelayTimers::EventWaitTime => "event.wait_time",
146-
RelayTimers::EventProcessingTime => "event.processing_time",
147-
RelayTimers::EventTotalTime => "event.total_time",
154+
RelayTimers::EnvelopeWaitTime => "event.wait_time",
155+
RelayTimers::EnvelopeProcessingTime => "event.processing_time",
156+
RelayTimers::EnvelopeTotalTime => "event.total_time",
148157
RelayTimers::ProjectStateEvictionDuration => "project_state.eviction.duration",
149158
RelayTimers::ProjectStateRequestDuration => "project_state.request.duration",
150159
RelayTimers::ProjectIdRequestDuration => "project_id.request.duration",
@@ -155,20 +164,20 @@ impl TimerMetric for RelayTimers {
155164

156165
/// Counter metrics used by Relay
157166
pub enum RelayCounters {
158-
/// Number of events accepted in the current time slot. This represents events that
159-
/// have successfully passed rate limits, filters and have been successfully handled.
160-
EventAccepted,
161-
/// Number of events rejected in the current time slot. This includes events being rejected
162-
/// because they are malformed or any other error during processing (including filtered
163-
/// events, discarded events and rate limited events).
164-
EventRejected,
165-
/// Represents a group of counters, implemented with using tags. The following tags are
166-
/// present for each event outcome:
167+
/// Number of envelopes accepted in the current time slot. This represents requests that have
168+
/// successfully passed rate limits, filters and have been successfully handled.
169+
EnvelopeAccepted,
170+
/// Number of envelopes rejected in the current time slot. This includes envelopes being
171+
/// rejected because they are malformed or any other errors during processing (including
172+
/// filtered events, invalid payloads and rate limits).
173+
EnvelopeRejected,
174+
/// Represents a group of counters incremented for every outcome emitted by Relay, implemented
175+
/// with tags. The following tags are present for each event outcome:
167176
///
168-
/// - `outcome` which is an `EventOutcome` enumeration
177+
/// - `outcome` which is an `Outcome` enumeration
169178
/// - `reason` which is the reason string for all outcomes that are not `Accepted`.
170179
#[cfg(feature = "processing")]
171-
EventOutcomes,
180+
Outcomes,
172181
/// Counts the number of times a project state lookup is done. This includes requests
173182
/// for projects that are cached and requests for projects that are not yet cached.
174183
/// All requests that return a `EventAction::Accept` i.e. are not rate limited (on
@@ -196,12 +205,13 @@ pub enum RelayCounters {
196205
/// Counts the number of times Relay started.
197206
/// This can be used to track unwanted restarts due to crashes or termination.
198207
ServerStarting,
199-
/// Counts the number of messages placed on the Kafka queue. When Relay operates with processing
200-
/// enabled and a message is successfully processed each message will generate an event on the
201-
/// Kafka queue and zero or more attachments. The counter has an `event_type` tag which is set to
208+
/// Counts the number of messages placed on the Kafka queue.
209+
///
210+
/// When Relay operates with processing enabled and an item is successfully processed, each item
211+
/// will generate a message on the Kafka. The counter has an `event_type` tag which is set to
202212
/// either `event` or `attachment` representing the type of message produced on the Kafka queue.
203213
#[cfg(feature = "processing")]
204-
ProcessingEventProduced,
214+
ProcessingMessageProduced,
205215
/// Counts the number of events that hit any of the Store like endpoints (Store, Security,
206216
/// MiniDump, Unreal). The events are counted before they are rate limited , filtered or
207217
/// processed in any way. The counter has a `version` tag that tracks the message event
@@ -224,18 +234,18 @@ pub enum RelayCounters {
224234
impl CounterMetric for RelayCounters {
225235
fn name(&self) -> &'static str {
226236
match self {
227-
RelayCounters::EventAccepted => "event.accepted",
228-
RelayCounters::EventRejected => "event.rejected",
237+
RelayCounters::EnvelopeAccepted => "event.accepted",
238+
RelayCounters::EnvelopeRejected => "event.rejected",
229239
#[cfg(feature = "processing")]
230-
RelayCounters::EventOutcomes => "events.outcomes",
240+
RelayCounters::Outcomes => "events.outcomes",
231241
RelayCounters::ProjectStateGet => "project_state.get",
232242
RelayCounters::ProjectStateRequest => "project_state.request",
233243
RelayCounters::ProjectCacheHit => "project_cache.hit",
234244
RelayCounters::ProjectCacheMiss => "project_cache.miss",
235245
RelayCounters::ProjectIdRequest => "project_id.request",
236246
RelayCounters::ServerStarting => "server.starting",
237247
#[cfg(feature = "processing")]
238-
RelayCounters::ProcessingEventProduced => "processing.event.produced",
248+
RelayCounters::ProcessingMessageProduced => "processing.event.produced",
239249
RelayCounters::EventProtocol => "event.protocol",
240250
RelayCounters::Requests => "requests",
241251
RelayCounters::ResponsesStatusCodes => "responses.status_codes",

0 commit comments

Comments
 (0)