Skip to content

Commit 7cebcf6

Browse files
committed
[Refactor] Complete metrics overhaul
Metrics got an entire overhaul. Instead of relying on a broken prometheus library to publish our metrics, we now use the `tracing` library and with OpenTelemetry that we bind together then publish into a prometheus library. Metrics are now mostly derive-macros. This means that the struct can express what it wants to export and a help text. The library will choose if it is able to export it. Tracing now works by calling `.publish()` on the parent structs, those structs need to call `.publish()` on all the child members it wishes to publish data about. If a "group" is requested, use the `group!()` macro, which under-the-hood calls `tracing::span` with some special labels. At primitive layers, it will call the `publish!()` macro, which will call `tracing::event!()` macro under-the-hood with some special fields set. A custom `tracing::Subscriber` will intercept all the events and spans and convert them into a json-like object. This object can then be exported as real json or encoded into other formats like otel/prometheus. closes: TraceMachina#1164, TraceMachina#650, TraceMachina#384, TraceMachina#209 towards: TraceMachina#206
1 parent f59f8ba commit 7cebcf6

File tree

75 files changed

+2770
-1439
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+2770
-1439
lines changed

Cargo.lock

+318
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+14
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ nativelink-service = { path = "nativelink-service" }
4040
nativelink-store = { path = "nativelink-store" }
4141
nativelink-util = { path = "nativelink-util" }
4242
nativelink-worker = { path = "nativelink-worker" }
43+
nativelink-metric = { path = "nativelink-metric" }
44+
nativelink-metric-collector = { path = "nativelink-metric-collector" }
4345

4446
async-lock = "3.3.0"
4547
axum = "0.6.20"
@@ -58,3 +60,15 @@ tokio-rustls = "0.25.0"
5860
tonic = { version = "0.11.0", features = ["gzip", "tls"] }
5961
tower = "0.4.13"
6062
tracing = "0.1.40"
63+
opentelemetry_sdk = { version = "0.23.0", features = ["metrics"] }
64+
tracing-subscriber = "0.3.18"
65+
tracing-opentelemetry = { version = "0.25.0", features = ["metrics"] }
66+
opentelemetry-stdout = "0.5.0"
67+
opentelemetry_api = { version = "0.20.0", features = ["metrics"] }
68+
opentelemetry = { version = "0.23.0", features = ["metrics"] }
69+
prometheus = "0.13.4"
70+
opentelemetry-prometheus = "0.16.0"
71+
serde_json = "1.0.120"
72+
73+
[dev-dependencies]
74+
nativelink-metric-tests = { path = "nativelink-metric-tests" }

nativelink-error/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ autobenches = false
99

1010
[dependencies]
1111
nativelink-proto = { path = "../nativelink-proto" }
12+
nativelink-metric = { path = "../nativelink-metric" }
1213

1314
hex = "0.4.3"
1415
prost = "0.12.4"

nativelink-error/src/lib.rs

+7
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use prost_types::TimestampError;
1616
use serde::{Deserialize, Serialize};
17+
use nativelink_metric::{MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent};
1718

1819
#[macro_export]
1920
macro_rules! make_err {
@@ -47,6 +48,12 @@ pub struct Error {
4748
pub messages: Vec<String>,
4849
}
4950

51+
impl MetricsComponent for Error {
52+
fn publish(&self, kind: MetricKind, field_metadata: MetricFieldData) -> Result<MetricPublishKnownKindData, nativelink_metric::Error> {
53+
self.to_string().publish(kind, field_metadata)
54+
}
55+
}
56+
5057
impl Error {
5158
pub fn new(code: Code, msg: String) -> Self {
5259
let mut msgs = Vec::with_capacity(1);
+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[package]
2+
name = "nativelink-metric-collector"
3+
version = "0.4.0"
4+
edition = "2021"
5+
rust-version = "1.79.0"
6+
7+
[dependencies]
8+
tracing = "0.1.40"
9+
opentelemetry_sdk = { version = "0.23.0", features = ["metrics", "rt-tokio"] }
10+
tracing-subscriber = "0.3.18"
11+
# tracing-opentelemetry = { version = "0.25.0", features = ["metrics"] }
12+
# opentelemetry-stdout = "0.5.0"
13+
# opentelemetry_api = { version = "0.20.0", features = ["metrics"] }
14+
opentelemetry = { version = "0.23.0", features = ["metrics"] }
15+
parking_lot = "0.12.2"
16+
# tokio = { version = "1.37.0" }
17+
nativelink-metric = { path = "../nativelink-metric" }
18+
serde_json = "1.0.120"
19+
serde = "1.0.204"
20+
21+
[dev-dependencies]
22+
nativelink-macro = { path = "../nativelink-macro" }
23+
# nativelink-util = { path = "../nativelink-util" }
24+
nativelink-error = { path = "../nativelink-error" }
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
pub use tracing_layers::MetricsCollectorLayer;
2+
3+
mod metrics_collection;
4+
mod metrics_visitors;
5+
mod tracing_layers;
6+
mod otel_exporter;
7+
8+
pub use otel_exporter::otel_export;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
use std::{
2+
borrow::Cow,
3+
collections::HashMap,
4+
ops::{Deref, DerefMut},
5+
};
6+
7+
use serde::Serialize;
8+
9+
use crate::metrics_visitors::CollectionKind;
10+
11+
#[derive(Debug, Serialize)]
12+
#[serde(untagged)]
13+
pub enum CollectedMetricPrimitiveValue {
14+
Counter(u64),
15+
String(Cow<'static, str>),
16+
}
17+
18+
#[derive(Default, Debug)]
19+
pub struct CollectedMetricPrimitive {
20+
pub value: Option<CollectedMetricPrimitiveValue>,
21+
pub help: String,
22+
pub value_type: CollectionKind,
23+
}
24+
25+
impl Serialize for CollectedMetricPrimitive {
26+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
27+
where
28+
S: serde::Serializer,
29+
{
30+
match &self.value {
31+
Some(CollectedMetricPrimitiveValue::Counter(value)) => {
32+
serializer.serialize_u64(*value)
33+
}
34+
Some(CollectedMetricPrimitiveValue::String(value)) => {
35+
serializer.serialize_str(value)
36+
}
37+
None => serializer.serialize_none(),
38+
}
39+
}
40+
}
41+
42+
pub type CollectedMetricChildren = HashMap<String, CollectedMetrics>;
43+
44+
#[derive(Debug, Serialize)]
45+
#[serde(untagged)]
46+
pub enum CollectedMetrics {
47+
Primitive(CollectedMetricPrimitive),
48+
Component(Box<CollectedMetricChildren>),
49+
}
50+
51+
impl CollectedMetrics {
52+
pub fn new_component() -> Self {
53+
Self::Component(Box::new(CollectedMetricChildren::default()))
54+
}
55+
}
56+
57+
#[derive(Default, Debug, Serialize)]
58+
pub struct RootMetricCollectedMetrics {
59+
#[serde(flatten)]
60+
inner: CollectedMetricChildren,
61+
}
62+
63+
impl RootMetricCollectedMetrics {
64+
pub fn to_json5(&self) -> Result<std::string::String, serde_json::Error> {
65+
serde_json::to_string_pretty(self)
66+
}
67+
}
68+
69+
impl Deref for RootMetricCollectedMetrics {
70+
type Target = CollectedMetricChildren;
71+
72+
fn deref(&self) -> &Self::Target {
73+
&self.inner
74+
}
75+
}
76+
77+
impl DerefMut for RootMetricCollectedMetrics {
78+
fn deref_mut(&mut self) -> &mut Self::Target {
79+
&mut self.inner
80+
}
81+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
use std::{borrow::Cow, fmt::Debug};
2+
3+
use nativelink_metric::MetricKind;
4+
use serde::Serialize;
5+
use tracing::field::{Field, Visit};
6+
7+
use crate::metrics_collection::{CollectedMetricPrimitive, CollectedMetricPrimitiveValue};
8+
9+
#[derive(Default, Debug, Serialize)]
10+
pub enum CollectionKind {
11+
#[default]
12+
Counter = 0,
13+
String = 1,
14+
}
15+
16+
impl From<MetricKind> for CollectionKind {
17+
fn from(kind: MetricKind) -> Self {
18+
match kind {
19+
MetricKind::Counter => CollectionKind::Counter,
20+
MetricKind::String => CollectionKind::String,
21+
_ => CollectionKind::String,
22+
}
23+
}
24+
}
25+
26+
#[derive(Debug)]
27+
enum ValueWithPrimitiveType {
28+
String(String),
29+
U64(u64),
30+
}
31+
32+
impl Default for ValueWithPrimitiveType {
33+
fn default() -> Self {
34+
ValueWithPrimitiveType::U64(0)
35+
}
36+
}
37+
38+
#[derive(Default, Debug)]
39+
pub struct MetricDataVisitor {
40+
pub name: String,
41+
value: ValueWithPrimitiveType,
42+
help: String,
43+
value_type: Option<CollectionKind>,
44+
}
45+
46+
impl From<MetricDataVisitor> for CollectedMetricPrimitive {
47+
fn from(visitor: MetricDataVisitor) -> Self {
48+
let (value, derived_type) = match visitor.value {
49+
ValueWithPrimitiveType::String(s) => {
50+
(CollectedMetricPrimitiveValue::String(Cow::Owned(s)), CollectionKind::String)
51+
},
52+
ValueWithPrimitiveType::U64(u) => {
53+
(CollectedMetricPrimitiveValue::Counter(u), CollectionKind::Counter)
54+
},
55+
};
56+
CollectedMetricPrimitive {
57+
value: Some(value),
58+
help: visitor.help,
59+
value_type: visitor.value_type.unwrap_or(derived_type),
60+
}
61+
}
62+
}
63+
64+
impl Visit for MetricDataVisitor {
65+
// Required method
66+
fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {}
67+
68+
// Provided methods
69+
fn record_f64(&mut self, field: &Field, value: f64) {
70+
if field.name() == "__value" {
71+
self.value = ValueWithPrimitiveType::String(value.to_string())
72+
}
73+
}
74+
fn record_i64(&mut self, field: &Field, value: i64) {
75+
if field.name() == "__value" {
76+
match u64::try_from(value) {
77+
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
78+
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
79+
}
80+
}
81+
}
82+
fn record_u64(&mut self, field: &Field, value: u64) {
83+
match field.name() {
84+
"__value" => self.value = ValueWithPrimitiveType::U64(value),
85+
"__type" => self.value_type = Some(MetricKind::from(value).into()),
86+
"__help" => self.help = value.to_string(),
87+
"__name" => self.name = value.to_string(),
88+
field => panic!("UNKNOWN FIELD {field}"),
89+
}
90+
}
91+
fn record_i128(&mut self, field: &Field, value: i128) {
92+
if field.name() == "__value" {
93+
match u64::try_from(value) {
94+
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
95+
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
96+
}
97+
}
98+
}
99+
fn record_u128(&mut self, field: &Field, value: u128) {
100+
if field.name() == "__value" {
101+
match u64::try_from(value) {
102+
Ok(v) => self.value = ValueWithPrimitiveType::U64(v),
103+
Err(_) => self.value = ValueWithPrimitiveType::String(value.to_string()),
104+
}
105+
}
106+
}
107+
fn record_bool(&mut self, field: &Field, value: bool) {
108+
if field.name() == "__value" {
109+
self.value = ValueWithPrimitiveType::U64(u64::from(value));
110+
}
111+
}
112+
fn record_str(&mut self, field: &Field, value: &str) {
113+
match field.name() {
114+
"__value" => self.value = ValueWithPrimitiveType::String(value.to_string()),
115+
"__help" => self.help = value.to_string(),
116+
"__name" => self.name = value.to_string(),
117+
field => panic!("UNKNOWN FIELD {field}"),
118+
}
119+
}
120+
fn record_error(&mut self, _field: &Field, _value: &(dyn std::error::Error + 'static)) {}
121+
}
122+
123+
pub struct SpanFields {
124+
pub name: Cow<'static, str>,
125+
}
126+
127+
impl Visit for SpanFields {
128+
// Required method
129+
fn record_debug(&mut self, _field: &Field, _value: &dyn Debug) {}
130+
131+
fn record_str(&mut self, field: &Field, value: &str) {
132+
if field.name() == "__name" {
133+
self.name = Cow::Owned(value.to_string());
134+
}
135+
}
136+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
use opentelemetry::metrics::Meter;
2+
use tracing::info;
3+
4+
use crate::metrics_collection::{CollectedMetricChildren, CollectedMetricPrimitive, CollectedMetricPrimitiveValue, CollectedMetrics, RootMetricCollectedMetrics};
5+
6+
const MAX_METRIC_NAME_LENGTH: usize = 256;
7+
8+
pub fn otel_export(mut root_prefix: String, meter: &Meter, root_collected_metrics: &RootMetricCollectedMetrics) {
9+
if !root_prefix.is_empty() {
10+
root_prefix.push('_');
11+
}
12+
process_children(&mut root_prefix, meter, &root_collected_metrics);
13+
}
14+
15+
fn process_children(prefix: &mut String, meter: &Meter, children: &CollectedMetricChildren) {
16+
for (name, child) in children {
17+
prefix.push_str(name);
18+
let mut added_prefix_len = name.len();
19+
match child {
20+
CollectedMetrics::Primitive(primitive) => {
21+
process_primitive(prefix, meter, primitive);
22+
}
23+
CollectedMetrics::Component(component) => {
24+
prefix.push('_');
25+
added_prefix_len += 1;
26+
process_children(prefix, meter, component);
27+
}
28+
}
29+
prefix.truncate(prefix.len() - added_prefix_len);
30+
}
31+
}
32+
33+
fn process_primitive(prefix: &mut String, meter: &Meter, primitive: &CollectedMetricPrimitive) {
34+
match &primitive.value {
35+
Some(CollectedMetricPrimitiveValue::Counter(value)) => {
36+
if prefix.len() > MAX_METRIC_NAME_LENGTH {
37+
info!("Metric name longer than 256 characters: {}", prefix);
38+
return;
39+
}
40+
let counter = meter
41+
.u64_counter(prefix.clone())
42+
.with_description(primitive.help.clone())
43+
.init();
44+
counter.add(*value, &[]);
45+
}
46+
Some(CollectedMetricPrimitiveValue::String(_value)) => {
47+
// We don't publish strings in metrics.
48+
}
49+
None => {}
50+
}
51+
}

0 commit comments

Comments
 (0)