Skip to content

Commit b7b1cab

Browse files
authored
fix(spool): Revert to page counts for size estimate (#3379)
INC-703 showed that `estimate_spool_size` via `dbstat` is too slow. This PR replaces the estimate with the number of non-free pages multiplied by the page size. Though inaccurate, benchmarks show that this is fast regardless of the db size. Fixes getsentry/team-ingest#307.
1 parent 1aaf204 commit b7b1cab

File tree

4 files changed

+100
-18
lines changed

4 files changed

+100
-18
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
## Unreleased
44

5+
**Bug fixes:**
6+
7+
- Fix performance regression in disk spooling by using page counts to estimate the spool size. ([#3379](https://github.com/getsentry/relay/pull/3379))
8+
59
**Features**:
610

711
- Add support for continuous profiling. ([#3270](https://github.com/getsentry/relay/pull/3270))

relay-server/src/services/spooler/mod.rs

+86-12
Original file line numberDiff line numberDiff line change
@@ -689,9 +689,9 @@ impl OnDisk {
689689
}
690690
}
691691

692-
/// Estimates the db size by multiplying `page_count * page_size`.
692+
/// Estimates the db size by multiplying `used_page_count * page_size`.
693693
async fn estimate_spool_size(&self) -> Result<i64, BufferError> {
694-
let size: i64 = sql::current_size()
694+
let size: i64 = sql::estimate_size()
695695
.fetch_one(&self.db)
696696
.await
697697
.and_then(|r| r.try_get(0))
@@ -1268,7 +1268,7 @@ impl Drop for BufferService {
12681268
mod tests {
12691269
use std::str::FromStr;
12701270
use std::sync::Mutex;
1271-
use std::time::Duration;
1271+
use std::time::{Duration, Instant};
12721272

12731273
use insta::assert_debug_snapshot;
12741274
use rand::Rng;
@@ -1496,7 +1496,7 @@ mod tests {
14961496
"envelopes": {
14971497
"path": std::env::temp_dir().join(Uuid::new_v4().to_string()),
14981498
"max_memory_size": "4KB",
1499-
"max_disk_size": "20KB",
1499+
"max_disk_size": "40KB",
15001500
}
15011501
}
15021502
}))
@@ -1557,7 +1557,7 @@ mod tests {
15571557
.filter(|name| name.contains("buffer."))
15581558
.collect();
15591559

1560-
assert_debug_snapshot!(captures, @r#"
1560+
assert_debug_snapshot!(captures, @r###"
15611561
[
15621562
"buffer.envelopes_mem:2000|h",
15631563
"buffer.envelopes_mem_count:1|g",
@@ -1569,26 +1569,25 @@ mod tests {
15691569
"buffer.writes:1|c",
15701570
"buffer.envelopes_written:3|c",
15711571
"buffer.envelopes_disk_count:3|g",
1572-
"buffer.disk_size:1031|h",
1572+
"buffer.disk_size:24576|h",
15731573
"buffer.envelopes_written:1|c",
15741574
"buffer.envelopes_disk_count:4|g",
15751575
"buffer.writes:1|c",
1576-
"buffer.disk_size:1372|h",
1577-
"buffer.disk_size:1372|h",
1576+
"buffer.disk_size:24576|h",
1577+
"buffer.disk_size:24576|h",
15781578
"buffer.envelopes_written:1|c",
15791579
"buffer.envelopes_disk_count:5|g",
15801580
"buffer.writes:1|c",
1581-
"buffer.disk_size:1713|h",
1581+
"buffer.disk_size:24576|h",
15821582
"buffer.dequeue_attempts:1|h",
15831583
"buffer.reads:1|c",
15841584
"buffer.envelopes_read:-5|c",
15851585
"buffer.envelopes_disk_count:0|g",
15861586
"buffer.dequeue_attempts:1|h",
15871587
"buffer.reads:1|c",
1588-
"buffer.disk_size:8|h",
1589-
"buffer.reads:1|c",
1588+
"buffer.disk_size:24576|h",
15901589
]
1591-
"#);
1590+
"###);
15921591
}
15931592

15941593
pub enum TestHealth {
@@ -1879,4 +1878,79 @@ mod tests {
18791878
let index = index.lock().unwrap().clone();
18801879
assert_eq!(index.len(), 100);
18811880
}
1881+
1882+
#[ignore] // Slow. Should probably be a criterion benchmark.
1883+
#[tokio::test]
1884+
async fn compare_counts() {
1885+
let path = std::env::temp_dir().join(Uuid::new_v4().to_string());
1886+
let options = SqliteConnectOptions::new()
1887+
.filename(path)
1888+
.journal_mode(SqliteJournalMode::Wal)
1889+
.create_if_missing(true);
1890+
1891+
let db = sqlx::SqlitePool::connect_with(options).await.unwrap();
1892+
1893+
println!("Migrating...");
1894+
sqlx::migrate!("../migrations").run(&db).await.unwrap();
1895+
1896+
let transaction = db.begin().await.unwrap();
1897+
1898+
// println!("Inserting...");
1899+
let mut query_builder = sqlx::QueryBuilder::new(
1900+
"INSERT INTO envelopes (received_at, own_key, sampling_key, envelope) ",
1901+
);
1902+
let n = 10000;
1903+
for i in 0..n {
1904+
if (i % 100) == 0 {
1905+
println!("Batch {i} of {n}");
1906+
}
1907+
1908+
let chunk = (0..5000).map(|_| ("", "", "this is my chunk how do you like it", ""));
1909+
let query = query_builder
1910+
.push_values(chunk, |mut b, (key1, key2, value, received_at)| {
1911+
b.push_bind(received_at)
1912+
.push_bind(key1)
1913+
.push_bind(key2)
1914+
.push_bind(value);
1915+
})
1916+
.build();
1917+
1918+
query.execute(&db).await.unwrap();
1919+
1920+
query_builder.reset();
1921+
}
1922+
transaction.commit().await.unwrap();
1923+
1924+
let t = Instant::now();
1925+
let q = sqlx::query(
1926+
r#"SELECT SUM(pgsize - unused) FROM dbstat WHERE name="envelopes" AND aggregate = FALSE"#,
1927+
);
1928+
let pgsize: i64 = q.fetch_one(&db).await.unwrap().get(0);
1929+
println!("pgsize: {} {:?}", pgsize, t.elapsed());
1930+
1931+
let t = Instant::now();
1932+
let q = sqlx::query(
1933+
r#"SELECT SUM(pgsize - unused) FROM dbstat WHERE name="envelopes" AND aggregate = TRUE"#,
1934+
);
1935+
let pgsize_agg: i64 = q.fetch_one(&db).await.unwrap().get(0);
1936+
println!("pgsize_agg: {} {:?}", pgsize_agg, t.elapsed());
1937+
1938+
let t = Instant::now();
1939+
let q = sqlx::query(r#"SELECT SUM(length(envelope)) FROM envelopes"#);
1940+
let brute_force: i64 = q.fetch_one(&db).await.unwrap().get(0);
1941+
println!("brute_force: {} {:?}", brute_force, t.elapsed());
1942+
1943+
let t = Instant::now();
1944+
let q = sqlx::query(
1945+
r#"SELECT (page_count - freelist_count) * page_size as size FROM pragma_page_count(), pragma_freelist_count(), pragma_page_size();"#,
1946+
);
1947+
let pragma: i64 = q.fetch_one(&db).await.unwrap().get(0);
1948+
println!("pragma: {} {:?}", pragma, t.elapsed());
1949+
1950+
// Result:
1951+
// pgsize = 2'408'464'463 t.elapsed() = 7.007533833s
1952+
// pgsize_agg = 2'408'464'463 t.elapsed() = 5.010104791s
1953+
// brute_force = 1'750'000'000 t.elapsed() = 7.893590875s
1954+
// pragma = 3'036'307'456 t.elapsed() = 213.417µs
1955+
}
18821956
}

relay-server/src/services/spooler/sql.rs

+6-4
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,13 @@ pub fn delete<'a>(key: QueueKey) -> Query<'a, Sqlite, SqliteArguments<'a>> {
7070
.bind(key.sampling_key.to_string())
7171
}
7272

73-
/// Creates a query which fetches the `envelopes` table size.
73+
/// Creates a query which fetches the number of used database pages multiplied by the page size.
7474
///
75-
/// This info used to calculate the current allocated database size.
76-
pub fn current_size<'a>() -> Query<'a, Sqlite, SqliteArguments<'a>> {
77-
sqlx::query(r#"SELECT SUM(pgsize - unused) FROM dbstat WHERE name="envelopes""#)
75+
/// This info used to estimate the current allocated database size.
76+
pub fn estimate_size<'a>() -> Query<'a, Sqlite, SqliteArguments<'a>> {
77+
sqlx::query(
78+
r#"SELECT (page_count - freelist_count) * page_size as size FROM pragma_page_count(), pragma_freelist_count(), pragma_page_size();"#,
79+
)
7880
}
7981

8082
/// Creates the query to select only 1 record's `received_at` from the database.

tests/integration/test_healthchecks.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def store_internal_error_event():
160160
"envelopes": {
161161
"path": dbfile,
162162
"max_memory_size": 0,
163-
"max_disk_size": "100",
163+
"max_disk_size": "24577", # one more than the initial size
164164
}
165165
},
166166
}
@@ -175,7 +175,9 @@ def store_internal_error_event():
175175
# Wrapping this into the try block, to make sure we ignore those errors and just check the health at the end.
176176
try:
177177
# These events will consume all the disk space and we will report not ready.
178-
relay.send_event(project_key)
178+
for i in range(20):
179+
# It takes ~10 events to make SQLlite use more pages.
180+
relay.send_event(project_key)
179181
finally:
180182
# Authentication failures would fail the test
181183
mini_sentry.test_failures.clear()

0 commit comments

Comments
 (0)