Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 9900f7c

Browse files
authored
Add admin endpoint to query room sizes (#15482)
1 parent 710502c commit 9900f7c

File tree

6 files changed

+195
-1
lines changed

6 files changed

+195
-1
lines changed

changelog.d/15482.feature

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add admin endpoint to query the largest rooms by disk space used in the database.

docs/admin_api/statistics.md

+49
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,52 @@ The following fields are returned in the JSON response body:
8181
- `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`).
8282
* `next_token` - integer - Opaque value used for pagination. See above.
8383
* `total` - integer - Total number of users after filtering.
84+
85+
86+
# Get largest rooms by size in database
87+
88+
Returns the 10 largest rooms and an estimate of how much space in the database
89+
they are taking.
90+
91+
This does not include the size of any associated media associated with the room.
92+
93+
Returns an error on SQLite.
94+
95+
*Note:* This uses the planner statistics from PostgreSQL to do the estimates,
96+
which means that the returned information can vary widely from reality. However,
97+
it should be enough to get a rough idea of where database disk space is going.
98+
99+
100+
The API is:
101+
102+
```
103+
GET /_synapse/admin/v1/statistics/statistics/database/rooms
104+
```
105+
106+
A response body like the following is returned:
107+
108+
```json
109+
{
110+
"rooms": [
111+
{
112+
"room_id": "!OGEhHVWSdvArJzumhm:matrix.org",
113+
"estimated_size": 47325417353
114+
}
115+
],
116+
}
117+
```
118+
119+
120+
121+
**Response**
122+
123+
The following fields are returned in the JSON response body:
124+
125+
* `rooms` - An array of objects, sorted by largest room first. Objects contain
126+
the following fields:
127+
- `room_id` - string - The room ID.
128+
- `estimated_size` - integer - Estimated disk space used in bytes by the room
129+
in the database.
130+
131+
132+
*Added in Synapse 1.83.0*

synapse/rest/admin/__init__.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,10 @@
6868
RoomTimestampToEventRestServlet,
6969
)
7070
from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet
71-
from synapse.rest.admin.statistics import UserMediaStatisticsRestServlet
71+
from synapse.rest.admin.statistics import (
72+
LargestRoomsStatistics,
73+
UserMediaStatisticsRestServlet,
74+
)
7275
from synapse.rest.admin.username_available import UsernameAvailableRestServlet
7376
from synapse.rest.admin.users import (
7477
AccountDataRestServlet,
@@ -259,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None:
259262
UserRestServletV2(hs).register(http_server)
260263
UsersRestServletV2(hs).register(http_server)
261264
UserMediaStatisticsRestServlet(hs).register(http_server)
265+
LargestRoomsStatistics(hs).register(http_server)
262266
EventReportDetailRestServlet(hs).register(http_server)
263267
EventReportsRestServlet(hs).register(http_server)
264268
AccountDataRestServlet(hs).register(http_server)

synapse/rest/admin/statistics.py

+25
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,28 @@ async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
113113
ret["next_token"] = start + len(users_media)
114114

115115
return HTTPStatus.OK, ret
116+
117+
118+
class LargestRoomsStatistics(RestServlet):
119+
"""Get the largest rooms by database size.
120+
121+
Only works when using PostgreSQL.
122+
"""
123+
124+
PATTERNS = admin_patterns("/statistics/database/rooms$")
125+
126+
def __init__(self, hs: "HomeServer"):
127+
self.auth = hs.get_auth()
128+
self.stats_controller = hs.get_storage_controllers().stats
129+
130+
async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]:
131+
await assert_requester_is_admin(self.auth, request)
132+
133+
room_sizes = await self.stats_controller.get_room_db_size_estimate()
134+
135+
return HTTPStatus.OK, {
136+
"rooms": [
137+
{"room_id": room_id, "estimated_size": size}
138+
for room_id, size in room_sizes
139+
]
140+
}

synapse/storage/controllers/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
)
2020
from synapse.storage.controllers.purge_events import PurgeEventsStorageController
2121
from synapse.storage.controllers.state import StateStorageController
22+
from synapse.storage.controllers.stats import StatsController
2223
from synapse.storage.databases import Databases
2324
from synapse.storage.databases.main import DataStore
2425

@@ -40,6 +41,7 @@ def __init__(self, hs: "HomeServer", stores: Databases):
4041

4142
self.purge_events = PurgeEventsStorageController(hs, stores)
4243
self.state = StateStorageController(hs, stores)
44+
self.stats = StatsController(hs, stores)
4345

4446
self.persistence = None
4547
if stores.persist_events:

synapse/storage/controllers/stats.py

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Copyright 2023 The Matrix.org Foundation C.I.C.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import logging
16+
from collections import Counter
17+
from typing import TYPE_CHECKING, Collection, List, Tuple
18+
19+
from synapse.api.errors import SynapseError
20+
from synapse.storage.database import LoggingTransaction
21+
from synapse.storage.databases import Databases
22+
from synapse.storage.engines import PostgresEngine
23+
24+
if TYPE_CHECKING:
25+
from synapse.server import HomeServer
26+
27+
logger = logging.getLogger(__name__)
28+
29+
30+
class StatsController:
31+
"""High level interface for getting statistics."""
32+
33+
def __init__(self, hs: "HomeServer", stores: Databases):
34+
self.stores = stores
35+
36+
async def get_room_db_size_estimate(self) -> List[Tuple[str, int]]:
37+
"""Get an estimate of the largest rooms and how much database space they
38+
use, in bytes.
39+
40+
Only works against PostgreSQL.
41+
42+
Note: this uses the postgres statistics so is a very rough estimate.
43+
"""
44+
45+
# Note: We look at both tables on the main and state databases.
46+
if not isinstance(self.stores.main.database_engine, PostgresEngine):
47+
raise SynapseError(400, "Endpoint requires using PostgreSQL")
48+
49+
if not isinstance(self.stores.state.database_engine, PostgresEngine):
50+
raise SynapseError(400, "Endpoint requires using PostgreSQL")
51+
52+
# For each "large" table, we go through and get the largest rooms
53+
# and an estimate of how much space they take. We can then sum the
54+
# results and return the top 10.
55+
#
56+
# This isn't the most accurate, but given all of these are estimates
57+
# anyway its good enough.
58+
room_estimates: Counter[str] = Counter()
59+
60+
# Return size of the table on disk, including indexes and TOAST.
61+
table_sql = """
62+
SELECT pg_total_relation_size(?)
63+
"""
64+
65+
# Get an estimate for the largest rooms and their frequency.
66+
#
67+
# Note: the cast here is a hack to cast from `anyarray` to an actual
68+
# type. This ensures that psycopg2 passes us a back a a Python list.
69+
column_sql = """
70+
SELECT
71+
most_common_vals::TEXT::TEXT[], most_common_freqs::TEXT::NUMERIC[]
72+
FROM pg_stats
73+
WHERE tablename = ? and attname = 'room_id'
74+
"""
75+
76+
def get_room_db_size_estimate_txn(
77+
txn: LoggingTransaction,
78+
tables: Collection[str],
79+
) -> None:
80+
for table in tables:
81+
txn.execute(table_sql, (table,))
82+
row = txn.fetchone()
83+
assert row is not None
84+
(table_size,) = row
85+
86+
txn.execute(column_sql, (table,))
87+
row = txn.fetchone()
88+
assert row is not None
89+
vals, freqs = row
90+
91+
for room_id, freq in zip(vals, freqs):
92+
room_estimates[room_id] += int(freq * table_size)
93+
94+
await self.stores.main.db_pool.runInteraction(
95+
"get_room_db_size_estimate_main",
96+
get_room_db_size_estimate_txn,
97+
(
98+
"event_json",
99+
"events",
100+
"event_search",
101+
"event_edges",
102+
"event_push_actions",
103+
"stream_ordering_to_exterm",
104+
),
105+
)
106+
107+
await self.stores.state.db_pool.runInteraction(
108+
"get_room_db_size_estimate_state",
109+
get_room_db_size_estimate_txn,
110+
("state_groups_state",),
111+
)
112+
113+
return room_estimates.most_common(10)

0 commit comments

Comments
 (0)