Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 772e8c2

Browse files
authored
Fix stack overflow in _PerHostRatelimiter due to synchronous requests (#14812)
When there are many synchronous requests waiting on a `_PerHostRatelimiter`, each request will be started recursively just after the previous request has completed. Under the right conditions, this leads to stack exhaustion. A common way for requests to become synchronous is when the remote client disconnects early, because the homeserver is overloaded and slow to respond. Avoid stack exhaustion under these conditions by deferring subsequent requests until the next reactor tick. Fixes #14480. Signed-off-by: Sean Quah <[email protected]>
1 parent 12083d3 commit 772e8c2

File tree

5 files changed

+70
-12
lines changed

5 files changed

+70
-12
lines changed

changelog.d/14812.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a long-standing bug where Synapse would exhaust the stack when processing many federation requests where the remote homeserver has disconencted early.

synapse/rest/client/register.py

+1
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ def __init__(self, hs: "HomeServer"):
310310
self.hs = hs
311311
self.registration_handler = hs.get_registration_handler()
312312
self.ratelimiter = FederationRateLimiter(
313+
hs.get_reactor(),
313314
hs.get_clock(),
314315
FederationRatelimitSettings(
315316
# Time window of 2s

synapse/server.py

+1
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@ def get_replication_streams(self) -> Dict[str, Stream]:
768768
@cache_in_self
769769
def get_federation_ratelimiter(self) -> FederationRateLimiter:
770770
return FederationRateLimiter(
771+
self.get_reactor(),
771772
self.get_clock(),
772773
config=self.config.ratelimiting.rc_federation,
773774
metrics_name="federation_servlets",

synapse/util/ratelimitutils.py

+25-9
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from typing_extensions import ContextManager
3535

3636
from twisted.internet import defer
37+
from twisted.internet.interfaces import IReactorTime
3738

3839
from synapse.api.errors import LimitExceededError
3940
from synapse.config.ratelimiting import FederationRatelimitSettings
@@ -146,12 +147,14 @@ class FederationRateLimiter:
146147

147148
def __init__(
148149
self,
150+
reactor: IReactorTime,
149151
clock: Clock,
150152
config: FederationRatelimitSettings,
151153
metrics_name: Optional[str] = None,
152154
):
153155
"""
154156
Args:
157+
reactor
155158
clock
156159
config
157160
metrics_name: The name of the rate limiter so we can differentiate it
@@ -163,7 +166,7 @@ def __init__(
163166

164167
def new_limiter() -> "_PerHostRatelimiter":
165168
return _PerHostRatelimiter(
166-
clock=clock, config=config, metrics_name=metrics_name
169+
reactor=reactor, clock=clock, config=config, metrics_name=metrics_name
167170
)
168171

169172
self.ratelimiters: DefaultDict[
@@ -194,19 +197,22 @@ def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]
194197
class _PerHostRatelimiter:
195198
def __init__(
196199
self,
200+
reactor: IReactorTime,
197201
clock: Clock,
198202
config: FederationRatelimitSettings,
199203
metrics_name: Optional[str] = None,
200204
):
201205
"""
202206
Args:
207+
reactor
203208
clock
204209
config
205210
metrics_name: The name of the rate limiter so we can differentiate it
206211
from the rest in the metrics. If `None`, we don't track metrics
207212
for this rate limiter.
208213
from the rest in the metrics
209214
"""
215+
self.reactor = reactor
210216
self.clock = clock
211217
self.metrics_name = metrics_name
212218

@@ -364,12 +370,22 @@ def on_both(r: object) -> object:
364370

365371
def _on_exit(self, request_id: object) -> None:
366372
logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id))
367-
self.current_processing.discard(request_id)
368-
try:
369-
# start processing the next item on the queue.
370-
_, deferred = self.ready_request_queue.popitem(last=False)
371373

372-
with PreserveLoggingContext():
373-
deferred.callback(None)
374-
except KeyError:
375-
pass
374+
# When requests complete synchronously, we will recursively start the next
375+
# request in the queue. To avoid stack exhaustion, we defer starting the next
376+
# request until the next reactor tick.
377+
378+
def start_next_request() -> None:
379+
# We only remove the completed request from the list when we're about to
380+
# start the next one, otherwise we can allow extra requests through.
381+
self.current_processing.discard(request_id)
382+
try:
383+
# start processing the next item on the queue.
384+
_, deferred = self.ready_request_queue.popitem(last=False)
385+
386+
with PreserveLoggingContext():
387+
deferred.callback(None)
388+
except KeyError:
389+
pass
390+
391+
self.reactor.callLater(0.0, start_next_request)

tests/util/test_ratelimitutils.py

+42-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
from typing import Optional
1515

16+
from twisted.internet import defer
1617
from twisted.internet.defer import Deferred
1718

1819
from synapse.config.homeserver import HomeServerConfig
@@ -29,7 +30,7 @@ def test_ratelimit(self) -> None:
2930
"""A simple test with the default values"""
3031
reactor, clock = get_clock()
3132
rc_config = build_rc_config()
32-
ratelimiter = FederationRateLimiter(clock, rc_config)
33+
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
3334

3435
with ratelimiter.ratelimit("testhost") as d1:
3536
# shouldn't block
@@ -39,7 +40,7 @@ def test_concurrent_limit(self) -> None:
3940
"""Test what happens when we hit the concurrent limit"""
4041
reactor, clock = get_clock()
4142
rc_config = build_rc_config({"rc_federation": {"concurrent": 2}})
42-
ratelimiter = FederationRateLimiter(clock, rc_config)
43+
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
4344

4445
with ratelimiter.ratelimit("testhost") as d1:
4546
# shouldn't block
@@ -57,6 +58,7 @@ def test_concurrent_limit(self) -> None:
5758

5859
# ... until we complete an earlier request
5960
cm2.__exit__(None, None, None)
61+
reactor.advance(0.0)
6062
self.successResultOf(d3)
6163

6264
def test_sleep_limit(self) -> None:
@@ -65,7 +67,7 @@ def test_sleep_limit(self) -> None:
6567
rc_config = build_rc_config(
6668
{"rc_federation": {"sleep_limit": 2, "sleep_delay": 500}}
6769
)
68-
ratelimiter = FederationRateLimiter(clock, rc_config)
70+
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
6971

7072
with ratelimiter.ratelimit("testhost") as d1:
7173
# shouldn't block
@@ -81,6 +83,43 @@ def test_sleep_limit(self) -> None:
8183
sleep_time = _await_resolution(reactor, d3)
8284
self.assertAlmostEqual(sleep_time, 500, places=3)
8385

86+
def test_lots_of_queued_things(self) -> None:
87+
"""Tests lots of synchronous things queued up behind a slow thing.
88+
89+
The stack should *not* explode when the slow thing completes.
90+
"""
91+
reactor, clock = get_clock()
92+
rc_config = build_rc_config(
93+
{
94+
"rc_federation": {
95+
"sleep_limit": 1000000000, # never sleep
96+
"reject_limit": 1000000000, # never reject requests
97+
"concurrent": 1,
98+
}
99+
}
100+
)
101+
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
102+
103+
with ratelimiter.ratelimit("testhost") as d:
104+
# shouldn't block
105+
self.successResultOf(d)
106+
107+
async def task() -> None:
108+
with ratelimiter.ratelimit("testhost") as d:
109+
await d
110+
111+
for _ in range(1, 100):
112+
defer.ensureDeferred(task())
113+
114+
last_task = defer.ensureDeferred(task())
115+
116+
# Upon exiting the context manager, all the synchronous things will resume.
117+
# If a stack overflow occurs, the final task will not complete.
118+
119+
# Wait for all the things to complete.
120+
reactor.advance(0.0)
121+
self.successResultOf(last_task)
122+
84123

85124
def _await_resolution(reactor: ThreadedMemoryReactorClock, d: Deferred) -> float:
86125
"""advance the clock until the deferred completes.

0 commit comments

Comments
 (0)