Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit a1ff1e9

Browse files
authored
Periodically send pings to detect dead Redis connections (#9218)
This is done by creating a custom `RedisFactory` subclass that periodically pings all connections in its pool. We also ensure that the `replyTimeout` param is non-null, so that we timeout waiting for the reply to those pings (and thus triggering a reconnect).
1 parent 5b857b7 commit a1ff1e9

File tree

4 files changed

+107
-57
lines changed

4 files changed

+107
-57
lines changed

changelog.d/9218.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix bug where we sometimes didn't detect that Redis connections had died, causing workers to not see new data.

stubs/txredisapi.pyi

+8-4
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@ from typing import List, Optional, Type, Union
1919

2020
class RedisProtocol:
2121
def publish(self, channel: str, message: bytes): ...
22+
async def ping(self) -> None: ...
2223

23-
class SubscriberProtocol:
24+
class SubscriberProtocol(RedisProtocol):
2425
def __init__(self, *args, **kwargs): ...
2526
password: Optional[str]
2627
def subscribe(self, channels: Union[str, List[str]]): ...
@@ -39,14 +40,13 @@ def lazyConnection(
3940
convertNumbers: bool = ...,
4041
) -> RedisProtocol: ...
4142

42-
class SubscriberFactory:
43-
def buildProtocol(self, addr): ...
44-
4543
class ConnectionHandler: ...
4644

4745
class RedisFactory:
4846
continueTrying: bool
4947
handler: RedisProtocol
48+
pool: List[RedisProtocol]
49+
replyTimeout: Optional[int]
5050
def __init__(
5151
self,
5252
uuid: str,
@@ -59,3 +59,7 @@ class RedisFactory:
5959
replyTimeout: Optional[int] = None,
6060
convertNumbers: Optional[int] = True,
6161
): ...
62+
def buildProtocol(self, addr) -> RedisProtocol: ...
63+
64+
class SubscriberFactory(RedisFactory):
65+
def __init__(self): ...

synapse/replication/tcp/handler.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# limitations under the License.
1616
import logging
1717
from typing import (
18+
TYPE_CHECKING,
1819
Any,
1920
Awaitable,
2021
Dict,
@@ -63,6 +64,9 @@
6364
TypingStream,
6465
)
6566

67+
if TYPE_CHECKING:
68+
from synapse.server import HomeServer
69+
6670
logger = logging.getLogger(__name__)
6771

6872

@@ -88,7 +92,7 @@ class ReplicationCommandHandler:
8892
back out to connections.
8993
"""
9094

91-
def __init__(self, hs):
95+
def __init__(self, hs: "HomeServer"):
9296
self._replication_data_handler = hs.get_replication_data_handler()
9397
self._presence_handler = hs.get_presence_handler()
9498
self._store = hs.get_datastore()
@@ -300,7 +304,7 @@ def start_replication(self, hs):
300304

301305
# First create the connection for sending commands.
302306
outbound_redis_connection = lazyConnection(
303-
reactor=hs.get_reactor(),
307+
hs=hs,
304308
host=hs.config.redis_host,
305309
port=hs.config.redis_port,
306310
password=hs.config.redis.redis_password,

synapse/replication/tcp/redis.py

+92-51
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,15 @@
1515

1616
import logging
1717
from inspect import isawaitable
18-
from typing import TYPE_CHECKING, Optional
18+
from typing import TYPE_CHECKING, Optional, Type, cast
1919

2020
import txredisapi
2121

2222
from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable
2323
from synapse.metrics.background_process_metrics import (
2424
BackgroundProcessLoggingContext,
2525
run_as_background_process,
26+
wrap_as_background_process,
2627
)
2728
from synapse.replication.tcp.commands import (
2829
Command,
@@ -59,16 +60,16 @@ class RedisSubscriber(txredisapi.SubscriberProtocol, AbstractConnection):
5960
immediately after initialisation.
6061
6162
Attributes:
62-
handler: The command handler to handle incoming commands.
63-
stream_name: The *redis* stream name to subscribe to and publish from
64-
(not anything to do with Synapse replication streams).
65-
outbound_redis_connection: The connection to redis to use to send
63+
synapse_handler: The command handler to handle incoming commands.
64+
synapse_stream_name: The *redis* stream name to subscribe to and publish
65+
from (not anything to do with Synapse replication streams).
66+
synapse_outbound_redis_connection: The connection to redis to use to send
6667
commands.
6768
"""
6869

69-
handler = None # type: ReplicationCommandHandler
70-
stream_name = None # type: str
71-
outbound_redis_connection = None # type: txredisapi.RedisProtocol
70+
synapse_handler = None # type: ReplicationCommandHandler
71+
synapse_stream_name = None # type: str
72+
synapse_outbound_redis_connection = None # type: txredisapi.RedisProtocol
7273

7374
def __init__(self, *args, **kwargs):
7475
super().__init__(*args, **kwargs)
@@ -88,19 +89,19 @@ async def _send_subscribe(self):
8889
# it's important to make sure that we only send the REPLICATE command once we
8990
# have successfully subscribed to the stream - otherwise we might miss the
9091
# POSITION response sent back by the other end.
91-
logger.info("Sending redis SUBSCRIBE for %s", self.stream_name)
92-
await make_deferred_yieldable(self.subscribe(self.stream_name))
92+
logger.info("Sending redis SUBSCRIBE for %s", self.synapse_stream_name)
93+
await make_deferred_yieldable(self.subscribe(self.synapse_stream_name))
9394
logger.info(
9495
"Successfully subscribed to redis stream, sending REPLICATE command"
9596
)
96-
self.handler.new_connection(self)
97+
self.synapse_handler.new_connection(self)
9798
await self._async_send_command(ReplicateCommand())
9899
logger.info("REPLICATE successfully sent")
99100

100101
# We send out our positions when there is a new connection in case the
101102
# other side missed updates. We do this for Redis connections as the
102103
# otherside won't know we've connected and so won't issue a REPLICATE.
103-
self.handler.send_positions_to_connection(self)
104+
self.synapse_handler.send_positions_to_connection(self)
104105

105106
def messageReceived(self, pattern: str, channel: str, message: str):
106107
"""Received a message from redis.
@@ -137,7 +138,7 @@ def handle_command(self, cmd: Command) -> None:
137138
cmd: received command
138139
"""
139140

140-
cmd_func = getattr(self.handler, "on_%s" % (cmd.NAME,), None)
141+
cmd_func = getattr(self.synapse_handler, "on_%s" % (cmd.NAME,), None)
141142
if not cmd_func:
142143
logger.warning("Unhandled command: %r", cmd)
143144
return
@@ -155,7 +156,7 @@ def handle_command(self, cmd: Command) -> None:
155156
def connectionLost(self, reason):
156157
logger.info("Lost connection to redis")
157158
super().connectionLost(reason)
158-
self.handler.lost_connection(self)
159+
self.synapse_handler.lost_connection(self)
159160

160161
# mark the logging context as finished
161162
self._logging_context.__exit__(None, None, None)
@@ -183,11 +184,54 @@ async def _async_send_command(self, cmd: Command):
183184
tcp_outbound_commands_counter.labels(cmd.NAME, "redis").inc()
184185

185186
await make_deferred_yieldable(
186-
self.outbound_redis_connection.publish(self.stream_name, encoded_string)
187+
self.synapse_outbound_redis_connection.publish(
188+
self.synapse_stream_name, encoded_string
189+
)
190+
)
191+
192+
193+
class SynapseRedisFactory(txredisapi.RedisFactory):
194+
"""A subclass of RedisFactory that periodically sends pings to ensure that
195+
we detect dead connections.
196+
"""
197+
198+
def __init__(
199+
self,
200+
hs: "HomeServer",
201+
uuid: str,
202+
dbid: Optional[int],
203+
poolsize: int,
204+
isLazy: bool = False,
205+
handler: Type = txredisapi.ConnectionHandler,
206+
charset: str = "utf-8",
207+
password: Optional[str] = None,
208+
replyTimeout: int = 30,
209+
convertNumbers: Optional[int] = True,
210+
):
211+
super().__init__(
212+
uuid=uuid,
213+
dbid=dbid,
214+
poolsize=poolsize,
215+
isLazy=isLazy,
216+
handler=handler,
217+
charset=charset,
218+
password=password,
219+
replyTimeout=replyTimeout,
220+
convertNumbers=convertNumbers,
187221
)
188222

223+
hs.get_clock().looping_call(self._send_ping, 30 * 1000)
224+
225+
@wrap_as_background_process("redis_ping")
226+
async def _send_ping(self):
227+
for connection in self.pool:
228+
try:
229+
await make_deferred_yieldable(connection.ping())
230+
except Exception:
231+
logger.warning("Failed to send ping to a redis connection")
189232

190-
class RedisDirectTcpReplicationClientFactory(txredisapi.SubscriberFactory):
233+
234+
class RedisDirectTcpReplicationClientFactory(SynapseRedisFactory):
191235
"""This is a reconnecting factory that connects to redis and immediately
192236
subscribes to a stream.
193237
@@ -206,65 +250,62 @@ def __init__(
206250
self, hs: "HomeServer", outbound_redis_connection: txredisapi.RedisProtocol
207251
):
208252

209-
super().__init__()
210-
211-
# This sets the password on the RedisFactory base class (as
212-
# SubscriberFactory constructor doesn't pass it through).
213-
self.password = hs.config.redis.redis_password
253+
super().__init__(
254+
hs,
255+
uuid="subscriber",
256+
dbid=None,
257+
poolsize=1,
258+
replyTimeout=30,
259+
password=hs.config.redis.redis_password,
260+
)
214261

215-
self.handler = hs.get_tcp_replication()
216-
self.stream_name = hs.hostname
262+
self.synapse_handler = hs.get_tcp_replication()
263+
self.synapse_stream_name = hs.hostname
217264

218-
self.outbound_redis_connection = outbound_redis_connection
265+
self.synapse_outbound_redis_connection = outbound_redis_connection
219266

220267
def buildProtocol(self, addr):
221-
p = super().buildProtocol(addr) # type: RedisSubscriber
268+
p = super().buildProtocol(addr)
269+
p = cast(RedisSubscriber, p)
222270

223271
# We do this here rather than add to the constructor of `RedisSubcriber`
224272
# as to do so would involve overriding `buildProtocol` entirely, however
225273
# the base method does some other things than just instantiating the
226274
# protocol.
227-
p.handler = self.handler
228-
p.outbound_redis_connection = self.outbound_redis_connection
229-
p.stream_name = self.stream_name
230-
p.password = self.password
275+
p.synapse_handler = self.synapse_handler
276+
p.synapse_outbound_redis_connection = self.synapse_outbound_redis_connection
277+
p.synapse_stream_name = self.synapse_stream_name
231278

232279
return p
233280

234281

235282
def lazyConnection(
236-
reactor,
283+
hs: "HomeServer",
237284
host: str = "localhost",
238285
port: int = 6379,
239286
dbid: Optional[int] = None,
240287
reconnect: bool = True,
241-
charset: str = "utf-8",
242288
password: Optional[str] = None,
243-
connectTimeout: Optional[int] = None,
244-
replyTimeout: Optional[int] = None,
245-
convertNumbers: bool = True,
289+
replyTimeout: int = 30,
246290
) -> txredisapi.RedisProtocol:
247-
"""Equivalent to `txredisapi.lazyConnection`, except allows specifying a
248-
reactor.
291+
"""Creates a connection to Redis that is lazily set up and reconnects if the
292+
connections is lost.
249293
"""
250294

251-
isLazy = True
252-
poolsize = 1
253-
254295
uuid = "%s:%d" % (host, port)
255-
factory = txredisapi.RedisFactory(
256-
uuid,
257-
dbid,
258-
poolsize,
259-
isLazy,
260-
txredisapi.ConnectionHandler,
261-
charset,
262-
password,
263-
replyTimeout,
264-
convertNumbers,
296+
factory = SynapseRedisFactory(
297+
hs,
298+
uuid=uuid,
299+
dbid=dbid,
300+
poolsize=1,
301+
isLazy=True,
302+
handler=txredisapi.ConnectionHandler,
303+
password=password,
304+
replyTimeout=replyTimeout,
265305
)
266306
factory.continueTrying = reconnect
267-
for x in range(poolsize):
268-
reactor.connectTCP(host, port, factory, connectTimeout)
307+
308+
reactor = hs.get_reactor()
309+
reactor.connectTCP(host, port, factory, 30)
269310

270311
return factory.handler

0 commit comments

Comments
 (0)