Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit dd71eb0

Browse files
authored
Make federation catchup send last event from any server. (#9640)
Currently federation catchup will send the last *local* event that we failed to send to the remote. This can cause issues for large rooms where lots of servers have sent events while the remote server was down, as when it comes back up again it'll be flooded with events from various points in the DAG. Instead, let's make it so that all the servers send the most recent events, even if its not theirs. The remote should deduplicate the events, so there shouldn't be much overhead in doing this. Alternatively, the servers could only send local events if they were also extremities and hope that the other server will send the event over, but that is a bit risky.
1 parent 7b06f85 commit dd71eb0

File tree

4 files changed

+141
-38
lines changed

4 files changed

+141
-38
lines changed

changelog.d/9640.misc

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve performance of federation catch up by sending events the latest events in the room to the remote, rather than just the last event sent by the local server.

synapse/federation/federation_server.py

+2-23
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from twisted.internet.abstract import isIPAddress
3636
from twisted.python import failure
3737

38-
from synapse.api.constants import EduTypes, EventTypes, Membership
38+
from synapse.api.constants import EduTypes, EventTypes
3939
from synapse.api.errors import (
4040
AuthError,
4141
Codes,
@@ -63,7 +63,7 @@
6363
ReplicationFederationSendEduRestServlet,
6464
ReplicationGetQueryRestServlet,
6565
)
66-
from synapse.types import JsonDict, get_domain_from_id
66+
from synapse.types import JsonDict
6767
from synapse.util import glob_to_regex, json_decoder, unwrapFirstError
6868
from synapse.util.async_helpers import Linearizer, concurrently_execute
6969
from synapse.util.caches.response_cache import ResponseCache
@@ -727,27 +727,6 @@ async def _handle_received_pdu(self, origin: str, pdu: EventBase) -> None:
727727
if the event was unacceptable for any other reason (eg, too large,
728728
too many prev_events, couldn't find the prev_events)
729729
"""
730-
# check that it's actually being sent from a valid destination to
731-
# workaround bug #1753 in 0.18.5 and 0.18.6
732-
if origin != get_domain_from_id(pdu.sender):
733-
# We continue to accept join events from any server; this is
734-
# necessary for the federation join dance to work correctly.
735-
# (When we join over federation, the "helper" server is
736-
# responsible for sending out the join event, rather than the
737-
# origin. See bug #1893. This is also true for some third party
738-
# invites).
739-
if not (
740-
pdu.type == "m.room.member"
741-
and pdu.content
742-
and pdu.content.get("membership", None)
743-
in (Membership.JOIN, Membership.INVITE)
744-
):
745-
logger.info(
746-
"Discarding PDU %s from invalid origin %s", pdu.event_id, origin
747-
)
748-
return
749-
else:
750-
logger.info("Accepting join PDU %s from %s", pdu.event_id, origin)
751730

752731
# We've already checked that we know the room version by this point
753732
room_version = await self.store.get_room_version(pdu.room_id)

synapse/federation/sender/per_destination_queue.py

+89-15
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# limitations under the License.
1616
import datetime
1717
import logging
18-
from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Tuple, cast
18+
from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Tuple
1919

2020
import attr
2121
from prometheus_client import Counter
@@ -77,6 +77,7 @@ def __init__(
7777
self._transaction_manager = transaction_manager
7878
self._instance_name = hs.get_instance_name()
7979
self._federation_shard_config = hs.config.worker.federation_shard_config
80+
self._state = hs.get_state_handler()
8081

8182
self._should_send_on_this_instance = True
8283
if not self._federation_shard_config.should_handle(
@@ -415,22 +416,95 @@ async def _catch_up_transmission_loop(self) -> None:
415416
"This should not happen." % event_ids
416417
)
417418

418-
if logger.isEnabledFor(logging.INFO):
419-
rooms = [p.room_id for p in catchup_pdus]
420-
logger.info("Catching up rooms to %s: %r", self._destination, rooms)
419+
# We send transactions with events from one room only, as its likely
420+
# that the remote will have to do additional processing, which may
421+
# take some time. It's better to give it small amounts of work
422+
# rather than risk the request timing out and repeatedly being
423+
# retried, and not making any progress.
424+
#
425+
# Note: `catchup_pdus` will have exactly one PDU per room.
426+
for pdu in catchup_pdus:
427+
# The PDU from the DB will be the last PDU in the room from
428+
# *this server* that wasn't sent to the remote. However, other
429+
# servers may have sent lots of events since then, and we want
430+
# to try and tell the remote only about the *latest* events in
431+
# the room. This is so that it doesn't get inundated by events
432+
# from various parts of the DAG, which all need to be processed.
433+
#
434+
# Note: this does mean that in large rooms a server coming back
435+
# online will get sent the same events from all the different
436+
# servers, but the remote will correctly deduplicate them and
437+
# handle it only once.
438+
439+
# Step 1, fetch the current extremities
440+
extrems = await self._store.get_prev_events_for_room(pdu.room_id)
441+
442+
if pdu.event_id in extrems:
443+
# If the event is in the extremities, then great! We can just
444+
# use that without having to do further checks.
445+
room_catchup_pdus = [pdu]
446+
else:
447+
# If not, fetch the extremities and figure out which we can
448+
# send.
449+
extrem_events = await self._store.get_events_as_list(extrems)
450+
451+
new_pdus = []
452+
for p in extrem_events:
453+
# We pulled this from the DB, so it'll be non-null
454+
assert p.internal_metadata.stream_ordering
455+
456+
# Filter out events that happened before the remote went
457+
# offline
458+
if (
459+
p.internal_metadata.stream_ordering
460+
< self._last_successful_stream_ordering
461+
):
462+
continue
421463

422-
await self._transaction_manager.send_new_transaction(
423-
self._destination, catchup_pdus, []
424-
)
464+
# Filter out events where the server is not in the room,
465+
# e.g. it may have left/been kicked. *Ideally* we'd pull
466+
# out the kick and send that, but it's a rare edge case
467+
# so we don't bother for now (the server that sent the
468+
# kick should send it out if its online).
469+
hosts = await self._state.get_hosts_in_room_at_events(
470+
p.room_id, [p.event_id]
471+
)
472+
if self._destination not in hosts:
473+
continue
425474

426-
sent_transactions_counter.inc()
427-
final_pdu = catchup_pdus[-1]
428-
self._last_successful_stream_ordering = cast(
429-
int, final_pdu.internal_metadata.stream_ordering
430-
)
431-
await self._store.set_destination_last_successful_stream_ordering(
432-
self._destination, self._last_successful_stream_ordering
433-
)
475+
new_pdus.append(p)
476+
477+
# If we've filtered out all the extremities, fall back to
478+
# sending the original event. This should ensure that the
479+
# server gets at least some of missed events (especially if
480+
# the other sending servers are up).
481+
if new_pdus:
482+
room_catchup_pdus = new_pdus
483+
484+
logger.info(
485+
"Catching up rooms to %s: %r", self._destination, pdu.room_id
486+
)
487+
488+
await self._transaction_manager.send_new_transaction(
489+
self._destination, room_catchup_pdus, []
490+
)
491+
492+
sent_transactions_counter.inc()
493+
494+
# We pulled this from the DB, so it'll be non-null
495+
assert pdu.internal_metadata.stream_ordering
496+
497+
# Note that we mark the last successful stream ordering as that
498+
# from the *original* PDU, rather than the PDU(s) we actually
499+
# send. This is because we use it to mark our position in the
500+
# queue of missed PDUs to process.
501+
self._last_successful_stream_ordering = (
502+
pdu.internal_metadata.stream_ordering
503+
)
504+
505+
await self._store.set_destination_last_successful_stream_ordering(
506+
self._destination, self._last_successful_stream_ordering
507+
)
434508

435509
def _get_rr_edus(self, force_flush: bool) -> Iterable[Edu]:
436510
if not self._pending_rrs:

tests/federation/test_federation_catch_up.py

+49
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from mock import Mock
44

5+
from synapse.api.constants import EventTypes
56
from synapse.events import EventBase
67
from synapse.federation.sender import PerDestinationQueue, TransactionManager
78
from synapse.federation.units import Edu
@@ -421,3 +422,51 @@ def wake_destination_track(destination):
421422
self.assertNotIn("zzzerver", woken)
422423
# - all destinations are woken exactly once; they appear once in woken.
423424
self.assertCountEqual(woken, server_names[:-1])
425+
426+
@override_config({"send_federation": True})
427+
def test_not_latest_event(self):
428+
"""Test that we send the latest event in the room even if its not ours."""
429+
430+
per_dest_queue, sent_pdus = self.make_fake_destination_queue()
431+
432+
# Make a room with a local user, and two servers. One will go offline
433+
# and one will send some events.
434+
self.register_user("u1", "you the one")
435+
u1_token = self.login("u1", "you the one")
436+
room_1 = self.helper.create_room_as("u1", tok=u1_token)
437+
438+
self.get_success(
439+
event_injection.inject_member_event(self.hs, room_1, "@user:host2", "join")
440+
)
441+
event_1 = self.get_success(
442+
event_injection.inject_member_event(self.hs, room_1, "@user:host3", "join")
443+
)
444+
445+
# First we send something from the local server, so that we notice the
446+
# remote is down and go into catchup mode.
447+
self.helper.send(room_1, "you hear me!!", tok=u1_token)
448+
449+
# Now simulate us receiving an event from the still online remote.
450+
event_2 = self.get_success(
451+
event_injection.inject_event(
452+
self.hs,
453+
type=EventTypes.Message,
454+
sender="@user:host3",
455+
room_id=room_1,
456+
content={"msgtype": "m.text", "body": "Hello"},
457+
)
458+
)
459+
460+
self.get_success(
461+
self.hs.get_datastore().set_destination_last_successful_stream_ordering(
462+
"host2", event_1.internal_metadata.stream_ordering
463+
)
464+
)
465+
466+
self.get_success(per_dest_queue._catch_up_transmission_loop())
467+
468+
# We expect only the last message from the remote, event_2, to have been
469+
# sent, rather than the last *local* event that was sent.
470+
self.assertEqual(len(sent_pdus), 1)
471+
self.assertEqual(sent_pdus[0].event_id, event_2.event_id)
472+
self.assertFalse(per_dest_queue._catching_up)

0 commit comments

Comments
 (0)