Skip to content

Commit 181d5d5

Browse files
authored
Setup one-at-a-time lock for sync_organization tasks (#3612)
Related to grafana/support-escalations#8844 Queuing multiple sync_organization tasks for the same org could lead to parallel running of the sync task for the same organization, potentially creating duplicated entries and/or generating multiple unneeded API calls. This prevents running an organization sync while there is a sync for that same org in progress.
1 parent 0a39f90 commit 181d5d5

File tree

4 files changed

+79
-14
lines changed

4 files changed

+79
-14
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1515

1616
- Handle message to reply to not found in Telegram send log ([#3587](https://github.com/grafana/oncall/pull/3587))
1717
- Upgrade mobx lib to the latest version 6.12.0 ([#3453](https://github.com/grafana/oncall/issues/3453))
18+
- Add task lock to avoid running multiple sync_organization tasks in parallel for the same org ([#3612](https://github.com/grafana/oncall/pull/3612))
1819

1920
## v1.3.81 (2023-12-28)
2021

engine/apps/user_management/sync.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import uuid
23

34
from celery.utils.log import get_task_logger
45
from django.conf import settings
@@ -7,12 +8,25 @@
78
from apps.grafana_plugin.helpers.client import GcomAPIClient, GrafanaAPIClient
89
from apps.user_management.models import Organization, Team, User
910
from apps.user_management.signals import org_sync_signal
11+
from common.utils import task_lock
1012

1113
logger = get_task_logger(__name__)
1214
logger.setLevel(logging.DEBUG)
1315

1416

1517
def sync_organization(organization: Organization) -> None:
18+
# ensure one sync task is running at most for a given org at a given time
19+
lock_id = "sync-organization-lock-{}".format(organization.id)
20+
random_value = str(uuid.uuid4())
21+
with task_lock(lock_id, random_value) as acquired:
22+
if acquired:
23+
_sync_organization(organization)
24+
else:
25+
# sync already running
26+
logger.info(f"Sync for Organization {organization.pk} already in progress.")
27+
28+
29+
def _sync_organization(organization: Organization) -> None:
1630
grafana_api_client = GrafanaAPIClient(api_url=organization.grafana_url, api_token=organization.api_token)
1731

1832
# NOTE: checking whether or not RBAC is enabled depends on whether we are dealing with an open-source or cloud

engine/apps/user_management/tests/test_sync.py

+38-14
Original file line numberDiff line numberDiff line change
@@ -326,10 +326,13 @@ def test_sync_organization_is_rbac_permissions_enabled_open_source(make_organiza
326326

327327
@pytest.mark.parametrize("gcom_api_response", [False, True])
328328
@patch("apps.user_management.sync.GcomAPIClient")
329+
@patch("common.utils.cache")
329330
@override_settings(LICENSE=settings.CLOUD_LICENSE_NAME)
330331
@override_settings(GRAFANA_COM_ADMIN_API_TOKEN="mockedToken")
331332
@pytest.mark.django_db
332-
def test_sync_organization_is_rbac_permissions_enabled_cloud(mocked_gcom_client, make_organization, gcom_api_response):
333+
def test_sync_organization_is_rbac_permissions_enabled_cloud(
334+
mock_cache, mocked_gcom_client, make_organization, gcom_api_response
335+
):
333336
stack_id = 5
334337
organization = make_organization(stack_id=stack_id)
335338

@@ -369,22 +372,27 @@ def test_sync_organization_is_rbac_permissions_enabled_cloud(mocked_gcom_client,
369372
},
370373
)
371374

372-
with patch.object(GrafanaAPIClient, "check_token", return_value=(None, api_check_token_call_status)):
373-
with patch.object(GrafanaAPIClient, "get_users", return_value=api_users_response):
374-
with patch.object(GrafanaAPIClient, "get_teams", return_value=(api_teams_response, None)):
375-
with patch.object(GrafanaAPIClient, "get_team_members", return_value=(api_members_response, None)):
376-
with patch.object(
377-
GrafanaAPIClient,
378-
"get_grafana_incident_plugin_settings",
379-
return_value=(
380-
{"enabled": True, "jsonData": {"backendUrl": MOCK_GRAFANA_INCIDENT_BACKEND_URL}},
381-
None,
382-
),
383-
):
384-
sync_organization(organization)
375+
random_uuid = "random"
376+
with patch("apps.user_management.sync.uuid.uuid4", return_value=random_uuid):
377+
with patch.object(GrafanaAPIClient, "check_token", return_value=(None, api_check_token_call_status)):
378+
with patch.object(GrafanaAPIClient, "get_users", return_value=api_users_response):
379+
with patch.object(GrafanaAPIClient, "get_teams", return_value=(api_teams_response, None)):
380+
with patch.object(GrafanaAPIClient, "get_team_members", return_value=(api_members_response, None)):
381+
with patch.object(
382+
GrafanaAPIClient,
383+
"get_grafana_incident_plugin_settings",
384+
return_value=(
385+
{"enabled": True, "jsonData": {"backendUrl": MOCK_GRAFANA_INCIDENT_BACKEND_URL}},
386+
None,
387+
),
388+
):
389+
sync_organization(organization)
385390

386391
organization.refresh_from_db()
387392

393+
# lock is set and released
394+
mock_cache.add.assert_called_once_with(f"sync-organization-lock-{organization.id}", random_uuid, 60 * 10)
395+
mock_cache.delete.assert_called_once_with(f"sync-organization-lock-{organization.id}")
388396
assert mocked_gcom_client.return_value.called_once_with("mockedToken")
389397
assert mocked_gcom_client.return_value.is_rbac_enabled_for_stack.called_once_with(stack_id)
390398
assert organization.is_rbac_permissions_enabled == gcom_api_response
@@ -433,3 +441,19 @@ def test_cleanup_organization_deleted(make_organization):
433441

434442
organization.refresh_from_db()
435443
assert organization.deleted_at is not None
444+
445+
446+
@pytest.mark.django_db
447+
def test_sync_organization_lock(make_organization):
448+
organization = make_organization()
449+
450+
random_uuid = "random"
451+
with patch("apps.user_management.sync.GrafanaAPIClient") as mock_client:
452+
with patch("apps.user_management.sync.uuid.uuid4", return_value=random_uuid):
453+
with patch("apps.user_management.sync.task_lock") as mock_task_lock:
454+
# lock couldn't be acquired
455+
mock_task_lock.return_value.__enter__.return_value = False
456+
sync_organization(organization)
457+
458+
mock_task_lock.assert_called_once_with(f"sync-organization-lock-{organization.id}", random_uuid)
459+
assert not mock_client.called

engine/common/utils.py

+26
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
import random
66
import re
77
import time
8+
from contextlib import contextmanager
89
from functools import reduce
910

1011
import factory
1112
import markdown2
1213
from bs4 import BeautifulSoup
1314
from celery.utils.log import get_task_logger
1415
from celery.utils.time import get_exponential_backoff_interval
16+
from django.core.cache import cache
1517
from django.utils.html import urlize
1618

1719
logger = get_task_logger(__name__)
@@ -73,6 +75,30 @@ def rerun_task(self, countdown):
7375
)
7476

7577

78+
LOCK_EXPIRE = 60 * 10 # Lock expires in 10 minutes
79+
80+
81+
# Context manager for tasks that are intended to run once at a time
82+
# (ie. no parallel instances of the same task running)
83+
# based on https://docs.celeryq.dev/en/stable/tutorials/task-cookbook.html#ensuring-a-task-is-only-executed-one-at-a-time
84+
@contextmanager
85+
def task_lock(lock_id, oid):
86+
timeout_at = time.monotonic() + LOCK_EXPIRE - 3
87+
# cache.add returns False if the key already exists
88+
status = cache.add(lock_id, oid, LOCK_EXPIRE)
89+
try:
90+
yield status
91+
finally:
92+
# cache delete may be slow, but we have to use it to take
93+
# advantage of using add() for atomic locking
94+
if time.monotonic() < timeout_at and status:
95+
# don't release the lock if we exceeded the timeout
96+
# to lessen the chance of releasing an expired lock
97+
# owned by someone else
98+
# also don't release the lock if we didn't acquire it
99+
cache.delete(lock_id)
100+
101+
76102
# lru cache version with addition of timeout.
77103
# Timeout added to not to occupy memory with too old values
78104
def timed_lru_cache(timeout: int, maxsize: int = 128, typed: bool = False):

0 commit comments

Comments
 (0)