Skip to content

Commit 3bcf5ef

Browse files
authored
manually retry for requests.exceptions.Timeout exceptions when sending outgoing webhooks (#3632)
# Which issue(s) this PR fixes Fixes grafana/oncall-private#2439 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] `CHANGELOG.md` updated (or `pr:no changelog` PR label added if not required)
1 parent d57b41b commit 3bcf5ef

File tree

3 files changed

+101
-9
lines changed

3 files changed

+101
-9
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616
### Fixed
1717

1818
- Fixed schedule timezone issues ([#3576](https://github.com/grafana/oncall/issues/3576))
19+
- Ignore `requests.exceptions.Timeout` exceptions when attempting to send outgoing webhook requests by @joeyorlando ([#3632](https://github.com/grafana/oncall/pull/3632))
1920

2021
## v1.3.82 (2024-01-04)
2122

engine/apps/webhooks/tasks/trigger_webhook.py

+46-9
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import json
22
import logging
3+
import typing
4+
from datetime import datetime
35
from json import JSONDecodeError
46

7+
import requests
58
from celery.utils.log import get_task_logger
69
from django.conf import settings
710
from django.db.models import Prefetch
@@ -28,6 +31,10 @@
2831
logger.setLevel(logging.DEBUG)
2932

3033

34+
EXECUTE_WEBHOOK_RETRIES = 3
35+
# these exceptions are fully out of our control (e.g. customer's network issues)
36+
# let's manually retry them without raising an exception
37+
EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY = (requests.exceptions.Timeout,)
3138
TRIGGER_TYPE_TO_LABEL = {
3239
Webhook.TRIGGER_ALERT_GROUP_CREATED: "alert group created",
3340
Webhook.TRIGGER_ACKNOWLEDGE: "acknowledge",
@@ -40,6 +47,17 @@
4047
}
4148

4249

50+
class WebhookRequestStatus(typing.TypedDict):
51+
url: typing.Optional[str]
52+
request_trigger: typing.Optional[str]
53+
request_headers: typing.Optional[str]
54+
request_data: typing.Optional[str]
55+
status_code: typing.Optional[int]
56+
content: typing.Optional[str]
57+
webhook: Webhook
58+
event_data: str
59+
60+
4361
@shared_dedicated_queue_retry_task(
4462
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
4563
)
@@ -52,15 +70,14 @@ def send_webhook_event(trigger_type, alert_group_id, organization_id=None, user_
5270
).exclude(is_webhook_enabled=False)
5371

5472
for webhook in webhooks_qs:
55-
print(webhook.name)
5673
execute_webhook.apply_async((webhook.pk, alert_group_id, user_id, None))
5774

5875

59-
def _isoformat_date(date_value):
76+
def _isoformat_date(date_value: datetime) -> typing.Optional[str]:
6077
return date_value.isoformat() if date_value else None
6178

6279

63-
def _build_payload(webhook, alert_group, user):
80+
def _build_payload(webhook: Webhook, alert_group: AlertGroup, user: User) -> typing.Dict[str, typing.Any]:
6481
trigger_type = webhook.trigger_type
6582
event = {
6683
"type": TRIGGER_TYPE_TO_LABEL[trigger_type],
@@ -96,7 +113,9 @@ def _build_payload(webhook, alert_group, user):
96113
return data
97114

98115

99-
def mask_authorization_header(headers, header_keys_to_mask):
116+
def mask_authorization_header(
117+
headers: typing.Dict[str, str], header_keys_to_mask: typing.List[str]
118+
) -> typing.Dict[str, str]:
100119
masked_headers = headers.copy()
101120
lower_keys = set(k.lower() for k in header_keys_to_mask)
102121
for k in headers.keys():
@@ -105,8 +124,10 @@ def mask_authorization_header(headers, header_keys_to_mask):
105124
return masked_headers
106125

107126

108-
def make_request(webhook, alert_group, data):
109-
status = {
127+
def make_request(
128+
webhook: Webhook, alert_group: AlertGroup, data: typing.Dict[str, typing.Any]
129+
) -> typing.Tuple[bool, WebhookRequestStatus, typing.Optional[str], typing.Optional[Exception]]:
130+
status: WebhookRequestStatus = {
110131
"url": None,
111132
"request_trigger": None,
112133
"request_headers": None,
@@ -172,9 +193,9 @@ def make_request(webhook, alert_group, data):
172193

173194

174195
@shared_dedicated_queue_retry_task(
175-
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else 3
196+
autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else EXECUTE_WEBHOOK_RETRIES
176197
)
177-
def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
198+
def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id, manual_retry_num=0):
178199
from apps.webhooks.models import Webhook
179200

180201
try:
@@ -244,5 +265,21 @@ def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
244265
escalation_error_code=error_code,
245266
)
246267

247-
if exception:
268+
if isinstance(exception, EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY):
269+
msg_details = (
270+
f"webhook={webhook_pk} alert_group={alert_group_id} user={user_id} escalation_policy={escalation_policy_id}"
271+
)
272+
273+
if manual_retry_num < EXECUTE_WEBHOOK_RETRIES:
274+
retry_num = manual_retry_num + 1
275+
logger.warning(f"Manually retrying execute_webhook for {msg_details} manual_retry_num={retry_num}")
276+
execute_webhook.apply_async(
277+
(webhook_pk, alert_group_id, user_id, escalation_policy_id, retry_num),
278+
countdown=10,
279+
)
280+
else:
281+
# don't raise an exception if we've exhausted retries for
282+
# exceptions within EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY, simply give up trying
283+
logger.warning(f"Exhausted execute_webhook retries for {msg_details}")
284+
elif exception:
248285
raise exception

engine/apps/webhooks/tests/test_trigger_webhook.py

+54
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from unittest.mock import call, patch
33

44
import pytest
5+
import requests
56
from django.utils import timezone
67

78
from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy
@@ -555,3 +556,56 @@ def test_response_content_limit(
555556
assert log.status_code == 200
556557
assert log.content == f"Response content {content_length} exceeds {WEBHOOK_RESPONSE_LIMIT} character limit"
557558
assert log.url == "https://test/"
559+
560+
561+
@patch("apps.webhooks.tasks.trigger_webhook.execute_webhook", wraps=execute_webhook)
562+
@patch("apps.webhooks.models.webhook.requests")
563+
@patch("apps.webhooks.utils.socket.gethostbyname", return_value="8.8.8.8")
564+
@pytest.mark.django_db
565+
@pytest.mark.parametrize("exception", [requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout])
566+
def test_manually_retried_exceptions(
567+
_mock_gethostbyname,
568+
mock_requests,
569+
spy_execute_webhook,
570+
make_organization,
571+
make_user_for_organization,
572+
make_alert_receive_channel,
573+
make_alert_group,
574+
make_custom_webhook,
575+
exception,
576+
):
577+
mock_requests.post.side_effect = exception("foo bar")
578+
579+
organization = make_organization()
580+
user = make_user_for_organization(organization)
581+
alert_receive_channel = make_alert_receive_channel(organization)
582+
alert_group = make_alert_group(
583+
alert_receive_channel, acknowledged_at=timezone.now(), acknowledged=True, acknowledged_by=user.pk
584+
)
585+
webhook = make_custom_webhook(
586+
organization=organization,
587+
url="https://test/",
588+
http_method="POST",
589+
trigger_type=Webhook.TRIGGER_ACKNOWLEDGE,
590+
forward_all=False,
591+
)
592+
593+
execute_webhook_args = webhook.pk, alert_group.pk, user.pk, None
594+
595+
# should retry
596+
execute_webhook(*execute_webhook_args)
597+
598+
mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
599+
spy_execute_webhook.apply_async.assert_called_once_with((*execute_webhook_args, 1), countdown=10)
600+
601+
mock_requests.reset_mock()
602+
spy_execute_webhook.reset_mock()
603+
604+
# should stop retrying after 3 attempts without raising issue
605+
try:
606+
execute_webhook(*execute_webhook_args, manual_retry_num=3)
607+
except Exception:
608+
pytest.fail()
609+
610+
mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
611+
spy_execute_webhook.apply_async.assert_not_called()

0 commit comments

Comments
 (0)