grafana · joeyorlando · Jan 9, 2024 · Jan 8, 2024 · Jan 8, 2024 · Jan 8, 2024
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - Fixed schedule timezone issues ([#3576](https://github.com/grafana/oncall/issues/3576))
+- Ignore `requests.exceptions.Timeout` exceptions when attempting to send outgoing webhook requests by @joeyorlando ([#3632](https://github.com/grafana/oncall/pull/3632))
 
 ## v1.3.82 (2024-01-04)
 

@@ -1,7 +1,10 @@
 import json
 import logging
+import typing
+from datetime import datetime
 from json import JSONDecodeError
 
+import requests
 from celery.utils.log import get_task_logger
 from django.conf import settings
 from django.db.models import Prefetch
@@ -28,6 +31,10 @@
 logger.setLevel(logging.DEBUG)
 
 
+EXECUTE_WEBHOOK_RETRIES = 3
+# these exceptions are fully out of our control (e.g. customer's network issues)
+# let's manually retry them without raising an exception
+EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY = (requests.exceptions.Timeout,)
 TRIGGER_TYPE_TO_LABEL = {
     Webhook.TRIGGER_ALERT_GROUP_CREATED: "alert group created",
     Webhook.TRIGGER_ACKNOWLEDGE: "acknowledge",
@@ -40,6 +47,17 @@
 }
 
 
+class WebhookRequestStatus(typing.TypedDict):
+    url: typing.Optional[str]
+    request_trigger: typing.Optional[str]
+    request_headers: typing.Optional[str]
+    request_data: typing.Optional[str]
+    status_code: typing.Optional[int]
+    content: typing.Optional[str]
+    webhook: Webhook
+    event_data: str
+
+
 @shared_dedicated_queue_retry_task(
     autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None
 )
@@ -52,15 +70,14 @@ def send_webhook_event(trigger_type, alert_group_id, organization_id=None, user_
     ).exclude(is_webhook_enabled=False)
 
     for webhook in webhooks_qs:
-        print(webhook.name)
         execute_webhook.apply_async((webhook.pk, alert_group_id, user_id, None))
 
 
-def _isoformat_date(date_value):
+def _isoformat_date(date_value: datetime) -> typing.Optional[str]:
     return date_value.isoformat() if date_value else None
 
 
-def _build_payload(webhook, alert_group, user):
+def _build_payload(webhook: Webhook, alert_group: AlertGroup, user: User) -> typing.Dict[str, typing.Any]:
     trigger_type = webhook.trigger_type
     event = {
         "type": TRIGGER_TYPE_TO_LABEL[trigger_type],
@@ -96,7 +113,9 @@ def _build_payload(webhook, alert_group, user):
     return data
 
 
-def mask_authorization_header(headers, header_keys_to_mask):
+def mask_authorization_header(
+    headers: typing.Dict[str, str], header_keys_to_mask: typing.List[str]
+) -> typing.Dict[str, str]:
     masked_headers = headers.copy()
     lower_keys = set(k.lower() for k in header_keys_to_mask)
     for k in headers.keys():
@@ -105,8 +124,10 @@ def mask_authorization_header(headers, header_keys_to_mask):
     return masked_headers
 
 
-def make_request(webhook, alert_group, data):
-    status = {
+def make_request(
+    webhook: Webhook, alert_group: AlertGroup, data: typing.Dict[str, typing.Any]
+) -> typing.Tuple[bool, WebhookRequestStatus, typing.Optional[str], typing.Optional[Exception]]:
+    status: WebhookRequestStatus = {
         "url": None,
         "request_trigger": None,
         "request_headers": None,
@@ -172,9 +193,9 @@ def make_request(webhook, alert_group, data):
 
 
 @shared_dedicated_queue_retry_task(
-    autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else 3
+    autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else EXECUTE_WEBHOOK_RETRIES
 )
-def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
+def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id, manual_retry_num=0):
     from apps.webhooks.models import Webhook
 
     try:
@@ -244,5 +265,24 @@ def execute_webhook(webhook_pk, alert_group_id, user_id, escalation_policy_id):
             escalation_error_code=error_code,
         )
 
-    if exception:
+    if isinstance(exception, EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY):
+        msg_details = (
+            f"\nwebhook - {webhook_pk}\n"
+            f"alert group - {alert_group_id}\n"
+            f"user - {user_id}\n"
+            f"escalation policy - {escalation_policy_id}"
+        )
+
+        if manual_retry_num < EXECUTE_WEBHOOK_RETRIES:
+            retry_num = manual_retry_num + 1
+            logger.warning(f"Manually retrying execute_webhook for{msg_details}\nmanual retry num - {retry_num}")
+            execute_webhook.apply_async(
+                (webhook_pk, alert_group_id, user_id, escalation_policy_id, retry_num),
+                countdown=10,
+            )
+        else:
+            # don't raise an exception if we've exhausted retries for
+            # exceptions within EXECUTE_WEBHOOK_EXCEPTIONS_TO_MANUALLY_RETRY, simply give up trying
+            logger.warning(f"Exhausted execute_webhook retries for{msg_details}")
+    elif exception:
         raise exception
@@ -2,6 +2,7 @@
 from unittest.mock import call, patch
 
 import pytest
+import requests
 from django.utils import timezone
 
 from apps.alerts.models import AlertGroupLogRecord, EscalationPolicy
@@ -555,3 +556,56 @@ def test_response_content_limit(
     assert log.status_code == 200
     assert log.content == f"Response content {content_length} exceeds {WEBHOOK_RESPONSE_LIMIT} character limit"
     assert log.url == "https://test/"
+
+
+@patch("apps.webhooks.tasks.trigger_webhook.execute_webhook", wraps=execute_webhook)
+@patch("apps.webhooks.models.webhook.requests")
+@patch("apps.webhooks.utils.socket.gethostbyname", return_value="8.8.8.8")
+@pytest.mark.django_db
+@pytest.mark.parametrize("exception", [requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout])
+def test_manually_retried_exceptions(
+    _mock_gethostbyname,
+    mock_requests,
+    spy_execute_webhook,
+    make_organization,
+    make_user_for_organization,
+    make_alert_receive_channel,
+    make_alert_group,
+    make_custom_webhook,
+    exception,
+):
+    mock_requests.post.side_effect = exception("foo bar")
+
+    organization = make_organization()
+    user = make_user_for_organization(organization)
+    alert_receive_channel = make_alert_receive_channel(organization)
+    alert_group = make_alert_group(
+        alert_receive_channel, acknowledged_at=timezone.now(), acknowledged=True, acknowledged_by=user.pk
+    )
+    webhook = make_custom_webhook(
+        organization=organization,
+        url="https://test/",
+        http_method="POST",
+        trigger_type=Webhook.TRIGGER_ACKNOWLEDGE,
+        forward_all=False,
+    )
+
+    execute_webhook_args = webhook.pk, alert_group.pk, user.pk, None
+
+    # should retry
+    execute_webhook(*execute_webhook_args)
+
+    mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
+    spy_execute_webhook.apply_async.assert_called_once_with((*execute_webhook_args, 1), countdown=10)
+
+    mock_requests.reset_mock()
+    spy_execute_webhook.reset_mock()
+
+    # should stop retrying after 3 attempts without raising issue
+    try:
+        execute_webhook(*execute_webhook_args, manual_retry_num=3)
+    except Exception:
+        pytest.fail()
+
+    mock_requests.post.assert_called_once_with("https://test/", timeout=10, headers={})
+    spy_execute_webhook.apply_async.assert_not_called()