Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lowering retry period for high priority emails to 25 seconds #2031

Merged
merged 28 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8d4a8f8
Decoupled scan malware code + lowering retry period for high priority…
jimleroyer Nov 20, 2023
8b3ad4e
Extract common email retry handling logic into its own function
jimleroyer Nov 21, 2023
4b40f65
Cleaned up import
jimleroyer Nov 21, 2023
dab3a12
Forgot to provide default value to optional fn arg
jimleroyer Nov 21, 2023
5ec97ae
Fixed test import
jimleroyer Nov 21, 2023
a6a418a
Isolated retry task param builder in a class
jimleroyer Nov 22, 2023
fd12319
Merge remote-tracking branch 'origin/main' into feat/decrease-retry-f…
jimleroyer Nov 22, 2023
1cef267
Cleaned up import
jimleroyer Nov 22, 2023
f4a6470
Fixed moved refs
jimleroyer Nov 22, 2023
17482f5
Trying a different strategy to fix circular import
jimleroyer Nov 22, 2023
e124f21
Fixing another bad import ref
jimleroyer Nov 23, 2023
e5bfea8
Introducing celery utils module instead of using celery root one
jimleroyer Nov 23, 2023
c7f6057
Merge remote-tracking branch 'origin/main' into feat/decrease-retry-f…
jimleroyer Nov 23, 2023
ec41793
Cover edge cases + modified tests
jimleroyer Nov 24, 2023
e084c8f
Merge remote-tracking branch 'origin/main' into feat/decrease-retry-f…
jimleroyer Nov 24, 2023
5433518
Formatting
jimleroyer Nov 24, 2023
85f5140
Sort imports
jimleroyer Nov 24, 2023
9e4bfe6
Make notification_process_type param optional
jimleroyer Nov 24, 2023
d064161
Merge remote-tracking branch 'origin/main' into feat/decrease-retry-f…
jimleroyer Dec 7, 2023
c3e5ee6
Fixed edge case when template not associated with notification obj
jimleroyer Dec 7, 2023
3b33d29
Merge remote-tracking branch 'origin/main' into feat/decrease-retry-f…
jimleroyer Dec 11, 2023
bcd3f16
Fixing params order
jimleroyer Dec 11, 2023
4e236d4
Fixing regression tests
jimleroyer Dec 11, 2023
7264ae8
More tests
jimleroyer Dec 12, 2023
cc991f2
Added null protection against a potential NPE
jimleroyer Dec 12, 2023
5bf854b
Formatting
jimleroyer Dec 12, 2023
2ea5a4b
Fix imports
jimleroyer Dec 12, 2023
e7daf30
Merge branch 'main' into feat/decrease-retry-for-email-high
sastels Dec 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions app/celery/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import Any, Dict, Optional

from flask import current_app

from app.config import QueueNames
from app.models import BULK, NORMAL, PRIORITY

# Default retry periods for sending notifications.
RETRY_DEFAULT = 300
RETRY_HIGH = 25

RETRY_PERIODS = {
BULK: RETRY_DEFAULT,
NORMAL: RETRY_DEFAULT,
PRIORITY: RETRY_HIGH,
}


def build_retry_task_params(notification_type: str, notification_process_type: str, countdown: Optional[int]) -> Dict[str, Any]:
"""
Build task params for the sending parameter retry tasks.

If the notification is a high priority SMS, set the retry policy to retry every 25 seconds
else fall back to the default retry policy of retrying every 5 minutes.

Provide an override parameter for cases the calling task wants to override the retry policy.
"""
params: dict[str, Any] = {"queue": QueueNames.RETRY}
if current_app.config["FF_CELERY_CUSTOM_TASK_PARAMS"] is False:
return params

if countdown is not None:
params["countdown"] = countdown
else:
# Overring the retry policy is only supported for SMS for now;
# email support coming later.
params["countdown"] = RETRY_PERIODS[notification_process_type]

return params
73 changes: 41 additions & 32 deletions app/celery/provider_tasks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from typing import Optional

from flask import current_app
from notifications_utils.recipients import InvalidEmailError
from notifications_utils.statsd_decorators import statsd
from sqlalchemy.orm.exc import NoResultFound

from app import notify_celery
from app.celery import build_retry_task_params
from app.config import Config, QueueNames
from app.dao import notifications_dao
from app.dao.notifications_dao import update_notification_status_by_id
Expand All @@ -14,9 +17,9 @@
MalwareScanInProgressException,
NotificationTechnicalFailureException,
)
from app.models import NOTIFICATION_TECHNICAL_FAILURE
from app.notifications import build_retry_task_params
from app.models import NOTIFICATION_TECHNICAL_FAILURE, Notification
from app.notifications.callbacks import _check_and_queue_callback_task
from celery import Task


# Celery rate limits are per worker instance and not a global rate limit.
Expand Down Expand Up @@ -81,30 +84,24 @@
_check_and_queue_callback_task(notification)
except MalwareDetectedException:
_check_and_queue_callback_task(notification)
except Exception as e:
if isinstance(e, MalwareScanInProgressException) and self.request.retries <= SCAN_MAX_BACKOFF_RETRIES:
countdown = SCAN_RETRY_BACKOFF * (self.request.retries + 1) # do we need to add 1 here?
except MalwareScanInProgressException as me:
if self.request.retries <= SCAN_MAX_BACKOFF_RETRIES:
countdown = SCAN_RETRY_BACKOFF * (self.request.retries + 1)
else:
countdown = None
try:
current_app.logger.warning(f"The exception is {repr(e)}")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we add this log warning in a specific context? This is not informative of the context. Nonetheless, I added the exception to the parameter while logging the exception down below.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was probably debugging code that slipped through

if self.request.retries <= 10:
current_app.logger.warning("RETRY {}: Email notification {} failed".format(self.request.retries, notification_id))
else:
current_app.logger.exception("RETRY: Email notification {} failed".format(notification_id))
if countdown is not None:
self.retry(queue=QueueNames.RETRY, countdown=countdown)
else:
self.retry(queue=QueueNames.RETRY)
except self.MaxRetriesExceededError:
message = (
"RETRY FAILED: Max retries reached. "
"The task send_email_to_provider failed for notification {}. "
"Notification has been updated to technical-failure".format(notification_id)
if self.request.retries <= 10:
current_app.logger.warning(
"RETRY {}: Email notification {} is waiting on pending malware scanning".format(
self.request.retries, notification_id
)
)
update_notification_status_by_id(notification_id, NOTIFICATION_TECHNICAL_FAILURE)
_check_and_queue_callback_task(notification)
raise NotificationTechnicalFailureException(message)
else:
current_app.logger.exception(
"RETRY: Email notification {} failed on pending malware scanning".format(notification_id)
)
_handle_email_retry(self, notification, me, countdown)
except Exception as e:
_handle_email_retry(self, notification, e)


def _deliver_sms(self, notification_id):
Expand All @@ -120,15 +117,8 @@
_check_and_queue_callback_task(notification)
except Exception:
try:
if self.request.retries == 0:
# Retry immediately, especially as a common failure is for the database data
# replication to be delayed. The immediate retry likely succeeds in these scenarios.
self.retry(queue=QueueNames.RETRY, countdown=0)
else:
# Once the previous retry failed, log the exception and this time,
# retry with the default delay.
current_app.logger.exception("SMS notification delivery for id: {} failed".format(notification_id))
self.retry(**build_retry_task_params(notification.notification_type, notification.template.process_type))
current_app.logger.exception("SMS notification delivery for id: {} failed".format(notification_id))
self.retry(**build_retry_task_params(notification.notification_type, notification.template.process_type))
Fixed Show fixed Hide fixed
except self.MaxRetriesExceededError:
message = (
"RETRY FAILED: Max retries reached. The task send_sms_to_provider failed for notification {}. "
Expand All @@ -137,3 +127,22 @@
update_notification_status_by_id(notification_id, NOTIFICATION_TECHNICAL_FAILURE)
_check_and_queue_callback_task(notification)
raise NotificationTechnicalFailureException(message)


def _handle_email_retry(task: Task, notification: Notification, e: Exception, countdown: Optional[None] = None):
try:
if task.request.retries <= 10:
current_app.logger.warning("RETRY {}: Email notification {} failed".format(task.request.retries, notification.id))
else:
current_app.logger.exception("RETRY: Email notification {} failed".format(notification.id), exc_info=e)

task.retry(**build_retry_task_params(notification.notification_type, notification.template.process_type, countdown))
except task.MaxRetriesExceededError:
message = (
"RETRY FAILED: Max retries reached. "
"The task send_email_to_provider failed for notification {}. "
"Notification has been updated to technical-failure".format(notification.id)
)
update_notification_status_by_id(notification.id, NOTIFICATION_TECHNICAL_FAILURE)
_check_and_queue_callback_task(notification)
raise NotificationTechnicalFailureException(message)
36 changes: 0 additions & 36 deletions app/notifications/__init__.py
Original file line number Diff line number Diff line change
@@ -1,36 +0,0 @@
from typing import Any, Dict

from flask import current_app

from app.config import QueueNames
from app.models import BULK, NORMAL, PRIORITY, SMS_TYPE

# Default retry periods for sending notifications.
RETRY_DEFAULT = 300
RETRY_HIGH = 25

RETRY_PERIODS = {
BULK: RETRY_DEFAULT,
NORMAL: RETRY_DEFAULT,
PRIORITY: RETRY_HIGH,
}


def build_retry_task_params(notification_type: str, notification_process_type: str) -> Dict[str, Any]:
"""
Build task params for the sending parameter retry tasks.

If the notification is a high priority SMS, set the retry policy to retry every 25 seconds
else fall back to the default retry policy of retrying every 5 minutes.
"""
params: dict[str, Any] = {"queue": QueueNames.RETRY}
if current_app.config["FF_CELERY_CUSTOM_TASK_PARAMS"] is False:
return params

# Overring the retry policy is only supported for SMS for now;
# email support coming later.
if notification_type == SMS_TYPE:
params["countdown"] = RETRY_PERIODS[notification_process_type]
else:
params["countdown"] = RETRY_DEFAULT
return params
1 change: 0 additions & 1 deletion app/notifications/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ def _check_and_queue_callback_task(notification):
service_callback_api = get_service_delivery_status_callback_api_for_service(service_id=notification.service_id)
if service_callback_api:
notification_data = create_delivery_status_callback_data(notification, service_callback_api)

send_delivery_status_to_service.apply_async([str(notification.id), notification_data], queue=QueueNames.CALLBACKS)


Expand Down
Loading