-
Notifications
You must be signed in to change notification settings - Fork 217
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add filtered index creation DAG (#1833)
* Add filtered index creation DAG Still WIP; data refresh external task sensor does not work and the new DAG is missing logic for deleting previous filtered indexes and unit tests * Only wait for filtered index creation if a run actually exists * Delete the previous filtered index * Reorganise and use more "airflow-y" approach for concurrency check * Wait for any finished DAG state * Fix typo * Allow per-media-type filtered index timeout configuration * Use more descriptive error * Fix DAG trigger configuration passing method * Update DAG docs * Remove permalinks from argument documentation * Update dag docs since generator fix * Skip unnecessary linting steps
- Loading branch information
1 parent
00a26a6
commit 4d6e995
Showing
13 changed files
with
649 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import logging | ||
import os | ||
from datetime import timedelta | ||
from urllib.parse import urlparse | ||
|
||
from airflow.exceptions import AirflowException | ||
from airflow.providers.http.operators.http import SimpleHttpOperator | ||
from airflow.providers.http.sensors.http import HttpSensor | ||
from requests import Response | ||
|
||
from common.constants import XCOM_PULL_TEMPLATE | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
POKE_INTERVAL = int(os.getenv("DATA_REFRESH_POKE_INTERVAL", 60 * 15)) | ||
|
||
|
||
def response_filter_stat(response: Response) -> str: | ||
""" | ||
Handle the response for the `get_current_index` task. | ||
This is used to extract the name of the current index that the concerned alias | ||
points to. This index name will be available via XCom in the downstream tasks. | ||
""" | ||
index_name = response.json()["alt_names"] | ||
# Indices are named as '<media type>-<suffix>', so everything after the first | ||
# hyphen '-' is the suffix. | ||
_, index_suffix = index_name.split("-", maxsplit=1) | ||
return index_suffix | ||
|
||
|
||
def response_filter_status_check_endpoint(response: Response) -> str: | ||
""" | ||
Handle the response for `trigger_task` task. | ||
This is used to grab the endpoint needed to poll for the status of the triggered | ||
data refresh. This information will then be available via XCom in the downstream | ||
tasks. | ||
""" | ||
status_check_url = response.json()["status_check"] | ||
return urlparse(status_check_url).path | ||
|
||
|
||
def response_check_wait_for_completion(response: Response) -> bool: | ||
""" | ||
Handle the response for `wait_for_completion` Sensor. | ||
Processes the response to determine whether the task can complete. | ||
""" | ||
data = response.json() | ||
|
||
if data["active"]: | ||
# The data refresh is still running. Poll again later. | ||
return False | ||
|
||
if data["error"]: | ||
raise AirflowException( | ||
"Ingestion server encountered an error during data refresh." | ||
) | ||
|
||
logger.info(f"Data refresh done with {data['progress']}% completed.") | ||
return True | ||
|
||
|
||
def get_current_index(target_alias: str) -> SimpleHttpOperator: | ||
return SimpleHttpOperator( | ||
task_id="get_current_index", | ||
http_conn_id="data_refresh", | ||
endpoint=f"stat/{target_alias}", | ||
method="GET", | ||
response_check=lambda response: response.status_code == 200, | ||
response_filter=response_filter_stat, | ||
) | ||
|
||
|
||
def trigger_task( | ||
action: str, | ||
model: str, | ||
data: dict | None = None, | ||
) -> SimpleHttpOperator: | ||
data = { | ||
**(data or {}), | ||
"model": model, | ||
"action": action.upper(), | ||
} | ||
return SimpleHttpOperator( | ||
task_id=f"trigger_{action.lower()}", | ||
http_conn_id="data_refresh", | ||
endpoint="task", | ||
data=data, | ||
response_check=lambda response: response.status_code == 202, | ||
response_filter=response_filter_status_check_endpoint, | ||
) | ||
|
||
|
||
def wait_for_task( | ||
action: str, | ||
task_trigger: SimpleHttpOperator, | ||
timeout: timedelta, | ||
poke_interval: int = POKE_INTERVAL, | ||
) -> HttpSensor: | ||
return HttpSensor( | ||
task_id=f"wait_for_{action.lower()}", | ||
http_conn_id="data_refresh", | ||
endpoint=XCOM_PULL_TEMPLATE.format(task_trigger.task_id, "return_value"), | ||
method="GET", | ||
response_check=response_check_wait_for_completion, | ||
mode="reschedule", | ||
poke_interval=poke_interval, | ||
timeout=timeout.total_seconds(), | ||
) | ||
|
||
|
||
def trigger_and_wait_for_task( | ||
action: str, | ||
model: str, | ||
timeout: timedelta, | ||
data: dict | None = None, | ||
poke_interval: int = POKE_INTERVAL, | ||
) -> tuple[SimpleHttpOperator, HttpSensor]: | ||
trigger = trigger_task(action, model, data) | ||
waiter = wait_for_task(action, trigger, timeout, poke_interval) | ||
trigger >> waiter | ||
return trigger, waiter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from datetime import datetime | ||
|
||
from airflow.models import DagRun | ||
|
||
|
||
def get_most_recent_dag_run(dag_id) -> list[datetime] | datetime: | ||
""" | ||
Retrieve the most recent DAG run's execution date. | ||
For use as ``execution_date_fn`` argument to ``ExternalTaskSensor``. | ||
Adapted from https://stackoverflow.com/a/74017474 | ||
CC BY-SA 4.0 by Stack Overflow user Nahid O. | ||
""" | ||
dag_runs = DagRun.find(dag_id=dag_id) | ||
dag_runs.sort(key=lambda x: x.execution_date, reverse=True) | ||
if dag_runs: | ||
return dag_runs[0].execution_date | ||
|
||
# If there are no DAG runs, return an empty list to indicate that | ||
# there are no execution dates to check. | ||
# This works because the sensor waits until the number | ||
# of runs for the execution dates in the ``allowed_states`` matches the | ||
# length of the list of execution dates to check. If there are no runs | ||
# for this DAG, then the only possible number of required states | ||
# we can have is 0. See ``ExternalTaskSensor::poke`` and | ||
# ``ExternalTaskSensor::get_count``, especially the handling | ||
# of ``dttm_filter`` for the relevant implementation details. | ||
return [] |
Oops, something went wrong.