From 81555a25f31222ed7475630cd3492f1a4d8c9389 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Wed, 31 Jan 2024 14:20:07 -0600 Subject: [PATCH 1/2] Update title pattern by applying rule interpretation --- sde_collections/models/pattern.py | 17 ++++++++++++++++- sde_collections/pattern_interpreter.py | 8 ++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 sde_collections/pattern_interpreter.py diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 1c7eae57..c7dd9d72 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -1,7 +1,9 @@ import re from django.db import models +from django.db.models import Case, F, Q, When +from ..pattern_interpreter import interpret_title_pattern from .collection_choice_fields import DocumentTypes @@ -149,7 +151,20 @@ class TitlePattern(BaseMatchPattern): def apply(self) -> None: matched_urls = self.matched_urls() - matched_urls.update(generated_title=self.title_pattern) + + # Update generated_title using the update function + matched_urls.update( + generated_title=Case( + When( + Q(url=F("url"), scraped_title=F("scraped_title")), + then=interpret_title_pattern( + F("url"), F("scraped_title"), self.title_pattern + ), + ), + default=F("generated_title"), + ) + ) + candidate_url_ids = list(matched_urls.values_list("id", flat=True)) self.candidate_urls.through.objects.bulk_create( objs=[ diff --git a/sde_collections/pattern_interpreter.py b/sde_collections/pattern_interpreter.py new file mode 100644 index 00000000..53a2aa8e --- /dev/null +++ b/sde_collections/pattern_interpreter.py @@ -0,0 +1,8 @@ +def interpret_title_pattern(url, scraped_title, title_pattern): + """Interpret a title pattern.""" + # If "{title}" is in the title_pattern, replace it with scraped_title + if "{title}" in title_pattern: + return title_pattern.replace("{title}", scraped_title) + # If "{title}" is not in the title_pattern, return title_pattern as is + else: + return title_pattern From 0b863f512e00de725f9d76c7f985fe131409d3a7 Mon Sep 17 00:00:00 2001 From: Ashish Acharya Date: Wed, 31 Jan 2024 14:40:13 -0600 Subject: [PATCH 2/2] Change pattern apply code to use a loop --- sde_collections/models/pattern.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index c7dd9d72..9eb755e6 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -1,7 +1,6 @@ import re from django.db import models -from django.db.models import Case, F, Q, When from ..pattern_interpreter import interpret_title_pattern from .collection_choice_fields import DocumentTypes @@ -152,18 +151,14 @@ class TitlePattern(BaseMatchPattern): def apply(self) -> None: matched_urls = self.matched_urls() - # Update generated_title using the update function - matched_urls.update( - generated_title=Case( - When( - Q(url=F("url"), scraped_title=F("scraped_title")), - then=interpret_title_pattern( - F("url"), F("scraped_title"), self.title_pattern - ), - ), - default=F("generated_title"), + # since this is not running in celery, this is a bit slow + for url, scraped_title in matched_urls.values_list("url", "scraped_title"): + generated_title = interpret_title_pattern( + url, scraped_title, self.title_pattern + ) + matched_urls.filter(url=url, scraped_title=scraped_title).update( + generated_title=generated_title ) - ) candidate_url_ids = list(matched_urls.values_list("id", flat=True)) self.candidate_urls.through.objects.bulk_create(