Skip to content

Commit

Permalink
Merge pull request #1109 from NASA-IMPACT/1105-improve-pattern-applic…
Browse files Browse the repository at this point in the history
…ation-and-exclusion-management

1105 improve pattern application and exclusion management
  • Loading branch information
CarsonDavis authored Nov 26, 2024
2 parents 1ea5168 + 4c7834f commit e285697
Show file tree
Hide file tree
Showing 27 changed files with 3,219 additions and 551 deletions.
3 changes: 2 additions & 1 deletion sde_collections/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

from sde_collections.models.delta_patterns import (
DeltaDivisionPattern,
DeltaResolvedTitle,
DeltaTitlePattern,
)

from .models.candidate_url import CandidateURL, ResolvedTitle
from .models.collection import Collection, WorkflowHistory
from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl
from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl
from .models.pattern import DivisionPattern, IncludePattern, TitlePattern
from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api

Expand Down
47 changes: 47 additions & 0 deletions sde_collections/management/commands/deduplicate_patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# docker-compose -f local.yml run --rm django python manage.py deduplicate_patterns
# docker-compose -f production.yml run --rm django python manage.py deduplicate_patterns

from collections import defaultdict

from django.core.management.base import BaseCommand
from django.db.models import Count

from sde_collections.models.pattern import (
DivisionPattern,
DocumentTypePattern,
ExcludePattern,
IncludePattern,
TitlePattern,
)


class Command(BaseCommand):
help = "Remove duplicate patterns within collections for all pattern types"

def handle(self, *args, **kwargs):
pattern_models = [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern]

deletion_counts = defaultdict(int)

for model in pattern_models:
# Get all collections that have duplicate patterns
collections_with_dupes = (
model.objects.values("collection", "match_pattern")
.annotate(pattern_count=Count("id"))
.filter(pattern_count__gt=1)
)

for group in collections_with_dupes:
# Get all patterns for this collection/match_pattern combo
patterns = model.objects.filter(collection_id=group["collection"], match_pattern=group["match_pattern"])

# Keep one pattern, delete the rest
patterns_to_delete = patterns[1:]
for pattern in patterns_to_delete:
pattern.delete()
deletion_counts[model.__name__] += 1

# Print final summary
for model_name, count in deletion_counts.items():
self.stdout.write(f"{model_name}: {count}")
self.stdout.write(f"Total: {sum(deletion_counts.values())}")
88 changes: 88 additions & 0 deletions sde_collections/management/commands/deduplicate_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import time

from django.core.management.base import BaseCommand
from django.db.models import Count, Min

from sde_collections.models.candidate_url import CandidateURL
from sde_collections.models.collection import Collection
from sde_collections.models.collection_choice_fields import WorkflowStatusChoices


class Command(BaseCommand):
help = "Deduplicate CandidateURLs"

def handle(self, *args, **kwargs):
deduplicate_candidate_urls()


def is_priority_collection(collection):
priority_statuses = {
WorkflowStatusChoices.CURATED,
WorkflowStatusChoices.QUALITY_FIXED,
WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED,
WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED,
WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK,
WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK,
WorkflowStatusChoices.QUALITY_CHECK_FAILED,
WorkflowStatusChoices.QUALITY_CHECK_MINOR,
WorkflowStatusChoices.QUALITY_CHECK_PERFECT,
WorkflowStatusChoices.PROD_PERFECT,
WorkflowStatusChoices.PROD_MINOR,
WorkflowStatusChoices.PROD_MAJOR,
}
return collection.workflow_status in priority_statuses


def deduplicate_candidate_urls():
start_time = time.time()

collection_counts = {
c["id"]: c["url_count"]
for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count")
}

collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()}

# Phase 1: Intra-collection duplicates
intra_dupes = (
CandidateURL.objects.values("collection_id", "url")
.annotate(count=Count("id"), min_id=Min("id"))
.filter(count__gt=1)
)

intra_ids_to_delete = []
for dupe in intra_dupes:
dupe_ids = set(
CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"])
.exclude(id=dupe["min_id"])
.values_list("id", flat=True)
)
intra_ids_to_delete.extend(dupe_ids)

CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete()

# Phase 2: Cross-collection duplicates
cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1)

cross_ids_to_delete = []
for dupe in cross_dupes:
instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id"))

priority_instances = [i for i in instances if collection_status[i["collection_id"]]]
non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]]

if priority_instances:
keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]])
else:
keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]])

delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]]
cross_ids_to_delete.extend(delete_ids)

CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete()

elapsed_time = time.time() - start_time
action = "Deleted"
print(
f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds" # noqa
)
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def handle(self, *args, **kwargs):
visited=candidate_url.visited,
document_type=candidate_url.document_type,
division=candidate_url.division,
delete=False,
to_delete=False,
)
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Generated by Django 4.2.9 on 2024-11-23 17:44

from django.db import migrations, models
import sde_collections.models.delta_patterns


class Migration(migrations.Migration):

dependencies = [
("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"),
]

operations = [
migrations.AlterUniqueTogether(
name="deltadivisionpattern",
unique_together=set(),
),
migrations.AlterUniqueTogether(
name="deltadocumenttypepattern",
unique_together=set(),
),
migrations.AlterUniqueTogether(
name="deltaexcludepattern",
unique_together=set(),
),
migrations.AlterUniqueTogether(
name="deltaincludepattern",
unique_together=set(),
),
migrations.AlterUniqueTogether(
name="deltatitlepattern",
unique_together=set(),
),
migrations.AlterField(
model_name="deltadivisionpattern",
name="match_pattern",
field=models.CharField(
help_text="This pattern is compared against the URL of all documents in the collection",
verbose_name="Pattern",
),
),
migrations.AlterField(
model_name="deltadocumenttypepattern",
name="match_pattern",
field=models.CharField(
help_text="This pattern is compared against the URL of all documents in the collection",
verbose_name="Pattern",
),
),
migrations.AlterField(
model_name="deltaexcludepattern",
name="match_pattern",
field=models.CharField(
help_text="This pattern is compared against the URL of all documents in the collection",
verbose_name="Pattern",
),
),
migrations.AlterField(
model_name="deltaincludepattern",
name="match_pattern",
field=models.CharField(
help_text="This pattern is compared against the URL of all documents in the collection",
verbose_name="Pattern",
),
),
migrations.AlterField(
model_name="deltatitlepattern",
name="match_pattern",
field=models.CharField(
help_text="This pattern is compared against the URL of all documents in the collection",
verbose_name="Pattern",
),
),
migrations.AlterField(
model_name="deltatitlepattern",
name="title_pattern",
field=models.CharField(
help_text="Pattern for the new title. Support exact replacement or sinequa-valid code",
validators=[sde_collections.models.delta_patterns.validate_title_pattern],
verbose_name="Title Pattern",
),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Generated by Django 4.2.9 on 2024-11-23 18:14

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
("sde_collections", "0066_alter_deltadivisionpattern_unique_together_and_more"),
]

operations = [
migrations.AlterModelOptions(
name="deltadivisionpattern",
options={
"ordering": ["match_pattern"],
"verbose_name": "Delta Division Pattern",
"verbose_name_plural": "Delta Division Patterns",
},
),
migrations.AlterModelOptions(
name="deltadocumenttypepattern",
options={
"ordering": ["match_pattern"],
"verbose_name": "Delta Document Type Pattern",
"verbose_name_plural": "Delta Document Type Patterns",
},
),
migrations.AlterModelOptions(
name="deltaexcludepattern",
options={
"ordering": ["match_pattern"],
"verbose_name": "Delta Exclude Pattern",
"verbose_name_plural": "Delta Exclude Patterns",
},
),
migrations.AlterModelOptions(
name="deltaincludepattern",
options={
"ordering": ["match_pattern"],
"verbose_name": "Delta Include Pattern",
"verbose_name_plural": "Delta Include Patterns",
},
),
migrations.AlterModelOptions(
name="deltatitlepattern",
options={
"ordering": ["match_pattern"],
"verbose_name": "Delta Title Pattern",
"verbose_name_plural": "Delta Title Patterns",
},
),
migrations.AlterUniqueTogether(
name="deltadivisionpattern",
unique_together={("collection", "match_pattern")},
),
migrations.AlterUniqueTogether(
name="deltadocumenttypepattern",
unique_together={("collection", "match_pattern")},
),
migrations.AlterUniqueTogether(
name="deltaexcludepattern",
unique_together={("collection", "match_pattern")},
),
migrations.AlterUniqueTogether(
name="deltaincludepattern",
unique_together={("collection", "match_pattern")},
),
migrations.AlterUniqueTogether(
name="deltatitlepattern",
unique_together={("collection", "match_pattern")},
),
]
Loading

0 comments on commit e285697

Please sign in to comment.