-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1109 from NASA-IMPACT/1105-improve-pattern-applic…
…ation-and-exclusion-management 1105 improve pattern application and exclusion management
- Loading branch information
Showing
27 changed files
with
3,219 additions
and
551 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
47 changes: 47 additions & 0 deletions
47
sde_collections/management/commands/deduplicate_patterns.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# docker-compose -f local.yml run --rm django python manage.py deduplicate_patterns | ||
# docker-compose -f production.yml run --rm django python manage.py deduplicate_patterns | ||
|
||
from collections import defaultdict | ||
|
||
from django.core.management.base import BaseCommand | ||
from django.db.models import Count | ||
|
||
from sde_collections.models.pattern import ( | ||
DivisionPattern, | ||
DocumentTypePattern, | ||
ExcludePattern, | ||
IncludePattern, | ||
TitlePattern, | ||
) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Remove duplicate patterns within collections for all pattern types" | ||
|
||
def handle(self, *args, **kwargs): | ||
pattern_models = [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern] | ||
|
||
deletion_counts = defaultdict(int) | ||
|
||
for model in pattern_models: | ||
# Get all collections that have duplicate patterns | ||
collections_with_dupes = ( | ||
model.objects.values("collection", "match_pattern") | ||
.annotate(pattern_count=Count("id")) | ||
.filter(pattern_count__gt=1) | ||
) | ||
|
||
for group in collections_with_dupes: | ||
# Get all patterns for this collection/match_pattern combo | ||
patterns = model.objects.filter(collection_id=group["collection"], match_pattern=group["match_pattern"]) | ||
|
||
# Keep one pattern, delete the rest | ||
patterns_to_delete = patterns[1:] | ||
for pattern in patterns_to_delete: | ||
pattern.delete() | ||
deletion_counts[model.__name__] += 1 | ||
|
||
# Print final summary | ||
for model_name, count in deletion_counts.items(): | ||
self.stdout.write(f"{model_name}: {count}") | ||
self.stdout.write(f"Total: {sum(deletion_counts.values())}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import time | ||
|
||
from django.core.management.base import BaseCommand | ||
from django.db.models import Count, Min | ||
|
||
from sde_collections.models.candidate_url import CandidateURL | ||
from sde_collections.models.collection import Collection | ||
from sde_collections.models.collection_choice_fields import WorkflowStatusChoices | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Deduplicate CandidateURLs" | ||
|
||
def handle(self, *args, **kwargs): | ||
deduplicate_candidate_urls() | ||
|
||
|
||
def is_priority_collection(collection): | ||
priority_statuses = { | ||
WorkflowStatusChoices.CURATED, | ||
WorkflowStatusChoices.QUALITY_FIXED, | ||
WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, | ||
WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED, | ||
WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, | ||
WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, | ||
WorkflowStatusChoices.QUALITY_CHECK_FAILED, | ||
WorkflowStatusChoices.QUALITY_CHECK_MINOR, | ||
WorkflowStatusChoices.QUALITY_CHECK_PERFECT, | ||
WorkflowStatusChoices.PROD_PERFECT, | ||
WorkflowStatusChoices.PROD_MINOR, | ||
WorkflowStatusChoices.PROD_MAJOR, | ||
} | ||
return collection.workflow_status in priority_statuses | ||
|
||
|
||
def deduplicate_candidate_urls(): | ||
start_time = time.time() | ||
|
||
collection_counts = { | ||
c["id"]: c["url_count"] | ||
for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count") | ||
} | ||
|
||
collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()} | ||
|
||
# Phase 1: Intra-collection duplicates | ||
intra_dupes = ( | ||
CandidateURL.objects.values("collection_id", "url") | ||
.annotate(count=Count("id"), min_id=Min("id")) | ||
.filter(count__gt=1) | ||
) | ||
|
||
intra_ids_to_delete = [] | ||
for dupe in intra_dupes: | ||
dupe_ids = set( | ||
CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"]) | ||
.exclude(id=dupe["min_id"]) | ||
.values_list("id", flat=True) | ||
) | ||
intra_ids_to_delete.extend(dupe_ids) | ||
|
||
CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete() | ||
|
||
# Phase 2: Cross-collection duplicates | ||
cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1) | ||
|
||
cross_ids_to_delete = [] | ||
for dupe in cross_dupes: | ||
instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id")) | ||
|
||
priority_instances = [i for i in instances if collection_status[i["collection_id"]]] | ||
non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]] | ||
|
||
if priority_instances: | ||
keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]]) | ||
else: | ||
keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]]) | ||
|
||
delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]] | ||
cross_ids_to_delete.extend(delete_ids) | ||
|
||
CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete() | ||
|
||
elapsed_time = time.time() - start_time | ||
action = "Deleted" | ||
print( | ||
f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds" # noqa | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 83 additions & 0 deletions
83
sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# Generated by Django 4.2.9 on 2024-11-23 17:44 | ||
|
||
from django.db import migrations, models | ||
import sde_collections.models.delta_patterns | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"), | ||
] | ||
|
||
operations = [ | ||
migrations.AlterUniqueTogether( | ||
name="deltadivisionpattern", | ||
unique_together=set(), | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltadocumenttypepattern", | ||
unique_together=set(), | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltaexcludepattern", | ||
unique_together=set(), | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltaincludepattern", | ||
unique_together=set(), | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltatitlepattern", | ||
unique_together=set(), | ||
), | ||
migrations.AlterField( | ||
model_name="deltadivisionpattern", | ||
name="match_pattern", | ||
field=models.CharField( | ||
help_text="This pattern is compared against the URL of all documents in the collection", | ||
verbose_name="Pattern", | ||
), | ||
), | ||
migrations.AlterField( | ||
model_name="deltadocumenttypepattern", | ||
name="match_pattern", | ||
field=models.CharField( | ||
help_text="This pattern is compared against the URL of all documents in the collection", | ||
verbose_name="Pattern", | ||
), | ||
), | ||
migrations.AlterField( | ||
model_name="deltaexcludepattern", | ||
name="match_pattern", | ||
field=models.CharField( | ||
help_text="This pattern is compared against the URL of all documents in the collection", | ||
verbose_name="Pattern", | ||
), | ||
), | ||
migrations.AlterField( | ||
model_name="deltaincludepattern", | ||
name="match_pattern", | ||
field=models.CharField( | ||
help_text="This pattern is compared against the URL of all documents in the collection", | ||
verbose_name="Pattern", | ||
), | ||
), | ||
migrations.AlterField( | ||
model_name="deltatitlepattern", | ||
name="match_pattern", | ||
field=models.CharField( | ||
help_text="This pattern is compared against the URL of all documents in the collection", | ||
verbose_name="Pattern", | ||
), | ||
), | ||
migrations.AlterField( | ||
model_name="deltatitlepattern", | ||
name="title_pattern", | ||
field=models.CharField( | ||
help_text="Pattern for the new title. Support exact replacement or sinequa-valid code", | ||
validators=[sde_collections.models.delta_patterns.validate_title_pattern], | ||
verbose_name="Title Pattern", | ||
), | ||
), | ||
] |
73 changes: 73 additions & 0 deletions
73
sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# Generated by Django 4.2.9 on 2024-11-23 18:14 | ||
|
||
from django.db import migrations | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
("sde_collections", "0066_alter_deltadivisionpattern_unique_together_and_more"), | ||
] | ||
|
||
operations = [ | ||
migrations.AlterModelOptions( | ||
name="deltadivisionpattern", | ||
options={ | ||
"ordering": ["match_pattern"], | ||
"verbose_name": "Delta Division Pattern", | ||
"verbose_name_plural": "Delta Division Patterns", | ||
}, | ||
), | ||
migrations.AlterModelOptions( | ||
name="deltadocumenttypepattern", | ||
options={ | ||
"ordering": ["match_pattern"], | ||
"verbose_name": "Delta Document Type Pattern", | ||
"verbose_name_plural": "Delta Document Type Patterns", | ||
}, | ||
), | ||
migrations.AlterModelOptions( | ||
name="deltaexcludepattern", | ||
options={ | ||
"ordering": ["match_pattern"], | ||
"verbose_name": "Delta Exclude Pattern", | ||
"verbose_name_plural": "Delta Exclude Patterns", | ||
}, | ||
), | ||
migrations.AlterModelOptions( | ||
name="deltaincludepattern", | ||
options={ | ||
"ordering": ["match_pattern"], | ||
"verbose_name": "Delta Include Pattern", | ||
"verbose_name_plural": "Delta Include Patterns", | ||
}, | ||
), | ||
migrations.AlterModelOptions( | ||
name="deltatitlepattern", | ||
options={ | ||
"ordering": ["match_pattern"], | ||
"verbose_name": "Delta Title Pattern", | ||
"verbose_name_plural": "Delta Title Patterns", | ||
}, | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltadivisionpattern", | ||
unique_together={("collection", "match_pattern")}, | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltadocumenttypepattern", | ||
unique_together={("collection", "match_pattern")}, | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltaexcludepattern", | ||
unique_together={("collection", "match_pattern")}, | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltaincludepattern", | ||
unique_together={("collection", "match_pattern")}, | ||
), | ||
migrations.AlterUniqueTogether( | ||
name="deltatitlepattern", | ||
unique_together={("collection", "match_pattern")}, | ||
), | ||
] |
Oops, something went wrong.