From eddad8479d411f7ee4f220595593ae845b42717b Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 13 Dec 2024 10:55:47 -0600 Subject: [PATCH 1/7] refactor readme for unapply logic --- .../models/README_UNAPPLY_LOGIC.md | 105 +++++++++++++----- 1 file changed, 77 insertions(+), 28 deletions(-) diff --git a/sde_collections/models/README_UNAPPLY_LOGIC.md b/sde_collections/models/README_UNAPPLY_LOGIC.md index f4c75f8f..536a2281 100644 --- a/sde_collections/models/README_UNAPPLY_LOGIC.md +++ b/sde_collections/models/README_UNAPPLY_LOGIC.md @@ -14,33 +14,61 @@ - Delta URL exists with pattern effect - Pattern is removed ``` -Curated: None -Delta: division=BIOLOGY (from pattern) -[Pattern removed] -Result: Delta remains with division=None +Curated: None exists +Delta: url=new.com, division=None +``` +`[Pattern: division=BIOLOGY], created` +``` +Curated: None exists +Delta: url=new.com, division=BIOLOGY +``` +`[Pattern: division=BIOLOGY], deleted` +``` +Curated: None exists +Delta: url=new.com, division=None ``` -### Case 2: Delta and Curated Exist +### Case 2: Delta Created to Apply Pattern **Scenario:** -- Both curated and delta URLs exist +- A Curated with no division already exists +- A pattern is created +- A delta is created to to apply a pattern - Pattern is removed +- Delta should be deleted +``` +Curated: division=None ``` -Curated: division=GENERAL +`[Pattern: division=BIOLOGY], created` +``` +Curated: division=None Delta: division=BIOLOGY (from pattern) -[Pattern removed] -Result: Delta reverts to curated value (division=GENERAL) -If delta now matches curated exactly, delta is deleted +``` +`[Pattern: division=BIOLOGY], deleted` +``` +Curated: division=None ``` -### Case 3: Curated Only -**Scenario:** -- Only curated URL exists +### Case 3: Pre-existing Delta +- A Curated with no division already exists +- A Delta with an updated scraped_title exists +- A pattern is created to set division +- A delta is created to apply a pattern - Pattern is removed +- Delta should be maintained because of scraped_title + +``` +Curated: division=None +Delta: scraped_title="Modified", division=None +``` +`[Pattern: division=BIOLOGY], created` +``` +Curated: division=None +Delta: scraped_title="Modified", division=BIOLOGY (from pattern) +``` +`[Pattern: division=BIOLOGY], deleted` ``` -Curated: division=GENERAL -Delta: None -[Pattern removed] -Result: New delta created with division=None +Curated: division=None +Delta: scraped_title="Modified", division=None ``` ### Case 4: Multiple Pattern Effects @@ -48,22 +76,43 @@ Result: New delta created with division=None - Delta has changes from multiple patterns - One pattern is removed ``` -Curated: division=GENERAL, doc_type=DOCUMENTATION Delta: division=BIOLOGY, doc_type=DATA (from two patterns) -[Division pattern removed] -Result: Delta remains with division=GENERAL, doc_type=DATA preserved +Pattern: division=BIOLOGY +Pattern: doc_type=DATA +``` +`[Pattern: division=BIOLOGY], deleted` +``` +Delta: division=None, doc_type=DATA +Pattern: doc_type=DATA ``` -### Case 5: Pattern Removal with Manual Changes -**Scenario:** -- Delta has both pattern effect and manual changes -- Pattern is removed +### Case 5: Overlapping Patterns, Specific Deleted ``` -Curated: division=GENERAL, title="Original" -Delta: division=BIOLOGY, title="Modified" (pattern + manual) -[Pattern removed] -Result: Delta remains with division=GENERAL, title="Modified" preserved +Curated: division=ASTROPHYSICS (because of specific pattern) +Specific Pattern: division=ASTROPHYSICS +General Pattern: division=BIOLOGY ``` +`[Specific Pattern: division=ASTROPHYSICS], deleted` + +``` +Curated: division=BIOLOGY (because of general pattern) +General Pattern: division=BIOLOGY +``` + + +### Case 6: Overlapping Patterns, General Deleted +``` +Curated: division=ASTROPHYSICS (because of specific pattern) +Specific Pattern: division=ASTROPHYSICS +General Pattern: division=BIOLOGY +``` +`[General Pattern: division=BIOLOGY], deleted` + +``` +Curated: division=ASTROPHYSICS (because of specific pattern) +Specific Pattern: division=ASTROPHYSICS +``` + ## Implementation Steps From 6a52eafa95987fb12273856f1d607ba2c52a8048 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 13 Dec 2024 13:00:56 -0600 Subject: [PATCH 2/7] update promotion code to treat empty stings and null values as meaningful --- sde_collections/models/collection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index ddc5b241..937134dc 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -211,18 +211,18 @@ def promote_to_curated(self): continue delta_value = getattr(delta, field_name) - if delta_value not in [None, ""] and getattr(curated, field_name) != delta_value: + if getattr(curated, field_name) != delta_value: updated_fields[field_name] = delta_value if updated_fields: CuratedUrl.objects.filter(pk=curated.pk).update(**updated_fields) else: - # If no matching CuratedUrl, create a new one using all non-null and non-empty fields + # Previously, we excluded fields with values of None and "" + # however, such null values are considered meaningful and should be copied over new_data = { field.name: getattr(delta, field.name) for field in delta._meta.fields - if field.name not in ["to_delete", "collection", "id"] - and getattr(delta, field.name) not in [None, ""] + if field.name not in ["to_delete", "collection", "id"] and getattr(delta, field.name) } CuratedUrl.objects.create(collection=self, **new_data) From ec471b014507ac7ee78c77406ceaa0d81e131c5b Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 13 Dec 2024 13:01:12 -0600 Subject: [PATCH 3/7] correct examples 5 and 6 in unapply logic readme --- sde_collections/models/README_UNAPPLY_LOGIC.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sde_collections/models/README_UNAPPLY_LOGIC.md b/sde_collections/models/README_UNAPPLY_LOGIC.md index 536a2281..1000f5f8 100644 --- a/sde_collections/models/README_UNAPPLY_LOGIC.md +++ b/sde_collections/models/README_UNAPPLY_LOGIC.md @@ -88,28 +88,28 @@ Pattern: doc_type=DATA ### Case 5: Overlapping Patterns, Specific Deleted ``` -Curated: division=ASTROPHYSICS (because of specific pattern) +Delta: division=ASTROPHYSICS (because of specific pattern) Specific Pattern: division=ASTROPHYSICS General Pattern: division=BIOLOGY ``` `[Specific Pattern: division=ASTROPHYSICS], deleted` ``` -Curated: division=BIOLOGY (because of general pattern) +Delta: division=BIOLOGY (because of general pattern) General Pattern: division=BIOLOGY ``` ### Case 6: Overlapping Patterns, General Deleted ``` -Curated: division=ASTROPHYSICS (because of specific pattern) +Delta: division=ASTROPHYSICS (because of specific pattern) Specific Pattern: division=ASTROPHYSICS General Pattern: division=BIOLOGY ``` `[General Pattern: division=BIOLOGY], deleted` ``` -Curated: division=ASTROPHYSICS (because of specific pattern) +Delta: division=ASTROPHYSICS (because of specific pattern) Specific Pattern: division=ASTROPHYSICS ``` From 393402c0f53a67ff8a33d78f4638f62d5b3a9e27 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 13 Dec 2024 13:18:48 -0600 Subject: [PATCH 4/7] update Field modifier unapply to handle pattern overlaps --- sde_collections/models/delta_patterns.py | 55 ++++++++++++++++++------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py index dade950a..aa00f41c 100644 --- a/sde_collections/models/delta_patterns.py +++ b/sde_collections/models/delta_patterns.py @@ -333,16 +333,28 @@ def unapply(self) -> None: affected_deltas = self.delta_urls.all() affected_curated = self.curated_urls.all() + # Get all other patterns of same type for this collection + pattern_class = self.__class__ + other_patterns = pattern_class.objects.filter(collection=self.collection).exclude(id=self.id) + # Process each affected delta URL for delta in affected_deltas: curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first() - if not curated: - # Scenario 1: Delta only - new URL - setattr(delta, field, None) + # Find next most specific matching pattern if any + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)] + + next_pattern = None + if matching_patterns: + # Sort by number of URLs matched (ascending) to find most specific + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + + if next_pattern: + # Apply next most specific pattern's value + setattr(delta, field, next_pattern.get_new_value()) delta.save() - else: - # Scenario 2: Both exist + elif curated: + # No other patterns match, revert to curated value setattr(delta, field, getattr(curated, field)) delta.save() @@ -354,17 +366,36 @@ def unapply(self) -> None: ) if fields_match: delta.delete() + else: + # No curated URL or other patterns, set to None + setattr(delta, field, None) + delta.save() # Handle curated URLs that don't have deltas for curated in affected_curated: if not DeltaUrl.objects.filter(url=curated.url).exists(): - # Scenario 3: Curated only - # Copy all fields from curated except the one we're nulling - fields = { - f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"] - } - fields[field] = None # Set the pattern's field to None - delta = DeltaUrl.objects.create(collection=self.collection, **fields) + # Find any matching patterns + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)] + + if matching_patterns: + # Apply most specific pattern's value + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + fields[field] = next_pattern.get_new_value() + DeltaUrl.objects.create(collection=self.collection, **fields) + else: + # No other patterns, create delta with None + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + fields[field] = None + DeltaUrl.objects.create(collection=self.collection, **fields) # Clear pattern relationships self.delta_urls.clear() From 48e0ac986046a5f14e4696061263c2d0b790e57e Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 13 Dec 2024 13:20:12 -0600 Subject: [PATCH 5/7] add dedicated test suite for field modifier unapply --- .../tests/test_field_modifier_unapply.py | 252 ++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 sde_collections/tests/test_field_modifier_unapply.py diff --git a/sde_collections/tests/test_field_modifier_unapply.py b/sde_collections/tests/test_field_modifier_unapply.py new file mode 100644 index 00000000..9f9f5b01 --- /dev/null +++ b/sde_collections/tests/test_field_modifier_unapply.py @@ -0,0 +1,252 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_unapply.py + +from django.test import TestCase + +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl + +from .factories import CollectionFactory, DumpUrlFactory + + +class TestDeltaPatternUnapplyLogic(TestCase): + """Test complete lifecycle of pattern application and removal.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_dump_to_delta_migration_with_pattern_lifecycle(self): + """ + Test complete lifecycle: + 1. Create dump URLs + 2. Migrate to delta URLs + 3. Apply patterns + 4. Promote to curated + 5. Delete pattern + 6. Verify deltas are created + 7. Promote to curated + 8. Verify curated URLs have division set to None + """ + # Create initial dump URLs + [ + DumpUrlFactory( + collection=self.collection, + url=f"https://example.com/science/data{i}.html", + ) + for i in range(3) + ] + + # Migrate dump to delta + self.collection.migrate_dump_to_delta() + + # Verify dump URLs were migrated to delta URLs + self.assertEqual(DeltaUrl.objects.count(), 3) + self.assertEqual(DumpUrl.objects.count(), 0) + + # Apply division pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/science/*.html", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.BIOLOGY, + ) + + # Verify pattern was applied to existing deltas + for delta_url in DeltaUrl.objects.all(): + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Promote to curated + self.collection.promote_to_curated() + + # Verify promotion + self.assertEqual(CuratedUrl.objects.count(), 3) + self.assertEqual(DeltaUrl.objects.count(), 0) + for curated_url in CuratedUrl.objects.all(): + self.assertEqual(curated_url.division, Divisions.BIOLOGY) + + # Remove pattern + pattern.delete() + + # Should have created new deltas for all URLs setting division to None + self.assertEqual(DeltaUrl.objects.count(), 3) + for delta_url in DeltaUrl.objects.all(): + self.assertIsNone(delta_url.division) + + # Promote to curated + self.collection.promote_to_curated() + + # Should updated all Curated setting division to None + self.assertEqual(CuratedUrl.objects.count(), 3) + for delta_url in CuratedUrl.objects.all(): + self.assertIsNone(delta_url.division) + + # Test for README_UNNAPLY_LOGIC.md Case 1: Delta Only (New URL) + def test_pattern_removal_with_delta_only(self): + """Test pattern removal when delta exists without corresponding curated URL.""" + # Create initial delta URL (simulating a new URL) + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/new.html") + + # Create and apply pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, division=Divisions.BIOLOGY + ) + + # Verify pattern was applied + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Remove pattern + pattern.delete() + + # Verify delta still exists but with division set to None + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertIsNone(delta_url.division) + self.assertEqual(DeltaUrl.objects.count(), 1) + + # Test for README_UNNAPLY_LOGIC.md Case 2: Delta Created to Apply Pattern + def test_pattern_removal_with_simple_delta(self): + """Test pattern removal when delta was created just to apply pattern.""" + # Create initial curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, url="https://example.com/doc.html", division=None + ) + + # Create and apply pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + # Verify delta was created with pattern's value + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Remove pattern + pattern.delete() + + # Verify delta was deleted since it would match curated + self.assertEqual(DeltaUrl.objects.filter(url=curated_url.url).count(), 0) + + # Test for README_UNNAPLY_LOGIC.md Case 3: Pre-existing Delta + def test_pattern_removal_preserves_other_changes(self): + """Test pattern removal when delta has other changes that should be preserved.""" + # Create curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, + url="https://example.com/doc.html", + division=None, + scraped_title="Original Title", + ) + + # Create delta with modified title + delta_url = DeltaUrl.objects.create( + collection=self.collection, url=curated_url.url, division=None, scraped_title="Modified Title" + ) + + # Create and apply pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + # Verify pattern was applied while preserving title + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + self.assertEqual(delta_url.scraped_title, "Modified Title") + + # Remove pattern + pattern.delete() + + # Verify delta still exists with original changes but pattern effect removed + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertIsNone(delta_url.division) + self.assertEqual(delta_url.scraped_title, "Modified Title") + + # Test for README_UNNAPLY_LOGIC.md Case 4: Multiple Pattern Effects + def test_pattern_removal_with_multiple_patterns(self): + """Test removal of one pattern when URL is affected by multiple patterns.""" + # Create curated URL + curated_url = CuratedUrl.objects.create(collection=self.collection, url="https://example.com/doc.html") + + # Create two patterns affecting the same URL + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA + ) + + # Verify both patterns were applied + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + self.assertEqual(delta_url.document_type, DocumentTypes.DATA) + + # Remove division pattern + division_pattern.delete() + + # Verify delta still exists with doc type but division removed + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertIsNone(delta_url.division) + self.assertEqual(delta_url.document_type, DocumentTypes.DATA) + + # Test for Case 5: Overlapping Patterns, Specific Deleted + def test_specific_pattern_removal_with_overlapping_patterns(self): + """Test removal of specific pattern when more general pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.BIOLOGY, + ) + + # Create specific pattern + specific_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, division=Divisions.ASTROPHYSICS + ) + + # Verify specific pattern took precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS) + + # Remove specific pattern + specific_pattern.delete() + + # Verify general pattern now applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.BIOLOGY) + + # Test for Case 6: Overlapping Patterns, General Deleted + def test_general_pattern_removal_with_overlapping_patterns(self): + """Test removal of general pattern when more specific pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + general_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.BIOLOGY, + ) + + # Create specific pattern + DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, division=Divisions.ASTROPHYSICS + ) + + # Verify specific pattern takes precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS) + + # Remove general pattern + general_pattern.delete() + + # Verify specific pattern still applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.division, Divisions.ASTROPHYSICS) From cf86eee8b0026d702ec5dce590366d15c6f0b840 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 13 Dec 2024 13:30:31 -0600 Subject: [PATCH 6/7] add tests for title pattern unapply --- .../tests/test_title_pattern_unapply.py | 281 ++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 sde_collections/tests/test_title_pattern_unapply.py diff --git a/sde_collections/tests/test_title_pattern_unapply.py b/sde_collections/tests/test_title_pattern_unapply.py new file mode 100644 index 00000000..db8ed7e5 --- /dev/null +++ b/sde_collections/tests/test_title_pattern_unapply.py @@ -0,0 +1,281 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_pattern_unapply.py + +from django.test import TestCase + +from sde_collections.models.delta_patterns import ( + DeltaResolvedTitle, + DeltaResolvedTitleError, + DeltaTitlePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl + +from .factories import CollectionFactory, DumpUrlFactory + + +class TestTitlePatternUnapplyLogic(TestCase): + """Test complete lifecycle of title pattern application and removal.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_dump_to_delta_migration_with_pattern_lifecycle(self): + """ + Test complete lifecycle: + 1. Create dump URLs + 2. Migrate to delta URLs + 3. Apply title pattern + 4. Promote to curated + 5. Delete pattern + 6. Verify deltas are created + 7. Promote to curated + 8. Verify curated URLs have empty generated titles + """ + # Create initial dump URLs + [ + DumpUrlFactory( + collection=self.collection, + url=f"https://example.com/science/data{i}.html", + ) + for i in range(3) + ] + + # Migrate dump to delta + self.collection.migrate_dump_to_delta() + + # Apply title pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/science/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="Science Document {url}", + ) + + # Verify pattern was applied to all deltas and resolution tracked + for delta_url in DeltaUrl.objects.all(): + self.assertTrue(delta_url.generated_title.startswith("Science Document")) + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Promote to curated + self.collection.promote_to_curated() + + # Verify promotion + self.assertEqual(CuratedUrl.objects.count(), 3) + self.assertEqual(DeltaUrl.objects.count(), 0) + for curated_url in CuratedUrl.objects.all(): + self.assertTrue(curated_url.generated_title.startswith("Science Document")) + + # Remove pattern + pattern.delete() + + # Verify new deltas created with empty titles + self.assertEqual(DeltaUrl.objects.count(), 3) + for delta_url in DeltaUrl.objects.all(): + self.assertEqual(delta_url.generated_title, "") + + # Verify resolution tracking cleared + self.assertEqual(DeltaResolvedTitle.objects.count(), 0) + self.assertEqual(DeltaResolvedTitleError.objects.count(), 0) + + def test_pattern_removal_with_delta_only(self): + """Test pattern removal when delta exists without corresponding curated URL.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/new.html") + + # Create and apply pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="New Document {url}" + ) + + # Verify pattern was applied + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("New Document")) + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Remove pattern + pattern.delete() + + # Verify delta still exists but with empty title + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.generated_title, "") + self.assertEqual(DeltaResolvedTitle.objects.count(), 0) + + def test_pattern_removal_with_simple_delta(self): + """Test pattern removal when delta was created just to apply pattern.""" + # Create initial curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, url="https://example.com/doc.html", generated_title="" + ) + + # Create and apply pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, title_pattern="Documentation {url}" + ) + + # Verify delta was created with pattern's title + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertTrue(delta_url.generated_title.startswith("Documentation")) + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Remove pattern + pattern.delete() + + # Verify delta was deleted since it would match curated + self.assertEqual(DeltaUrl.objects.filter(url=curated_url.url).count(), 0) + self.assertEqual(DeltaResolvedTitle.objects.count(), 0) + + def test_pattern_removal_preserves_other_changes(self): + """Test pattern removal when delta has other changes that should be preserved.""" + # Create curated URL + curated_url = CuratedUrl.objects.create( + collection=self.collection, + url="https://example.com/doc.html", + generated_title="", + scraped_title="Original Title", + ) + + # Create delta with modified title + delta_url = DeltaUrl.objects.create( + collection=self.collection, url=curated_url.url, generated_title="", scraped_title="Modified Title" + ) + + # Create and apply pattern + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, title_pattern="API Doc {url}" + ) + + # Verify pattern was applied while preserving scraped title + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Doc")) + self.assertEqual(delta_url.scraped_title, "Modified Title") + + # Remove pattern + pattern.delete() + + # Verify delta still exists with original changes but pattern effect removed + delta_url = DeltaUrl.objects.get(url=curated_url.url) + self.assertEqual(delta_url.generated_title, "") + self.assertEqual(delta_url.scraped_title, "Modified Title") + + def test_pattern_removal_with_multiple_patterns(self): + """Test removal of one pattern when URL is affected by multiple patterns.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/doc.html") + + # Create specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="Specific Title {url}" + ) + + # Create another pattern for the same URL + generic_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="Generic Title {url}", + ) + + # Verify specific pattern takes precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("Specific Title")) + + # Verify resolution tracking + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists()) + + # Remove specific pattern + specific_pattern.delete() + + # Verify general pattern is now applied + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("Generic Title")) + + # Verify resolution tracking updated + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=generic_pattern).exists()) + + def test_specific_pattern_removal_with_overlapping_patterns(self): + """Test removal of specific pattern when more general pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="General Document {url}", + ) + + # Create specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="API Spec {url}" + ) + + # Verify specific pattern took precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Spec")) + + # Remove specific pattern + specific_pattern.delete() + + # Verify general pattern now applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("General Document")) + + def test_general_pattern_removal_with_overlapping_patterns(self): + """Test removal of general pattern when more specific pattern exists.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/docs/api/v2/spec.html") + + # Create general pattern + general_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.html", + match_pattern_type=DeltaTitlePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + title_pattern="General Document {url}", + ) + + # Create specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=self.collection, match_pattern=delta_url.url, title_pattern="API Spec {url}" + ) + + # Verify specific pattern takes precedence + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Spec")) + + # Verify correct resolution tracking + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists()) + + # Remove general pattern + general_pattern.delete() + + # Verify specific pattern still applies + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertTrue(delta_url.generated_title.startswith("API Spec")) + + # Verify resolution tracking unchanged + self.assertTrue(DeltaResolvedTitle.objects.filter(delta_url=delta_url, title_pattern=specific_pattern).exists()) + + def test_pattern_removal_with_title_error(self): + """Test handling of title resolution errors during pattern removal.""" + # Create initial delta URL + delta_url = DeltaUrl.objects.create(collection=self.collection, url="https://example.com/doc.html") + + # Create pattern that will cause error (invalid template) + pattern = DeltaTitlePattern.objects.create( + collection=self.collection, + match_pattern=delta_url.url, + title_pattern="{invalid}", # This should cause an error + ) + + # Verify error was recorded + self.assertTrue(DeltaResolvedTitleError.objects.filter(delta_url=delta_url, title_pattern=pattern).exists()) + + # Remove pattern + pattern.delete() + + # Verify error tracking cleared + self.assertEqual(DeltaResolvedTitleError.objects.count(), 0) + + # Verify delta has empty title + delta_url = DeltaUrl.objects.get(url=delta_url.url) + self.assertEqual(delta_url.generated_title, "") From 85a65d094e53652e690cc3035aabc87c6476f2d8 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 13 Dec 2024 13:33:34 -0600 Subject: [PATCH 7/7] update unapply logic for title patterns to include overlapping patterns --- sde_collections/models/delta_patterns.py | 88 +++++++++++++++++++----- 1 file changed, 69 insertions(+), 19 deletions(-) diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py index aa00f41c..61c8e9ea 100644 --- a/sde_collections/models/delta_patterns.py +++ b/sde_collections/models/delta_patterns.py @@ -554,11 +554,12 @@ def apply(self) -> None: def unapply(self) -> None: """ - Remove title modifications: - 1. Create Delta URLs for affected Curated URLs to explicitly clear titles - 2. Remove generated titles from affected Delta URLs - 3. Clean up Delta URLs that become identical to their Curated URL - 4. Clear resolution tracking + Remove title modifications, maintaining pattern precedence: + 1. Find any remaining patterns that match each URL + 2. Apply most specific matching pattern's title if one exists + 3. Otherwise revert to curated title or clear title + 4. Update title resolution tracking + 5. Clean up redundant deltas """ DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") @@ -569,16 +570,36 @@ def unapply(self) -> None: affected_deltas = self.delta_urls.all() affected_curated = self.curated_urls.all() + # Get all other title patterns for this collection + other_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(id=self.id) + # Process each affected delta URL for delta in affected_deltas: curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first() - if not curated: - # Scenario 1: Delta only - clear generated title - delta.generated_title = "" - delta.save() - else: - # Scenario 2: Both exist - revert to curated title + # Find next most specific matching pattern if any + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)] + + next_pattern = None + if matching_patterns: + # Sort by number of URLs matched (ascending) to find most specific + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + + if next_pattern: + # Apply next most specific pattern's title + new_title, error = next_pattern.generate_title_for_url(delta) + if error: + DeltaResolvedTitleError.objects.update_or_create( + delta_url=delta, defaults={"title_pattern": next_pattern, "error_string": error} + ) + else: + delta.generated_title = new_title + delta.save() + DeltaResolvedTitle.objects.update_or_create( + delta_url=delta, defaults={"title_pattern": next_pattern, "resolved_title": new_title} + ) + elif curated: + # No other patterns match, revert to curated title delta.generated_title = curated.generated_title delta.save() @@ -590,18 +611,47 @@ def unapply(self) -> None: ) if fields_match: delta.delete() + else: + # No curated URL or other patterns, clear title + delta.generated_title = "" + delta.save() # Handle curated URLs that don't have deltas for curated in affected_curated: if not DeltaUrl.objects.filter(url=curated.url).exists(): - # Scenario 3: Curated only - create delta with cleared title - fields = { - f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"] - } - fields["generated_title"] = "" - DeltaUrl.objects.create(collection=self.collection, **fields) - - # Clear resolution tracking + # Find any matching patterns + matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)] + + if matching_patterns: + # Apply most specific pattern's title + next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count()) + + # Copy all fields from curated + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + + # Generate and apply new title + new_title, error = next_pattern.generate_title_for_url(curated) + if not error: + fields["generated_title"] = new_title + delta = DeltaUrl.objects.create(collection=self.collection, **fields) + DeltaResolvedTitle.objects.create( + title_pattern=next_pattern, delta_url=delta, resolved_title=new_title + ) + else: + # No other patterns, create delta with cleared title + fields = { + f.name: getattr(curated, f.name) + for f in curated._meta.fields + if f.name not in ["id", "collection"] + } + fields["generated_title"] = "" + DeltaUrl.objects.create(collection=self.collection, **fields) + + # Clear resolution tracking for this pattern DeltaResolvedTitle.objects.filter(title_pattern=self).delete() DeltaResolvedTitleError.objects.filter(title_pattern=self).delete()