diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py index c0c3123b..dade950a 100644 --- a/sde_collections/models/delta_patterns.py +++ b/sde_collections/models/delta_patterns.py @@ -6,10 +6,10 @@ from django.db import models from ..utils.title_resolver import ( - is_valid_fstring, is_valid_xpath, parse_title, resolve_title, + validate_fstring, ) from .collection_choice_fields import Divisions, DocumentTypes @@ -50,11 +50,19 @@ def get_url_match_count(self): def is_most_distinctive_pattern(self, url) -> bool: """ - Determine if this pattern should apply to a URL by checking if it matches - the smallest number of URLs among all patterns that match this URL. + Determine if this pattern should apply to a URL by checking: + 1. First checks if this pattern matches this URL + 2. If it matches the smallest number of URLs among all patterns that match this URL + 3. If tied for smallest number of matches, uses the longest pattern string Returns True if this pattern should be applied. """ + # First check if this pattern matches the URL + regex_pattern = self.get_regex_pattern() + if not re.search(regex_pattern, url.url): + return False + my_match_count = self.get_url_match_count() + my_pattern_length = len(self.match_pattern) # Get patterns from same type that affect this URL pattern_class = self.__class__ @@ -63,12 +71,19 @@ def is_most_distinctive_pattern(self, url) -> bool: .filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url)) .exclude(id=self.id) .distinct() - ) # TODO: does this have a distinct urls, or distinct model objects. + ) - # If any matching pattern has a smaller URL set, don't apply + # Use M2M relationships for checking other patterns since those are already established for pattern in matching_patterns: - if pattern.get_url_match_count() < my_match_count: + other_match_count = pattern.get_url_match_count() + if other_match_count < my_match_count: + # Other pattern matches fewer URLs - definitely not most distinctive return False + if other_match_count == my_match_count: + # Same match count - check pattern length + if len(pattern.match_pattern) > my_pattern_length: + # Other pattern is longer - not most distinctive + return False return True @@ -398,7 +413,7 @@ def validate_title_pattern(title_pattern_string: str) -> None: raise ValidationError(f"Invalid xpath: {element_value}") elif element_type == "brace": try: - is_valid_fstring(element_value) + validate_fstring(element_value) except ValueError as e: raise ValidationError(str(e)) @@ -454,8 +469,9 @@ def apply(self) -> None: new_title, error = self.generate_title_for_url(curated_url) if error: - # Log error and continue to next URL - DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=curated_url, error_string=error) + DeltaResolvedTitleError.objects.update_or_create( + delta_url=curated_url, defaults={"title_pattern": self, "error_string": error} # lookup field + ) continue # Skip if the generated title matches existing or if Delta already exists @@ -488,7 +504,9 @@ def apply(self) -> None: new_title, error = self.generate_title_for_url(delta_url) if error: - DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error) + DeltaResolvedTitleError.objects.update_or_create( + delta_url=delta_url, defaults={"title_pattern": self, "error_string": error} # lookup field + ) continue # Update title and record resolution - key change here diff --git a/sde_collections/models/pattern.py b/sde_collections/models/pattern.py index 1e14042b..774b988e 100644 --- a/sde_collections/models/pattern.py +++ b/sde_collections/models/pattern.py @@ -5,10 +5,10 @@ from django.db import models from ..utils.title_resolver import ( - is_valid_fstring, is_valid_xpath, parse_title, resolve_title, + validate_fstring, ) from .collection_choice_fields import Divisions, DocumentTypes @@ -146,7 +146,7 @@ def validate_title_pattern(title_pattern_string): raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") # noqa: E231 elif element_type == "brace": try: - is_valid_fstring(element_value) + validate_fstring(element_value) except ValueError as e: raise ValidationError(str(e)) diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py index b72981fc..a7941fbd 100644 --- a/sde_collections/tests/test_delta_patterns.py +++ b/sde_collections/tests/test_delta_patterns.py @@ -266,3 +266,48 @@ def test_pattern_reapplication_does_not_duplicate_delta_urls(self): # Ensure no new `DeltaUrl` is created after reapplying the pattern pattern.apply() assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0 + + @pytest.mark.django_db + def test_title_pattern_error_updates(self): + """ + Test that when a more specific pattern creates an error, + it updates rather than duplicates the error record. + """ + # Create a collection and URL + collection = CollectionFactory() + url = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/specific/item.html", scraped_title="Original Title" + ) + + # Create a general pattern first + general_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs*", + title_pattern="{invalid}", # Invalid variable name will cause error + match_pattern_type=2, + ) + + # Verify initial error state + error = url.deltaresolvedtitleerror + assert error.title_pattern == general_pattern + assert "Variable 'invalid' not allowed in f-string pattern" in error.error_string + + # Create a more specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs/specific*", + title_pattern="{another_invalid}", + match_pattern_type=2, + ) + + # Re-fetch error to see latest state + error.refresh_from_db() + + # Error should now be from specific pattern + assert ( + error.title_pattern == specific_pattern + ), f"Error still associated with {error.title_pattern} instead of {specific_pattern}" + assert "Variable 'another_invalid' not allowed in f-string pattern" in error.error_string + + # Verify we still only have one error record + assert DeltaResolvedTitleError.objects.filter(delta_url=url).count() == 1 diff --git a/sde_collections/tests/test_pattern_specificity.py b/sde_collections/tests/test_pattern_specificity.py index 98c7f006..7eafe978 100644 --- a/sde_collections/tests/test_pattern_specificity.py +++ b/sde_collections/tests/test_pattern_specificity.py @@ -156,3 +156,65 @@ def test_field_modifying_pattern_layered_specificity(): assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) + + +@pytest.mark.django_db +def test_pattern_specificity_tiebreaker(): + """Test that when patterns match the same number of URLs, longer patterns are considered more specific.""" + collection = CollectionFactory() + + # Create URLs that would result in same match count for different patterns + url1 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/specific/item1.html", scraped_title="Title 1" + ) + url2 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/specific/item2.html", scraped_title="Title 2" + ) + + # Create patterns with same match count but different lengths + general_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs*", # Shorter pattern + title_pattern="{title}", + match_pattern_type=2, + ) + + specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs/specific*", # Longer pattern + title_pattern="{title} - Specific", + match_pattern_type=2, + ) + + # Both patterns will match both URLs (same match count) + assert general_pattern.get_url_match_count() == 2 + assert specific_pattern.get_url_match_count() == 2 + + # But the longer pattern should be considered more specific + assert general_pattern.is_most_distinctive_pattern(url1) is False + assert specific_pattern.is_most_distinctive_pattern(url1) is True + + # Check that this applies to both URLs + assert general_pattern.is_most_distinctive_pattern(url2) is False + assert specific_pattern.is_most_distinctive_pattern(url2) is True + + # Create an even more specific pattern + very_specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs/specific/item1*", # Even longer pattern + title_pattern="{title} - Very Specific", + match_pattern_type=2, + ) + + # It matches fewer URLs + assert very_specific_pattern.get_url_match_count() == 1 + + # For URL1, the very specific pattern should win due to fewer matches + assert general_pattern.is_most_distinctive_pattern(url1) is False + assert specific_pattern.is_most_distinctive_pattern(url1) is False + assert very_specific_pattern.is_most_distinctive_pattern(url1) is True + + # For URL2, the middle pattern should still win since very_specific doesn't match + assert general_pattern.is_most_distinctive_pattern(url2) is False + assert specific_pattern.is_most_distinctive_pattern(url2) is True + assert very_specific_pattern.is_most_distinctive_pattern(url2) is False diff --git a/sde_collections/tests/test_title_resolution.py b/sde_collections/tests/test_title_resolution.py new file mode 100644 index 00000000..88c15aef --- /dev/null +++ b/sde_collections/tests/test_title_resolution.py @@ -0,0 +1,133 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_resolution.py + +from unittest.mock import Mock, patch + +import pytest + +from ..utils.title_resolver import ( + clean_text, + is_valid_xpath, + parse_title, + resolve_brace, + resolve_title, + resolve_xpath, + validate_fstring, +) + + +def test_parse_title(): + # Test basic string + assert parse_title("Simple Title") == [("str", "Simple Title")] + + # Test f-string + assert parse_title("Hello {title}") == [("str", "Hello "), ("brace", "{title}")] + + # Test xpath + assert parse_title("xpath://h1") == [("xpath", "//h1")] + + # Test complex pattern + result = parse_title("xpath://h1 | {title} - {collection}") + assert result == [ + ("xpath", "//h1"), + ("str", " | "), + ("brace", "{title}"), + ("str", " - "), + ("brace", "{collection}"), + ] + + +def test_is_valid_xpath(): + assert is_valid_xpath("//h1") is True + assert is_valid_xpath("//div[@class='title']") is True + assert is_valid_xpath("invalid xpath") is False + assert is_valid_xpath("//h1[") is False + + +def test_validate_fstring(): + # Valid cases - should not raise + validate_fstring("{title}") + validate_fstring("{url}") + validate_fstring("{collection}") + + # Invalid cases + with pytest.raises(ValueError): + validate_fstring("{invalid_var}") + with pytest.raises(ValueError): + validate_fstring("{title.upper()}") + with pytest.raises(ValueError): + validate_fstring("{len(title)}") + + +def test_resolve_brace(): + context = {"title": "Test Title", "url": "https://example.com", "collection": "Test Collection"} + + assert resolve_brace("{title}", context) == "Test Title" + assert resolve_brace("{title} - {collection}", context) == "Test Title - Test Collection" + + with pytest.raises(ValueError): + resolve_brace("{invalid}", context) + + +def test_clean_text(): + # Test whitespace handling + assert clean_text(" Title \n With\tSpaces ") == "Title With Spaces" + + # Test HTML entities + assert clean_text("Title & More") == "Title & More" + + # Test unicode normalization + assert clean_text("Café") == "Cafe" + + +@patch("requests.get") +def test_resolve_xpath(mock_get): + mock_response = Mock() + mock_response.ok = True + mock_response.content = b""" + +
+