Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1115 improve title processing and tests #1118

Merged
merged 4 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions sde_collections/models/delta_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from django.db import models

from ..utils.title_resolver import (
is_valid_fstring,
is_valid_xpath,
parse_title,
resolve_title,
validate_fstring,
)
from .collection_choice_fields import Divisions, DocumentTypes

Expand Down Expand Up @@ -50,11 +50,19 @@ def get_url_match_count(self):

def is_most_distinctive_pattern(self, url) -> bool:
"""
Determine if this pattern should apply to a URL by checking if it matches
the smallest number of URLs among all patterns that match this URL.
Determine if this pattern should apply to a URL by checking:
1. First checks if this pattern matches this URL
2. If it matches the smallest number of URLs among all patterns that match this URL
3. If tied for smallest number of matches, uses the longest pattern string
Returns True if this pattern should be applied.
"""
# First check if this pattern matches the URL
regex_pattern = self.get_regex_pattern()
if not re.search(regex_pattern, url.url):
return False

my_match_count = self.get_url_match_count()
my_pattern_length = len(self.match_pattern)

# Get patterns from same type that affect this URL
pattern_class = self.__class__
Expand All @@ -63,12 +71,19 @@ def is_most_distinctive_pattern(self, url) -> bool:
.filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url))
.exclude(id=self.id)
.distinct()
) # TODO: does this have a distinct urls, or distinct model objects.
)

# If any matching pattern has a smaller URL set, don't apply
# Use M2M relationships for checking other patterns since those are already established
for pattern in matching_patterns:
if pattern.get_url_match_count() < my_match_count:
other_match_count = pattern.get_url_match_count()
if other_match_count < my_match_count:
# Other pattern matches fewer URLs - definitely not most distinctive
return False
if other_match_count == my_match_count:
# Same match count - check pattern length
if len(pattern.match_pattern) > my_pattern_length:
# Other pattern is longer - not most distinctive
return False

return True

Expand Down Expand Up @@ -398,7 +413,7 @@ def validate_title_pattern(title_pattern_string: str) -> None:
raise ValidationError(f"Invalid xpath: {element_value}")
elif element_type == "brace":
try:
is_valid_fstring(element_value)
validate_fstring(element_value)
except ValueError as e:
raise ValidationError(str(e))

Expand Down Expand Up @@ -454,8 +469,9 @@ def apply(self) -> None:
new_title, error = self.generate_title_for_url(curated_url)

if error:
# Log error and continue to next URL
DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=curated_url, error_string=error)
DeltaResolvedTitleError.objects.update_or_create(
delta_url=curated_url, defaults={"title_pattern": self, "error_string": error} # lookup field
)
continue

# Skip if the generated title matches existing or if Delta already exists
Expand Down Expand Up @@ -488,7 +504,9 @@ def apply(self) -> None:
new_title, error = self.generate_title_for_url(delta_url)

if error:
DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error)
DeltaResolvedTitleError.objects.update_or_create(
delta_url=delta_url, defaults={"title_pattern": self, "error_string": error} # lookup field
)
continue

# Update title and record resolution - key change here
Expand Down
4 changes: 2 additions & 2 deletions sde_collections/models/pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from django.db import models

from ..utils.title_resolver import (
is_valid_fstring,
is_valid_xpath,
parse_title,
resolve_title,
validate_fstring,
)
from .collection_choice_fields import Divisions, DocumentTypes

Expand Down Expand Up @@ -146,7 +146,7 @@ def validate_title_pattern(title_pattern_string):
raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") # noqa: E231
elif element_type == "brace":
try:
is_valid_fstring(element_value)
validate_fstring(element_value)
except ValueError as e:
raise ValidationError(str(e))

Expand Down
45 changes: 45 additions & 0 deletions sde_collections/tests/test_delta_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,48 @@ def test_pattern_reapplication_does_not_duplicate_delta_urls(self):
# Ensure no new `DeltaUrl` is created after reapplying the pattern
pattern.apply()
assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0

@pytest.mark.django_db
def test_title_pattern_error_updates(self):
"""
Test that when a more specific pattern creates an error,
it updates rather than duplicates the error record.
"""
# Create a collection and URL
collection = CollectionFactory()
url = DeltaUrlFactory(
collection=collection, url="https://example.com/docs/specific/item.html", scraped_title="Original Title"
)

# Create a general pattern first
general_pattern = DeltaTitlePattern.objects.create(
collection=collection,
match_pattern="*docs*",
title_pattern="{invalid}", # Invalid variable name will cause error
match_pattern_type=2,
)

# Verify initial error state
error = url.deltaresolvedtitleerror
assert error.title_pattern == general_pattern
assert "Variable 'invalid' not allowed in f-string pattern" in error.error_string

# Create a more specific pattern
specific_pattern = DeltaTitlePattern.objects.create(
collection=collection,
match_pattern="*docs/specific*",
title_pattern="{another_invalid}",
match_pattern_type=2,
)

# Re-fetch error to see latest state
error.refresh_from_db()

# Error should now be from specific pattern
assert (
error.title_pattern == specific_pattern
), f"Error still associated with {error.title_pattern} instead of {specific_pattern}"
assert "Variable 'another_invalid' not allowed in f-string pattern" in error.error_string

# Verify we still only have one error record
assert DeltaResolvedTitleError.objects.filter(delta_url=url).count() == 1
62 changes: 62 additions & 0 deletions sde_collections/tests/test_pattern_specificity.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,65 @@ def test_field_modifying_pattern_layered_specificity():
assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)

assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True)


@pytest.mark.django_db
def test_pattern_specificity_tiebreaker():
"""Test that when patterns match the same number of URLs, longer patterns are considered more specific."""
collection = CollectionFactory()

# Create URLs that would result in same match count for different patterns
url1 = DeltaUrlFactory(
collection=collection, url="https://example.com/docs/specific/item1.html", scraped_title="Title 1"
)
url2 = DeltaUrlFactory(
collection=collection, url="https://example.com/docs/specific/item2.html", scraped_title="Title 2"
)

# Create patterns with same match count but different lengths
general_pattern = DeltaTitlePattern.objects.create(
collection=collection,
match_pattern="*docs*", # Shorter pattern
title_pattern="{title}",
match_pattern_type=2,
)

specific_pattern = DeltaTitlePattern.objects.create(
collection=collection,
match_pattern="*docs/specific*", # Longer pattern
title_pattern="{title} - Specific",
match_pattern_type=2,
)

# Both patterns will match both URLs (same match count)
assert general_pattern.get_url_match_count() == 2
assert specific_pattern.get_url_match_count() == 2

# But the longer pattern should be considered more specific
assert general_pattern.is_most_distinctive_pattern(url1) is False
assert specific_pattern.is_most_distinctive_pattern(url1) is True

# Check that this applies to both URLs
assert general_pattern.is_most_distinctive_pattern(url2) is False
assert specific_pattern.is_most_distinctive_pattern(url2) is True

# Create an even more specific pattern
very_specific_pattern = DeltaTitlePattern.objects.create(
collection=collection,
match_pattern="*docs/specific/item1*", # Even longer pattern
title_pattern="{title} - Very Specific",
match_pattern_type=2,
)

# It matches fewer URLs
assert very_specific_pattern.get_url_match_count() == 1

# For URL1, the very specific pattern should win due to fewer matches
assert general_pattern.is_most_distinctive_pattern(url1) is False
assert specific_pattern.is_most_distinctive_pattern(url1) is False
assert very_specific_pattern.is_most_distinctive_pattern(url1) is True

# For URL2, the middle pattern should still win since very_specific doesn't match
assert general_pattern.is_most_distinctive_pattern(url2) is False
assert specific_pattern.is_most_distinctive_pattern(url2) is True
assert very_specific_pattern.is_most_distinctive_pattern(url2) is False
133 changes: 133 additions & 0 deletions sde_collections/tests/test_title_resolution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_title_resolution.py

from unittest.mock import Mock, patch

import pytest

from ..utils.title_resolver import (
clean_text,
is_valid_xpath,
parse_title,
resolve_brace,
resolve_title,
resolve_xpath,
validate_fstring,
)


def test_parse_title():
# Test basic string
assert parse_title("Simple Title") == [("str", "Simple Title")]

# Test f-string
assert parse_title("Hello {title}") == [("str", "Hello "), ("brace", "{title}")]

# Test xpath
assert parse_title("xpath://h1") == [("xpath", "//h1")]

# Test complex pattern
result = parse_title("xpath://h1 | {title} - {collection}")
assert result == [
("xpath", "//h1"),
("str", " | "),
("brace", "{title}"),
("str", " - "),
("brace", "{collection}"),
]


def test_is_valid_xpath():
assert is_valid_xpath("//h1") is True
assert is_valid_xpath("//div[@class='title']") is True
assert is_valid_xpath("invalid xpath") is False
assert is_valid_xpath("//h1[") is False


def test_validate_fstring():
# Valid cases - should not raise
validate_fstring("{title}")
validate_fstring("{url}")
validate_fstring("{collection}")

# Invalid cases
with pytest.raises(ValueError):
validate_fstring("{invalid_var}")
with pytest.raises(ValueError):
validate_fstring("{title.upper()}")
with pytest.raises(ValueError):
validate_fstring("{len(title)}")


def test_resolve_brace():
context = {"title": "Test Title", "url": "https://example.com", "collection": "Test Collection"}

assert resolve_brace("{title}", context) == "Test Title"
assert resolve_brace("{title} - {collection}", context) == "Test Title - Test Collection"

with pytest.raises(ValueError):
resolve_brace("{invalid}", context)


def test_clean_text():
# Test whitespace handling
assert clean_text(" Title \n With\tSpaces ") == "Title With Spaces"

# Test HTML entities
assert clean_text("Title &amp; More") == "Title & More"

# Test unicode normalization
assert clean_text("Café") == "Cafe"


@patch("requests.get")
def test_resolve_xpath(mock_get):
mock_response = Mock()
mock_response.ok = True
mock_response.content = b"""
<html>
<body>
<h1>Test Title</h1>
<div class="content">Inner Content</div>
</body>
</html>
"""
mock_get.return_value = mock_response

# Test basic xpath
assert resolve_xpath("//h1", "https://example.com") == "Test Title"
assert resolve_xpath("//div[@class='content']", "https://example.com") == "Inner Content"

# Test error cases
mock_response.ok = False
with pytest.raises(ValueError):
resolve_xpath("//h1", "https://example.com")

mock_response.ok = True
with pytest.raises(ValueError):
resolve_xpath("//nonexistent", "https://example.com")


@patch("requests.get")
def test_resolve_title(mock_get):
mock_response = Mock()
mock_response.ok = True
mock_response.content = b"""
<html>
<body>
<h1>Dynamic Content</h1>
</body>
</html>
"""
mock_get.return_value = mock_response

context = {"title": "Original Title", "url": "https://example.com", "collection": "Test Collection"}

# Test combination of xpath and f-string
pattern = "xpath://h1 | {title} - {collection}"
assert resolve_title(pattern, context) == "Dynamic Content | Original Title - Test Collection"

# Test simple f-string
assert resolve_title("{title} ({collection})", context) == "Original Title (Test Collection)"

# Test plain string
assert resolve_title("Static Title", context) == "Static Title"
4 changes: 2 additions & 2 deletions sde_collections/utils/title_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def is_valid_xpath(xpath: str) -> bool:
return False


def is_valid_fstring(pattern: str) -> bool:
def validate_fstring(pattern: str) -> bool:
context = {
"url": "",
"title": "",
Expand Down Expand Up @@ -53,7 +53,7 @@ def resolve_brace(pattern: str, context: dict[str, Any]) -> str:
"""Safely interpolates the variables in an f-string pattern using the provided context."""
parsed = ast.parse(f"f'''{pattern}'''", mode="eval")

is_valid_fstring(pattern) # Refactor this
validate_fstring(pattern)

compiled = compile(parsed, "<string>", "eval")
return str(eval(compiled, {}, context))
Expand Down
Loading