diff --git a/sde_collections/admin.py b/sde_collections/admin.py index 8c9acf9b..02f4b11f 100644 --- a/sde_collections/admin.py +++ b/sde_collections/admin.py @@ -5,12 +5,13 @@ from sde_collections.models.delta_patterns import ( DeltaDivisionPattern, + DeltaResolvedTitle, DeltaTitlePattern, ) from .models.candidate_url import CandidateURL, ResolvedTitle from .models.collection import Collection, WorkflowHistory -from .models.delta_url import CuratedUrl, DeltaResolvedTitle, DeltaUrl, DumpUrl +from .models.delta_url import CuratedUrl, DeltaUrl, DumpUrl from .models.pattern import DivisionPattern, IncludePattern, TitlePattern from .tasks import fetch_and_replace_full_text, import_candidate_urls_from_api diff --git a/sde_collections/management/commands/deduplicate_patterns.py b/sde_collections/management/commands/deduplicate_patterns.py new file mode 100644 index 00000000..f9de42e6 --- /dev/null +++ b/sde_collections/management/commands/deduplicate_patterns.py @@ -0,0 +1,47 @@ +# docker-compose -f local.yml run --rm django python manage.py deduplicate_patterns +# docker-compose -f production.yml run --rm django python manage.py deduplicate_patterns + +from collections import defaultdict + +from django.core.management.base import BaseCommand +from django.db.models import Count + +from sde_collections.models.pattern import ( + DivisionPattern, + DocumentTypePattern, + ExcludePattern, + IncludePattern, + TitlePattern, +) + + +class Command(BaseCommand): + help = "Remove duplicate patterns within collections for all pattern types" + + def handle(self, *args, **kwargs): + pattern_models = [ExcludePattern, IncludePattern, TitlePattern, DocumentTypePattern, DivisionPattern] + + deletion_counts = defaultdict(int) + + for model in pattern_models: + # Get all collections that have duplicate patterns + collections_with_dupes = ( + model.objects.values("collection", "match_pattern") + .annotate(pattern_count=Count("id")) + .filter(pattern_count__gt=1) + ) + + for group in collections_with_dupes: + # Get all patterns for this collection/match_pattern combo + patterns = model.objects.filter(collection_id=group["collection"], match_pattern=group["match_pattern"]) + + # Keep one pattern, delete the rest + patterns_to_delete = patterns[1:] + for pattern in patterns_to_delete: + pattern.delete() + deletion_counts[model.__name__] += 1 + + # Print final summary + for model_name, count in deletion_counts.items(): + self.stdout.write(f"{model_name}: {count}") + self.stdout.write(f"Total: {sum(deletion_counts.values())}") diff --git a/sde_collections/management/commands/deduplicate_urls.py b/sde_collections/management/commands/deduplicate_urls.py new file mode 100644 index 00000000..251ae887 --- /dev/null +++ b/sde_collections/management/commands/deduplicate_urls.py @@ -0,0 +1,88 @@ +import time + +from django.core.management.base import BaseCommand +from django.db.models import Count, Min + +from sde_collections.models.candidate_url import CandidateURL +from sde_collections.models.collection import Collection +from sde_collections.models.collection_choice_fields import WorkflowStatusChoices + + +class Command(BaseCommand): + help = "Deduplicate CandidateURLs" + + def handle(self, *args, **kwargs): + deduplicate_candidate_urls() + + +def is_priority_collection(collection): + priority_statuses = { + WorkflowStatusChoices.CURATED, + WorkflowStatusChoices.QUALITY_FIXED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_STARTED, + WorkflowStatusChoices.SECRET_DEPLOYMENT_FAILED, + WorkflowStatusChoices.READY_FOR_LRM_QUALITY_CHECK, + WorkflowStatusChoices.READY_FOR_FINAL_QUALITY_CHECK, + WorkflowStatusChoices.QUALITY_CHECK_FAILED, + WorkflowStatusChoices.QUALITY_CHECK_MINOR, + WorkflowStatusChoices.QUALITY_CHECK_PERFECT, + WorkflowStatusChoices.PROD_PERFECT, + WorkflowStatusChoices.PROD_MINOR, + WorkflowStatusChoices.PROD_MAJOR, + } + return collection.workflow_status in priority_statuses + + +def deduplicate_candidate_urls(): + start_time = time.time() + + collection_counts = { + c["id"]: c["url_count"] + for c in Collection.objects.annotate(url_count=Count("candidate_urls")).values("id", "url_count") + } + + collection_status = {c.id: is_priority_collection(c) for c in Collection.objects.all()} + + # Phase 1: Intra-collection duplicates + intra_dupes = ( + CandidateURL.objects.values("collection_id", "url") + .annotate(count=Count("id"), min_id=Min("id")) + .filter(count__gt=1) + ) + + intra_ids_to_delete = [] + for dupe in intra_dupes: + dupe_ids = set( + CandidateURL.objects.filter(collection_id=dupe["collection_id"], url=dupe["url"]) + .exclude(id=dupe["min_id"]) + .values_list("id", flat=True) + ) + intra_ids_to_delete.extend(dupe_ids) + + CandidateURL.objects.filter(id__in=intra_ids_to_delete).delete() + + # Phase 2: Cross-collection duplicates + cross_dupes = CandidateURL.objects.values("url").annotate(count=Count("id")).filter(count__gt=1) + + cross_ids_to_delete = [] + for dupe in cross_dupes: + instances = list(CandidateURL.objects.filter(url=dupe["url"]).values("id", "collection_id")) + + priority_instances = [i for i in instances if collection_status[i["collection_id"]]] + non_priority_instances = [i for i in instances if not collection_status[i["collection_id"]]] + + if priority_instances: + keep_instance = min(priority_instances, key=lambda x: collection_counts[x["collection_id"]]) + else: + keep_instance = min(non_priority_instances, key=lambda x: collection_counts[x["collection_id"]]) + + delete_ids = [i["id"] for i in instances if i["id"] != keep_instance["id"]] + cross_ids_to_delete.extend(delete_ids) + + CandidateURL.objects.filter(id__in=cross_ids_to_delete).delete() + + elapsed_time = time.time() - start_time + action = "Deleted" + print( + f"{action} {len(intra_ids_to_delete)} intra-collection and {len(cross_ids_to_delete)} cross-collection duplicates (total: {len(intra_ids_to_delete) + len(cross_ids_to_delete)}) in {elapsed_time:.2f} seconds" # noqa + ) diff --git a/sde_collections/management/commands/migrate_urls_and_patterns.py b/sde_collections/management/commands/migrate_urls_and_patterns.py index 7110cd30..7c28d1d4 100644 --- a/sde_collections/management/commands/migrate_urls_and_patterns.py +++ b/sde_collections/management/commands/migrate_urls_and_patterns.py @@ -87,7 +87,7 @@ def handle(self, *args, **kwargs): visited=candidate_url.visited, document_type=candidate_url.document_type, division=candidate_url.division, - delete=False, + to_delete=False, ) ) diff --git a/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py b/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py new file mode 100644 index 00000000..f9be360b --- /dev/null +++ b/sde_collections/migrations/0066_alter_deltadivisionpattern_unique_together_and_more.py @@ -0,0 +1,83 @@ +# Generated by Django 4.2.9 on 2024-11-23 17:44 + +from django.db import migrations, models +import sde_collections.models.delta_patterns + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0065_rename_delete_deltaurl_to_delete_and_more"), + ] + + operations = [ + migrations.AlterUniqueTogether( + name="deltadivisionpattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltadocumenttypepattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltaexcludepattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltaincludepattern", + unique_together=set(), + ), + migrations.AlterUniqueTogether( + name="deltatitlepattern", + unique_together=set(), + ), + migrations.AlterField( + model_name="deltadivisionpattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="match_pattern", + field=models.CharField( + help_text="This pattern is compared against the URL of all documents in the collection", + verbose_name="Pattern", + ), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="title_pattern", + field=models.CharField( + help_text="Pattern for the new title. Support exact replacement or sinequa-valid code", + validators=[sde_collections.models.delta_patterns.validate_title_pattern], + verbose_name="Title Pattern", + ), + ), + ] diff --git a/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py b/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py new file mode 100644 index 00000000..4a244362 --- /dev/null +++ b/sde_collections/migrations/0067_alter_deltadivisionpattern_options_and_more.py @@ -0,0 +1,73 @@ +# Generated by Django 4.2.9 on 2024-11-23 18:14 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0066_alter_deltadivisionpattern_unique_together_and_more"), + ] + + operations = [ + migrations.AlterModelOptions( + name="deltadivisionpattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Division Pattern", + "verbose_name_plural": "Delta Division Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltadocumenttypepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Document Type Pattern", + "verbose_name_plural": "Delta Document Type Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltaexcludepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Exclude Pattern", + "verbose_name_plural": "Delta Exclude Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltaincludepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Include Pattern", + "verbose_name_plural": "Delta Include Patterns", + }, + ), + migrations.AlterModelOptions( + name="deltatitlepattern", + options={ + "ordering": ["match_pattern"], + "verbose_name": "Delta Title Pattern", + "verbose_name_plural": "Delta Title Patterns", + }, + ), + migrations.AlterUniqueTogether( + name="deltadivisionpattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltadocumenttypepattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltaexcludepattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltaincludepattern", + unique_together={("collection", "match_pattern")}, + ), + migrations.AlterUniqueTogether( + name="deltatitlepattern", + unique_together={("collection", "match_pattern")}, + ), + ] diff --git a/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py b/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py new file mode 100644 index 00000000..91d87951 --- /dev/null +++ b/sde_collections/migrations/0068_alter_deltadivisionpattern_collection_and_more.py @@ -0,0 +1,124 @@ +# Generated by Django 4.2.9 on 2024-11-24 19:39 + +from django.db import migrations, models +import django.db.models.deletion +import sde_collections.models.delta_patterns + + +class Migration(migrations.Migration): + + dependencies = [ + ("sde_collections", "0067_alter_deltadivisionpattern_options_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="deltadivisionpattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltadivisionpattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltadivisionpattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltadocumenttypepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltaexcludepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltaincludepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="collection", + field=models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="%(class)ss", + related_query_name="%(class)ss", + to="sde_collections.collection", + ), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="curated_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.curatedurl"), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="delta_urls", + field=models.ManyToManyField(related_name="%(class)ss", to="sde_collections.deltaurl"), + ), + migrations.AlterField( + model_name="deltatitlepattern", + name="title_pattern", + field=models.CharField( + help_text="Pattern for the new title. Can be an exact replacement string or sinequa-valid code", + validators=[sde_collections.models.delta_patterns.validate_title_pattern], + verbose_name="Title Pattern", + ), + ), + ] diff --git a/sde_collections/models/README.md b/sde_collections/models/README.md new file mode 100644 index 00000000..1c5202c4 --- /dev/null +++ b/sde_collections/models/README.md @@ -0,0 +1,78 @@ +# URL Pattern Management System + +## Overview +This system provides a framework for managing and curating collections of URLs through pattern-based rules. It enables systematic modification, categorization, and filtering of URLs while maintaining a clear separation between work-in-progress changes and production content. + +## Core Concepts + +### URL States +Content progresses through three states: +- **Dump URLs**: Raw content from initial scraping/indexing +- **Delta URLs**: Work-in-progress changes and modifications +- **Curated URLs**: Production-ready, approved content + +### Pattern Types +- **Include/Exclude Patterns**: Control which URLs are included in collections + - Include patterns always override exclude patterns + - Use wildcards for matching multiple URLs + +- **Modification Patterns**: Change URL properties + - Title patterns modify final titles shown in search results + - Document type patterns affect which tab the URL appears under + - Division patterns assign URLs within the Science Knowledge Sources + +### Pattern Resolution +The system uses a "smallest set priority" strategy which resolves conflicts by always using the most specific pattern that matches a URL: +- Multiple patterns can match the same URL +- Pattern matching the smallest number of URLs takes precedence +- Applies to title, division, and document type patterns +- More specific patterns naturally override general ones + +## Getting Started + +To effectively understand this system, we recommend reading through the documentation in the following order: + +1. Begin with the Pattern System Overview to learn the fundamental concepts of how patterns work and interact with URLs +2. Next, explore the URL Lifecycle documentation to understand how content moves through different states +3. The Pattern Resolution documentation will show you how the system handles overlapping patterns +4. Learn how to control which URLs appear in your collection with the Include/Exclude patterns guide +5. Finally, review the Pattern Unapplication Logic to understand how pattern removal affects your URLs + +Each section builds upon knowledge from previous sections, providing a comprehensive understanding of the system. + +## Documentation + +[Pattern System Overview](./README_PATTERN_SYSTEM.md) +- Core concepts and pattern types +- Pattern lifecycle and effects +- Delta URL generation rules +- Working principles (idempotency, separation of concerns) +- Pattern interaction examples + +[URL Lifecycle Management](./README_LIFECYCLE.md) +- Migration process (Dump → Delta) +- Promotion process (Delta → Curated) +- Field handling during transitions +- Pattern application timing +- Data integrity considerations + +[Pattern Resolution](./README_PATTERN_RESOLUTION.md) +- Smallest set priority mechanism +- URL counting and precedence +- Performance considerations +- Edge case handling +- Implementation details + +[URL Inclusion/Exclusion](./README_INCLUSION.md) +- Wildcard pattern matching +- Include/exclude precedence +- Example pattern configurations +- Best practices +- Common pitfalls and solutions + +[Pattern Unapplication Logic](./README_UNAPPLY_LOGIC.md) +- Pattern removal handling +- Delta management during unapplication +- Manual change preservation +- Cleanup procedures +- Edge case handling diff --git a/sde_collections/models/README_INCLUSION.md b/sde_collections/models/README_INCLUSION.md new file mode 100644 index 00000000..d2fedf51 --- /dev/null +++ b/sde_collections/models/README_INCLUSION.md @@ -0,0 +1,146 @@ +# URL Include and Exclude Patterns + +## Overview + +The pattern system allows you to control which URLs are included in or excluded from your collection using two types of patterns: +- **Exclude Patterns**: Mark URLs for exclusion from the collection +- **Include Patterns**: Explicitly include URLs, overriding any exclude patterns + +## Pattern Types + +### Individual URL Patterns +- Matches exact URLs +- Best for targeting specific pages +- No wildcards allowed +```python +# Matches only exactly this URL +match_pattern = "https://example.com/docs/specific-page.html" +``` + +### Multi-URL (Wildcard) Patterns +- Uses `*` as a wildcard to match multiple URLs +- Best for targeting entire directories or file types +- Can have wildcards anywhere in the pattern +```python +# Matches all files in the /docs directory +match_pattern = "https://example.com/docs/*" + +# Matches all PDF files +match_pattern = "https://example.com/*.pdf" +``` + +## Pattern Precedence + +1. Include patterns **always** take precedence over exclude patterns +2. More specific patterns take precedence over general patterns +3. If a URL matches both an include and exclude pattern, it will be included + +## Common Examples + +### Excluding a Directory But Including Specific Files + +```python +# Exclude the internal docs directory +DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/internal/*", + match_pattern_type=2 # Multi-URL pattern +) + +# But include specific approved pages +DeltaIncludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/internal/public-roadmap.html", + match_pattern_type=1 # Individual URL pattern +) +``` + +### Including Only Specific File Types + +```python +# Exclude everything in docs directory +DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/*", + match_pattern_type=2 +) + +# Include only PDF files +DeltaIncludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/*.pdf", + match_pattern_type=2 +) +``` + +### Folder-Based Access Control + +```python +# Exclude all draft documents +DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/drafts/*", + match_pattern_type=2 +) + +# Include the approved drafts subfolder +DeltaIncludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/drafts/approved/*", + match_pattern_type=2 +) +``` + +## Best Practices + +1. **Start Specific**: Begin with specific patterns and broaden as needed + ```python + # Better + match_pattern = "https://example.com/docs/api/v1/*" + # Less precise + match_pattern = "https://example.com/docs/*" + ``` + +2. **Use Include for Exceptions**: When excluding a large section, use include patterns for exceptions + ```python + # Exclude staging environment + exclude_pattern = "https://staging.example.com/*" + # Include specific staging features that should be public + include_pattern = "https://staging.example.com/features/released/*" + ``` + +3. **Document Patterns**: Keep track of why each pattern was added + ```python + DeltaExcludePattern.objects.create( + collection=collection, + match_pattern="https://example.com/internal/*", + reason="Internal documentation not ready for public release" + ) + ``` + +4. **Regular Maintenance**: Review patterns periodically to ensure they're still needed and correct + +## Common Gotchas + +1. **Trailing Slashes**: URLs with and without trailing slashes are treated as different + ```python + # These are different patterns + "https://example.com/docs" + "https://example.com/docs/" + ``` + +2. **Over-Inclusive Wildcards**: Be careful with patterns that might match too much + ```python + # Dangerous: Could match more than intended + match_pattern = "https://example.com/*internal*" + + # Better: More specific + match_pattern = "https://example.com/internal/*" + ``` + +3. **Pattern Order**: Remember that include patterns always win, regardless of the order they're created + ```python + # This URL will be included despite the exclude pattern + exclude_pattern = "https://example.com/docs/*" + include_pattern = "https://example.com/docs/public.html" + ``` diff --git a/sde_collections/models/README_LIFECYCLE.md b/sde_collections/models/README_LIFECYCLE.md new file mode 100644 index 00000000..61afa3f7 --- /dev/null +++ b/sde_collections/models/README_LIFECYCLE.md @@ -0,0 +1,203 @@ +# URL Migration and Promotion Guide + +## Overview +This document explains the lifecycle of URLs in the system, focusing on two critical processes: +1. Migration from DumpUrls to DeltaUrls +2. Promotion from DeltaUrls to CuratedUrls + +## Core Concepts + +### URL States +- **DumpUrls**: Raw data from initial scraping/indexing +- **DeltaUrls**: Work-in-progress changes and modifications +- **CuratedUrls**: Production-ready, approved content + +### Fields That Transfer +All fields are transferred between states, including: +- URL +- Scraped Title +- Generated Title +- Document Type +- Division +- Excluded Status +- Scraped Text +- Any additional metadata + +## Migration Process (Dump → Delta) + +### Overview +Migration converts DumpUrls to DeltaUrls, preserving all fields and applying patterns. This process happens when: +- New content is scraped +- Content is reindexed +- Collection is being prepared for curation + +### Steps +1. Clear existing DeltaUrls +2. Process each DumpUrl: + - If matching CuratedUrl exists: Create Delta with all fields + - If no matching CuratedUrl: Create Delta as new URL +3. Process missing CuratedUrls: + - Create deletion Deltas for any not in Dump +4. Apply all patterns to new Deltas +5. Clear DumpUrls + +### Examples + +#### Example 1: Basic Migration +```python +# Starting State +dump_url = DumpUrl( + url="example.com/doc", + scraped_title="Original Title", + document_type=DocumentTypes.DOCUMENTATION +) + +# After Migration +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="Original Title", + document_type=DocumentTypes.DOCUMENTATION, + to_delete=False +) +``` + +#### Example 2: Migration with Existing Curated +```python +# Starting State +dump_url = DumpUrl( + url="example.com/doc", + scraped_title="New Title", + document_type=DocumentTypes.DOCUMENTATION +) + +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="Old Title", + document_type=DocumentTypes.DOCUMENTATION +) + +# After Migration +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="New Title", # Different from curated + document_type=DocumentTypes.DOCUMENTATION, + to_delete=False +) +``` + +#### Example 3: Migration with Pattern Application +```python +# Starting State +dump_url = DumpUrl( + url="example.com/data/file.pdf", + scraped_title="Data File", + document_type=None +) + +document_type_pattern = DocumentTypePattern( + match_pattern="*.pdf", + document_type=DocumentTypes.DATA +) + +# After Migration and Pattern Application +delta_url = DeltaUrl( + url="example.com/data/file.pdf", + scraped_title="Data File", + document_type=DocumentTypes.DATA, # Set by pattern + to_delete=False +) +``` + +## Promotion Process (Delta → Curated) + +### Overview +Promotion moves DeltaUrls to CuratedUrls, applying all changes including explicit NULL values. This occurs when: +- A curator marks a collection as Curated. + +### Steps +1. Process each DeltaUrl: + - If marked for deletion: Remove matching CuratedUrl + - Otherwise: Update/create CuratedUrl with ALL fields +2. Clear all DeltaUrls +3. Refresh pattern relationships + +### Examples + +#### Example 1: Basic Promotion +```python +# Starting State +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="New Title", + document_type=DocumentTypes.DOCUMENTATION, + to_delete=False +) + +# After Promotion +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="New Title", + document_type=DocumentTypes.DOCUMENTATION +) +``` + +#### Example 2: Promotion with NULL Override +```python +# Starting State +delta_url = DeltaUrl( + url="example.com/doc", + scraped_title="Title", + document_type=None, # Explicitly set to None by pattern + to_delete=False +) + +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="Title", + document_type=DocumentTypes.DOCUMENTATION +) + +# After Promotion +curated_url = CuratedUrl( + url="example.com/doc", + scraped_title="Title", + document_type=None # NULL value preserved +) +``` + +#### Example 3: Deletion During Promotion +```python +# Starting State +delta_url = DeltaUrl( + url="example.com/old-doc", + scraped_title="Old Title", + to_delete=True +) + +curated_url = CuratedUrl( + url="example.com/old-doc", + scraped_title="Old Title" +) + +# After Promotion +# CuratedUrl is deleted +# DeltaUrl is cleared +``` + +## Important Notes + +### Field Handling +- ALL fields are copied during migration and promotion +- NULL values in DeltaUrls are treated as explicit values +- Pattern-set values take precedence over original values + +### Pattern Application +- Patterns are applied after migration +- Pattern effects persist through promotion +- Multiple patterns can affect the same URL + +### Data Integrity +- Migrations preserve all field values +- Promotions apply all changes +- Deletion flags are honored during promotion +- Pattern relationships are maintained diff --git a/sde_collections/models/README_PATTERN_RESOLUTION.md b/sde_collections/models/README_PATTERN_RESOLUTION.md new file mode 100644 index 00000000..936e8424 --- /dev/null +++ b/sde_collections/models/README_PATTERN_RESOLUTION.md @@ -0,0 +1,48 @@ +# Pattern Resolution System + +## Overview +The pattern system uses a "smallest set priority" strategy for resolving conflicts between overlapping patterns. This applies to title patterns, division patterns, and document type patterns. The pattern that matches the smallest set of URLs takes precedence. + +## How It Works + +When multiple patterns match a URL, the system: +1. Counts how many total URLs each pattern matches +2. Compares the counts +3. Applies the pattern that matches the fewest URLs + +### Example +``` +Pattern A: */docs/* # Matches 100 URLs +Pattern B: */docs/api/* # Matches 20 URLs +Pattern C: */docs/api/v2/* # Matches 5 URLs + +For URL "/docs/api/v2/users": +- All patterns match +- Pattern C wins (5 URLs < 20 URLs < 100 URLs) +``` + +## Pattern Types and Resolution + +### Title Patterns +```python +# More specific title pattern takes precedence +Pattern A: */docs/* → title="Documentation" # 100 URLs +Pattern B: */docs/api/* → title="API Reference" # 20 URLs +Result: URL gets title "API Reference" +``` + +### Division Patterns +```python +# More specific division assignment wins +Pattern A: *.pdf → division="GENERAL" # 500 URLs +Pattern B: */specs/*.pdf → division="ENGINEERING" # 50 URLs +Result: URL gets division "ENGINEERING" +``` + +### Document Type Patterns +```python +# Most specific document type classification applies +Pattern A: */docs/* → type="DOCUMENTATION" # 200 URLs +Pattern B: */docs/data/* → type="DATA" # 30 URLs +Result: URL gets type "DATA" +``` diff --git a/sde_collections/models/README_PATTERN_SYSTEM.md b/sde_collections/models/README_PATTERN_SYSTEM.md new file mode 100644 index 00000000..b8381747 --- /dev/null +++ b/sde_collections/models/README_PATTERN_SYSTEM.md @@ -0,0 +1,112 @@ +# Understanding the Pattern System + +## Overview +The pattern system is designed to manage and track changes to URLs in a content curation workflow. It provides a way to systematically modify, exclude, or categorize URLs while maintaining a clear separation between work-in-progress changes (Delta URLs) and production content (Curated URLs). + +## Core Concepts + +### URL States +- **Curated URLs**: Production-ready, approved content +- **Delta URLs**: Work-in-progress changes, additions, or deletions to curated content +- **Dump URLs**: Raw content from the dev server + +### Pattern Types +1. **Exclude Patterns**: Mark URLs for exclusion from the collection +2. **Include Patterns**: Explicitly include URLs in the collection +3. **Title Patterns**: Change or modify the original title +4. **Document Type Patterns**: Assign document type classifications +5. **Division Patterns**: Assign SMD division + +## Pattern Lifecycle + +### 1. Pattern Creation & Application +When a new pattern is created: +1. System identifies all matching URLs based on the pattern criteria +2. For matching Curated URLs: + - If the pattern would change the URL's properties + - And no Delta URL exists → Create a Delta URL with the changes + - If Delta URL exists → Update it with additional changes +3. For matching Delta URLs: + - Apply the pattern's effects directly + + +### 2. Pattern Effects +- Each pattern type has specific effects: + - Exclude: Sets exclusion status + - Include: Clears exclusion status + - Title: Modifies scraped title + - Document Type: Sets document classification + - Division: Sets organizational division + +### 3. Delta URL Generation Rules +Delta URLs are created when: +1. A new pattern would modify a Curated URL +2. An existing pattern effecting a Curated URL is removed, requiring reversal of its effects +3. Reindexed content in DumpUrl differs from Curated content + +Delta URLs are not created when: +1. Pattern effects match current Curated URL state +2. Reindexed content matches Curated content + +### 4. Pattern Removal +When a pattern is deleted: +1. System identifies all URLs affected by the pattern +2. For each affected Curated URL: + - Create Delta URL to reverse effects +3. For affected Delta URLs: + - Remove pattern's effects + - If other patterns still affect it → Keep with updated state + - If Delta URL becomes identical to Curated URL → Delete Delta URL + +## Working Principles + +### 1. Idempotency +- Applying the same pattern multiple times should have the same effect as applying it once +- System tracks pattern effects to ensure consistency +- Multiple patterns can affect the same URL + +### 2. Separation of Concerns +- Pattern effects on Delta URLs don't directly affect Curated URLs +- Exclusion status tracked separately for Delta and Curated URLs +- Changes only propagate to Curated URLs during promotion + +### 3. Change Tracking +- System maintains relationships between patterns and affected URLs +- Each pattern's effects are tracked separately +- Changes can be reversed if patterns are removed + +### 4. Delta URL Lifecycle +1. Creation: + - When patterns would modify Curated URLs + - When DumpUrl content differs from Curated content + - When patterns are removed and effects on CuratedUrls need reversal + +2. Updates: + - When new patterns affect the URL + - When pattern effects change + - When source content changes + +3. Deletion: + - When identical to Curated URL with no pattern effects + - When explicitly marked for deletion + - During promotion to Curated status + +## Pattern Interaction Examples + +### Scenario 1: Multiple Patterns +- Pattern A excludes URLs containing "draft" +- Pattern B sets document type for URLs containing "spec" +- URL: "example.com/draft-spec" +- Result: URL is excluded, document type is set (both patterns apply) + +### Scenario 2: Pattern Removal +- Pattern sets custom title for URLs +- URLs have custom titles in production +- Pattern is deleted +- Result: Delta URLs created to restore original titles + +### Scenario 3: Conflicting Patterns +- Pattern A includes URLs containing "docs" +- Pattern B excludes URLs containing "internal" +- URL: "example.com/docs/internal" +- Result: Url is included - Includes always take precedence diff --git a/sde_collections/models/README_UNAPPLY_LOGIC.md b/sde_collections/models/README_UNAPPLY_LOGIC.md new file mode 100644 index 00000000..f4c75f8f --- /dev/null +++ b/sde_collections/models/README_UNAPPLY_LOGIC.md @@ -0,0 +1,103 @@ +# Pattern System Unapply Logic + +## Core Principles +1. When patterns are removed, we need to handle deltas based on their relationship to curated URLs +2. Deltas should only exist if they differ from their curated counterparts, or if no curated URL exists +3. Multiple patterns can affect the same URL +4. Manual changes to deltas should be preserved + +## Cases to Handle + +### Case 1: Delta Only (New URL) +**Scenario:** +- No curated URL exists for this URL +- Delta URL exists with pattern effect +- Pattern is removed +``` +Curated: None +Delta: division=BIOLOGY (from pattern) +[Pattern removed] +Result: Delta remains with division=None +``` + +### Case 2: Delta and Curated Exist +**Scenario:** +- Both curated and delta URLs exist +- Pattern is removed +``` +Curated: division=GENERAL +Delta: division=BIOLOGY (from pattern) +[Pattern removed] +Result: Delta reverts to curated value (division=GENERAL) +If delta now matches curated exactly, delta is deleted +``` + +### Case 3: Curated Only +**Scenario:** +- Only curated URL exists +- Pattern is removed +``` +Curated: division=GENERAL +Delta: None +[Pattern removed] +Result: New delta created with division=None +``` + +### Case 4: Multiple Pattern Effects +**Scenario:** +- Delta has changes from multiple patterns +- One pattern is removed +``` +Curated: division=GENERAL, doc_type=DOCUMENTATION +Delta: division=BIOLOGY, doc_type=DATA (from two patterns) +[Division pattern removed] +Result: Delta remains with division=GENERAL, doc_type=DATA preserved +``` + +### Case 5: Pattern Removal with Manual Changes +**Scenario:** +- Delta has both pattern effect and manual changes +- Pattern is removed +``` +Curated: division=GENERAL, title="Original" +Delta: division=BIOLOGY, title="Modified" (pattern + manual) +[Pattern removed] +Result: Delta remains with division=GENERAL, title="Modified" preserved +``` + +## Implementation Steps + +1. **Get Affected URLs** + - Get all deltas and curated URLs that match pattern + - For each URL determine what exists (delta only, both, or curated only) + +2. **For Each Delta URL Found** + - If no matching curated exists: + - Set pattern's field to null + - If matching curated exists: + - Set pattern's field to curated value + - If delta now matches curated exactly, delete delta + +3. **For Each Curated URL without Delta** + - Create new delta with pattern's field set to null + +4. **Cleanup** + - Clear pattern's relationships with URLs + - Remove pattern from database + +## Edge Cases to Handle + +1. **Field Comparison** + - When comparing delta to curated, ignore id and to_delete fields + - All other fields must match exactly for delta deletion + +2. **Manual Changes** + - Preserve any delta fields not modified by this pattern + - Only delete delta if ALL fields match curated + +3. **Multiple Collections** + - Only affect URLs in pattern's collection + +4. **Invalid States** + - Handle missing URLs gracefully + - Skip URLs that no longer exist diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py index 083e231e..5788f335 100644 --- a/sde_collections/models/collection.py +++ b/sde_collections/models/collection.py @@ -114,10 +114,16 @@ def refresh_url_lists_for_all_patterns(self): # Filter patterns for the current collection and update relations for pattern in model.objects.filter(collection=self): - pattern.refresh_url_lists() + pattern.update_affected_delta_urls_list() + pattern.update_affected_curated_urls_list() def migrate_dump_to_delta(self): - """Main function to handle migration from DumpUrls to DeltaUrls with specific rules.""" + """ + Migrates data from DumpUrls to DeltaUrls, preserving all fields. + Creates DeltaUrls that reflect: + 1. Changes from DumpUrls vs CuratedUrls + 2. Missing URLs in DumpUrls that exist in CuratedUrls (marked for deletion) + """ # Step 1: Clear existing DeltaUrls for this collection self.clear_delta_urls() @@ -145,27 +151,31 @@ def migrate_dump_to_delta(self): # Step 5: Clear DumpUrls after migration is complete self.clear_dump_urls() - # Step 6: Reapply patterns to DeltaUrls - self.refresh_url_lists_for_all_patterns() + # Step 6: Apply all patterns to DeltaUrls + # self.refresh_url_lists_for_all_patterns() # TODO: I'm pretty confident we shouldn't be running this + self.apply_all_patterns() def create_or_update_delta_url(self, url_instance, to_delete=False): """ Creates or updates a DeltaUrl entry based on the given DumpUrl or CuratedUrl object. - If to_delete is True, only sets the to_delete flag and url. + Always copies all fields, even for deletion cases. + + Args: + url_instance: DumpUrl or CuratedUrl instance to copy from + to_delete: Whether to mark the resulting DeltaUrl for deletion """ - if to_delete: - # Only set the URL and to_delete flag - DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults={"to_delete": True}) - else: - # Automatically move over all fields from url_instance - fields_to_copy = { - field.name: getattr(url_instance, field.name) - for field in DumpUrl._meta.fields # Assumes same fields for CuratedUrl via inheritance - if field.name not in ["id", "collection", "url"] - } - fields_to_copy["to_delete"] = False # Ensure to_delete flag is False + # Get all copyable fields from the source instance + fields_to_copy = { + field.name: getattr(url_instance, field.name) + for field in url_instance._meta.fields + if field.name not in ["id", "collection"] + } - DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy) + # Set deletion status + fields_to_copy["to_delete"] = to_delete + + # Update or create the DeltaUrl + DeltaUrl.objects.update_or_create(collection=self, url=url_instance.url, defaults=fields_to_copy) def promote_to_curated(self): """ @@ -599,15 +609,22 @@ def sync_with_production_webapp(self) -> None: self.save() - def apply_all_patterns(self) -> None: - """Apply all the patterns.""" - for pattern in self.excludepattern.all(): + def apply_all_patterns(self): + """Apply all the patterns with debug information.""" + + for pattern in self.deltaexcludepatterns.all(): pattern.apply() - for pattern in self.includepattern.all(): + + for pattern in self.deltaincludepatterns.all(): pattern.apply() - for pattern in self.titlepattern.all(): + + for pattern in self.deltatitlepatterns.all(): pattern.apply() - for pattern in self.documenttypepattern.all(): + + for pattern in self.deltadocumenttypepatterns.all(): + pattern.apply() + + for pattern in self.deltadivisionpatterns.all(): pattern.apply() def save(self, *args, **kwargs): diff --git a/sde_collections/models/delta_patterns.py b/sde_collections/models/delta_patterns.py index 44aaf863..ae3e92ea 100644 --- a/sde_collections/models/delta_patterns.py +++ b/sde_collections/models/delta_patterns.py @@ -1,4 +1,5 @@ import re +from typing import Any from django.apps import apps from django.core.exceptions import ValidationError @@ -14,6 +15,8 @@ class BaseMatchPattern(models.Model): + """Base class for all delta patterns.""" + class MatchPatternTypeChoices(models.IntegerChoices): INDIVIDUAL_URL = 1, "Individual URL Pattern" MULTI_URL_PATTERN = 2, "Multi-URL Pattern" @@ -21,105 +24,94 @@ class MatchPatternTypeChoices(models.IntegerChoices): collection = models.ForeignKey( "Collection", on_delete=models.CASCADE, - related_name="%(class)s", + related_name="%(class)ss", # Makes collection.deltaincludepatterns.all() related_query_name="%(class)ss", ) match_pattern = models.CharField( - "Pattern", - help_text="This pattern is compared against the URL of all the documents in the collection " - "and matching documents will be returned", + "Pattern", help_text="This pattern is compared against the URL of all documents in the collection" ) match_pattern_type = models.IntegerField(choices=MatchPatternTypeChoices.choices, default=1) delta_urls = models.ManyToManyField( "DeltaUrl", - related_name="%(class)s_delta_urls", + related_name="%(class)ss", # Makes delta_url.deltaincludepatterns.all() ) curated_urls = models.ManyToManyField( "CuratedUrl", - related_name="%(class)s_curated_urls", + related_name="%(class)ss", # Makes curated_url.deltaincludepatterns.all() ) - def matched_urls(self): + def get_url_match_count(self): """ - Find all URLs matching the pattern. - This does not update pattern.delta_urls or pattern.curated_urls. + Get the number of unique URLs this pattern matches across both delta and curated URLs. """ - DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") - CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + delta_urls = set(self.get_matching_delta_urls().values_list("url", flat=True)) + curated_urls = set(self.get_matching_curated_urls().values_list("url", flat=True)) + return len(delta_urls.union(curated_urls)) - # Construct the regex pattern based on match type - escaped_match_pattern = re.escape(self.match_pattern) - regex_pattern = ( - f"{escaped_match_pattern}$" - if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL - else escaped_match_pattern.replace(r"\*", ".*") + def is_most_distinctive_pattern(self, url) -> bool: + """ + Determine if this pattern should apply to a URL by checking if it matches + the smallest number of URLs among all patterns that match this URL. + Returns True if this pattern should be applied. + """ + my_match_count = self.get_url_match_count() + + # Get patterns from same type that affect this URL + pattern_class = self.__class__ + matching_patterns = ( + pattern_class.objects.filter(collection=self.collection) + .filter(models.Q(delta_urls__url=url.url) | models.Q(curated_urls__url=url.url)) + .exclude(id=self.id) + .distinct() ) - # Directly query DeltaUrl and CuratedUrl with collection filter - matching_delta_urls = DeltaUrl.objects.filter(collection=self.collection, url__regex=regex_pattern) - matching_curated_urls = CuratedUrl.objects.filter(collection=self.collection, url__regex=regex_pattern) + # If any matching pattern has a smaller URL set, don't apply + for pattern in matching_patterns: + if pattern.get_url_match_count() < my_match_count: + return False - return { - "matching_delta_urls": matching_delta_urls, - "matching_curated_urls": matching_curated_urls, - } + return True - def refresh_url_lists(self): - """Update the delta_urls and curated_urls ManyToMany relationships.""" - matched_urls = self.matched_urls() - self.delta_urls.set(matched_urls["matching_delta_urls"]) - self.curated_urls.set(matched_urls["matching_curated_urls"]) + def get_regex_pattern(self) -> str: + """Convert the match pattern into a proper regex based on pattern type.""" + escaped_pattern = re.escape(self.match_pattern) + if self.match_pattern_type == self.MatchPatternTypeChoices.INDIVIDUAL_URL: + return f"{escaped_pattern}$" + return escaped_pattern.replace(r"\*", ".*") - def generate_delta_url(self, curated_url, fields_to_copy=None): - """ - Generates or updates a DeltaUrl based on a CuratedUrl. - Only specified fields are copied if fields_to_copy is provided. - """ - # Import DeltaUrl dynamically to avoid circular import issues + def get_matching_delta_urls(self) -> models.QuerySet: + """Get all DeltaUrls that match this pattern.""" DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + regex_pattern = self.get_regex_pattern() + return DeltaUrl.objects.filter(collection=self.collection, url__regex=regex_pattern) - delta_url, created = DeltaUrl.objects.get_or_create( - collection=self.collection, - url=curated_url.url, - defaults={field: getattr(curated_url, field) for field in (fields_to_copy or [])}, - ) - if not created and fields_to_copy: - # Update only if certain fields are missing in DeltaUrl - for field in fields_to_copy: - if getattr(delta_url, field, None) in [None, ""]: - setattr(delta_url, field, getattr(curated_url, field)) - delta_url.save() + def get_matching_curated_urls(self) -> models.QuerySet: + """Get all CuratedUrls that match this pattern.""" + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + regex_pattern = self.get_regex_pattern() + return CuratedUrl.objects.filter(collection=self.collection, url__regex=regex_pattern) - def apply(self, fields_to_copy=None, update_fields=None): - matched_urls = self.matched_urls() + def update_affected_delta_urls_list(self) -> None: + """Update the many-to-many relationship for matched DeltaUrls.""" + self.delta_urls.set(self.get_matching_delta_urls()) - # Step 1: Generate or update DeltaUrls for each matching CuratedUrl - for curated_url in matched_urls["matching_curated_urls"]: - # Check if the curated_url is already linked to this pattern - if self.curated_urls.filter(pk=curated_url.pk).exists(): - # Skip creating a DeltaUrl if the curated_url is already associated with this pattern - continue - self.generate_delta_url(curated_url, fields_to_copy) + def update_affected_curated_urls_list(self) -> None: + """Update the many-to-many relationship for matched CuratedUrls.""" + self.curated_urls.set(self.get_matching_curated_urls()) - # Step 2: Apply updates to fields on matching DeltaUrls - if update_fields: - matched_urls["matching_delta_urls"].update(**update_fields) - - # Update ManyToMany relationships - self.refresh_url_lists() + def apply(self) -> None: + """Apply pattern effects. Must be implemented by subclasses.""" + raise NotImplementedError - def unapply(self): - """Default unapply behavior.""" - self.delta_urls.clear() - self.curated_urls.clear() + def unapply(self) -> None: + """Remove pattern effects. Must be implemented by subclasses.""" + raise NotImplementedError - def save(self, *args, **kwargs): - """Save the pattern and apply it.""" + def save(self, *args, **kwargs) -> None: super().save(*args, **kwargs) self.apply() - def delete(self, *args, **kwargs): - """Delete the pattern and unapply it.""" + def delete(self, *args, **kwargs) -> None: self.unapply() super().delete(*args, **kwargs) @@ -132,36 +124,278 @@ def __str__(self): return self.match_pattern -class DeltaExcludePattern(BaseMatchPattern): - reason = models.TextField("Reason for excluding", default="", blank=True) +class InclusionPatternBase(BaseMatchPattern): + """ + Base class for patterns that handle URL inclusion/exclusion. + Both ExcludePattern and IncludePattern share the same core logic for managing + relationships and Delta URL creation/cleanup. + """ - # No need to override `apply`—we use the base class logic as-is. - # This pattern's functionality is handled by the `excluded` annotation in the manager. + class Meta(BaseMatchPattern.Meta): + abstract = True - class Meta: + def apply(self) -> None: + """ + Apply pattern effects to matching URLs: + 1. Find new Curated URLs that match but weren't previously affected + 2. Create Delta URLs for newly affected Curated URLs if needed + 3. Update pattern relationships to manage inclusion/exclusion status + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + + # Get QuerySet of all matching CuratedUrls + matching_curated_urls = self.get_matching_curated_urls() + + # Find Curated URLs that match but weren't previously affected + previously_unaffected_curated = matching_curated_urls.exclude( + id__in=self.curated_urls.values_list("id", flat=True) + ) + + # Create Delta URLs for newly affected Curated URLs if needed + for curated_url in previously_unaffected_curated: + # Skip if Delta already exists + if DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists(): + continue + + # Create new Delta URL copying fields from Curated URL + fields = { + field.name: getattr(curated_url, field.name) + for field in curated_url._meta.fields + if field.name not in ["id", "collection"] + } + fields["to_delete"] = False + fields["collection"] = self.collection + + DeltaUrl.objects.create(**fields) + + # Update relationships - this handles inclusion/exclusion status + self.update_affected_delta_urls_list() + + def unapply(self) -> None: + """ + Remove this pattern's effects by: + 1. Creating Delta URLs for previously excluded Curated URLs to show they're no longer excluded/included + 2. Cleaning up any Delta URLs that are now identical to their Curated URL counterparts + (these would have only existed to show their exclusion/inclusion) + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + + # Create Delta URLs for previously affected Curated URLs + for curated_url in self.curated_urls.all(): + fields = { + field.name: getattr(curated_url, field.name) + for field in curated_url._meta.fields + if field.name not in ["id", "collection"] + } + fields["to_delete"] = False + fields["collection"] = self.collection + + DeltaUrl.objects.get_or_create(**fields) + + # Clean up redundant Delta URLs + for delta_url in self.delta_urls.filter(to_delete=False): + try: + curated_url = CuratedUrl.objects.get(collection=self.collection, url=delta_url.url) + + # Check if Delta is now identical to Curated + fields_match = all( + getattr(delta_url, field.name) == getattr(curated_url, field.name) + for field in delta_url._meta.fields + if field.name not in ["id", "to_delete"] + ) + + if fields_match: + delta_url.delete() + + except CuratedUrl.DoesNotExist: + continue + + # Clear pattern relationships + self.delta_urls.clear() + self.curated_urls.clear() + + +class DeltaExcludePattern(InclusionPatternBase): + """Pattern for marking URLs for exclusion.""" + + reason = models.TextField("Reason for excluding", default="", blank=True) + + class Meta(InclusionPatternBase.Meta): verbose_name = "Delta Exclude Pattern" verbose_name_plural = "Delta Exclude Patterns" - unique_together = ("collection", "match_pattern") -class DeltaIncludePattern(BaseMatchPattern): - # No additional logic needed for `apply`—using base class functionality. +class DeltaIncludePattern(InclusionPatternBase): + """Pattern for explicitly including URLs.""" - class Meta: + class Meta(InclusionPatternBase.Meta): verbose_name = "Delta Include Pattern" verbose_name_plural = "Delta Include Patterns" - unique_together = ("collection", "match_pattern") -def validate_title_pattern(title_pattern_string): - parsed_title = parse_title(title_pattern_string) +class FieldModifyingPattern(BaseMatchPattern): + """ + Abstract base class for patterns that modify a single field on matching URLs. + Examples: DeltaDivisionPattern, DeltaDocumentTypePattern + """ + + class Meta(BaseMatchPattern.Meta): + abstract = True + + def get_field_to_modify(self) -> str: + """Return the name of the field this pattern modifies. Must be implemented by subclasses.""" + raise NotImplementedError + + def get_new_value(self) -> Any: + """Return the new value for the field. Must be implemented by subclasses.""" + raise NotImplementedError + + def apply(self) -> None: + """ + Apply field modification to matching URLs: + 1. Find new Curated URLs that match but weren't previously affected + 2. Create Delta URLs only for Curated URLs where the field value would change + 3. Update the pattern's list of affected URLs + 4. Set the field value on all matching Delta URLs + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + + field = self.get_field_to_modify() + new_value = self.get_new_value() + + # Get newly matching Curated URLs + matching_curated_urls = self.get_matching_curated_urls() + previously_unaffected_curated = matching_curated_urls.exclude( + id__in=self.curated_urls.values_list("id", flat=True) + ) + + # Create DeltaUrls only where field value would change + for curated_url in previously_unaffected_curated: + if not self.is_most_distinctive_pattern(curated_url): + continue + + if ( + getattr(curated_url, field) == new_value + or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists() + ): + continue + + fields = { + f.name: getattr(curated_url, f.name) + for f in curated_url._meta.fields + if f.name not in ["id", "collection"] + } + fields[field] = new_value + fields["to_delete"] = False + fields["collection"] = self.collection + + DeltaUrl.objects.create(**fields) + + # Update all matching DeltaUrls with the new field value if this is the most distinctive pattern + for delta_url in self.get_matching_delta_urls(): + if self.is_most_distinctive_pattern(delta_url): + setattr(delta_url, field, new_value) + delta_url.save() + + # Update pattern relationships + self.update_affected_delta_urls_list() + + def unapply(self) -> None: + """ + Remove field modifications: + 1. Create Delta URLs for affected Curated URLs to explicitly set NULL + 2. Remove field value from affected Delta URLs only if no other patterns affect them + 3. Clean up Delta URLs that become identical to their Curated URL + """ + + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + + field = self.get_field_to_modify() + + # Get all affected URLs + affected_deltas = self.delta_urls.all() + affected_curated = self.curated_urls.all() + + # Process each affected delta URL + for delta in affected_deltas: + curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first() + + if not curated: + # Scenario 1: Delta only - new URL + setattr(delta, field, None) + delta.save() + else: + # Scenario 2: Both exist + setattr(delta, field, getattr(curated, field)) + delta.save() + + # Check if delta is now redundant + fields_match = all( + getattr(delta, f.name) == getattr(curated, f.name) + for f in delta._meta.fields + if f.name not in ["id", "to_delete"] + ) + if fields_match: + delta.delete() + + # Handle curated URLs that don't have deltas + for curated in affected_curated: + if not DeltaUrl.objects.filter(url=curated.url).exists(): + # Scenario 3: Curated only + # Copy all fields from curated except the one we're nulling + fields = { + f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"] + } + fields[field] = None # Set the pattern's field to None + delta = DeltaUrl.objects.create(collection=self.collection, **fields) + + # Clear pattern relationships + self.delta_urls.clear() + self.curated_urls.clear() + - for element in parsed_title: - element_type, element_value = element +class DeltaDocumentTypePattern(FieldModifyingPattern): + """Pattern for setting document types.""" + document_type = models.IntegerField(choices=DocumentTypes.choices) + + def get_field_to_modify(self) -> str: + return "document_type" + + def get_new_value(self) -> Any: + return self.document_type + + class Meta(FieldModifyingPattern.Meta): + verbose_name = "Delta Document Type Pattern" + verbose_name_plural = "Delta Document Type Patterns" + + +class DeltaDivisionPattern(FieldModifyingPattern): + """Pattern for setting divisions.""" + + division = models.IntegerField(choices=Divisions.choices) + + def get_field_to_modify(self) -> str: + return "division" + + def get_new_value(self) -> Any: + return self.division + + class Meta(FieldModifyingPattern.Meta): + verbose_name = "Delta Division Pattern" + verbose_name_plural = "Delta Division Patterns" + + +def validate_title_pattern(title_pattern_string: str) -> None: + """Validate title pattern format.""" + parsed_title = parse_title(title_pattern_string) + + for element_type, element_value in parsed_title: if element_type == "xpath": if not is_valid_xpath(element_value): - raise ValidationError(f"'xpath:{element_value}' is not a valid xpath.") # noqa: E231 + raise ValidationError(f"Invalid xpath: {element_value}") elif element_type == "brace": try: is_valid_fstring(element_value) @@ -170,151 +404,191 @@ def validate_title_pattern(title_pattern_string): class DeltaTitlePattern(BaseMatchPattern): + """Pattern for modifying titles of URLs based on a template pattern.""" + title_pattern = models.CharField( "Title Pattern", - help_text="This is the pattern for the new title. You can either write an exact replacement string" - " (no quotes required) or you can write sinequa-valid code", + help_text="Pattern for the new title. Can be an exact replacement string or sinequa-valid code", validators=[validate_title_pattern], ) - def apply(self) -> None: - # Dynamically get the DeltaResolvedTitle and DeltaResolvedTitleError models to avoid circular import issues - DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle") - DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError") - - matched_urls = self.matched_urls() - - # Step 1: Apply title pattern to matching DeltaUrls - for delta_url in matched_urls["matching_delta_urls"]: - self.apply_title_to_url(delta_url, DeltaResolvedTitle, DeltaResolvedTitleError) - - # Step 2: Check and potentially create DeltaUrls for matching CuratedUrls - for curated_url in matched_urls["matching_curated_urls"]: - self.create_delta_if_title_differs(curated_url, DeltaResolvedTitle, DeltaResolvedTitleError) - - # Step 3: Update ManyToMany relationships for DeltaUrls and CuratedUrls - self.refresh_url_lists() - - def create_delta_if_title_differs(self, curated_url, DeltaResolvedTitle, DeltaResolvedTitleError): + def generate_title_for_url(self, url_obj) -> tuple[str, str | None]: """ - Checks if the title generated by the pattern differs from the existing generated title - in CuratedUrl. If it does, creates or updates a DeltaUrl with the new title. + Generate a new title for a URL using the pattern. + Returns tuple of (generated_title, error_message). """ - # Calculate the title that would be generated if the pattern is applied context = { - "url": curated_url.url, - "title": curated_url.scraped_title, + "url": url_obj.url, + "title": url_obj.scraped_title, "collection": self.collection.name, } - try: - new_generated_title = resolve_title(self.title_pattern, context) - - # Compare against the existing generated title in CuratedUrl - if curated_url.generated_title != new_generated_title: - # Only create a DeltaUrl if the titles differ - DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") - delta_url, created = DeltaUrl.objects.get_or_create( - collection=self.collection, - url=curated_url.url, - defaults={"scraped_title": curated_url.scraped_title}, - ) - delta_url.generated_title = new_generated_title - delta_url.save() - self.apply_title_to_url(delta_url, DeltaResolvedTitle, DeltaResolvedTitleError) - except (ValueError, ValidationError) as e: - self.log_title_error(curated_url, DeltaResolvedTitleError, str(e)) + try: + return resolve_title(self.title_pattern, context), None + except Exception as e: + return None, str(e) - def apply_title_to_url(self, url_obj, DeltaResolvedTitle, DeltaResolvedTitleError): + def apply(self) -> None: """ - Applies the title pattern to a DeltaUrl or CuratedUrl and records the resolved title or errors. + Apply the title pattern to matching URLs: + 1. Find new Curated URLs that match but weren't previously affected + 2. Create Delta URLs only where the generated title differs + 3. Update all matching Delta URLs with new titles + 4. Track title resolution status and errors """ - context = { - "url": url_obj.url, - "title": url_obj.scraped_title, - "collection": self.collection.name, - } - try: - generated_title = resolve_title(self.title_pattern, context) + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle") + DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError") + + # Get newly matching Curated URLs + matching_curated_urls = self.get_matching_curated_urls() + previously_unaffected_curated = matching_curated_urls.exclude( + id__in=self.curated_urls.values_list("id", flat=True) + ) - # Remove existing resolved title entries for this URL - DeltaResolvedTitle.objects.filter(delta_url=url_obj).delete() + # Process each previously unaffected curated URL + for curated_url in previously_unaffected_curated: + if not self.is_most_distinctive_pattern(curated_url): + continue - # Create a new resolved title entry - DeltaResolvedTitle.objects.create(title_pattern=self, delta_url=url_obj, resolved_title=generated_title) + new_title, error = self.generate_title_for_url(curated_url) - # Set generated title only on DeltaUrl - url_obj.generated_title = generated_title - url_obj.save() + if error: + # Log error and continue to next URL + DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=curated_url, error_string=error) + continue - except (ValueError, ValidationError) as e: - self.log_title_error(url_obj, DeltaResolvedTitleError, str(e)) + # Skip if the generated title matches existing or if Delta already exists + if ( + curated_url.generated_title == new_title + or DeltaUrl.objects.filter(url=curated_url.url, collection=self.collection).exists() + ): + continue - def log_title_error(self, url_obj, DeltaResolvedTitleError, message): - """Logs an error when resolving a title.""" - resolved_title_error = DeltaResolvedTitleError.objects.create( - title_pattern=self, delta_url=url_obj, error_string=message - ) - status_code = re.search(r"Status code: (\d+)", message) - if status_code: - resolved_title_error.http_status_code = int(status_code.group(1)) - resolved_title_error.save() + # Create new Delta URL with the new title + fields = { + field.name: getattr(curated_url, field.name) + for field in curated_url._meta.fields + if field.name not in ["id", "collection"] + } + fields["generated_title"] = new_title + fields["to_delete"] = False + fields["collection"] = self.collection - def unapply(self) -> None: - """Clears generated titles for DeltaUrls affected by this pattern and dissociates URLs from the pattern.""" - matched_urls = self.matched_urls() + delta_url = DeltaUrl.objects.create(**fields) - # Clear the `generated_title` for all matching DeltaUrls - matched_urls["matching_delta_urls"].update(generated_title="") + # Record successful title resolution + DeltaResolvedTitle.objects.create(title_pattern=self, delta_url=delta_url, resolved_title=new_title) - # Clear relationships - self.delta_urls.clear() - self.curated_urls.clear() + # Update titles for all matching Delta URLs + for delta_url in self.get_matching_delta_urls(): + if not self.is_most_distinctive_pattern(delta_url): + continue - class Meta: - verbose_name = "Delta Title Pattern" - verbose_name_plural = "Delta Title Patterns" - unique_together = ("collection", "match_pattern") + new_title, error = self.generate_title_for_url(delta_url) + if error: + DeltaResolvedTitleError.objects.create(title_pattern=self, delta_url=delta_url, error_string=error) + continue -class DeltaDocumentTypePattern(BaseMatchPattern): - document_type = models.IntegerField(choices=DocumentTypes.choices) + # Update title and record resolution - key change here + DeltaResolvedTitle.objects.update_or_create( + delta_url=delta_url, # Only use delta_url for lookup + defaults={"title_pattern": self, "resolved_title": new_title}, + ) - # We use `update_fields` in the base apply method to set `document_type`. - def apply(self) -> None: - super().apply(update_fields={"document_type": self.document_type}) + delta_url.generated_title = new_title + delta_url.save() + + # Update pattern relationships + self.update_affected_delta_urls_list() def unapply(self) -> None: - """Clear document type from associated delta and curated URLs.""" - self.delta_urls.update(document_type=None) + """ + Remove title modifications: + 1. Create Delta URLs for affected Curated URLs to explicitly clear titles + 2. Remove generated titles from affected Delta URLs + 3. Clean up Delta URLs that become identical to their Curated URL + 4. Clear resolution tracking + """ + DeltaUrl = apps.get_model("sde_collections", "DeltaUrl") + CuratedUrl = apps.get_model("sde_collections", "CuratedUrl") + DeltaResolvedTitle = apps.get_model("sde_collections", "DeltaResolvedTitle") + DeltaResolvedTitleError = apps.get_model("sde_collections", "DeltaResolvedTitleError") + + # Get all affected URLs + affected_deltas = self.delta_urls.all() + affected_curated = self.curated_urls.all() + + # Process each affected delta URL + for delta in affected_deltas: + curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first() + + if not curated: + # Scenario 1: Delta only - clear generated title + delta.generated_title = "" + delta.save() + else: + # Scenario 2: Both exist - revert to curated title + delta.generated_title = curated.generated_title + delta.save() + + # Check if delta is now redundant + fields_match = all( + getattr(delta, f.name) == getattr(curated, f.name) + for f in delta._meta.fields + if f.name not in ["id", "to_delete"] + ) + if fields_match: + delta.delete() + + # Handle curated URLs that don't have deltas + for curated in affected_curated: + if not DeltaUrl.objects.filter(url=curated.url).exists(): + # Scenario 3: Curated only - create delta with cleared title + fields = { + f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"] + } + fields["generated_title"] = "" + DeltaUrl.objects.create(collection=self.collection, **fields) + + # Clear resolution tracking + DeltaResolvedTitle.objects.filter(title_pattern=self).delete() + DeltaResolvedTitleError.objects.filter(title_pattern=self).delete() + + # Clear pattern relationships self.delta_urls.clear() self.curated_urls.clear() - class Meta: - verbose_name = "Delta Document Type Pattern" - verbose_name_plural = "Delta Document Type Patterns" - unique_together = ("collection", "match_pattern") + class Meta(BaseMatchPattern.Meta): + verbose_name = "Delta Title Pattern" + verbose_name_plural = "Delta Title Patterns" -class DeltaDivisionPattern(BaseMatchPattern): - division = models.IntegerField(choices=Divisions.choices) +class DeltaResolvedTitleBase(models.Model): + # TODO: need to understand this logic and whether we need to have these match to CuratedUrls as well - # We use `update_fields` in the base apply method to set `division`. - def apply(self) -> None: - super().apply(update_fields={"division": self.division}) + title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE) + delta_url = models.OneToOneField("sde_collections.DeltaUrl", on_delete=models.CASCADE) + created_at = models.DateTimeField(auto_now_add=True) - def unapply(self) -> None: - """Clear division from associated delta and curated URLs.""" - # TODO: need to double check this logic for complicated cases - self.delta_urls.update(division=None) + class Meta: + abstract = True + + +class DeltaResolvedTitle(DeltaResolvedTitleBase): + resolved_title = models.CharField(blank=True, default="") class Meta: - verbose_name = "Delta Division Pattern" - verbose_name_plural = "Delta Division Patterns" - unique_together = ("collection", "match_pattern") + verbose_name = "Resolved Title" + verbose_name_plural = "Resolved Titles" + + def save(self, *args, **kwargs): + # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it + DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete() + super().save(*args, **kwargs) -# @receiver(post_save, sender=DeltaTitlePattern) -# def post_save_handler(sender, instance, created, **kwargs): -# if created: -# transaction.on_commit(lambda: resolve_title_pattern.delay(instance.pk)) +class DeltaResolvedTitleError(DeltaResolvedTitleBase): + error_string = models.TextField(null=False, blank=False) + http_status_code = models.IntegerField(null=True, blank=True) diff --git a/sde_collections/models/delta_url.py b/sde_collections/models/delta_url.py index 3f1212e0..cefeae7a 100644 --- a/sde_collections/models/delta_url.py +++ b/sde_collections/models/delta_url.py @@ -4,27 +4,56 @@ from django.db import models from .collection_choice_fields import Divisions, DocumentTypes -from .delta_patterns import DeltaExcludePattern, DeltaTitlePattern +from .delta_patterns import DeltaExcludePattern, DeltaIncludePattern class DeltaUrlQuerySet(models.QuerySet): def with_exclusion_status(self): + """ + Annotate queryset with exclusion status, taking into account both exclude and include patterns. + Include patterns take precedence over exclude patterns. + """ return self.annotate( - excluded=models.Exists( + has_exclude=models.Exists( DeltaExcludePattern.delta_urls.through.objects.filter(deltaurl=models.OuterRef("pk")) - ) + ), + has_include=models.Exists( + DeltaIncludePattern.delta_urls.through.objects.filter(deltaurl=models.OuterRef("pk")) + ), + excluded=models.Case( + # If has_include is True, URL is not excluded regardless of exclude patterns + models.When(has_include=True, then=models.Value(False)), + # Otherwise, excluded status is determined by presence of exclude pattern + default=models.F("has_exclude"), + output_field=models.BooleanField(), + ), ) class CuratedUrlQuerySet(models.QuerySet): def with_exclusion_status(self): + """ + Annotate queryset with exclusion status, taking into account both exclude and include patterns. + Include patterns take precedence over exclude patterns. + """ return self.annotate( - excluded=models.Exists( + has_exclude=models.Exists( DeltaExcludePattern.curated_urls.through.objects.filter(curatedurl=models.OuterRef("pk")) - ) + ), + has_include=models.Exists( + DeltaIncludePattern.curated_urls.through.objects.filter(curatedurl=models.OuterRef("pk")) + ), + excluded=models.Case( + # If has_include is True, URL is not excluded regardless of exclude patterns + models.When(has_include=True, then=models.Value(False)), + # Otherwise, excluded status is determined by presence of exclude pattern + default=models.F("has_exclude"), + output_field=models.BooleanField(), + ), ) +# Manager classes remain unchanged since they just use the updated QuerySets class DeltaUrlManager(models.Manager): def get_queryset(self): return DeltaUrlQuerySet(self.model, using=self._db).with_exclusion_status() @@ -145,31 +174,3 @@ class Meta: verbose_name = "Curated Urls" verbose_name_plural = "Curated Urls" ordering = ["url"] - - -class DeltaResolvedTitleBase(models.Model): - # TODO: need to understand this logic and whether we need to have thess match to CuratedUrls as well - title_pattern = models.ForeignKey(DeltaTitlePattern, on_delete=models.CASCADE) - delta_url = models.OneToOneField(DeltaUrl, on_delete=models.CASCADE) - created_at = models.DateTimeField(auto_now_add=True) - - class Meta: - abstract = True - - -class DeltaResolvedTitle(DeltaResolvedTitleBase): - resolved_title = models.CharField(blank=True, default="") - - class Meta: - verbose_name = "Resolved Title" - verbose_name_plural = "Resolved Titles" - - def save(self, *args, **kwargs): - # Finds the linked delta URL and deletes DeltaResolvedTitleError objects linked to it - DeltaResolvedTitleError.objects.filter(delta_url=self.delta_url).delete() - super().save(*args, **kwargs) - - -class DeltaResolvedTitleError(DeltaResolvedTitleBase): - error_string = models.TextField(null=False, blank=False) - http_status_code = models.IntegerField(null=True, blank=True) diff --git a/sde_collections/serializers.py b/sde_collections/serializers.py index b3470ce3..7b2fdc7f 100644 --- a/sde_collections/serializers.py +++ b/sde_collections/serializers.py @@ -64,15 +64,15 @@ class DeltaURLSerializer(serializers.ModelSerializer): delta_urls_count = serializers.SerializerMethodField(read_only=True) def get_delta_urls_count(self, obj): - titlepattern = obj.deltatitlepattern_delta_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.delta_urls.count() if titlepattern else 0 def get_generated_title_id(self, obj): - titlepattern = obj.deltatitlepattern_delta_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.id if titlepattern else None def get_match_pattern_type(self, obj): - titlepattern = obj.deltatitlepattern_delta_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.match_pattern_type if titlepattern else None class Meta: @@ -104,15 +104,15 @@ class CuratedURLSerializer(serializers.ModelSerializer): curated_urls_count = serializers.SerializerMethodField(read_only=True) def get_curated_urls_count(self, obj): - titlepattern = obj.deltatitlepattern_curated_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.curated_urls.count() if titlepattern else 0 def get_generated_title_id(self, obj): - titlepattern = obj.deltatitlepattern_curated_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.id if titlepattern else None def get_match_pattern_type(self, obj): - titlepattern = obj.deltatitlepattern_curated_urls.last() + titlepattern = obj.deltatitlepatterns.last() return titlepattern.match_pattern_type if titlepattern else None class Meta: diff --git a/sde_collections/tests/factories.py b/sde_collections/tests/factories.py index 414221d5..dded5d5c 100644 --- a/sde_collections/tests/factories.py +++ b/sde_collections/tests/factories.py @@ -66,25 +66,25 @@ class Meta: # division = 1 -class CuratedUrlFactory(factory.django.DjangoModelFactory): +class DeltaUrlFactory(factory.django.DjangoModelFactory): class Meta: - model = CuratedUrl + model = DeltaUrl collection = factory.SubFactory(CollectionFactory) url = factory.Faker("url") scraped_title = factory.Faker("sentence") - scraped_text = factory.Faker("paragraph") - generated_title = factory.Faker("sentence") - visited = factory.Faker("boolean") - document_type = 1 - division = 1 + to_delete = False -class DeltaUrlFactory(factory.django.DjangoModelFactory): +class CuratedUrlFactory(factory.django.DjangoModelFactory): class Meta: - model = DeltaUrl + model = CuratedUrl collection = factory.SubFactory(CollectionFactory) url = factory.Faker("url") scraped_title = factory.Faker("sentence") - to_delete = False + scraped_text = factory.Faker("paragraph") + generated_title = factory.Faker("sentence") + visited = factory.Faker("boolean") + document_type = 1 + division = 1 diff --git a/sde_collections/tests/test_delta_patterns.py b/sde_collections/tests/test_delta_patterns.py index 1fd9c886..b72981fc 100644 --- a/sde_collections/tests/test_delta_patterns.py +++ b/sde_collections/tests/test_delta_patterns.py @@ -3,177 +3,92 @@ import pytest from sde_collections.models.delta_patterns import ( - DeltaDivisionPattern, - DeltaDocumentTypePattern, DeltaExcludePattern, - DeltaIncludePattern, - DeltaTitlePattern, -) -from sde_collections.models.delta_url import ( - CuratedUrl, DeltaResolvedTitleError, - DeltaUrl, + DeltaTitlePattern, ) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl from sde_collections.tests.factories import ( CollectionFactory, CuratedUrlFactory, DeltaUrlFactory, - DumpUrlFactory, ) from sde_collections.utils.title_resolver import resolve_title @pytest.mark.django_db def test_exclusion_status(): + """ + new patterns should only exclude DeltaUrls, not CuratedUrls + """ collection = CollectionFactory() - curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page") + curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page/1") + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page/2") - # Create an exclusion pattern that should apply to this URL - DeltaExcludePattern.objects.create(collection=collection, match_pattern="https://example.com/page") + # confirm they both start as not excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False - # Assert that the `excluded` field is set to True, as expected - assert CuratedUrl.objects.get(pk=curated_url.pk).excluded + # Create an exclusion pattern matches both urls + pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern="*page*", match_pattern_type=2) + pattern.apply() + + # curated urls should not be affected by patterns until the collection is promoted + # curated should be included, but delta should be excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False @pytest.mark.django_db class TestBaseMatchPattern: - def test_individual_url_pattern_matching(self): + def test_pattern_save_applies_effects(self): + """Test that pattern creation automatically applies effects.""" collection = CollectionFactory() - curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page") - pattern = DeltaIncludePattern.objects.create( - collection=collection, match_pattern="https://example.com/page", match_pattern_type=1 # INDIVIDUAL_URL - ) - pattern.apply() - matching_urls = pattern.matched_urls() - CuratedUrl.objects.filter(collection=collection, url__regex=pattern.match_pattern) + curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/test") - assert curated_url in matching_urls["matching_curated_urls"] - - def test_multi_url_pattern_matching(self): - collection = CollectionFactory() - curated_url_1 = CuratedUrlFactory(collection=collection, url="https://example.com/page1") - curated_url_2 = CuratedUrlFactory(collection=collection, url="https://example.com/page2") - pattern = DeltaIncludePattern.objects.create( - collection=collection, match_pattern="https://example.com/*", match_pattern_type=2 # MULTI_URL_PATTERN + # Create pattern - should automatically apply + pattern = DeltaExcludePattern.objects.create( + collection=collection, match_pattern=curated_url.url, match_pattern_type=1 ) - matching_urls = pattern.matched_urls() - assert curated_url_1 in matching_urls["matching_curated_urls"] - assert curated_url_2 in matching_urls["matching_curated_urls"] - - def test_generate_delta_url_creation_and_update(self): - collection = CollectionFactory() - curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page") - pattern = DeltaIncludePattern.objects.create(collection=collection, match_pattern="https://example.com/page") - - # First call to generate DeltaUrl - pattern.generate_delta_url(curated_url, fields_to_copy=["scraped_title"]) + # Delta URL should be created and excluded delta_url = DeltaUrl.objects.get(url=curated_url.url) - original_delta_title = delta_url.scraped_title - assert delta_url.scraped_title == curated_url.scraped_title - - # Update DeltaUrl with additional fields - # this is kinda weird, but basically if you have a deltaurl with a - # scraped_title, that value is gospel. if for some reason generate_delta_url is called - # again and it hits that deltaurl, it will not update the scraped_title field, since that - # field already exists and is assumed correct. - # this is true of title. but i think not of other fields? - curated_url.scraped_title = "Updated Title" - curated_url.save() - curated_url.refresh_from_db() - pattern.generate_delta_url(curated_url, fields_to_copy=["scraped_title"]) - delta_url.refresh_from_db() - assert delta_url.scraped_title == original_delta_title - - def test_apply_creates_delta_url_if_curated_url_does_not_exist(self): - """ - Ensures that the `apply` logic creates a new `DeltaUrl` if a matching `CuratedUrl` does not exist. - """ - collection = CollectionFactory() - delta_url = DeltaUrlFactory( - collection=collection, url="https://example.com/page", scraped_title="Original Title" - ) - - # Create a pattern matching the URL - pattern = DeltaIncludePattern.objects.create( - collection=collection, match_pattern="https://example.com/*", match_pattern_type=2 - ) - - # Apply the pattern - pattern.apply() - - # Verify that a DeltaUrl is created - assert DeltaUrl.objects.filter(url=delta_url.url).exists() + assert delta_url.excluded is True + assert pattern.delta_urls.filter(id=delta_url.id).exists() - def test_apply_skips_delta_url_creation_if_curated_url_exists(self): - """ - Ensures that the `apply` logic does not create a new `DeltaUrl` if a matching `CuratedUrl` already exists. - """ + def test_pattern_delete_removes_effects(self): + """Test that deleting a pattern properly removes its effects.""" collection = CollectionFactory() - delta_url = DeltaUrlFactory( - collection=collection, url="https://example.com/page", scraped_title="Original Title" - ) + curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/test") - # Create a pattern matching the URL - pattern = DeltaIncludePattern.objects.create( - collection=collection, match_pattern="https://example.com/*", match_pattern_type=2 - ) + pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern=curated_url.url) - # Promote the DeltaUrl to a CuratedUrl - collection.promote_to_curated() - curated_url = CuratedUrl.objects.get(url=delta_url.url) + # Verify initial state + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.excluded is True - # ReApply the pattern - pattern.apply() + # Delete pattern + pattern.delete() - # Verify that no DeltaUrl is created after the CuratedUrl exists + # Delta URL should be gone since it was only created for exclusion assert not DeltaUrl.objects.filter(url=curated_url.url).exists() - def test_apply_creates_delta_url_if_no_curated_url_exists(self): - """ - Ensures that if no `CuratedUrl` exists for a given pattern, a new `DeltaUrl` is created. - """ - collection = CollectionFactory() - dump_url = DumpUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title") + def test_different_collections_isolation(self): + """Test that patterns only affect URLs in their collection.""" + collection1 = CollectionFactory() + collection2 = CollectionFactory() - # Migrate DumpUrl to DeltaUrl - collection.migrate_dump_to_delta() + # Create URLs with different paths + curated_url1 = CuratedUrlFactory(collection=collection1, url="https://example.com/test1") + curated_url2 = CuratedUrlFactory(collection=collection2, url="https://example.com/test2") - # Create a pattern matching the URL - pattern = DeltaIncludePattern.objects.create( - collection=collection, match_pattern="https://example.com/*", match_pattern_type=2 + DeltaExcludePattern.objects.create( + collection=collection1, match_pattern="https://example.com/*", match_pattern_type=2 ) - # Apply the pattern - pattern.apply() - - # A `DeltaUrl` should now exist - delta_url = DeltaUrl.objects.get(url=dump_url.url) - assert delta_url.scraped_title == dump_url.scraped_title - - def test_apply_and_unapply_pattern(self): - # if we make a new exclude pattern and it affects an old url - # that wasn't previously affected, what should happen? - # for now, let's say the curated_url should be excluded, and a delta_url is created which is also excluded - # when the pattern is deleted, they should both be unexcluded again - collection = CollectionFactory() - curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page") - assert not CuratedUrl.objects.get(pk=curated_url.pk).excluded - - pattern = DeltaExcludePattern.objects.create( - collection=collection, - match_pattern="https://example.com/*", - match_pattern_type=2, # MULTI_URL_PATTERN - ) - - assert CuratedUrl.objects.get(pk=curated_url.pk).excluded - assert DeltaUrl.objects.get(url=curated_url.url).excluded - - pattern.delete() - - # TODO: for now the DeltaUrl is persisting, but i think we might want to find a way to delete it eventually - assert not CuratedUrl.objects.get(pk=curated_url.pk).excluded - assert not DeltaUrl.objects.get(url=curated_url.url).excluded + # Only collection1's URL should be affected + assert DeltaUrl.objects.filter(collection=collection1, url=curated_url1.url).exists() + assert not DeltaUrl.objects.filter(collection=collection2, url=curated_url2.url).exists() @pytest.mark.django_db @@ -186,7 +101,6 @@ def test_apply_generates_delta_url_if_title_differs(self): collection=collection, url="https://example.com/page", scraped_title="Sample Title", - generated_title="Old Title - Processed", ) # Step 2: Create a `DeltaTitlePattern` with a new title pattern @@ -197,9 +111,6 @@ def test_apply_generates_delta_url_if_title_differs(self): title_pattern="{title} - Processed New", ) - # Apply the pattern - pattern.apply() - # Step 3: A new DeltaUrl should be created with the updated `generated_title` delta_url = DeltaUrl.objects.get(url=curated_url.url) expected_generated_title = resolve_title( @@ -302,7 +213,7 @@ def test_unapply_removes_pattern_relationships(self): curated_url = CuratedUrlFactory( collection=collection, url="https://example.com/page", scraped_title="Sample Title" ) - delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="Sample Title") + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/page", scraped_title="New Title") # Create and apply a `DeltaTitlePattern` pattern = DeltaTitlePattern.objects.create( @@ -316,7 +227,8 @@ def test_unapply_removes_pattern_relationships(self): # Ensure relationships are set assert pattern.delta_urls.filter(pk=delta_url.pk).exists() - assert pattern.curated_urls.filter(pk=curated_url.pk).exists() + # this actually shouldn't match until after promotion + assert not pattern.curated_urls.filter(pk=curated_url.pk).exists() # Unapply the pattern pattern.unapply() @@ -354,60 +266,3 @@ def test_pattern_reapplication_does_not_duplicate_delta_urls(self): # Ensure no new `DeltaUrl` is created after reapplying the pattern pattern.apply() assert DeltaUrl.objects.filter(url=curated_url.url).count() == 0 - - -@pytest.mark.django_db -class TestDeltaDocumentTypePattern: - def test_apply_document_type_pattern(self): - collection = CollectionFactory() - curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page") - pattern = DeltaDocumentTypePattern.objects.create( - collection=collection, - match_pattern="https://example.com/page", - document_type=2, # A different document type than default - ) - pattern.apply() - - delta_url = DeltaUrl.objects.get(url=curated_url.url) - assert delta_url.document_type == 2 - - def test_unapply_document_type_pattern(self): - collection = CollectionFactory() - curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page") - pattern = DeltaDocumentTypePattern.objects.create( - collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, document_type=2 - ) - pattern.apply() - - delta_url = DeltaUrl.objects.get(url=curated_url.url) - assert delta_url.document_type == 2 - - pattern.unapply() - delta_url.refresh_from_db() - assert delta_url.document_type is None - - -@pytest.mark.django_db -class TestDeltaDivisionPattern: - def test_apply_and_unapply_division_pattern(self): - # Step 1: Create a collection and a CuratedUrl that matches the pattern - collection = CollectionFactory() - curated_url = CuratedUrlFactory(collection=collection, url="https://example.com/page", division=1) - - # Step 2: Create a DeltaDivisionPattern to apply to matching URLs - pattern = DeltaDivisionPattern.objects.create( - collection=collection, match_pattern="https://example.com/*", match_pattern_type=2, division=2 - ) - - # Step 3: Apply the pattern, which should generate a DeltaUrl with the division set to 2 - delta_url = DeltaUrl.objects.get(url=curated_url.url) - assert delta_url.division == 2 - - # confirm the curated url maintains its original division - curated_url = CuratedUrl.objects.get(url=curated_url.url) - assert curated_url.division == 1 - - # Step 4: Unapply the pattern and confirm the division field is cleared - pattern.unapply() - delta_url.refresh_from_db() - assert delta_url.division is None diff --git a/sde_collections/tests/test_exclude_patterns.py b/sde_collections/tests/test_exclude_patterns.py new file mode 100644 index 00000000..3bf474d2 --- /dev/null +++ b/sde_collections/tests/test_exclude_patterns.py @@ -0,0 +1,366 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_exclude_patterns.py + +import pytest +from django.contrib.contenttypes.models import ContentType +from django.db import IntegrityError +from django.test import TestCase + +from sde_collections.models.delta_patterns import DeltaExcludePattern +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl + +from .factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + + +class BaseCollectionTest(TestCase): + def setUp(self): + super().setUp() + self.collection = CollectionFactory() + + # Ensure ContentTypes are created for all pattern models + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltaexcludepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltaincludepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltatitlepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltadocumenttypepattern", + ) + ContentType.objects.get_or_create( + app_label="sde_collections", + model="deltadivisionpattern", + ) + + +@pytest.mark.django_db +class TestDeltaExcludePatternBasics(TestCase): + """Test basic functionality of exclude patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_create_simple_exclude_pattern(self): + """Test creation of a basic exclude pattern.""" + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern="https://example.com/exclude-me", reason="Test exclusion" + ) + assert pattern.match_pattern_type == DeltaExcludePattern.MatchPatternTypeChoices.INDIVIDUAL_URL + + def test_exclude_single_curated_url(self): + """Test excluding a single curated URL creates appropriate delta.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/exclude-me", scraped_title="Test Title" + ) + + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + + # Pattern should create a delta URL + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert pattern.delta_urls.filter(id=delta_url.id).exists() + assert not pattern.curated_urls.filter(id=curated_url.id).exists() + + def test_exclude_single_curated_url_multiple_applies(self): + """ + Test excluding a single curated URL creates appropriate delta. + even if the pattern is applied multiple times + """ + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/exclude-me", scraped_title="Test Title" + ) + + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + pattern.save() + pattern.apply() + pattern.apply() + pattern.save() + + # Pattern should create a delta URL + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert pattern.delta_urls.filter(id=delta_url.id).exists() + assert not pattern.curated_urls.filter(id=curated_url.id).exists() + + def test_wildcard_pattern_exclusion(self): + """Test excluding multiple URLs with wildcard pattern.""" + # Create multiple curated URLs + urls = [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/docs/internal/{i}", + scraped_title=f"Internal Doc {i}", + ) + for i in range(3) + ] + + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/internal/*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # All URLs should have corresponding deltas + assert DeltaUrl.objects.filter(collection=self.collection).count() == 3 + for url in urls: + assert pattern.delta_urls.filter(url=url.url).exists() + assert not pattern.curated_urls.filter(id=url.id).exists() + + def test_exclusion_selectivity(self): + """ + new patterns should only exclude DeltaUrls, not CuratedUrls + """ + curated_url = CuratedUrlFactory(collection=self.collection, url="https://example.com/page/1") + delta_url = DeltaUrlFactory(collection=self.collection, url="https://example.com/page/2") + + # confirm they both start as not excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False + + # Create an exclusion pattern matches both urls + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern="*page*", match_pattern_type=2 + ) + pattern.apply() + + # curated urls should not be affected by patterns until the collection is promoted + # curated should be included, but delta should be excluded + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True + assert CuratedUrl.objects.get(pk=curated_url.pk).excluded is False + + +class TestDeltaExcludePatternWorkflow(BaseCollectionTest): + """Test complex workflows involving exclude patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_removal_creates_reversal_deltas(self): + """ + Test that removing an exclude pattern after promotion creates delta URLs + to reverse the exclusion of previously excluded curated URLs. + """ + collection = self.collection + # Create curated URL + curated_url = CuratedUrlFactory( + collection=collection, url="https://example.com/test", scraped_title="Test Title" + ) + + # Create exclude pattern - this should create excluded delta URL + pattern = DeltaExcludePattern.objects.create(collection=collection, match_pattern=curated_url.url) + + # Verify delta URL was created and is excluded + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.excluded is True + + # Promote collection - this should convert excluded delta URL to excluded curated URL + collection.promote_to_curated() + + # Verify curated URL is now excluded and delta URL is gone + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + curated_url = CuratedUrl.objects.get(url=curated_url.url) + assert curated_url.excluded is True + + # Remove pattern - this should create new delta URL to show URL will be included + pattern.delete() + + reversal_delta = DeltaUrl.objects.get(url=curated_url.url) + assert reversal_delta.excluded is False + + collection.promote_to_curated() + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + curated_url = CuratedUrl.objects.get(url=curated_url.url) + assert curated_url.excluded is False + + def test_promote_and_new_exclude_workflow(self): + """Test workflow: add URLs, exclude some, promote, then add new exclude pattern.""" + # Initial setup with curated URLs + [ + CuratedUrlFactory(collection=self.collection, url=f"https://example.com/page{i}", scraped_title=f"Page {i}") + for i in range(3) + ] + + # Create first exclude pattern + DeltaExcludePattern.objects.create(collection=self.collection, match_pattern="https://example.com/page1") + + # Verify delta URL created + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + + # Simulate promotion + self.collection.promote_to_curated() + + # Create new exclude pattern after promotion + pattern2 = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern="https://example.com/page2" + ) + + # Should have new delta URL for newly excluded URL + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + assert pattern2.delta_urls.count() == 1 + + def test_dump_migration_with_excludes(self): + """Test handling of excluded URLs during dump migration.""" + # Create initial curated URLs + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test", scraped_title="Original Title" + ) + + # Create exclude pattern, this should not effect the curated + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + + # Create dump URL with different content, same as curated, different title, will make delta + DumpUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Updated Title") + + # Migrate dump to delta + self.collection.migrate_dump_to_delta() + + # Should have delta URL reflecting both exclusion and content change + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert delta_url.scraped_title == "Updated Title" + assert pattern.delta_urls.filter(id=delta_url.id).exists() + assert delta_url.excluded is True + + +class TestDeltaExcludePatternEdgeCases(TestCase): + """Test edge cases and complex scenarios.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_exclude_pattern_uniqueness(self): + """Test that we cannot create duplicate exclude patterns for the same URL in a collection.""" + from django.db import transaction + + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test", scraped_title="Test Title" + ) + + # Create first exclude pattern + pattern1 = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, reason="First exclusion" + ) + + # Verify we start with one pattern + assert DeltaExcludePattern.objects.filter(collection=self.collection).count() == 1 + + # Attempt to create second exclude pattern with same match_pattern should fail + with pytest.raises(IntegrityError), transaction.atomic(): + DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, reason="Second exclusion" + ) + + # Verify we still only have one pattern + assert DeltaExcludePattern.objects.filter(collection=self.collection).count() == 1 + + # Verify only one delta URL exists and is associated with the pattern + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert pattern1.delta_urls.filter(id=delta_url.id).exists() + + def test_different_patterns_matching_same_url(self): + """Test that different patterns can affect the same URL.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test/page", scraped_title="Test Title" + ) + + # Create pattern matching exact URL + pattern1 = DeltaExcludePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, reason="Exact match exclusion" + ) + + # Create pattern with wildcard that also matches the URL + pattern2 = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/test/*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + reason="Wildcard exclusion", + ) + + # Should still only have one delta URL + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + delta_url = DeltaUrl.objects.get(url=curated_url.url) + + # URL should be associated with both patterns + assert pattern1.delta_urls.filter(id=delta_url.id).exists() + assert pattern2.delta_urls.filter(id=delta_url.id).exists() + + def test_exclude_modified_url(self): + """Test excluding a URL that already has modifications in delta doesn't lose delta mods""" + # Create curated URL + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/test", scraped_title="Original Title" + ) + + # Create modified delta URL + DeltaUrlFactory(collection=self.collection, url=curated_url.url, scraped_title="Modified Title") + + # Create exclude pattern + pattern = DeltaExcludePattern.objects.create(collection=self.collection, match_pattern=curated_url.url) + + # Should still only have one delta URL with both modification and exclusion + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + updated_delta = DeltaUrl.objects.get(url=curated_url.url) + assert updated_delta.scraped_title == "Modified Title" + assert pattern.delta_urls.filter(id=updated_delta.id).exists() + + def test_pattern_update_workflow(self): + """ + Test updating an exclude pattern's criteria properly updates URL associations + while preserving existing delta changes. + """ + # Create multiple curated URLs + urls = [ + CuratedUrlFactory( + collection=self.collection, url=f"https://example.com/section{i}/page", scraped_title=f"Page {i}" + ) + for i in range(3) + ] + + # Create a delta URL for section1 with a modified title + DeltaUrlFactory( + collection=self.collection, url=urls[1].url, scraped_title="Modified Title for Section 1" # section1 + ) + + # Create initial pattern matching section1/* + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/section1/*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Verify initial state + assert not pattern.delta_urls.filter(url=urls[0].url).exists() # section0 + assert pattern.delta_urls.filter(url=urls[1].url).exists() # section1 + assert not pattern.delta_urls.filter(url=urls[2].url).exists() # section2 + + # Verify the delta URL still exists and has its modified title + assert DeltaUrl.objects.filter(url=urls[1].url).exists() + assert DeltaUrl.objects.get(url=urls[1].url).scraped_title == "Modified Title for Section 1" + + # Update pattern to match section2/* instead + pattern.match_pattern = "https://example.com/section2/*" + pattern.save() + + # Verify pattern associations have updated correctly + assert not pattern.delta_urls.filter(url=urls[0].url).exists() # section0 + assert not pattern.delta_urls.filter(url=urls[1].url).exists() # section1 + assert pattern.delta_urls.filter(url=urls[2].url).exists() # section2 + + # Verify section1's delta URL still exists with its modified title + assert DeltaUrl.objects.filter(url=urls[1].url).exists() + delta_after_update = DeltaUrl.objects.get(url=urls[1].url) + assert delta_after_update.scraped_title == "Modified Title for Section 1" diff --git a/sde_collections/tests/test_field_modifier_patterns.py b/sde_collections/tests/test_field_modifier_patterns.py new file mode 100644 index 00000000..db15a21e --- /dev/null +++ b/sde_collections/tests/test_field_modifier_patterns.py @@ -0,0 +1,490 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_field_modifier_patterns.py + +import pytest +from django.contrib.contenttypes.models import ContentType +from django.db import IntegrityError +from django.test import TestCase + +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, +) +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl + +from .factories import CollectionFactory, CuratedUrlFactory, DeltaUrlFactory + + +class BaseCollectionTest(TestCase): + def setUp(self): + super().setUp() + self.collection = CollectionFactory() + + # Ensure ContentTypes are created for all pattern models + for model in [ + "deltaexcludepattern", + "deltaincludepattern", + "deltatitlepattern", + "deltadocumenttypepattern", + "deltadivisionpattern", + ]: + ContentType.objects.get_or_create( + app_label="sde_collections", + model=model, + ) + + +@pytest.mark.django_db +class TestFieldModifierPatternBasics(TestCase): + """Test basic functionality of field modifier patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_create_document_type_pattern_single(self): + """Test creation of a document type pattern for single URL.""" + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/guide.pdf", + document_type=DocumentTypes.DOCUMENTATION, + ) + assert pattern.match_pattern_type == DeltaDocumentTypePattern.MatchPatternTypeChoices.INDIVIDUAL_URL + assert pattern.document_type == DocumentTypes.DOCUMENTATION + + def test_create_document_type_pattern_multi(self): + """Test creation of a document type pattern with wildcard.""" + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/docs/*.pdf", + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + document_type=DocumentTypes.DOCUMENTATION, + ) + assert pattern.match_pattern_type == DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN + assert pattern.document_type == DocumentTypes.DOCUMENTATION + + def test_create_division_pattern(self): + """Test creation of a division pattern.""" + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/helio/data.html", + division=Divisions.HELIOPHYSICS, + ) + assert pattern.match_pattern_type == DeltaDivisionPattern.MatchPatternTypeChoices.INDIVIDUAL_URL + assert pattern.division == Divisions.HELIOPHYSICS + + def test_modify_single_curated_url_document_type(self): + """Test modifying document type for a single curated URL.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/tools/analysis.html", document_type=DocumentTypes.DATA + ) + + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.SOFTWARETOOLS + ) + + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert delta_url.document_type == DocumentTypes.SOFTWARETOOLS + assert pattern.delta_urls.filter(id=delta_url.id).exists() + # curated url should be unchanged + assert CuratedUrl.objects.get(url=curated_url.url).document_type == DocumentTypes.DATA + + def test_modify_single_curated_url_division(self): + """Test modifying division for a single curated URL.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/planetary/mars.html", division=Divisions.EARTH_SCIENCE + ) + + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.PLANETARY + ) + + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url is not None + assert delta_url.division == Divisions.PLANETARY + assert pattern.delta_urls.filter(id=delta_url.id).exists() + + +@pytest.mark.django_db +class TestFieldModifierPatternBehavior(TestCase): + """Test complex behaviors of field modifier patterns.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_with_existing_delta(self): + """Test applying pattern when delta URL already exists.""" + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/instruments/telescope.html", + document_type=DocumentTypes.DOCUMENTATION, + ) + + # Create delta URL with different title + delta_url = DeltaUrlFactory( + collection=self.collection, + url=curated_url.url, + scraped_title="Updated Telescope Info", + document_type=DocumentTypes.DOCUMENTATION, + ) + + # Apply pattern - should modify existing delta + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.MISSIONSINSTRUMENTS + ) + + # Should still be only one delta URL with both changes + assert DeltaUrl.objects.filter(collection=self.collection).count() == 1 + updated_delta = DeltaUrl.objects.get(url=curated_url.url) + assert updated_delta.id == delta_url.id + assert updated_delta.document_type == DocumentTypes.MISSIONSINSTRUMENTS + assert updated_delta.scraped_title == "Updated Telescope Info" + assert CuratedUrl.objects.get(url=curated_url.url).document_type == DocumentTypes.DOCUMENTATION + + def test_multi_url_pattern_modification(self): + """Test modifying multiple URLs with wildcard pattern.""" + # Create multiple curated URLs + [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/images/galaxy{i}.jpg", + document_type=DocumentTypes.DOCUMENTATION, + ) + for i in range(3) + ] + + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/images/*.jpg", + document_type=DocumentTypes.IMAGES, + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + assert DeltaUrl.objects.filter(collection=self.collection).count() == 3 + for delta_url in DeltaUrl.objects.all(): + assert delta_url.document_type == DocumentTypes.IMAGES + assert pattern.delta_urls.filter(id=delta_url.id).exists() + + +@pytest.mark.django_db +class TestFieldModifierPatternLifecycle(TestCase): + """Test pattern lifecycle including promotion and removal.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_removal_creates_reversal_deltas(self): + """Test that removing a pattern creates deltas to reverse its effects.""" + curated_url = CuratedUrlFactory( + collection=self.collection, url="https://example.com/bio/experiment.html", division=Divisions.GENERAL + ) + + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.BIOLOGY + ) + + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.division == Divisions.BIOLOGY + + self.collection.promote_to_curated() + + curated_url = CuratedUrl.objects.get(url=curated_url.url) + + assert curated_url.division == Divisions.BIOLOGY + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + pattern.delete() + + # when all you have in the system is a curated url and a pattern setting a value + # removal of the pattern should make a delta that sets the value to None + reversal_delta = DeltaUrl.objects.get(url=curated_url.url) + assert reversal_delta.division is None + + def test_multiple_patterns_same_url(self): + """Test that different types of patterns can affect same URL.""" + url = "https://example.com/astro/telescope_data.fits" + + CuratedUrlFactory( + collection=self.collection, url=url, division=Divisions.GENERAL, document_type=DocumentTypes.DOCUMENTATION + ) + + # Apply both division and document type patterns + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=url, division=Divisions.ASTROPHYSICS + ) + + doc_type_pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=url, document_type=DocumentTypes.DATA + ) + + # Should have one delta URL reflecting both changes + assert DeltaUrl.objects.count() == 1 + delta_url = DeltaUrl.objects.get() + assert delta_url.division == Divisions.ASTROPHYSICS + assert delta_url.document_type == DocumentTypes.DATA + assert division_pattern.delta_urls.filter(id=delta_url.id).exists() + assert doc_type_pattern.delta_urls.filter(id=delta_url.id).exists() + + +@pytest.mark.django_db +class TestFieldModifierPatternConstraints(TestCase): + """Test pattern constraints and validation.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_pattern_uniqueness_per_collection(self): + """Test that patterns must be unique per collection.""" + url = "https://example.com/data/sample.fits" + + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=url, document_type=DocumentTypes.DATA + ) + + with pytest.raises(IntegrityError): + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=url, document_type=DocumentTypes.DOCUMENTATION + ) + + +@pytest.mark.django_db +class TestFieldModifierDeltaCleanup(TestCase): + """ + Test complex delta URL cleanup scenarios, particularly around pattern removal + and interaction between multiple patterns. + """ + + def setUp(self): + self.collection = CollectionFactory() + + def test_delta_retained_with_other_changes(self): + """ + Test that a delta URL with changes from multiple patterns is properly + handled when one pattern is removed. + """ + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + scraped_title="Original Title", # Adding this to test preservation of manual changes + ) + + # Create two patterns affecting the same URL + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.ASTROPHYSICS + ) + + DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA + ) + + # Manually modify the title to simulate a non-pattern change + delta_url = DeltaUrl.objects.get(url=curated_url.url) + delta_url.scraped_title = "Modified Title" + delta_url.save() + + # Remove one pattern - delta should be retained with other changes + division_pattern.delete() + + # Delta should still exist with doc type change and manual title change + retained_delta = DeltaUrl.objects.get(url=curated_url.url) + assert retained_delta.document_type == DocumentTypes.DATA + assert retained_delta.scraped_title == "Modified Title" + assert retained_delta.division == Divisions.GENERAL # Division reverted to curated value + + def test_delta_cleanup_after_all_patterns_removed(self): + """ + Test cleanup of delta URLs when all patterns affecting them are removed, + but only if no other changes exist. + """ + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + ) + + doc_type_pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, document_type=DocumentTypes.DATA + ) + + # Verify delta exists with both changes + delta_url = DeltaUrl.objects.get(url=curated_url.url) + assert delta_url.document_type == DocumentTypes.DATA + + # Remove pattern + doc_type_pattern.delete() + + assert not DeltaUrl.objects.filter(url=curated_url.url).exists() + + def test_delta_cleanup_with_manual_changes(self): + """ + Test that deltas are retained when patterns are removed but manual changes exist. + """ + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + scraped_title="Original Title", + ) + + # Create pattern and let it create a delta + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.ASTROPHYSICS + ) + + # Add manual change to delta + delta_url = DeltaUrl.objects.get(url=curated_url.url) + delta_url.scraped_title = "Modified Title" + delta_url.save() + + # Remove pattern + pattern.delete() + + # Delta should be retained due to manual title change + retained_delta = DeltaUrl.objects.get(url=curated_url.url) + assert retained_delta.scraped_title == "Modified Title" + assert retained_delta.division == Divisions.GENERAL + + def test_multi_url_pattern_cleanup(self): + """ + Test cleanup behavior when removing a pattern that affects multiple URLs. + """ + # Create several curated URLs + curated_urls = [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/data/set{i}.fits", + document_type=DocumentTypes.DOCUMENTATION, + ) + for i in range(3) + ] + + # Create pattern affecting all URLs + pattern = DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/data/*.fits", + document_type=DocumentTypes.DATA, + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Modify one delta with additional changes + delta_to_retain = DeltaUrl.objects.get(url=curated_urls[0].url) + delta_to_retain.scraped_title = "Modified Title" + delta_to_retain.save() + + # Remove pattern + pattern.delete() + + # Only the delta with manual changes should remain + assert DeltaUrl.objects.count() == 1 + retained_delta = DeltaUrl.objects.get() + assert retained_delta.url == curated_urls[0].url + assert retained_delta.scraped_title == "Modified Title" + assert retained_delta.document_type == DocumentTypes.DOCUMENTATION + + def test_pattern_removal_after_promotion(self): + """ + Test that removing a pattern after promotion creates appropriate reversal deltas. + """ + curated_urls = [ + CuratedUrlFactory( + collection=self.collection, + url=f"https://example.com/helio/data{i}.fits", + division=Divisions.GENERAL, + document_type=DocumentTypes.DOCUMENTATION, + ) + for i in range(2) + ] + + # Create patterns and manually modify one URL + division_pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="https://example.com/helio/*.fits", + division=Divisions.HELIOPHYSICS, + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Modify first delta with additional changes + delta = DeltaUrl.objects.get(url=curated_urls[0].url) + delta.scraped_title = "Modified Title" + delta.save() + + # Promote collection + self.collection.promote_to_curated() + + # Remove pattern - should create reversal deltas + division_pattern.delete() + + # Should have two deltas: one with just division reversal, + # one with division reversal plus preserved title change + assert DeltaUrl.objects.count() == 2 + + # Check delta with manual changes + modified_delta = DeltaUrl.objects.get(url=curated_urls[0].url) + assert modified_delta.division is None + assert modified_delta.scraped_title == "Modified Title" + + # Check plain reversal delta + plain_delta = DeltaUrl.objects.get(url=curated_urls[1].url) + assert plain_delta.division is None + assert plain_delta.scraped_title == curated_urls[1].scraped_title + + def test_pattern_removal_creates_null_deltas(self): + """ """ + curated_url = DeltaUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.fits", + division=Divisions.ASTROPHYSICS, + document_type=DocumentTypes.DATA, + ) + + # Create pattern + pattern = DeltaDivisionPattern.objects.create( + collection=self.collection, match_pattern=curated_url.url, division=Divisions.HELIOPHYSICS + ) + + # Verify initial state + delta = DeltaUrl.objects.get(url=curated_url.url) + assert delta.division == Divisions.HELIOPHYSICS + + # Remove pattern + pattern.delete() + + # Should have delta with explicit NULL + new_delta = DeltaUrl.objects.get(url=curated_url.url) + assert new_delta.division is None + + # def test_pattern_removal_with_multiple_patterns(self): + # """ + # Test that removing one pattern doesn't NULL the field if other + # patterns of same type still affect the URL. + # """ + # # TODO: The official stance right now is to simply not make overlapping patterns like this + # # in the future, if this behavior is allowed, then this would be the test case. + # # right now, this behavior is not coded for, and this test does not pass. + + # curated_url = CuratedUrlFactory( + # collection=self.collection, url="https://example.com/astro/data.fits", division=Divisions.GENERAL + # ) + + # # Create two patterns affecting same URL + # pattern1 = DeltaDivisionPattern.objects.create( + # collection=self.collection, + # match_pattern="*.fits", + # division=Divisions.ASTROPHYSICS, + # match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + # ) + + # DeltaDivisionPattern.objects.create( + # collection=self.collection, match_pattern=curated_url.url, division=Divisions.HELIOPHYSICS + # ) + + # # Remove one pattern + # pattern1.delete() + + # # Delta should retain value from remaining pattern + # delta = DeltaUrl.objects.get(url=curated_url.url) + # assert delta.division == Divisions.HELIOPHYSICS diff --git a/sde_collections/tests/test_include_patterns.py b/sde_collections/tests/test_include_patterns.py new file mode 100644 index 00000000..4212efa5 --- /dev/null +++ b/sde_collections/tests/test_include_patterns.py @@ -0,0 +1,132 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_include_patterns.py +import pytest + +from sde_collections.models.delta_patterns import ( + DeltaExcludePattern, + DeltaIncludePattern, +) +from sde_collections.models.delta_url import DeltaUrl +from sde_collections.tests.factories import ( + CollectionFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + + +@pytest.mark.django_db +def test_patterns_applied_after_migration(): + collection = CollectionFactory() + + # Add DumpUrls to migrate - using folder-based structure + DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/1") + DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/2") + DumpUrlFactory(collection=collection, url="https://example.com/included_docs/1") + DumpUrlFactory(collection=collection, url="https://example.com/other_docs/1") + # This URL should be included despite being in excluded_docs folder + DumpUrlFactory(collection=collection, url="https://example.com/excluded_docs/included") + + # Create exclude pattern for excluded_docs folder + exclude_pattern = DeltaExcludePattern.objects.create( + collection=collection, match_pattern="https://example.com/excluded_docs/*", match_pattern_type=2 + ) + + # Create include patterns + include_pattern = DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/included_docs/*", match_pattern_type=2 + ) + + # Specific include pattern that overrides the excluded_docs folder + specific_include = DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/excluded_docs/included", match_pattern_type=1 + ) + + # Perform the migration + collection.migrate_dump_to_delta() + + # Verify pattern relationships + assert exclude_pattern.delta_urls.filter( + url="https://example.com/excluded_docs/1" + ).exists(), "Exclude pattern not applied to excluded_docs" + + assert include_pattern.delta_urls.filter( + url="https://example.com/included_docs/1" + ).exists(), "Include pattern not applied to included_docs" + + # Verify URL in other_docs is unaffected + assert not exclude_pattern.delta_urls.filter( + url="https://example.com/other_docs/1" + ).exists(), "Exclude pattern incorrectly applied to other_docs" + assert not include_pattern.delta_urls.filter( + url="https://example.com/other_docs/1" + ).exists(), "Include pattern incorrectly applied to other_docs" + + # Verify excluded status + excluded_url = DeltaUrl.objects.get(url="https://example.com/excluded_docs/1") + included_url = DeltaUrl.objects.get(url="https://example.com/included_docs/1") + neutral_url = DeltaUrl.objects.get(url="https://example.com/other_docs/1") + override_url = DeltaUrl.objects.get(url="https://example.com/excluded_docs/included") + + assert excluded_url.excluded is True, "URL in excluded_docs should be excluded" + assert included_url.excluded is False, "URL in included_docs should not be excluded" + assert neutral_url.excluded is False, "URL in other_docs should not be excluded" + assert ( + override_url.excluded is False + ), "Specifically included URL should not be excluded despite being in excluded_docs" + + # Verify both patterns are applied to the override URL + assert exclude_pattern.delta_urls.filter(url="https://example.com/excluded_docs/included").exists() + assert specific_include.delta_urls.filter(url="https://example.com/excluded_docs/included").exists() + + +# Test cases for the updated functionality +@pytest.mark.django_db +class TestUrlExclusionInclusion: + def test_exclusion_with_no_patterns(self): + """Test that URLs are not excluded by default""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection) + + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + + def test_exclusion_pattern_only(self): + """Test that exclude patterns work when no include patterns exist""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/excluded") + + DeltaExcludePattern.objects.create( + collection=collection, match_pattern="https://example.com/excluded", match_pattern_type=1 + ) + + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is True + + def test_include_pattern_overrides_exclude(self): + """Test that include patterns take precedence over exclude patterns""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/both") + + # Create both exclude and include patterns for the same URL + DeltaExcludePattern.objects.create( + collection=collection, match_pattern="https://example.com/both", match_pattern_type=1 + ) + + DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/both", match_pattern_type=1 + ) + + # URL should not be excluded because include takes precedence + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False + + def test_wildcard_patterns(self): + """Test that wildcard patterns work correctly with include/exclude precedence""" + collection = CollectionFactory() + delta_url = DeltaUrlFactory(collection=collection, url="https://example.com/docs/file.pdf") + + # Exclude all PDFs but include those in /docs/ + DeltaExcludePattern.objects.create(collection=collection, match_pattern="*.pdf", match_pattern_type=2) + + DeltaIncludePattern.objects.create( + collection=collection, match_pattern="https://example.com/docs/*", match_pattern_type=2 + ) + + # URL should not be excluded because the include pattern matches + assert DeltaUrl.objects.get(pk=delta_url.pk).excluded is False diff --git a/sde_collections/tests/test_migrate_dump.py b/sde_collections/tests/test_migrate_dump.py index 451dd0be..c0f460d6 100644 --- a/sde_collections/tests/test_migrate_dump.py +++ b/sde_collections/tests/test_migrate_dump.py @@ -3,11 +3,12 @@ import pytest +from sde_collections.models.collection_choice_fields import DocumentTypes from sde_collections.models.delta_patterns import ( + DeltaDocumentTypePattern, DeltaExcludePattern, - DeltaIncludePattern, ) -from sde_collections.models.delta_url import DeltaUrl, DumpUrl +from sde_collections.models.delta_url import CuratedUrl, DeltaUrl, DumpUrl from sde_collections.tests.factories import ( CollectionFactory, CuratedUrlFactory, @@ -47,7 +48,7 @@ def test_create_or_update_delta_url_delete(self): collection.create_or_update_delta_url(curated_url, to_delete=True) delta = DeltaUrl.objects.get(url=curated_url.url) assert delta.to_delete is True - assert delta.scraped_title == "" + assert delta.scraped_title == curated_url.scraped_title @pytest.mark.django_db @@ -76,7 +77,7 @@ def test_url_in_curated_only(self): collection.migrate_dump_to_delta() delta = DeltaUrl.objects.get(url=curated_url.url) assert delta.to_delete is True - assert delta.scraped_title == "" + assert delta.scraped_title == curated_url.scraped_title def test_identical_url_in_both(self): collection = CollectionFactory() @@ -233,77 +234,80 @@ def test_partial_data_in_curated_urls(): @pytest.mark.django_db -def test_patterns_applied_after_migration(): +def test_full_migration_with_patterns(): + """ + Test a complete migration flow with exclude patterns and document type patterns. + Tests the following scenarios: + - New URL from dump (should create delta) + - Updated URL from dump (should create delta with new title) + - Deleted URL (should create delta marked for deletion) + - URL matching exclude pattern (should be excluded) + - URL matching document type pattern (should have correct doc type) + """ collection = CollectionFactory() - # Add DumpUrls to migrate - DumpUrlFactory(collection=collection, url="https://exclude.com") - DumpUrlFactory(collection=collection, url="https://include.com") - DumpUrlFactory(collection=collection, url="https://neutral.com") + # Set up initial DumpUrls and CuratedUrls + DumpUrlFactory(collection=collection, url="https://example.com/new", scraped_title="New Page") + DumpUrlFactory(collection=collection, url="https://example.com/update", scraped_title="Updated Title") + DumpUrlFactory(collection=collection, url="https://example.com/docs/guide", scraped_title="Documentation Guide") - # Create exclude and include patterns + CuratedUrlFactory(collection=collection, url="https://example.com/update", scraped_title="Old Title") + CuratedUrlFactory(collection=collection, url="https://example.com/delete", scraped_title="Delete Me") + CuratedUrlFactory(collection=collection, url="https://example.com/docs/guide", scraped_title="Documentation Guide") + + # Create patterns before migration exclude_pattern = DeltaExcludePattern.objects.create( - collection=collection, match_pattern_type=2, match_pattern="exclude.*" + collection=collection, + match_pattern="https://example.com/delete", + match_pattern_type=1, # Individual URL + reason="Test exclusion", ) - include_pattern = DeltaIncludePattern.objects.create( - collection=collection, match_pattern_type=2, match_pattern="include.*" + + doc_type_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="https://example.com/docs/*", + match_pattern_type=2, # Multi-URL pattern + document_type=DocumentTypes.DOCUMENTATION, ) - # Perform the migration + # Perform migration collection.migrate_dump_to_delta() - # Check that the patterns were applied - exclude_pattern.refresh_from_db() - include_pattern.refresh_from_db() - - # Verify exclude pattern relationship - assert exclude_pattern.delta_urls.filter( - url="https://exclude.com" - ).exists(), "Exclude pattern not applied to DeltaUrls." - - # Verify include pattern relationship - assert include_pattern.delta_urls.filter( - url="https://include.com" - ).exists(), "Include pattern not applied to DeltaUrls." + # 1. Check new URL was created as delta + new_delta = DeltaUrl.objects.get(url="https://example.com/new") + assert new_delta.to_delete is False + assert new_delta.scraped_title == "New Page" - # Ensure neutral URL is unaffected - assert not exclude_pattern.delta_urls.filter( - url="https://neutral.com" - ).exists(), "Exclude pattern incorrectly applied." - assert not include_pattern.delta_urls.filter( - url="https://neutral.com" - ).exists(), "Include pattern incorrectly applied." + # 2. Check updated URL has new title in delta + update_delta = DeltaUrl.objects.get(url="https://example.com/update") + assert update_delta.to_delete is False + assert update_delta.scraped_title == "Updated Title" + # 3. Check deleted URL is marked for deletion + delete_delta = DeltaUrl.objects.get(url="https://example.com/delete") + assert delete_delta.to_delete is True + assert delete_delta.excluded is True # Should be excluded due to pattern -@pytest.mark.django_db -def test_full_migration_with_patterns(): - collection = CollectionFactory() + # 4. Check documentation URL has correct type + docs_delta = DeltaUrl.objects.get(url="https://example.com/docs/guide") + assert docs_delta.document_type == DocumentTypes.DOCUMENTATION + assert docs_delta.to_delete is False - # Set up DumpUrls and CuratedUrls - DumpUrlFactory(collection=collection, url="https://new.com") - DumpUrlFactory(collection=collection, url="https://update.com", scraped_title="Updated Title") - CuratedUrlFactory(collection=collection, url="https://update.com", scraped_title="Old Title") - CuratedUrlFactory(collection=collection, url="https://delete.com") - - # Create patterns - exclude_pattern = DeltaExcludePattern.objects.create( - collection=collection, match_pattern_type=2, match_pattern="delete.*" - ) - include_pattern = DeltaIncludePattern.objects.create( - collection=collection, match_pattern_type=2, match_pattern="update.*" - ) + # 5. Verify pattern relationships + exclude_pattern.refresh_from_db() + doc_type_pattern.refresh_from_db() - # Perform migration - collection.migrate_dump_to_delta() + assert exclude_pattern.delta_urls.filter(url="https://example.com/delete").exists() + assert doc_type_pattern.delta_urls.filter(url="https://example.com/docs/guide").exists() - # Check DeltaUrls - assert DeltaUrl.objects.filter(url="https://new.com", to_delete=False).exists() - assert DeltaUrl.objects.filter(url="https://update.com", to_delete=False, scraped_title="Updated Title").exists() - assert DeltaUrl.objects.filter(url="https://delete.com", to_delete=True).exists() + # 6. Check total number of deltas is correct + assert DeltaUrl.objects.filter(collection=collection).count() == 4 - # Check patterns - exclude_pattern.refresh_from_db() - include_pattern.refresh_from_db() + # Optional: Test promotion to verify patterns stick + collection.promote_to_curated() - assert exclude_pattern.delta_urls.filter(url="https://delete.com").exists(), "Exclude pattern not applied." - assert include_pattern.delta_urls.filter(url="https://update.com").exists(), "Include pattern not applied." + # Verify results after promotion + assert not CuratedUrl.objects.filter(url="https://example.com/delete").exists() + assert CuratedUrl.objects.get(url="https://example.com/docs/guide").document_type == DocumentTypes.DOCUMENTATION + assert CuratedUrl.objects.get(url="https://example.com/update").scraped_title == "Updated Title" + assert not CuratedUrl.objects.filter(scraped_title="Old Title").exists() diff --git a/sde_collections/tests/test_migration.py b/sde_collections/tests/test_migration.py new file mode 100644 index 00000000..211145e9 --- /dev/null +++ b/sde_collections/tests/test_migration.py @@ -0,0 +1,264 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_migration.py + +import pytest +from django.test import TestCase + +from sde_collections.models.collection_choice_fields import Divisions, DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDivisionPattern, + DeltaDocumentTypePattern, + DeltaExcludePattern, +) +from sde_collections.models.delta_url import DeltaUrl, DumpUrl +from sde_collections.tests.factories import ( + CollectionFactory, + CuratedUrlFactory, + DeltaUrlFactory, + DumpUrlFactory, +) + + +@pytest.mark.django_db +class TestMigrateDumpToDelta(TestCase): + """Test the migrate_dump_to_delta process comprehensively.""" + + def setUp(self): + self.collection = CollectionFactory() + + def test_basic_migration_new_url(self): + """Test basic migration of a new URL with no existing curated version.""" + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/new", + scraped_title="New Doc", + document_type=DocumentTypes.DOCUMENTATION, + division=Divisions.ASTROPHYSICS, + ) + + self.collection.migrate_dump_to_delta() + + # Verify delta created with all fields + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.scraped_title == dump_url.scraped_title + assert delta.document_type == dump_url.document_type + assert delta.division == dump_url.division + assert delta.to_delete is False + + def test_migration_with_differing_curated(self): + """Test migration when dump differs from existing curated URL.""" + url = "https://example.com/doc" + + dump_url = DumpUrlFactory( + collection=self.collection, + url=url, + scraped_title="New Title", + document_type=DocumentTypes.DATA, + ) + + CuratedUrlFactory( + collection=self.collection, + url=url, + scraped_title="Old Title", + document_type=DocumentTypes.DOCUMENTATION, + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=url) + assert delta.scraped_title == dump_url.scraped_title + assert delta.document_type == dump_url.document_type + assert delta.to_delete is False + + def test_migration_marks_missing_urls_for_deletion(self): + """Test that curated URLs not in dump are marked for deletion.""" + # Create only curated URL, no dump + curated_url = CuratedUrlFactory( + collection=self.collection, + url="https://example.com/old", + scraped_title="Old Doc", + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=curated_url.url) + assert delta.to_delete is True + assert delta.scraped_title == curated_url.scraped_title + + def test_migration_handles_null_fields(self): + """Test migration properly handles null/empty fields.""" + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/doc", + scraped_title="", # Empty string + document_type=None, # Null + division=None, # Null + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.scraped_title == "" + assert delta.document_type is None + assert delta.division is None + + def test_migration_clears_existing_deltas(self): + """Test that existing deltas are cleared before migration.""" + # Create pre-existing delta + old_delta = DeltaUrlFactory( + collection=self.collection, + url="https://example.com/old", + scraped_title="Old Delta", + ) + + # Create new dump URL + new_dump = DumpUrlFactory( + collection=self.collection, + url="https://example.com/new", + scraped_title="New Dump", + ) + + self.collection.migrate_dump_to_delta() + + # Verify old delta is gone and only new one exists + assert not DeltaUrl.objects.filter(url=old_delta.url).exists() + assert DeltaUrl.objects.filter(url=new_dump.url).exists() + + def test_migration_with_exclude_pattern(self): + """Test migration interacts correctly with exclude patterns.""" + # Create pattern first + DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="*internal*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + # Create dump URL that should be excluded + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/internal/doc", + scraped_title="Internal Doc", + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.excluded is True + + def test_migration_with_field_modifying_pattern(self): + """Test migration with patterns that modify fields.""" + # Create document type pattern + DeltaDocumentTypePattern.objects.create( + collection=self.collection, + match_pattern="*.pdf", + match_pattern_type=DeltaDocumentTypePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + document_type=DocumentTypes.DATA, + ) + + # Create division pattern + DeltaDivisionPattern.objects.create( + collection=self.collection, + match_pattern="*/astro/*", + match_pattern_type=DeltaDivisionPattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + division=Divisions.ASTROPHYSICS, + ) + + # Create dump URL that matches both patterns + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/astro/data.pdf", + scraped_title="Astro Data", + document_type=DocumentTypes.DOCUMENTATION, # Different from pattern + division=Divisions.EARTH_SCIENCE, # Different from pattern + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert delta.document_type == DocumentTypes.DATA + assert delta.division == Divisions.ASTROPHYSICS + + def test_migration_with_multiple_urls(self): + """Test migration with multiple URLs in various states.""" + # Create mix of dump and curated URLs + dump_urls = [DumpUrlFactory(collection=self.collection) for _ in range(3)] + curated_urls = [CuratedUrlFactory(collection=self.collection) for _ in range(2)] + + self.collection.migrate_dump_to_delta() + + # Should have deltas for all dump URLs + for dump_url in dump_urls: + assert DeltaUrl.objects.filter(url=dump_url.url, to_delete=False).exists() + + # Should have deletion deltas for curated URLs not in dump + for curated_url in curated_urls: + assert DeltaUrl.objects.filter(url=curated_url.url, to_delete=True).exists() + + def test_migration_with_empty_states(self): + """Test migration handles empty dump and curated states.""" + # No dump or curated URLs exist + self.collection.migrate_dump_to_delta() + assert DeltaUrl.objects.count() == 0 + + # Only curated URLs exist + CuratedUrlFactory(collection=self.collection) + self.collection.migrate_dump_to_delta() + assert DeltaUrl.objects.count() == 1 + assert DeltaUrl.objects.first().to_delete is True + + def test_migration_preserves_all_fields(self): + """Test that ALL fields are preserved during migration, not just changed ones.""" + # Create dump URL with all fields populated + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/doc", + scraped_title="Title", + scraped_text="Full text content", + generated_title="Generated Title", + document_type=DocumentTypes.DOCUMENTATION, + division=Divisions.ASTROPHYSICS, + visited=True, + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + + # Verify all fields were copied + fields_to_check = [ + "scraped_title", + "scraped_text", + "generated_title", + "document_type", + "division", + "visited", + ] + + for field in fields_to_check: + assert getattr(delta, field) == getattr(dump_url, field) + + def test_clearing_dump_urls(self): + """Test that dump URLs are cleared after migration.""" + DumpUrlFactory(collection=self.collection) + DumpUrlFactory(collection=self.collection) + + self.collection.migrate_dump_to_delta() + + assert DumpUrl.objects.filter(collection=self.collection).count() == 0 + + def test_pattern_relationships_updated(self): + """Test that pattern relationships are properly updated after migration.""" + pattern = DeltaExcludePattern.objects.create( + collection=self.collection, + match_pattern="*test*", + match_pattern_type=DeltaExcludePattern.MatchPatternTypeChoices.MULTI_URL_PATTERN, + ) + + dump_url = DumpUrlFactory( + collection=self.collection, + url="https://example.com/test/doc", + ) + + self.collection.migrate_dump_to_delta() + + delta = DeltaUrl.objects.get(url=dump_url.url) + assert pattern.delta_urls.filter(id=delta.id).exists() diff --git a/sde_collections/tests/test_pattern_specificity.py b/sde_collections/tests/test_pattern_specificity.py new file mode 100644 index 00000000..98c7f006 --- /dev/null +++ b/sde_collections/tests/test_pattern_specificity.py @@ -0,0 +1,158 @@ +# docker-compose -f local.yml run --rm django pytest sde_collections/tests/test_pattern_specificity.py + +import pytest + +from sde_collections.models.collection_choice_fields import DocumentTypes +from sde_collections.models.delta_patterns import ( + DeltaDocumentTypePattern, + DeltaResolvedTitle, + DeltaTitlePattern, +) +from sde_collections.models.delta_url import DeltaUrl +from sde_collections.tests.factories import CollectionFactory, DeltaUrlFactory + + +@pytest.mark.django_db +def test_title_pattern_multiple_resolved_titles_extended(): + """Test that patterns properly handle title resolution based on URL set size.""" + collection = CollectionFactory() + + # Create URLs with different levels of specificity + url1 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/item.html", scraped_title="Original Title" + ) + url2 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/item2.html", scraped_title="Original Title" + ) + url3 = DeltaUrlFactory( + collection=collection, url="https://example.com/docs/pdfs/item1.html", scraped_title="Original Title" + ) + + # Create general pattern (matches all URLs) + general_pattern = DeltaTitlePattern.objects.create( + collection=collection, + match_pattern="*docs*", + title_pattern="{title} - Docs", + match_pattern_type=2, + ) + + # Verify initial pattern application + assert general_pattern.get_url_match_count() == 3 + assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs" + assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs" + assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - Docs" + + # Verify DeltaResolvedTitle entries + assert DeltaResolvedTitle.objects.count() == 3 + for url in [url1, url2, url3]: + resolved = DeltaResolvedTitle.objects.get(delta_url=url) + assert resolved.title_pattern == general_pattern + assert resolved.resolved_title == "Original Title - Docs" + + # Create more specific pattern + specific_pattern = DeltaTitlePattern.objects.create( + collection=collection, match_pattern="*docs/pdfs*", title_pattern="{title} - HTML", match_pattern_type=2 + ) + + # Verify pattern match counts + assert specific_pattern.get_url_match_count() == 1 # Only matches pdfs URL + assert general_pattern.get_url_match_count() == 3 # Matches all URLs + + # Verify titles were updated appropriately + assert DeltaUrl.objects.get(pk=url1.pk).generated_title == "Original Title - Docs" # Unchanged + assert DeltaUrl.objects.get(pk=url2.pk).generated_title == "Original Title - Docs" # Unchanged + assert DeltaUrl.objects.get(pk=url3.pk).generated_title == "Original Title - HTML" # Updated + + # Verify DeltaResolvedTitle entries + assert DeltaResolvedTitle.objects.count() == 3 # Still one per URL + + # URLs with general pattern should be unchanged + for url in [url1, url2]: + resolved = DeltaResolvedTitle.objects.get(delta_url=url) + assert resolved.title_pattern == general_pattern + assert resolved.resolved_title == "Original Title - Docs" + + # PDF URL should now use specific pattern + resolved_pdf = DeltaResolvedTitle.objects.get(delta_url=url3) + assert resolved_pdf.title_pattern == specific_pattern + assert resolved_pdf.resolved_title == "Original Title - HTML" + + # Verify pattern relationships are maintained + assert url1 in general_pattern.delta_urls.all() + assert url2 in general_pattern.delta_urls.all() + assert url3 in general_pattern.delta_urls.all() + assert url3 in specific_pattern.delta_urls.all() + + +@pytest.mark.django_db +def test_field_modifying_pattern_layered_specificity(): + """Test overlapping patterns with different levels of specificity.""" + collection = CollectionFactory() + + # Create URLs in a hierarchy that allows for overlapping pattern matches + deep_tool = DeltaUrlFactory( + collection=collection, + url="https://example.com/tools/analysis/v2/processor.py", + document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation + ) + mid_tool = DeltaUrlFactory( + collection=collection, + url="https://example.com/tools/analysis/helper.py", + document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation + ) + top_tool = DeltaUrlFactory( + collection=collection, + url="https://example.com/tools/simple.py", + document_type=DocumentTypes.DOCUMENTATION, # Starting as documentation + ) + + # Create patterns with overlapping matches + broad_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="*/tools/*.py", # Matches all 3 URLs + document_type=DocumentTypes.SOFTWARETOOLS, + match_pattern_type=2, + ) + + mid_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="*/tools/analysis/*.py", # Matches 2 URLs (mid and deep) + document_type=DocumentTypes.DATA, # Different type to clearly show which pattern won + match_pattern_type=2, + ) + + specific_pattern = DeltaDocumentTypePattern.objects.create( + collection=collection, + match_pattern="*/analysis/v2/*.py", # Matches only 1 URL (deep) + document_type=DocumentTypes.DOCUMENTATION, # Different type to clearly show which pattern won + match_pattern_type=2, + ) + + # Verify URL match counts + assert broad_pattern.get_url_match_count() == 3 + assert mid_pattern.get_url_match_count() == 2 + assert specific_pattern.get_url_match_count() == 1 + + # Verify patterns were applied correctly based on specificity + deep_tool.refresh_from_db() + mid_tool.refresh_from_db() + top_tool.refresh_from_db() + + # The most specific pattern (1 match) should win for the deep URL + assert deep_tool.document_type == DocumentTypes.DOCUMENTATION, "Deep URL should use most specific pattern" + + # The mid-level pattern (2 matches) should win for the middle URL + assert mid_tool.document_type == DocumentTypes.DATA, "Mid URL should use mid-level pattern" + + # The broad pattern (3 matches) should only affect the top URL + assert top_tool.document_type == DocumentTypes.SOFTWARETOOLS, "Top URL should use broad pattern" + + # Verify the relationships are tracked correctly + assert deep_tool.pk in specific_pattern.delta_urls.values_list("pk", flat=True) + assert deep_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True) + assert deep_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) + + assert mid_tool.pk in mid_pattern.delta_urls.values_list("pk", flat=True) + assert mid_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) + + assert top_tool.pk in broad_pattern.delta_urls.values_list("pk", flat=True) diff --git a/sde_collections/utils/title_resolver.py b/sde_collections/utils/title_resolver.py index 20211bf7..165065d9 100644 --- a/sde_collections/utils/title_resolver.py +++ b/sde_collections/utils/title_resolver.py @@ -63,29 +63,33 @@ def resolve_xpath(xpath: str, url: str) -> str: if not is_valid_xpath(xpath): raise ValueError(f"The xpath, {xpath}, is not valid.") - response = requests.get(url) - - if response.ok: - tree = html.fromstring(response.content) - values = tree.xpath(xpath) - - if len(values) == 1: - if isinstance(values[0], str): - text_content = values[0] - else: - text_content = values[0].text - - if text_content: - text_content = clean_text(text_content) - return text_content + try: + response = requests.get(url) + + if response.ok: + tree = html.fromstring(response.content) + values = tree.xpath(xpath) + + if len(values) == 1: + if isinstance(values[0], str): + text_content = values[0] + else: + text_content = values[0].text + + if text_content: + text_content = clean_text(text_content) + return text_content + else: + raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.") + elif len(values) > 1: + raise ValueError(f"More than one element found for the xpath, {xpath}") else: - raise ValueError(f"The element at the xpath, {xpath}, does not contain any text content.") - elif len(values) > 1: - raise ValueError(f"More than one element found for the xpath, {xpath}") + raise ValueError(f"No element found for the xpath, {xpath}") else: - raise ValueError(f"No element found for the xpath, {xpath}") - else: - raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") + raise ValueError(f"Failed to retrieve the {url}. Status code: {response.status_code}") + + except requests.RequestException as e: + raise ValueError(f"Network error while accessing {url}: {str(e)}") def parse_title(input_string: str) -> list[tuple[str, str]]: diff --git a/sde_collections/views.py b/sde_collections/views.py index 6c63abfe..3ceaed84 100644 --- a/sde_collections/views.py +++ b/sde_collections/views.py @@ -31,14 +31,11 @@ DeltaDocumentTypePattern, DeltaExcludePattern, DeltaIncludePattern, - DeltaTitlePattern, -) -from .models.delta_url import ( - CuratedUrl, DeltaResolvedTitle, DeltaResolvedTitleError, - DeltaUrl, + DeltaTitlePattern, ) +from .models.delta_url import CuratedUrl, DeltaUrl from .serializers import ( CollectionReadSerializer, CollectionSerializer,