Skip to content

Commit

Permalink
Merge pull request #1146 from NASA-IMPACT/1142-test-field-modified-un…
Browse files Browse the repository at this point in the history
…apply-logic

refactor readme for unapply logic and refactor unapply to account for overlapping patterns
  • Loading branch information
CarsonDavis authored Dec 13, 2024
2 parents a0a80e9 + 85a65d0 commit aef7cdb
Show file tree
Hide file tree
Showing 5 changed files with 726 additions and 63 deletions.
105 changes: 77 additions & 28 deletions sde_collections/models/README_UNAPPLY_LOGIC.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,56 +14,105 @@
- Delta URL exists with pattern effect
- Pattern is removed
```
Curated: None
Delta: division=BIOLOGY (from pattern)
[Pattern removed]
Result: Delta remains with division=None
Curated: None exists
Delta: url=new.com, division=None
```
`[Pattern: division=BIOLOGY], created`
```
Curated: None exists
Delta: url=new.com, division=BIOLOGY
```
`[Pattern: division=BIOLOGY], deleted`
```
Curated: None exists
Delta: url=new.com, division=None
```

### Case 2: Delta and Curated Exist
### Case 2: Delta Created to Apply Pattern
**Scenario:**
- Both curated and delta URLs exist
- A Curated with no division already exists
- A pattern is created
- A delta is created to to apply a pattern
- Pattern is removed
- Delta should be deleted
```
Curated: division=None
```
Curated: division=GENERAL
`[Pattern: division=BIOLOGY], created`
```
Curated: division=None
Delta: division=BIOLOGY (from pattern)
[Pattern removed]
Result: Delta reverts to curated value (division=GENERAL)
If delta now matches curated exactly, delta is deleted
```
`[Pattern: division=BIOLOGY], deleted`
```
Curated: division=None
```

### Case 3: Curated Only
**Scenario:**
- Only curated URL exists
### Case 3: Pre-existing Delta
- A Curated with no division already exists
- A Delta with an updated scraped_title exists
- A pattern is created to set division
- A delta is created to apply a pattern
- Pattern is removed
- Delta should be maintained because of scraped_title

```
Curated: division=None
Delta: scraped_title="Modified", division=None
```
`[Pattern: division=BIOLOGY], created`
```
Curated: division=None
Delta: scraped_title="Modified", division=BIOLOGY (from pattern)
```
`[Pattern: division=BIOLOGY], deleted`
```
Curated: division=GENERAL
Delta: None
[Pattern removed]
Result: New delta created with division=None
Curated: division=None
Delta: scraped_title="Modified", division=None
```

### Case 4: Multiple Pattern Effects
**Scenario:**
- Delta has changes from multiple patterns
- One pattern is removed
```
Curated: division=GENERAL, doc_type=DOCUMENTATION
Delta: division=BIOLOGY, doc_type=DATA (from two patterns)
[Division pattern removed]
Result: Delta remains with division=GENERAL, doc_type=DATA preserved
Pattern: division=BIOLOGY
Pattern: doc_type=DATA
```
`[Pattern: division=BIOLOGY], deleted`
```
Delta: division=None, doc_type=DATA
Pattern: doc_type=DATA
```

### Case 5: Pattern Removal with Manual Changes
**Scenario:**
- Delta has both pattern effect and manual changes
- Pattern is removed
### Case 5: Overlapping Patterns, Specific Deleted
```
Curated: division=GENERAL, title="Original"
Delta: division=BIOLOGY, title="Modified" (pattern + manual)
[Pattern removed]
Result: Delta remains with division=GENERAL, title="Modified" preserved
Delta: division=ASTROPHYSICS (because of specific pattern)
Specific Pattern: division=ASTROPHYSICS
General Pattern: division=BIOLOGY
```
`[Specific Pattern: division=ASTROPHYSICS], deleted`

```
Delta: division=BIOLOGY (because of general pattern)
General Pattern: division=BIOLOGY
```


### Case 6: Overlapping Patterns, General Deleted
```
Delta: division=ASTROPHYSICS (because of specific pattern)
Specific Pattern: division=ASTROPHYSICS
General Pattern: division=BIOLOGY
```
`[General Pattern: division=BIOLOGY], deleted`

```
Delta: division=ASTROPHYSICS (because of specific pattern)
Specific Pattern: division=ASTROPHYSICS
```


## Implementation Steps

Expand Down
8 changes: 4 additions & 4 deletions sde_collections/models/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,18 +211,18 @@ def promote_to_curated(self):
continue

delta_value = getattr(delta, field_name)
if delta_value not in [None, ""] and getattr(curated, field_name) != delta_value:
if getattr(curated, field_name) != delta_value:
updated_fields[field_name] = delta_value

if updated_fields:
CuratedUrl.objects.filter(pk=curated.pk).update(**updated_fields)
else:
# If no matching CuratedUrl, create a new one using all non-null and non-empty fields
# Previously, we excluded fields with values of None and ""
# however, such null values are considered meaningful and should be copied over
new_data = {
field.name: getattr(delta, field.name)
for field in delta._meta.fields
if field.name not in ["to_delete", "collection", "id"]
and getattr(delta, field.name) not in [None, ""]
if field.name not in ["to_delete", "collection", "id"] and getattr(delta, field.name)
}
CuratedUrl.objects.create(collection=self, **new_data)

Expand Down
143 changes: 112 additions & 31 deletions sde_collections/models/delta_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,16 +333,28 @@ def unapply(self) -> None:
affected_deltas = self.delta_urls.all()
affected_curated = self.curated_urls.all()

# Get all other patterns of same type for this collection
pattern_class = self.__class__
other_patterns = pattern_class.objects.filter(collection=self.collection).exclude(id=self.id)

# Process each affected delta URL
for delta in affected_deltas:
curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()

if not curated:
# Scenario 1: Delta only - new URL
setattr(delta, field, None)
# Find next most specific matching pattern if any
matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)]

next_pattern = None
if matching_patterns:
# Sort by number of URLs matched (ascending) to find most specific
next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())

if next_pattern:
# Apply next most specific pattern's value
setattr(delta, field, next_pattern.get_new_value())
delta.save()
else:
# Scenario 2: Both exist
elif curated:
# No other patterns match, revert to curated value
setattr(delta, field, getattr(curated, field))
delta.save()

Expand All @@ -354,17 +366,36 @@ def unapply(self) -> None:
)
if fields_match:
delta.delete()
else:
# No curated URL or other patterns, set to None
setattr(delta, field, None)
delta.save()

# Handle curated URLs that don't have deltas
for curated in affected_curated:
if not DeltaUrl.objects.filter(url=curated.url).exists():
# Scenario 3: Curated only
# Copy all fields from curated except the one we're nulling
fields = {
f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
}
fields[field] = None # Set the pattern's field to None
delta = DeltaUrl.objects.create(collection=self.collection, **fields)
# Find any matching patterns
matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)]

if matching_patterns:
# Apply most specific pattern's value
next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())
fields = {
f.name: getattr(curated, f.name)
for f in curated._meta.fields
if f.name not in ["id", "collection"]
}
fields[field] = next_pattern.get_new_value()
DeltaUrl.objects.create(collection=self.collection, **fields)
else:
# No other patterns, create delta with None
fields = {
f.name: getattr(curated, f.name)
for f in curated._meta.fields
if f.name not in ["id", "collection"]
}
fields[field] = None
DeltaUrl.objects.create(collection=self.collection, **fields)

# Clear pattern relationships
self.delta_urls.clear()
Expand Down Expand Up @@ -523,11 +554,12 @@ def apply(self) -> None:

def unapply(self) -> None:
"""
Remove title modifications:
1. Create Delta URLs for affected Curated URLs to explicitly clear titles
2. Remove generated titles from affected Delta URLs
3. Clean up Delta URLs that become identical to their Curated URL
4. Clear resolution tracking
Remove title modifications, maintaining pattern precedence:
1. Find any remaining patterns that match each URL
2. Apply most specific matching pattern's title if one exists
3. Otherwise revert to curated title or clear title
4. Update title resolution tracking
5. Clean up redundant deltas
"""
DeltaUrl = apps.get_model("sde_collections", "DeltaUrl")
CuratedUrl = apps.get_model("sde_collections", "CuratedUrl")
Expand All @@ -538,16 +570,36 @@ def unapply(self) -> None:
affected_deltas = self.delta_urls.all()
affected_curated = self.curated_urls.all()

# Get all other title patterns for this collection
other_patterns = DeltaTitlePattern.objects.filter(collection=self.collection).exclude(id=self.id)

# Process each affected delta URL
for delta in affected_deltas:
curated = CuratedUrl.objects.filter(collection=self.collection, url=delta.url).first()

if not curated:
# Scenario 1: Delta only - clear generated title
delta.generated_title = ""
delta.save()
else:
# Scenario 2: Both exist - revert to curated title
# Find next most specific matching pattern if any
matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), delta.url)]

next_pattern = None
if matching_patterns:
# Sort by number of URLs matched (ascending) to find most specific
next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())

if next_pattern:
# Apply next most specific pattern's title
new_title, error = next_pattern.generate_title_for_url(delta)
if error:
DeltaResolvedTitleError.objects.update_or_create(
delta_url=delta, defaults={"title_pattern": next_pattern, "error_string": error}
)
else:
delta.generated_title = new_title
delta.save()
DeltaResolvedTitle.objects.update_or_create(
delta_url=delta, defaults={"title_pattern": next_pattern, "resolved_title": new_title}
)
elif curated:
# No other patterns match, revert to curated title
delta.generated_title = curated.generated_title
delta.save()

Expand All @@ -559,18 +611,47 @@ def unapply(self) -> None:
)
if fields_match:
delta.delete()
else:
# No curated URL or other patterns, clear title
delta.generated_title = ""
delta.save()

# Handle curated URLs that don't have deltas
for curated in affected_curated:
if not DeltaUrl.objects.filter(url=curated.url).exists():
# Scenario 3: Curated only - create delta with cleared title
fields = {
f.name: getattr(curated, f.name) for f in curated._meta.fields if f.name not in ["id", "collection"]
}
fields["generated_title"] = ""
DeltaUrl.objects.create(collection=self.collection, **fields)

# Clear resolution tracking
# Find any matching patterns
matching_patterns = [p for p in other_patterns if re.search(p.get_regex_pattern(), curated.url)]

if matching_patterns:
# Apply most specific pattern's title
next_pattern = min(matching_patterns, key=lambda p: p.get_url_match_count())

# Copy all fields from curated
fields = {
f.name: getattr(curated, f.name)
for f in curated._meta.fields
if f.name not in ["id", "collection"]
}

# Generate and apply new title
new_title, error = next_pattern.generate_title_for_url(curated)
if not error:
fields["generated_title"] = new_title
delta = DeltaUrl.objects.create(collection=self.collection, **fields)
DeltaResolvedTitle.objects.create(
title_pattern=next_pattern, delta_url=delta, resolved_title=new_title
)
else:
# No other patterns, create delta with cleared title
fields = {
f.name: getattr(curated, f.name)
for f in curated._meta.fields
if f.name not in ["id", "collection"]
}
fields["generated_title"] = ""
DeltaUrl.objects.create(collection=self.collection, **fields)

# Clear resolution tracking for this pattern
DeltaResolvedTitle.objects.filter(title_pattern=self).delete()
DeltaResolvedTitleError.objects.filter(title_pattern=self).delete()

Expand Down
Loading

0 comments on commit aef7cdb

Please sign in to comment.